[HUDI-2325] Add hive sync support to kafka connect (#3660)
Co-authored-by: Rajesh Mahindra <rmahindra@Rajeshs-MacBook-Pro.local>
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "hudi-sink",
|
||||
"config": {
|
||||
"bootstrap.servers": "localhost:9092",
|
||||
"bootstrap.servers": "kafkabroker:9092",
|
||||
"connector.class": "org.apache.hudi.connect.HoodieSinkConnector",
|
||||
"tasks.max": "4",
|
||||
"key.converter": "org.apache.kafka.connect.storage.StringConverter",
|
||||
@@ -11,10 +11,21 @@
|
||||
"hoodie.table.name": "hudi-test-topic",
|
||||
"hoodie.table.type": "MERGE_ON_READ",
|
||||
"hoodie.metadata.enable": "false",
|
||||
"hoodie.base.path": "file:///tmp/hoodie/hudi-test-topic",
|
||||
"hoodie.base.path": "hdfs://namenode:8020/user/hive/warehouse/hudi-test-topic",
|
||||
"hoodie.datasource.write.recordkey.field": "volume",
|
||||
"hoodie.datasource.write.partitionpath.field": "date",
|
||||
"hoodie.schemaprovider.class": "org.apache.hudi.schema.SchemaRegistryProvider",
|
||||
"hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8081/subjects/hudi-test-topic/versions/latest"
|
||||
}
|
||||
"hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8082/subjects/hudi-test-topic/versions/latest",
|
||||
"hoodie.kafka.commit.interval.secs": 60,
|
||||
"hoodie.meta.sync.enable": "true",
|
||||
"hoodie.meta.sync.classes": "org.apache.hudi.hive.HiveSyncTool",
|
||||
"hoodie.datasource.hive_sync.table": "huditesttopic",
|
||||
"hoodie.datasource.hive_sync.partition_fields": "date",
|
||||
"hoodie.datasource.hive_sync.partition_extractor_class": "org.apache.hudi.hive.MultiPartKeysValueExtractor",
|
||||
"hoodie.datasource.hive_sync.use_jdbc": "false",
|
||||
"hoodie.datasource.hive_sync.mode": "hms",
|
||||
"dfs.client.use.datanode.hostname": "true",
|
||||
"hive.metastore.uris": "thrift://hivemetastore:9083",
|
||||
"hive.metastore.client.socket.timeout": "1500s"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
# limitations under the License.
|
||||
##
|
||||
|
||||
bootstrap.servers=localhost:9092
|
||||
bootstrap.servers=kafkabroker:9092
|
||||
group.id=hudi-connect-cluster
|
||||
key.converter=org.apache.kafka.connect.json.JsonConverter
|
||||
value.converter=org.apache.kafka.connect.json.JsonConverter
|
||||
|
||||
@@ -50,6 +50,7 @@ fi
|
||||
|
||||
## defaults
|
||||
rawDataFile=${HUDI_DIR}/docker/demo/data/batch_1.json
|
||||
kafkaBrokerHostname=kafkabroker
|
||||
kafkaTopicName=hudi-test-topic
|
||||
numKafkaPartitions=4
|
||||
recordKey=volume
|
||||
@@ -115,23 +116,23 @@ done
|
||||
if [ $recreateTopic = "Y" ]; then
|
||||
# First delete the existing topic
|
||||
echo "Delete Kafka topic $kafkaTopicName ..."
|
||||
${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
|
||||
${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server ${kafkaBrokerHostname}:9092
|
||||
|
||||
# Create the topic with 4 partitions
|
||||
echo "Create Kafka topic $kafkaTopicName ..."
|
||||
${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
|
||||
${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server ${kafkaBrokerHostname}:9092
|
||||
fi
|
||||
|
||||
# Setup the schema registry
|
||||
export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring)
|
||||
curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8081/subjects/${kafkaTopicName}/versions
|
||||
curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8082/subjects/${kafkaTopicName}/versions
|
||||
curl -X GET http://localhost:8081/subjects/${kafkaTopicName}/versions/latest
|
||||
|
||||
# Generate kafka messages from raw records
|
||||
# Each records with unique keys and generate equal messages across each hudi partition
|
||||
partitions={}
|
||||
for ((i = 0; i < ${numHudiPartitions}; i++)); do
|
||||
partitions[$i]="partition-"$i
|
||||
partitions[$i]="partition_"$i
|
||||
done
|
||||
|
||||
events_file=/tmp/kcat-input.events
|
||||
@@ -170,5 +171,5 @@ for ((i = 1;i<=numBatch;i++)); do
|
||||
done
|
||||
|
||||
echo "publish to Kafka ..."
|
||||
grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic
|
||||
grep -v '^$' ${events_file} | kcat -P -b ${kafkaBrokerHostname}:9092 -t ${kafkaTopicName}
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user