1
0

[HUDI-2325] Add hive sync support to kafka connect (#3660)

Co-authored-by: Rajesh Mahindra <rmahindra@Rajeshs-MacBook-Pro.local>
This commit is contained in:
rmahindra123
2021-11-23 15:48:06 -08:00
committed by GitHub
parent 969a5bf11e
commit fbff0799b9
11 changed files with 344 additions and 88 deletions

View File

@@ -1,7 +1,7 @@
{
"name": "hudi-sink",
"config": {
"bootstrap.servers": "localhost:9092",
"bootstrap.servers": "kafkabroker:9092",
"connector.class": "org.apache.hudi.connect.HoodieSinkConnector",
"tasks.max": "4",
"key.converter": "org.apache.kafka.connect.storage.StringConverter",
@@ -11,10 +11,21 @@
"hoodie.table.name": "hudi-test-topic",
"hoodie.table.type": "MERGE_ON_READ",
"hoodie.metadata.enable": "false",
"hoodie.base.path": "file:///tmp/hoodie/hudi-test-topic",
"hoodie.base.path": "hdfs://namenode:8020/user/hive/warehouse/hudi-test-topic",
"hoodie.datasource.write.recordkey.field": "volume",
"hoodie.datasource.write.partitionpath.field": "date",
"hoodie.schemaprovider.class": "org.apache.hudi.schema.SchemaRegistryProvider",
"hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8081/subjects/hudi-test-topic/versions/latest"
}
"hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8082/subjects/hudi-test-topic/versions/latest",
"hoodie.kafka.commit.interval.secs": 60,
"hoodie.meta.sync.enable": "true",
"hoodie.meta.sync.classes": "org.apache.hudi.hive.HiveSyncTool",
"hoodie.datasource.hive_sync.table": "huditesttopic",
"hoodie.datasource.hive_sync.partition_fields": "date",
"hoodie.datasource.hive_sync.partition_extractor_class": "org.apache.hudi.hive.MultiPartKeysValueExtractor",
"hoodie.datasource.hive_sync.use_jdbc": "false",
"hoodie.datasource.hive_sync.mode": "hms",
"dfs.client.use.datanode.hostname": "true",
"hive.metastore.uris": "thrift://hivemetastore:9083",
"hive.metastore.client.socket.timeout": "1500s"
}
}

View File

@@ -15,7 +15,7 @@
# limitations under the License.
##
bootstrap.servers=localhost:9092
bootstrap.servers=kafkabroker:9092
group.id=hudi-connect-cluster
key.converter=org.apache.kafka.connect.json.JsonConverter
value.converter=org.apache.kafka.connect.json.JsonConverter

View File

@@ -50,6 +50,7 @@ fi
## defaults
rawDataFile=${HUDI_DIR}/docker/demo/data/batch_1.json
kafkaBrokerHostname=kafkabroker
kafkaTopicName=hudi-test-topic
numKafkaPartitions=4
recordKey=volume
@@ -115,23 +116,23 @@ done
if [ $recreateTopic = "Y" ]; then
# First delete the existing topic
echo "Delete Kafka topic $kafkaTopicName ..."
${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server ${kafkaBrokerHostname}:9092
# Create the topic with 4 partitions
echo "Create Kafka topic $kafkaTopicName ..."
${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server ${kafkaBrokerHostname}:9092
fi
# Setup the schema registry
export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring)
curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8081/subjects/${kafkaTopicName}/versions
curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8082/subjects/${kafkaTopicName}/versions
curl -X GET http://localhost:8081/subjects/${kafkaTopicName}/versions/latest
# Generate kafka messages from raw records
# Each records with unique keys and generate equal messages across each hudi partition
partitions={}
for ((i = 0; i < ${numHudiPartitions}; i++)); do
partitions[$i]="partition-"$i
partitions[$i]="partition_"$i
done
events_file=/tmp/kcat-input.events
@@ -170,5 +171,5 @@ for ((i = 1;i<=numBatch;i++)); do
done
echo "publish to Kafka ..."
grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic
grep -v '^$' ${events_file} | kcat -P -b ${kafkaBrokerHostname}:9092 -t ${kafkaTopicName}
done