1
0

[MINOR] Add more configuration to Kafka setup script (#3992)

* [MINOR] Add more configuration to Kafka setup script

* Add option to reuse Kafka topic

* Minor fixes to README
This commit is contained in:
Y Ethan Guo
2021-11-22 18:03:38 -08:00
committed by GitHub
parent e22150fe15
commit 6aa710eae0
2 changed files with 70 additions and 34 deletions

View File

@@ -61,7 +61,7 @@ Once downloaded and built, run the Zookeeper server and Kafka server using the c
```bash ```bash
export KAFKA_HOME=/path/to/kafka_install_dir export KAFKA_HOME=/path/to/kafka_install_dir
cd $KAFKA_KAFKA_HOME cd $KAFKA_HOME
./bin/zookeeper-server-start.sh ./config/zookeeper.properties ./bin/zookeeper-server-start.sh ./config/zookeeper.properties
./bin/kafka-server-start.sh ./config/server.properties ./bin/kafka-server-start.sh ./config/server.properties
``` ```
@@ -71,8 +71,9 @@ Wait until the kafka cluster is up and running.
### 2 - Set up the schema registry ### 2 - Set up the schema registry
Hudi leverages schema registry to obtain the latest schema when writing records. While it supports most popular schema Hudi leverages schema registry to obtain the latest schema when writing records. While it supports most popular schema
registries, we use Confluent schema registry. Download the latest confluent platform and run the schema registry registries, we use Confluent schema registry. Download the
service. latest [confluent platform](https://docs.confluent.io/platform/current/installation/index.html) and run the schema
registry service.
```bash ```bash
cd $CONFLUENT_DIR cd $CONFLUENT_DIR
@@ -98,6 +99,13 @@ cd $HUDI_DIR/hudi-kafka-connect/demo/
bash setupKafka.sh -n <total_kafka_messages> bash setupKafka.sh -n <total_kafka_messages>
``` ```
To generate data for long-running tests, you can add `-b` option to specify the number of batches of data
to generate, with each batch containing a number of messages and idle time between batches, as follows:
```bash
bash setupKafka.sh -n <num_kafka_messages_per_batch> -b <num_batches>
```
### 4 - Run the Sink connector worker (multiple workers can be run) ### 4 - Run the Sink connector worker (multiple workers can be run)
The Kafka connect is a distributed platform, with the ability to run one or more workers (each running multiple tasks) The Kafka connect is a distributed platform, with the ability to run one or more workers (each running multiple tasks)

View File

@@ -21,11 +21,14 @@
######################### #########################
usage() { usage() {
echo "Usage: $0" echo "Usage: $0"
echo " -n |--num-kafka-records, (required) number of kafka records to generate" echo " -n |--num-kafka-records, (required) number of kafka records to generate in a batch"
echo " -b |--num-batch, (optional) number of batches of records to generate (default is 1)"
echo " -t |--reuse-topic, (optional) reuses the Kafka topic (default deletes and recreate the topic)"
echo " -f |--raw-file, (optional) raw file for the kafka records" echo " -f |--raw-file, (optional) raw file for the kafka records"
echo " -k |--kafka-topic, (optional) Topic name for Kafka" echo " -k |--kafka-topic, (optional) Topic name for Kafka"
echo " -m |--num-kafka-partitions, (optional) number of kafka partitions" echo " -m |--num-kafka-partitions, (optional) number of kafka partitions"
echo " -r |--record-key, (optional) field to use as record key" echo " -r |--record-key, (optional) field to use as record key"
echo " -o |--record-key-offset, (optional) record key offset to start with (default is 0)"
echo " -l |--num-hudi-partitions, (optional) number of hudi partitions" echo " -l |--num-hudi-partitions, (optional) number of hudi partitions"
echo " -p |--partition-key, (optional) field to use as partition" echo " -p |--partition-key, (optional) field to use as partition"
echo " -s |--schema-file, (optional) path of the file containing the schema of the records" echo " -s |--schema-file, (optional) path of the file containing the schema of the records"
@@ -53,12 +56,23 @@ recordKey=volume
numHudiPartitions=5 numHudiPartitions=5
partitionField=date partitionField=date
schemaFile=${HUDI_DIR}/docker/demo/config/schema.avsc schemaFile=${HUDI_DIR}/docker/demo/config/schema.avsc
numBatch=1
recordValue=0
recreateTopic="Y"
while getopts ":n:f:k:m:r:l:p:s:-:" opt; do while getopts ":n:b:tf:k:m:r:o:l:p:s:-:" opt; do
case $opt in case $opt in
n) n)
num_records="$OPTARG" numRecords="$OPTARG"
printf "Argument num-kafka-records is %s\n" "$num_records" printf "Argument num-kafka-records is %s\n" "$numRecords"
;;
b)
numBatch="$OPTARG"
printf "Argument num-batch is %s\n" "$numBatch"
;;
t)
recreateTopic="N"
printf "Argument recreate-topic is N (reuse Kafka topic) \n"
;; ;;
k) k)
rawDataFile="$OPTARG" rawDataFile="$OPTARG"
@@ -76,6 +90,10 @@ while getopts ":n:f:k:m:r:l:p:s:-:" opt; do
recordKey="$OPTARG" recordKey="$OPTARG"
printf "Argument record-key is %s\n" "$recordKey" printf "Argument record-key is %s\n" "$recordKey"
;; ;;
o)
recordValue="$OPTARG"
printf "Argument record-key-offset is %s\n" "$recordValue"
;;
l) l)
numHudiPartitions="$OPTARG" numHudiPartitions="$OPTARG"
printf "Argument num-hudi-partitions is %s\n" "$numHudiPartitions" printf "Argument num-hudi-partitions is %s\n" "$numHudiPartitions"
@@ -84,7 +102,7 @@ while getopts ":n:f:k:m:r:l:p:s:-:" opt; do
partitionField="$OPTARG" partitionField="$OPTARG"
printf "Argument partition-key is %s\n" "$partitionField" printf "Argument partition-key is %s\n" "$partitionField"
;; ;;
p) s)
schemaFile="$OPTARG" schemaFile="$OPTARG"
printf "Argument schema-file is %s\n" "$schemaFile" printf "Argument schema-file is %s\n" "$schemaFile"
;; ;;
@@ -94,11 +112,15 @@ while getopts ":n:f:k:m:r:l:p:s:-:" opt; do
esac esac
done done
# First delete the existing topic if [ $recreateTopic = "Y" ]; then
${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092 # First delete the existing topic
echo "Delete Kafka topic $kafkaTopicName ..."
${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
# Create the topic with 4 partitions # Create the topic with 4 partitions
${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092 echo "Create Kafka topic $kafkaTopicName ..."
${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
fi
# Setup the schema registry # Setup the schema registry
export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring) export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring)
@@ -115,32 +137,38 @@ done
events_file=/tmp/kcat-input.events events_file=/tmp/kcat-input.events
rm -f ${events_file} rm -f ${events_file}
recordValue=0 totalNumRecords=$((numRecords + recordValue))
num_records=$((num_records + 0))
for (( ; ; )); do for ((i = 1;i<=numBatch;i++)); do
while IFS= read line; do rm -f ${events_file}
for partitionValue in "${partitions[@]}"; do date
echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' >>${events_file} echo "Start batch $i ..."
((recordValue = recordValue + 1)) batchRecordSeq=0
for (( ; ; )); do
while IFS= read line; do
for partitionValue in "${partitions[@]}"; do
echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' >>${events_file}
((recordValue = recordValue + 1))
((batchRecordSeq = batchRecordSeq + 1))
if [ $recordValue -gt $num_records ]; then if [ $batchRecordSeq -eq $numRecords ]; then
break
fi
done
if [ $batchRecordSeq -eq $numRecords ]; then
break break
fi fi
done done <"$rawDataFile"
if [ $recordValue -gt $num_records ]; then if [ $batchRecordSeq -eq $numRecords ]; then
break date
fi echo " Record key until $recordValue"
sleep 20
break
fi
done
if [ $(($recordValue % 1000)) -eq 0 ]; then echo "publish to Kafka ..."
sleep 1 grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic
fi
done <"$rawDataFile"
if [ $recordValue -gt $num_records ]; then
break
fi
done done
grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic