[MINOR] Add more configuration to Kafka setup script (#3992)
* [MINOR] Add more configuration to Kafka setup script * Add option to reuse Kafka topic * Minor fixes to README
This commit is contained in:
@@ -61,7 +61,7 @@ Once downloaded and built, run the Zookeeper server and Kafka server using the c
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
export KAFKA_HOME=/path/to/kafka_install_dir
|
export KAFKA_HOME=/path/to/kafka_install_dir
|
||||||
cd $KAFKA_KAFKA_HOME
|
cd $KAFKA_HOME
|
||||||
./bin/zookeeper-server-start.sh ./config/zookeeper.properties
|
./bin/zookeeper-server-start.sh ./config/zookeeper.properties
|
||||||
./bin/kafka-server-start.sh ./config/server.properties
|
./bin/kafka-server-start.sh ./config/server.properties
|
||||||
```
|
```
|
||||||
@@ -71,8 +71,9 @@ Wait until the kafka cluster is up and running.
|
|||||||
### 2 - Set up the schema registry
|
### 2 - Set up the schema registry
|
||||||
|
|
||||||
Hudi leverages schema registry to obtain the latest schema when writing records. While it supports most popular schema
|
Hudi leverages schema registry to obtain the latest schema when writing records. While it supports most popular schema
|
||||||
registries, we use Confluent schema registry. Download the latest confluent platform and run the schema registry
|
registries, we use Confluent schema registry. Download the
|
||||||
service.
|
latest [confluent platform](https://docs.confluent.io/platform/current/installation/index.html) and run the schema
|
||||||
|
registry service.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd $CONFLUENT_DIR
|
cd $CONFLUENT_DIR
|
||||||
@@ -98,6 +99,13 @@ cd $HUDI_DIR/hudi-kafka-connect/demo/
|
|||||||
bash setupKafka.sh -n <total_kafka_messages>
|
bash setupKafka.sh -n <total_kafka_messages>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To generate data for long-running tests, you can add `-b` option to specify the number of batches of data
|
||||||
|
to generate, with each batch containing a number of messages and idle time between batches, as follows:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash setupKafka.sh -n <num_kafka_messages_per_batch> -b <num_batches>
|
||||||
|
```
|
||||||
|
|
||||||
### 4 - Run the Sink connector worker (multiple workers can be run)
|
### 4 - Run the Sink connector worker (multiple workers can be run)
|
||||||
|
|
||||||
The Kafka connect is a distributed platform, with the ability to run one or more workers (each running multiple tasks)
|
The Kafka connect is a distributed platform, with the ability to run one or more workers (each running multiple tasks)
|
||||||
|
|||||||
@@ -21,11 +21,14 @@
|
|||||||
#########################
|
#########################
|
||||||
usage() {
|
usage() {
|
||||||
echo "Usage: $0"
|
echo "Usage: $0"
|
||||||
echo " -n |--num-kafka-records, (required) number of kafka records to generate"
|
echo " -n |--num-kafka-records, (required) number of kafka records to generate in a batch"
|
||||||
|
echo " -b |--num-batch, (optional) number of batches of records to generate (default is 1)"
|
||||||
|
echo " -t |--reuse-topic, (optional) reuses the Kafka topic (default deletes and recreate the topic)"
|
||||||
echo " -f |--raw-file, (optional) raw file for the kafka records"
|
echo " -f |--raw-file, (optional) raw file for the kafka records"
|
||||||
echo " -k |--kafka-topic, (optional) Topic name for Kafka"
|
echo " -k |--kafka-topic, (optional) Topic name for Kafka"
|
||||||
echo " -m |--num-kafka-partitions, (optional) number of kafka partitions"
|
echo " -m |--num-kafka-partitions, (optional) number of kafka partitions"
|
||||||
echo " -r |--record-key, (optional) field to use as record key"
|
echo " -r |--record-key, (optional) field to use as record key"
|
||||||
|
echo " -o |--record-key-offset, (optional) record key offset to start with (default is 0)"
|
||||||
echo " -l |--num-hudi-partitions, (optional) number of hudi partitions"
|
echo " -l |--num-hudi-partitions, (optional) number of hudi partitions"
|
||||||
echo " -p |--partition-key, (optional) field to use as partition"
|
echo " -p |--partition-key, (optional) field to use as partition"
|
||||||
echo " -s |--schema-file, (optional) path of the file containing the schema of the records"
|
echo " -s |--schema-file, (optional) path of the file containing the schema of the records"
|
||||||
@@ -53,12 +56,23 @@ recordKey=volume
|
|||||||
numHudiPartitions=5
|
numHudiPartitions=5
|
||||||
partitionField=date
|
partitionField=date
|
||||||
schemaFile=${HUDI_DIR}/docker/demo/config/schema.avsc
|
schemaFile=${HUDI_DIR}/docker/demo/config/schema.avsc
|
||||||
|
numBatch=1
|
||||||
|
recordValue=0
|
||||||
|
recreateTopic="Y"
|
||||||
|
|
||||||
while getopts ":n:f:k:m:r:l:p:s:-:" opt; do
|
while getopts ":n:b:tf:k:m:r:o:l:p:s:-:" opt; do
|
||||||
case $opt in
|
case $opt in
|
||||||
n)
|
n)
|
||||||
num_records="$OPTARG"
|
numRecords="$OPTARG"
|
||||||
printf "Argument num-kafka-records is %s\n" "$num_records"
|
printf "Argument num-kafka-records is %s\n" "$numRecords"
|
||||||
|
;;
|
||||||
|
b)
|
||||||
|
numBatch="$OPTARG"
|
||||||
|
printf "Argument num-batch is %s\n" "$numBatch"
|
||||||
|
;;
|
||||||
|
t)
|
||||||
|
recreateTopic="N"
|
||||||
|
printf "Argument recreate-topic is N (reuse Kafka topic) \n"
|
||||||
;;
|
;;
|
||||||
k)
|
k)
|
||||||
rawDataFile="$OPTARG"
|
rawDataFile="$OPTARG"
|
||||||
@@ -76,6 +90,10 @@ while getopts ":n:f:k:m:r:l:p:s:-:" opt; do
|
|||||||
recordKey="$OPTARG"
|
recordKey="$OPTARG"
|
||||||
printf "Argument record-key is %s\n" "$recordKey"
|
printf "Argument record-key is %s\n" "$recordKey"
|
||||||
;;
|
;;
|
||||||
|
o)
|
||||||
|
recordValue="$OPTARG"
|
||||||
|
printf "Argument record-key-offset is %s\n" "$recordValue"
|
||||||
|
;;
|
||||||
l)
|
l)
|
||||||
numHudiPartitions="$OPTARG"
|
numHudiPartitions="$OPTARG"
|
||||||
printf "Argument num-hudi-partitions is %s\n" "$numHudiPartitions"
|
printf "Argument num-hudi-partitions is %s\n" "$numHudiPartitions"
|
||||||
@@ -84,7 +102,7 @@ while getopts ":n:f:k:m:r:l:p:s:-:" opt; do
|
|||||||
partitionField="$OPTARG"
|
partitionField="$OPTARG"
|
||||||
printf "Argument partition-key is %s\n" "$partitionField"
|
printf "Argument partition-key is %s\n" "$partitionField"
|
||||||
;;
|
;;
|
||||||
p)
|
s)
|
||||||
schemaFile="$OPTARG"
|
schemaFile="$OPTARG"
|
||||||
printf "Argument schema-file is %s\n" "$schemaFile"
|
printf "Argument schema-file is %s\n" "$schemaFile"
|
||||||
;;
|
;;
|
||||||
@@ -94,11 +112,15 @@ while getopts ":n:f:k:m:r:l:p:s:-:" opt; do
|
|||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
# First delete the existing topic
|
if [ $recreateTopic = "Y" ]; then
|
||||||
${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
|
# First delete the existing topic
|
||||||
|
echo "Delete Kafka topic $kafkaTopicName ..."
|
||||||
|
${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
|
||||||
|
|
||||||
# Create the topic with 4 partitions
|
# Create the topic with 4 partitions
|
||||||
${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
|
echo "Create Kafka topic $kafkaTopicName ..."
|
||||||
|
${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
|
||||||
|
fi
|
||||||
|
|
||||||
# Setup the schema registry
|
# Setup the schema registry
|
||||||
export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring)
|
export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring)
|
||||||
@@ -115,32 +137,38 @@ done
|
|||||||
events_file=/tmp/kcat-input.events
|
events_file=/tmp/kcat-input.events
|
||||||
rm -f ${events_file}
|
rm -f ${events_file}
|
||||||
|
|
||||||
recordValue=0
|
totalNumRecords=$((numRecords + recordValue))
|
||||||
num_records=$((num_records + 0))
|
|
||||||
|
|
||||||
for (( ; ; )); do
|
for ((i = 1;i<=numBatch;i++)); do
|
||||||
while IFS= read line; do
|
rm -f ${events_file}
|
||||||
for partitionValue in "${partitions[@]}"; do
|
date
|
||||||
echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' >>${events_file}
|
echo "Start batch $i ..."
|
||||||
((recordValue = recordValue + 1))
|
batchRecordSeq=0
|
||||||
|
for (( ; ; )); do
|
||||||
|
while IFS= read line; do
|
||||||
|
for partitionValue in "${partitions[@]}"; do
|
||||||
|
echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' >>${events_file}
|
||||||
|
((recordValue = recordValue + 1))
|
||||||
|
((batchRecordSeq = batchRecordSeq + 1))
|
||||||
|
|
||||||
if [ $recordValue -gt $num_records ]; then
|
if [ $batchRecordSeq -eq $numRecords ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $batchRecordSeq -eq $numRecords ]; then
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
done
|
done <"$rawDataFile"
|
||||||
|
|
||||||
if [ $recordValue -gt $num_records ]; then
|
if [ $batchRecordSeq -eq $numRecords ]; then
|
||||||
break
|
date
|
||||||
fi
|
echo " Record key until $recordValue"
|
||||||
|
sleep 20
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
if [ $(($recordValue % 1000)) -eq 0 ]; then
|
echo "publish to Kafka ..."
|
||||||
sleep 1
|
grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic
|
||||||
fi
|
|
||||||
done <"$rawDataFile"
|
|
||||||
|
|
||||||
if [ $recordValue -gt $num_records ]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
done
|
||||||
|
|
||||||
grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic
|
|
||||||
|
|||||||
Reference in New Issue
Block a user