[HUDI-2330][HUDI-2335] Adding support for merge-on-read tables (#3679)
- Inserts go into logs, hashed by Kafka and Hudi partitions - Fixed issues with the setupKafka script - Bumped up the default commit interval to 300 seconds - Minor renaming
This commit is contained in:
@@ -9,10 +9,11 @@
|
||||
"value.converter.schemas.enable": "false",
|
||||
"topics": "hudi-test-topic",
|
||||
"hoodie.table.name": "hudi-test-topic",
|
||||
"hoodie.table.type": "MERGE_ON_READ",
|
||||
"hoodie.base.path": "file:///tmp/hoodie/hudi-test-topic",
|
||||
"hoodie.datasource.write.recordkey.field": "volume",
|
||||
"hoodie.datasource.write.partitionpath.field": "date",
|
||||
"hoodie.schemaprovider.class": "org.apache.hudi.schema.SchemaRegistryProvider",
|
||||
"hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8081/subjects/hudi-test-topic/versions/latest"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
131
hudi-kafka-connect/demo/setupKafka.sh
Normal file → Executable file
131
hudi-kafka-connect/demo/setupKafka.sh
Normal file → Executable file
@@ -16,38 +16,33 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
## Directories
|
||||
HOME_DIR=~
|
||||
HUDI_DIR=${HOME_DIR}/hudi
|
||||
KAFKA_HOME=${HOME_DIR}/kafka
|
||||
|
||||
#########################
|
||||
# The command line help #
|
||||
#########################
|
||||
usage() {
|
||||
echo "Usage: $0"
|
||||
echo " -n |--num-kafka-records, (required) number of kafka records to generate"
|
||||
echo " -f |--raw-file, (optional) raw file for the kafka records"
|
||||
echo " -k |--kafka-topic, (optional) Topic name for Kafka"
|
||||
echo " -m |--num-kafka-partitions, (optional) number of kafka partitions"
|
||||
echo " -r |--record-key, (optional) field to use as record key"
|
||||
echo " -l |--num-hudi-partitions, (optional) number of hudi partitions"
|
||||
echo " -p |--partition-key, (optional) field to use as partition"
|
||||
echo " -s |--schema-file, (optional) path of the file containing the schema of the records"
|
||||
exit 1
|
||||
echo "Usage: $0"
|
||||
echo " -n |--num-kafka-records, (required) number of kafka records to generate"
|
||||
echo " -f |--raw-file, (optional) raw file for the kafka records"
|
||||
echo " -k |--kafka-topic, (optional) Topic name for Kafka"
|
||||
echo " -m |--num-kafka-partitions, (optional) number of kafka partitions"
|
||||
echo " -r |--record-key, (optional) field to use as record key"
|
||||
echo " -l |--num-hudi-partitions, (optional) number of hudi partitions"
|
||||
echo " -p |--partition-key, (optional) field to use as partition"
|
||||
echo " -s |--schema-file, (optional) path of the file containing the schema of the records"
|
||||
exit 1
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Illegal number of parameters"
|
||||
usage
|
||||
exit 0
|
||||
echo "Illegal number of parameters"
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
## defaults
|
||||
@@ -61,71 +56,91 @@ schemaFile=${HUDI_DIR}/docker/demo/config/schema.avsc
|
||||
|
||||
while getopts ":n:f:k:m:r:l:p:s:-:" opt; do
|
||||
case $opt in
|
||||
n) num_records="$OPTARG"
|
||||
n)
|
||||
num_records="$OPTARG"
|
||||
printf "Argument num-kafka-records is %s\n" "$num_records"
|
||||
;;
|
||||
k) rawDataFile="$OPTARG"
|
||||
k)
|
||||
rawDataFile="$OPTARG"
|
||||
printf "Argument raw-file is %s\n" "$rawDataFile"
|
||||
;;
|
||||
f) kafkaTopicName="$OPTARG"
|
||||
f)
|
||||
kafkaTopicName="$OPTARG"
|
||||
printf "Argument kafka-topic is %s\n" "$kafkaTopicName"
|
||||
;;
|
||||
m) numKafkaPartitions="$OPTARG"
|
||||
m)
|
||||
numKafkaPartitions="$OPTARG"
|
||||
printf "Argument num-kafka-partitions is %s\n" "$numKafkaPartitions"
|
||||
;;
|
||||
r) recordKey="$OPTARG"
|
||||
r)
|
||||
recordKey="$OPTARG"
|
||||
printf "Argument record-key is %s\n" "$recordKey"
|
||||
;;
|
||||
l) numHudiPartitions="$OPTARG"
|
||||
l)
|
||||
numHudiPartitions="$OPTARG"
|
||||
printf "Argument num-hudi-partitions is %s\n" "$numHudiPartitions"
|
||||
;;
|
||||
p) partitionField="$OPTARG"
|
||||
p)
|
||||
partitionField="$OPTARG"
|
||||
printf "Argument partition-key is %s\n" "$partitionField"
|
||||
;;
|
||||
p) schemaFile="$OPTARG"
|
||||
p)
|
||||
schemaFile="$OPTARG"
|
||||
printf "Argument schema-file is %s\n" "$schemaFile"
|
||||
;;
|
||||
-) echo "Invalid option -$OPTARG" >&2
|
||||
-)
|
||||
echo "Invalid option -$OPTARG" >&2
|
||||
;;
|
||||
esac
|
||||
esac
|
||||
done
|
||||
|
||||
# First delete the existing topic
|
||||
$KAFKA_HOME/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
|
||||
#${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
|
||||
|
||||
# Create the topic with 4 partitions
|
||||
$KAFKA_HOME/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
|
||||
|
||||
#${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
|
||||
|
||||
# Setup the schema registry
|
||||
export SCHEMA=`sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring`
|
||||
export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring)
|
||||
curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8081/subjects/${kafkaTopicName}/versions
|
||||
curl -X GET http://localhost:8081/subjects/${kafkaTopicName}/versions/latest
|
||||
|
||||
|
||||
# Generate kafka messages from raw records
|
||||
# Each records with unique keys and generate equal messages across each hudi partition
|
||||
partitions={}
|
||||
for ((i=0; i<${numHudiPartitions}; i++))
|
||||
do
|
||||
partitions[$i]="partition-"$i;
|
||||
for ((i = 0; i < ${numHudiPartitions}; i++)); do
|
||||
partitions[$i]="partition-"$i
|
||||
done
|
||||
|
||||
for ((recordValue=0; recordValue<=${num_records}; ))
|
||||
do
|
||||
while IFS= read line
|
||||
do
|
||||
for partitionValue in "${partitions[@]}"
|
||||
do
|
||||
echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' | kafkacat -P -b localhost:9092 -t hudi-test-topic;
|
||||
((recordValue++));
|
||||
if [ $recordValue -gt ${num_records} ]; then
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $(( $recordValue % 1000 )) -eq 0 ]
|
||||
then sleep 1
|
||||
fi
|
||||
done < "$rawDataFile"
|
||||
done
|
||||
events_file=/tmp/kcat-input.events
|
||||
rm -f ${events_file}
|
||||
|
||||
recordValue=0
|
||||
num_records=$((num_records + 0))
|
||||
|
||||
for (( ; ; )); do
|
||||
while IFS= read line; do
|
||||
for partitionValue in "${partitions[@]}"; do
|
||||
echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' >>${events_file}
|
||||
((recordValue = recordValue + 1))
|
||||
|
||||
if [ $recordValue -gt $num_records ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $recordValue -gt $num_records ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
if [ $(($recordValue % 1000)) -eq 0 ]; then
|
||||
sleep 1
|
||||
fi
|
||||
done <"$rawDataFile"
|
||||
|
||||
if [ $recordValue -gt $num_records ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic
|
||||
|
||||
Reference in New Issue
Block a user