[HUDI-2330][HUDI-2335] Adding support for merge-on-read tables (#3679)

- Inserts go into logs, hashed by Kafka and Hudi partitions - Fixed issues with the setupKafka script - Bumped up the default commit interval to 300 seconds - Minor renaming
2021-09-16 15:24:34 -07:00
parent b8dad628e5
commit 57d5da68aa
16 changed files with 315 additions and 124 deletions
--- a/hudi-kafka-connect/demo/config-sink.json
+++ b/hudi-kafka-connect/demo/config-sink.json
@@ -9,10 +9,11 @@
 		"value.converter.schemas.enable": "false",
 		"topics": "hudi-test-topic",
 		"hoodie.table.name": "hudi-test-topic",
+		"hoodie.table.type": "MERGE_ON_READ",
 		"hoodie.base.path": "file:///tmp/hoodie/hudi-test-topic",
 		"hoodie.datasource.write.recordkey.field": "volume",
 		"hoodie.datasource.write.partitionpath.field": "date",
 		"hoodie.schemaprovider.class": "org.apache.hudi.schema.SchemaRegistryProvider",
 		"hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8081/subjects/hudi-test-topic/versions/latest"
-    }
+	}
 }
--- a/hudi-kafka-connect/demo/setupKafka.sh
+++ b/hudi-kafka-connect/demo/setupKafka.sh
@@ -16,38 +16,33 @@

 #!/bin/bash

-## Directories
-HOME_DIR=~
-HUDI_DIR=${HOME_DIR}/hudi
-KAFKA_HOME=${HOME_DIR}/kafka
-
 #########################
 # The command line help #
 #########################
 usage() {
-    echo "Usage: $0"
-    echo "   -n |--num-kafka-records, (required) number of kafka records to generate"
-    echo "   -f |--raw-file, (optional) raw file for the kafka records"
-    echo "   -k |--kafka-topic, (optional) Topic name for Kafka"
-    echo "   -m |--num-kafka-partitions, (optional) number of kafka partitions"
-    echo "   -r |--record-key, (optional) field to use as record key"
-    echo "   -l |--num-hudi-partitions, (optional) number of hudi partitions"
-    echo "   -p |--partition-key, (optional) field to use as partition"
-    echo "   -s |--schema-file, (optional) path of the file containing the schema of the records"
-    exit 1
+  echo "Usage: $0"
+  echo "   -n |--num-kafka-records, (required) number of kafka records to generate"
+  echo "   -f |--raw-file, (optional) raw file for the kafka records"
+  echo "   -k |--kafka-topic, (optional) Topic name for Kafka"
+  echo "   -m |--num-kafka-partitions, (optional) number of kafka partitions"
+  echo "   -r |--record-key, (optional) field to use as record key"
+  echo "   -l |--num-hudi-partitions, (optional) number of hudi partitions"
+  echo "   -p |--partition-key, (optional) field to use as partition"
+  echo "   -s |--schema-file, (optional) path of the file containing the schema of the records"
+  exit 1
 }

 case "$1" in
-   --help)
-       usage
-       exit 0
-       ;;
+--help)
+  usage
+  exit 0
+  ;;
 esac

 if [ $# -lt 1 ]; then
-    echo "Illegal number of parameters"
-    usage
-    exit 0
+  echo "Illegal number of parameters"
+  usage
+  exit 0
 fi

 ## defaults
@@ -61,71 +56,91 @@ schemaFile=${HUDI_DIR}/docker/demo/config/schema.avsc

 while getopts ":n:f:k:m:r:l:p:s:-:" opt; do
  case $opt in
-    n) num_records="$OPTARG"
+  n)
+    num_records="$OPTARG"
    printf "Argument num-kafka-records is %s\n" "$num_records"
    ;;
-    k) rawDataFile="$OPTARG"
+  k)
+    rawDataFile="$OPTARG"
    printf "Argument raw-file is %s\n" "$rawDataFile"
    ;;
-    f) kafkaTopicName="$OPTARG"
+  f)
+    kafkaTopicName="$OPTARG"
    printf "Argument kafka-topic is %s\n" "$kafkaTopicName"
    ;;
-    m) numKafkaPartitions="$OPTARG"
+  m)
+    numKafkaPartitions="$OPTARG"
    printf "Argument num-kafka-partitions is %s\n" "$numKafkaPartitions"
    ;;
-    r) recordKey="$OPTARG"
+  r)
+    recordKey="$OPTARG"
    printf "Argument record-key is %s\n" "$recordKey"
    ;;
-    l) numHudiPartitions="$OPTARG"
+  l)
+    numHudiPartitions="$OPTARG"
    printf "Argument num-hudi-partitions is %s\n" "$numHudiPartitions"
    ;;
-    p) partitionField="$OPTARG"
+  p)
+    partitionField="$OPTARG"
    printf "Argument partition-key is %s\n" "$partitionField"
    ;;
-    p) schemaFile="$OPTARG"
+  p)
+    schemaFile="$OPTARG"
    printf "Argument schema-file is %s\n" "$schemaFile"
    ;;
-    -) echo "Invalid option -$OPTARG" >&2
+  -)
+    echo "Invalid option -$OPTARG" >&2
    ;;
-esac
+  esac
 done

 # First delete the existing topic
-$KAFKA_HOME/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
+#${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092

 # Create the topic with 4 partitions
-$KAFKA_HOME/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
-
+#${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092

 # Setup the schema registry
-export SCHEMA=`sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring`
+export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring)
 curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8081/subjects/${kafkaTopicName}/versions
 curl -X GET http://localhost:8081/subjects/${kafkaTopicName}/versions/latest

-
 # Generate kafka messages from raw records
 # Each records with unique keys and generate equal messages across each hudi partition
 partitions={}
-for ((i=0; i<${numHudiPartitions}; i++))
-do
-    partitions[$i]="partition-"$i;
+for ((i = 0; i < ${numHudiPartitions}; i++)); do
+  partitions[$i]="partition-"$i
 done

-for ((recordValue=0; recordValue<=${num_records}; ))
-do 
-    while IFS= read line 
-    do
-        for partitionValue in "${partitions[@]}"
-        do
-            echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' | kafkacat -P -b localhost:9092 -t hudi-test-topic;
-            ((recordValue++));
-            if [ $recordValue -gt ${num_records} ]; then
-                exit 0
-            fi
-        done
-        
-        if [ $(( $recordValue % 1000 )) -eq 0 ]
-            then sleep 1
-        fi
-    done < "$rawDataFile"
-done 
+events_file=/tmp/kcat-input.events
+rm -f ${events_file}
+
+recordValue=0
+num_records=$((num_records + 0))
+
+for (( ; ; )); do
+  while IFS= read line; do
+    for partitionValue in "${partitions[@]}"; do
+      echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' >>${events_file}
+      ((recordValue = recordValue + 1))
+
+      if [ $recordValue -gt $num_records ]; then
+        break
+      fi
+    done
+
+    if [ $recordValue -gt $num_records ]; then
+      break
+    fi
+
+    if [ $(($recordValue % 1000)) -eq 0 ]; then
+      sleep 1
+    fi
+  done <"$rawDataFile"
+
+  if [ $recordValue -gt $num_records ]; then
+    break
+  fi
+done
+
+grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic