[HUDI-2330][HUDI-2335] Adding support for merge-on-read tables (#3679)
- Inserts go into logs, hashed by Kafka and Hudi partitions - Fixed issues with the setupKafka script - Bumped up the default commit interval to 300 seconds - Minor renaming
This commit is contained in:
@@ -70,9 +70,9 @@ Wait until the kafka cluster is up and running.
|
||||
|
||||
### 2 - Set up the schema registry
|
||||
|
||||
Hudi leverages schema registry to obtain the latest schema when writing records. While it supports most popular schema registries,
|
||||
we use Confluent schema registry. Download the latest confluent schema registry code from https://github.com/confluentinc/schema-registry
|
||||
and start the schema registry service.
|
||||
Hudi leverages schema registry to obtain the latest schema when writing records. While it supports most popular schema
|
||||
registries, we use Confluent schema registry. Download the latest confluent platform and run the schema registry
|
||||
service.
|
||||
|
||||
```bash
|
||||
cd $CONFLUENT_DIR
|
||||
@@ -120,7 +120,7 @@ that can be changed based on the desired properties.
|
||||
|
||||
```bash
|
||||
curl -X DELETE http://localhost:8083/connectors/hudi-sink
|
||||
curl -X POST -H "Content-Type:application/json" -d @$HUDI-DIR/hudi-kafka-connect/demo/config-sink.json http://localhost:8083/connectors
|
||||
curl -X POST -H "Content-Type:application/json" -d @${HUDI_DIR}/hudi-kafka-connect/demo/config-sink.json http://localhost:8083/connectors
|
||||
```
|
||||
|
||||
Now, you should see that the connector is created and tasks are running.
|
||||
|
||||
@@ -9,10 +9,11 @@
|
||||
"value.converter.schemas.enable": "false",
|
||||
"topics": "hudi-test-topic",
|
||||
"hoodie.table.name": "hudi-test-topic",
|
||||
"hoodie.table.type": "MERGE_ON_READ",
|
||||
"hoodie.base.path": "file:///tmp/hoodie/hudi-test-topic",
|
||||
"hoodie.datasource.write.recordkey.field": "volume",
|
||||
"hoodie.datasource.write.partitionpath.field": "date",
|
||||
"hoodie.schemaprovider.class": "org.apache.hudi.schema.SchemaRegistryProvider",
|
||||
"hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8081/subjects/hudi-test-topic/versions/latest"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
131
hudi-kafka-connect/demo/setupKafka.sh
Normal file → Executable file
131
hudi-kafka-connect/demo/setupKafka.sh
Normal file → Executable file
@@ -16,38 +16,33 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
## Directories
|
||||
HOME_DIR=~
|
||||
HUDI_DIR=${HOME_DIR}/hudi
|
||||
KAFKA_HOME=${HOME_DIR}/kafka
|
||||
|
||||
#########################
|
||||
# The command line help #
|
||||
#########################
|
||||
usage() {
|
||||
echo "Usage: $0"
|
||||
echo " -n |--num-kafka-records, (required) number of kafka records to generate"
|
||||
echo " -f |--raw-file, (optional) raw file for the kafka records"
|
||||
echo " -k |--kafka-topic, (optional) Topic name for Kafka"
|
||||
echo " -m |--num-kafka-partitions, (optional) number of kafka partitions"
|
||||
echo " -r |--record-key, (optional) field to use as record key"
|
||||
echo " -l |--num-hudi-partitions, (optional) number of hudi partitions"
|
||||
echo " -p |--partition-key, (optional) field to use as partition"
|
||||
echo " -s |--schema-file, (optional) path of the file containing the schema of the records"
|
||||
exit 1
|
||||
echo "Usage: $0"
|
||||
echo " -n |--num-kafka-records, (required) number of kafka records to generate"
|
||||
echo " -f |--raw-file, (optional) raw file for the kafka records"
|
||||
echo " -k |--kafka-topic, (optional) Topic name for Kafka"
|
||||
echo " -m |--num-kafka-partitions, (optional) number of kafka partitions"
|
||||
echo " -r |--record-key, (optional) field to use as record key"
|
||||
echo " -l |--num-hudi-partitions, (optional) number of hudi partitions"
|
||||
echo " -p |--partition-key, (optional) field to use as partition"
|
||||
echo " -s |--schema-file, (optional) path of the file containing the schema of the records"
|
||||
exit 1
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Illegal number of parameters"
|
||||
usage
|
||||
exit 0
|
||||
echo "Illegal number of parameters"
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
## defaults
|
||||
@@ -61,71 +56,91 @@ schemaFile=${HUDI_DIR}/docker/demo/config/schema.avsc
|
||||
|
||||
while getopts ":n:f:k:m:r:l:p:s:-:" opt; do
|
||||
case $opt in
|
||||
n) num_records="$OPTARG"
|
||||
n)
|
||||
num_records="$OPTARG"
|
||||
printf "Argument num-kafka-records is %s\n" "$num_records"
|
||||
;;
|
||||
k) rawDataFile="$OPTARG"
|
||||
k)
|
||||
rawDataFile="$OPTARG"
|
||||
printf "Argument raw-file is %s\n" "$rawDataFile"
|
||||
;;
|
||||
f) kafkaTopicName="$OPTARG"
|
||||
f)
|
||||
kafkaTopicName="$OPTARG"
|
||||
printf "Argument kafka-topic is %s\n" "$kafkaTopicName"
|
||||
;;
|
||||
m) numKafkaPartitions="$OPTARG"
|
||||
m)
|
||||
numKafkaPartitions="$OPTARG"
|
||||
printf "Argument num-kafka-partitions is %s\n" "$numKafkaPartitions"
|
||||
;;
|
||||
r) recordKey="$OPTARG"
|
||||
r)
|
||||
recordKey="$OPTARG"
|
||||
printf "Argument record-key is %s\n" "$recordKey"
|
||||
;;
|
||||
l) numHudiPartitions="$OPTARG"
|
||||
l)
|
||||
numHudiPartitions="$OPTARG"
|
||||
printf "Argument num-hudi-partitions is %s\n" "$numHudiPartitions"
|
||||
;;
|
||||
p) partitionField="$OPTARG"
|
||||
p)
|
||||
partitionField="$OPTARG"
|
||||
printf "Argument partition-key is %s\n" "$partitionField"
|
||||
;;
|
||||
p) schemaFile="$OPTARG"
|
||||
p)
|
||||
schemaFile="$OPTARG"
|
||||
printf "Argument schema-file is %s\n" "$schemaFile"
|
||||
;;
|
||||
-) echo "Invalid option -$OPTARG" >&2
|
||||
-)
|
||||
echo "Invalid option -$OPTARG" >&2
|
||||
;;
|
||||
esac
|
||||
esac
|
||||
done
|
||||
|
||||
# First delete the existing topic
|
||||
$KAFKA_HOME/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
|
||||
#${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
|
||||
|
||||
# Create the topic with 4 partitions
|
||||
$KAFKA_HOME/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
|
||||
|
||||
#${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
|
||||
|
||||
# Setup the schema registry
|
||||
export SCHEMA=`sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring`
|
||||
export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring)
|
||||
curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8081/subjects/${kafkaTopicName}/versions
|
||||
curl -X GET http://localhost:8081/subjects/${kafkaTopicName}/versions/latest
|
||||
|
||||
|
||||
# Generate kafka messages from raw records
|
||||
# Each records with unique keys and generate equal messages across each hudi partition
|
||||
partitions={}
|
||||
for ((i=0; i<${numHudiPartitions}; i++))
|
||||
do
|
||||
partitions[$i]="partition-"$i;
|
||||
for ((i = 0; i < ${numHudiPartitions}; i++)); do
|
||||
partitions[$i]="partition-"$i
|
||||
done
|
||||
|
||||
for ((recordValue=0; recordValue<=${num_records}; ))
|
||||
do
|
||||
while IFS= read line
|
||||
do
|
||||
for partitionValue in "${partitions[@]}"
|
||||
do
|
||||
echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' | kafkacat -P -b localhost:9092 -t hudi-test-topic;
|
||||
((recordValue++));
|
||||
if [ $recordValue -gt ${num_records} ]; then
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $(( $recordValue % 1000 )) -eq 0 ]
|
||||
then sleep 1
|
||||
fi
|
||||
done < "$rawDataFile"
|
||||
done
|
||||
events_file=/tmp/kcat-input.events
|
||||
rm -f ${events_file}
|
||||
|
||||
recordValue=0
|
||||
num_records=$((num_records + 0))
|
||||
|
||||
for (( ; ; )); do
|
||||
while IFS= read line; do
|
||||
for partitionValue in "${partitions[@]}"; do
|
||||
echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' >>${events_file}
|
||||
((recordValue = recordValue + 1))
|
||||
|
||||
if [ $recordValue -gt $num_records ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $recordValue -gt $num_records ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
if [ $(($recordValue % 1000)) -eq 0 ]; then
|
||||
sleep 1
|
||||
fi
|
||||
done <"$rawDataFile"
|
||||
|
||||
if [ $recordValue -gt $num_records ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic
|
||||
|
||||
@@ -18,17 +18,13 @@
|
||||
|
||||
package org.apache.hudi.connect;
|
||||
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.connect.utils.KafkaConnectUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.table.FileIdPrefixProvider;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Objects;
|
||||
import java.util.Properties;
|
||||
|
||||
public class KafkaConnectFileIdPrefixProvider extends FileIdPrefixProvider {
|
||||
@@ -52,18 +48,9 @@ public class KafkaConnectFileIdPrefixProvider extends FileIdPrefixProvider {
|
||||
// We use a combination of kafka partition and partition path as the file id, and then hash it
|
||||
// to generate a fixed sized hash.
|
||||
String rawFileIdPrefix = kafkaPartition + partitionPath;
|
||||
MessageDigest md;
|
||||
try {
|
||||
md = MessageDigest.getInstance("MD5");
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
LOG.error("Fatal error selecting hash algorithm", e);
|
||||
throw new HoodieException(e);
|
||||
}
|
||||
|
||||
byte[] digest = Objects.requireNonNull(md).digest(rawFileIdPrefix.getBytes(StandardCharsets.UTF_8));
|
||||
|
||||
String hashedPrefix = KafkaConnectUtils.hashDigest(rawFileIdPrefix);
|
||||
LOG.info("CreateFileId for Kafka Partition " + kafkaPartition + " : " + partitionPath + " = " + rawFileIdPrefix
|
||||
+ " === " + StringUtils.toHexString(digest).toUpperCase());
|
||||
return StringUtils.toHexString(digest).toUpperCase();
|
||||
+ " === " + hashedPrefix);
|
||||
return hashedPrefix;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,6 +26,7 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||
import org.apache.hudi.keygen.CustomAvroKeyGenerator;
|
||||
@@ -41,8 +42,12 @@ import org.apache.kafka.common.KafkaFuture;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Properties;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@@ -137,4 +142,16 @@ public class KafkaConnectUtils {
|
||||
return Option.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public static String hashDigest(String stringToHash) {
|
||||
MessageDigest md;
|
||||
try {
|
||||
md = MessageDigest.getInstance("MD5");
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
LOG.error("Fatal error selecting hash algorithm", e);
|
||||
throw new HoodieException(e);
|
||||
}
|
||||
byte[] digest = Objects.requireNonNull(md).digest(stringToHash.getBytes(StandardCharsets.UTF_8));
|
||||
return StringUtils.toHexString(digest).toUpperCase();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,9 @@ package org.apache.hudi.connect.writers;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.model.HoodieAvroPayload;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.connect.utils.KafkaConnectUtils;
|
||||
import org.apache.hudi.keygen.KeyGenerator;
|
||||
import org.apache.hudi.schema.SchemaProvider;
|
||||
import org.apache.hudi.utilities.sources.helpers.AvroConvertor;
|
||||
@@ -46,17 +48,19 @@ public abstract class AbstractConnectWriter implements ConnectWriter<WriteStatus
|
||||
public static final String KAFKA_JSON_CONVERTER = "org.apache.kafka.connect.json.JsonConverter";
|
||||
public static final String KAFKA_STRING_CONVERTER = "org.apache.kafka.connect.storage.StringConverter";
|
||||
private static final Logger LOG = LogManager.getLogger(AbstractConnectWriter.class);
|
||||
protected final String instantTime;
|
||||
|
||||
private final KafkaConnectConfigs connectConfigs;
|
||||
private final KeyGenerator keyGenerator;
|
||||
private final SchemaProvider schemaProvider;
|
||||
protected final KafkaConnectConfigs connectConfigs;
|
||||
|
||||
public AbstractConnectWriter(KafkaConnectConfigs connectConfigs,
|
||||
KeyGenerator keyGenerator,
|
||||
SchemaProvider schemaProvider) {
|
||||
SchemaProvider schemaProvider, String instantTime) {
|
||||
this.connectConfigs = connectConfigs;
|
||||
this.keyGenerator = keyGenerator;
|
||||
this.schemaProvider = schemaProvider;
|
||||
this.instantTime = instantTime;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -76,16 +80,22 @@ public abstract class AbstractConnectWriter implements ConnectWriter<WriteStatus
|
||||
throw new IOException("Unsupported Kafka Format type (" + connectConfigs.getKafkaValueConverter() + ")");
|
||||
}
|
||||
|
||||
HoodieRecord hoodieRecord = new HoodieRecord<>(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord));
|
||||
// Tag records with a file ID based on kafka partition and hudi partition.
|
||||
HoodieRecord<?> hoodieRecord = new HoodieRecord<>(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord));
|
||||
String fileId = KafkaConnectUtils.hashDigest(String.format("%s-%s", record.kafkaPartition(), hoodieRecord.getPartitionPath()));
|
||||
hoodieRecord.unseal();
|
||||
hoodieRecord.setCurrentLocation(new HoodieRecordLocation(instantTime, fileId));
|
||||
hoodieRecord.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
|
||||
hoodieRecord.seal();
|
||||
writeHudiRecord(hoodieRecord);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<WriteStatus> close() throws IOException {
|
||||
return flushHudiRecords();
|
||||
return flushRecords();
|
||||
}
|
||||
|
||||
protected abstract void writeHudiRecord(HoodieRecord<HoodieAvroPayload> record);
|
||||
protected abstract void writeHudiRecord(HoodieRecord<?> record);
|
||||
|
||||
protected abstract List<WriteStatus> flushHudiRecords() throws IOException;
|
||||
protected abstract List<WriteStatus> flushRecords() throws IOException;
|
||||
}
|
||||
|
||||
@@ -21,8 +21,9 @@ package org.apache.hudi.connect.writers;
|
||||
import org.apache.hudi.client.HoodieJavaWriteClient;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieAvroPayload;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||
import org.apache.hudi.common.util.DefaultSizeEstimator;
|
||||
import org.apache.hudi.common.util.HoodieRecordSizeEstimator;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
@@ -39,8 +40,8 @@ import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Specific implementation of a Hudi Writer that buffers all incoming records,
|
||||
@@ -52,9 +53,8 @@ public class BufferedConnectWriter extends AbstractConnectWriter {
|
||||
|
||||
private final HoodieEngineContext context;
|
||||
private final HoodieJavaWriteClient writeClient;
|
||||
private final String instantTime;
|
||||
private final HoodieWriteConfig config;
|
||||
private ExternalSpillableMap<String, HoodieRecord<HoodieAvroPayload>> bufferedRecords;
|
||||
private ExternalSpillableMap<String, HoodieRecord<?>> bufferedRecords;
|
||||
|
||||
public BufferedConnectWriter(HoodieEngineContext context,
|
||||
HoodieJavaWriteClient writeClient,
|
||||
@@ -63,10 +63,9 @@ public class BufferedConnectWriter extends AbstractConnectWriter {
|
||||
HoodieWriteConfig config,
|
||||
KeyGenerator keyGenerator,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(connectConfigs, keyGenerator, schemaProvider);
|
||||
super(connectConfigs, keyGenerator, schemaProvider, instantTime);
|
||||
this.context = context;
|
||||
this.writeClient = writeClient;
|
||||
this.instantTime = instantTime;
|
||||
this.config = config;
|
||||
init();
|
||||
}
|
||||
@@ -88,12 +87,12 @@ public class BufferedConnectWriter extends AbstractConnectWriter {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeHudiRecord(HoodieRecord<HoodieAvroPayload> record) {
|
||||
public void writeHudiRecord(HoodieRecord<?> record) {
|
||||
bufferedRecords.put(record.getRecordKey(), record);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<WriteStatus> flushHudiRecords() throws IOException {
|
||||
public List<WriteStatus> flushRecords() throws IOException {
|
||||
try {
|
||||
LOG.info("Number of entries in MemoryBasedMap => "
|
||||
+ bufferedRecords.getInMemoryMapNumEntries()
|
||||
@@ -102,15 +101,25 @@ public class BufferedConnectWriter extends AbstractConnectWriter {
|
||||
+ bufferedRecords.getDiskBasedMapNumEntries() + "Size of file spilled to disk => "
|
||||
+ bufferedRecords.getSizeOfFileOnDiskInBytes());
|
||||
List<WriteStatus> writeStatuses = new ArrayList<>();
|
||||
|
||||
boolean isMorTable = Option.ofNullable(connectConfigs.getString(HoodieTableConfig.TYPE))
|
||||
.map(t -> t.equals(HoodieTableType.MERGE_ON_READ.name()))
|
||||
.orElse(false);
|
||||
|
||||
// Write out all records if non-empty
|
||||
if (!bufferedRecords.isEmpty()) {
|
||||
writeStatuses = writeClient.bulkInsertPreppedRecords(
|
||||
bufferedRecords.values().stream().collect(Collectors.toList()),
|
||||
instantTime, Option.empty());
|
||||
if (isMorTable) {
|
||||
writeStatuses = writeClient.upsertPreppedRecords(
|
||||
new LinkedList<>(bufferedRecords.values()),
|
||||
instantTime);
|
||||
} else {
|
||||
writeStatuses = writeClient.bulkInsertPreppedRecords(
|
||||
new LinkedList<>(bufferedRecords.values()),
|
||||
instantTime, Option.empty());
|
||||
}
|
||||
}
|
||||
bufferedRecords.close();
|
||||
LOG.info("Flushed hudi records and got writeStatuses: "
|
||||
+ writeStatuses);
|
||||
LOG.info("Flushed hudi records and got writeStatuses: " + writeStatuses);
|
||||
return writeStatuses;
|
||||
} catch (Exception e) {
|
||||
throw new IOException("Write records failed", e);
|
||||
|
||||
@@ -67,7 +67,7 @@ public class KafkaConnectConfigs extends HoodieConfig {
|
||||
|
||||
public static final ConfigProperty<String> COORDINATOR_WRITE_TIMEOUT_SECS = ConfigProperty
|
||||
.key("hoodie.kafka.coordinator.write.timeout.secs")
|
||||
.defaultValue("60")
|
||||
.defaultValue("300")
|
||||
.withDocumentation("The timeout after sending an END_COMMIT until when "
|
||||
+ "the coordinator will wait for the write statuses from all the partitions"
|
||||
+ "to ignore the current commit and start a new commit.");
|
||||
|
||||
@@ -23,12 +23,10 @@ import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieJavaEngineContext;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieAvroPayload;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.connect.transaction.TransactionCoordinator;
|
||||
@@ -38,7 +36,6 @@ import org.apache.hudi.keygen.KeyGenerator;
|
||||
import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -54,19 +51,16 @@ import java.util.Map;
|
||||
public class KafkaConnectTransactionServices implements ConnectTransactionServices {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(KafkaConnectTransactionServices.class);
|
||||
private static final String TABLE_FORMAT = "PARQUET";
|
||||
|
||||
private final Option<HoodieTableMetaClient> tableMetaClient;
|
||||
private final Configuration hadoopConf;
|
||||
private final FileSystem fs;
|
||||
private final String tableBasePath;
|
||||
private final String tableName;
|
||||
private final HoodieEngineContext context;
|
||||
|
||||
private final HoodieJavaWriteClient<HoodieAvroPayload> javaClient;
|
||||
|
||||
public KafkaConnectTransactionServices(
|
||||
KafkaConnectConfigs connectConfigs) throws HoodieException {
|
||||
public KafkaConnectTransactionServices(KafkaConnectConfigs connectConfigs) throws HoodieException {
|
||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder()
|
||||
.withProperties(connectConfigs.getProps()).build();
|
||||
|
||||
@@ -74,29 +68,25 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
|
||||
tableName = writeConfig.getTableName();
|
||||
hadoopConf = KafkaConnectUtils.getDefaultHadoopConf();
|
||||
context = new HoodieJavaEngineContext(hadoopConf);
|
||||
fs = FSUtils.getFs(tableBasePath, hadoopConf);
|
||||
|
||||
try {
|
||||
KeyGenerator keyGenerator = HoodieAvroKeyGeneratorFactory.createKeyGenerator(
|
||||
new TypedProperties(connectConfigs.getProps()));
|
||||
|
||||
String recordKeyFields = KafkaConnectUtils.getRecordKeyColumns(keyGenerator);
|
||||
String partitionColumns = KafkaConnectUtils.getPartitionColumns(keyGenerator,
|
||||
new TypedProperties(connectConfigs.getProps()));
|
||||
|
||||
LOG.info(String.format("Setting record key %s and partitionfields %s for table %s",
|
||||
recordKeyFields,
|
||||
partitionColumns,
|
||||
tableBasePath + tableName));
|
||||
LOG.info(String.format("Setting record key %s and partition fields %s for table %s",
|
||||
recordKeyFields, partitionColumns, tableBasePath + tableName));
|
||||
|
||||
tableMetaClient = Option.of(HoodieTableMetaClient.withPropertyBuilder()
|
||||
.setTableType(HoodieTableType.COPY_ON_WRITE.name())
|
||||
.setTableName(tableName)
|
||||
.setPayloadClassName(HoodieAvroPayload.class.getName())
|
||||
.setBaseFileFormat(TABLE_FORMAT)
|
||||
.setRecordKeyFields(recordKeyFields)
|
||||
.setPartitionFields(partitionColumns)
|
||||
.setKeyGeneratorClassProp(writeConfig.getKeyGeneratorClass())
|
||||
.fromProperties(connectConfigs.getProps())
|
||||
.initTable(hadoopConf, tableBasePath));
|
||||
|
||||
javaClient = new HoodieJavaWriteClient<>(context, writeConfig);
|
||||
@@ -113,8 +103,7 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
|
||||
}
|
||||
|
||||
public void endCommit(String commitTime, List<WriteStatus> writeStatuses, Map<String, String> extraMetadata) {
|
||||
javaClient.commit(commitTime, writeStatuses, Option.of(extraMetadata),
|
||||
HoodieActiveTimeline.COMMIT_ACTION, Collections.emptyMap());
|
||||
javaClient.commit(commitTime, writeStatuses, Option.of(extraMetadata));
|
||||
LOG.info("Ending Hudi commit " + commitTime);
|
||||
}
|
||||
|
||||
|
||||
@@ -148,7 +148,7 @@ public class TestAbstractConnectWriter {
|
||||
private List<HoodieRecord> writtenRecords;
|
||||
|
||||
public AbstractHudiConnectWriterTestWrapper(KafkaConnectConfigs connectConfigs, KeyGenerator keyGenerator, SchemaProvider schemaProvider) {
|
||||
super(connectConfigs, keyGenerator, schemaProvider);
|
||||
super(connectConfigs, keyGenerator, schemaProvider, "000");
|
||||
writtenRecords = new ArrayList<>();
|
||||
}
|
||||
|
||||
@@ -157,12 +157,12 @@ public class TestAbstractConnectWriter {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeHudiRecord(HoodieRecord<HoodieAvroPayload> record) {
|
||||
protected void writeHudiRecord(HoodieRecord<?> record) {
|
||||
writtenRecords.add(record);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<WriteStatus> flushHudiRecords() {
|
||||
protected List<WriteStatus> flushRecords() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,7 +88,7 @@ public class TestBufferedConnectWriter {
|
||||
Mockito.verify(mockHoodieJavaWriteClient, times(0))
|
||||
.bulkInsertPreppedRecords(anyList(), eq(COMMIT_TIME), eq(Option.empty()));
|
||||
|
||||
writer.flushHudiRecords();
|
||||
writer.flushRecords();
|
||||
final ArgumentCaptor<List<HoodieRecord>> actualRecords = ArgumentCaptor.forClass(List.class);
|
||||
Mockito.verify(mockHoodieJavaWriteClient, times(1))
|
||||
.bulkInsertPreppedRecords(actualRecords.capture(), eq(COMMIT_TIME), eq(Option.empty()));
|
||||
|
||||
Reference in New Issue
Block a user