1
0

[HUDI-2332] Add clustering and compaction in Kafka Connect Sink (#3857)

* [HUDI-2332] Add clustering and compaction in Kafka Connect Sink

* Disable validation check on instant time for compaction and adjust configs

* Add javadocs

* Add clustering and compaction config

* Fix transaction causing missing records in the target table

* Add debugging logs

* Fix kafka offset sync in participant

* Adjust how clustering and compaction are configured in kafka-connect

* Fix clustering strategy

* Remove irrelevant changes from other published PRs

* Update clustering logic and others

* Update README

* Fix test failures

* Fix indentation

* Fix clustering config

* Add JavaCustomColumnsSortPartitioner and make async compaction enabled by default

* Add test for JavaCustomColumnsSortPartitioner

* Add more changes after IDE sync

* Update README with clarification

* Fix clustering logic after rebasing

* Remove unrelated changes
This commit is contained in:
Y Ethan Guo
2021-11-23 00:53:28 -08:00
committed by GitHub
parent 9ed28b1570
commit ca9bfa2a40
27 changed files with 1358 additions and 93 deletions

View File

@@ -73,6 +73,11 @@ public class KafkaConnectConfigs extends HoodieConfig {
+ "the coordinator will wait for the write statuses from all the partitions"
+ "to ignore the current commit and start a new commit.");
public static final ConfigProperty<String> ASYNC_COMPACT_ENABLE = ConfigProperty
.key("hoodie.kafka.compaction.async.enable")
.defaultValue("true")
.withDocumentation("Controls whether async compaction should be turned on for MOR table writing.");
public static final ConfigProperty<String> META_SYNC_ENABLE = ConfigProperty
.key("hoodie.meta.sync.enable")
.defaultValue("false")
@@ -121,6 +126,10 @@ public class KafkaConnectConfigs extends HoodieConfig {
return getString(KAFKA_VALUE_CONVERTER);
}
public Boolean isAsyncCompactEnabled() {
return getBoolean(ASYNC_COMPACT_ENABLE);
}
public Boolean isMetaSyncEnabled() {
return getBoolean(META_SYNC_ENABLE);
}

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.client.HoodieJavaWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieJavaEngineContext;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.EngineType;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieCommitMetadata;
@@ -54,6 +55,8 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
private final Option<HoodieTableMetaClient> tableMetaClient;
private final Configuration hadoopConf;
private final HoodieWriteConfig writeConfig;
private final KafkaConnectConfigs connectConfigs;
private final String tableBasePath;
private final String tableName;
private final HoodieEngineContext context;
@@ -61,8 +64,11 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
private final HoodieJavaWriteClient<HoodieAvroPayload> javaClient;
public KafkaConnectTransactionServices(KafkaConnectConfigs connectConfigs) throws HoodieException {
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder()
.withProperties(connectConfigs.getProps()).build();
this.connectConfigs = connectConfigs;
this.writeConfig = HoodieWriteConfig.newBuilder()
.withEngineType(EngineType.JAVA)
.withProperties(connectConfigs.getProps())
.build();
tableBasePath = writeConfig.getBasePath();
tableName = writeConfig.getTableName();
@@ -95,6 +101,7 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
}
}
@Override
public String startCommit() {
String newCommitTime = javaClient.startCommit();
javaClient.transitionInflight(newCommitTime);
@@ -102,11 +109,23 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
return newCommitTime;
}
@Override
public void endCommit(String commitTime, List<WriteStatus> writeStatuses, Map<String, String> extraMetadata) {
javaClient.commit(commitTime, writeStatuses, Option.of(extraMetadata));
LOG.info("Ending Hudi commit " + commitTime);
// Schedule clustering and compaction as needed.
if (writeConfig.isAsyncClusteringEnabled()) {
javaClient.scheduleClustering(Option.empty()).ifPresent(
instantTs -> LOG.info("Scheduled clustering at instant time:" + instantTs));
}
if (isAsyncCompactionEnabled()) {
javaClient.scheduleCompaction(Option.empty()).ifPresent(
instantTs -> LOG.info("Scheduled compaction at instant time:" + instantTs));
}
}
@Override
public Map<String, String> fetchLatestExtraCommitMetadata() {
if (tableMetaClient.isPresent()) {
Option<HoodieCommitMetadata> metadata = KafkaConnectUtils.getCommitMetadataForLatestInstant(tableMetaClient.get());
@@ -119,4 +138,10 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
}
throw new HoodieException("Fatal error retrieving Hoodie Extra Metadata since Table Meta Client is absent");
}
private boolean isAsyncCompactionEnabled() {
return tableMetaClient.isPresent()
&& HoodieTableType.MERGE_ON_READ.equals(tableMetaClient.get().getTableType())
&& connectConfigs.isAsyncCompactEnabled();
}
}