1
0

[HUDI-2332] Add clustering and compaction in Kafka Connect Sink (#3857)

* [HUDI-2332] Add clustering and compaction in Kafka Connect Sink

* Disable validation check on instant time for compaction and adjust configs

* Add javadocs

* Add clustering and compaction config

* Fix transaction causing missing records in the target table

* Add debugging logs

* Fix kafka offset sync in participant

* Adjust how clustering and compaction are configured in kafka-connect

* Fix clustering strategy

* Remove irrelevant changes from other published PRs

* Update clustering logic and others

* Update README

* Fix test failures

* Fix indentation

* Fix clustering config

* Add JavaCustomColumnsSortPartitioner and make async compaction enabled by default

* Add test for JavaCustomColumnsSortPartitioner

* Add more changes after IDE sync

* Update README with clarification

* Fix clustering logic after rebasing

* Remove unrelated changes
This commit is contained in:
Y Ethan Guo
2021-11-23 00:53:28 -08:00
committed by GitHub
parent 9ed28b1570
commit ca9bfa2a40
27 changed files with 1358 additions and 93 deletions

View File

@@ -22,7 +22,9 @@ import org.apache.hudi.common.config.ConfigClassProperty;
import org.apache.hudi.common.config.ConfigGroups;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.engine.EngineType;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieNotSupportedException;
import java.io.File;
import java.io.FileReader;
@@ -41,6 +43,14 @@ public class HoodieClusteringConfig extends HoodieConfig {
// Any strategy specific params can be saved with this prefix
public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy.";
public static final String SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY =
"org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy";
public static final String JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY =
"org.apache.hudi.client.clustering.plan.strategy.JavaSizeBasedClusteringPlanStrategy";
public static final String SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY =
"org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy";
public static final String JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY =
"org.apache.hudi.client.clustering.run.strategy.JavaSortAndSizeExecutionStrategy";
// Any Space-filling curves optimize(z-order/hilbert) params can be saved with this prefix
public static final String LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize.";
@@ -59,7 +69,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
public static final ConfigProperty<String> PLAN_STRATEGY_CLASS_NAME = ConfigProperty
.key("hoodie.clustering.plan.strategy.class")
.defaultValue("org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy")
.defaultValue(SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY)
.sinceVersion("0.7.0")
.withDocumentation("Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan "
+ "i.e select what file groups are being clustered. Default strategy, looks at the clustering small file size limit (determined by "
@@ -67,7 +77,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
public static final ConfigProperty<String> EXECUTION_STRATEGY_CLASS_NAME = ConfigProperty
.key("hoodie.clustering.execution.strategy.class")
.defaultValue("org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy")
.defaultValue(SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY)
.sinceVersion("0.7.0")
.withDocumentation("Config to provide a strategy class (subclass of RunClusteringStrategy) to define how the "
+ " clustering plan is executed. By default, we sort the file groups in th plan by the specified columns, while "
@@ -336,6 +346,12 @@ public class HoodieClusteringConfig extends HoodieConfig {
public static class Builder {
private final HoodieClusteringConfig clusteringConfig = new HoodieClusteringConfig();
private EngineType engineType = EngineType.SPARK;
public Builder withEngineType(EngineType engineType) {
this.engineType = engineType;
return this;
}
public Builder fromFile(File propertiesFile) throws IOException {
try (FileReader reader = new FileReader(propertiesFile)) {
@@ -455,9 +471,37 @@ public class HoodieClusteringConfig extends HoodieConfig {
}
public HoodieClusteringConfig build() {
clusteringConfig.setDefaultValue(
PLAN_STRATEGY_CLASS_NAME, getDefaultPlanStrategyClassName(engineType));
clusteringConfig.setDefaultValue(
EXECUTION_STRATEGY_CLASS_NAME, getDefaultExecutionStrategyClassName(engineType));
clusteringConfig.setDefaults(HoodieClusteringConfig.class.getName());
return clusteringConfig;
}
private String getDefaultPlanStrategyClassName(EngineType engineType) {
switch (engineType) {
case SPARK:
return SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY;
case FLINK:
case JAVA:
return JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY;
default:
throw new HoodieNotSupportedException("Unsupported engine " + engineType);
}
}
private String getDefaultExecutionStrategyClassName(EngineType engineType) {
switch (engineType) {
case SPARK:
return SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY;
case FLINK:
case JAVA:
return JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY;
default:
throw new HoodieNotSupportedException("Unsupported engine " + engineType);
}
}
}
/**

View File

@@ -2182,7 +2182,8 @@ public class HoodieWriteConfig extends HoodieConfig {
writeConfig.setDefaultOnCondition(!isCompactionConfigSet,
HoodieCompactionConfig.newBuilder().fromProperties(writeConfig.getProps()).build());
writeConfig.setDefaultOnCondition(!isClusteringConfigSet,
HoodieClusteringConfig.newBuilder().fromProperties(writeConfig.getProps()).build());
HoodieClusteringConfig.newBuilder().withEngineType(engineType)
.fromProperties(writeConfig.getProps()).build());
writeConfig.setDefaultOnCondition(!isMetricsConfigSet, HoodieMetricsConfig.newBuilder().fromProperties(
writeConfig.getProps()).build());
writeConfig.setDefaultOnCondition(!isBootstrapConfigSet,

View File

@@ -27,10 +27,15 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.BaseActionExecutor;
import org.apache.hudi.table.action.cluster.strategy.ClusteringPlanStrategy;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.Collections;
@@ -38,6 +43,8 @@ import java.util.Map;
public abstract class BaseClusteringPlanActionExecutor<T extends HoodieRecordPayload, I, K, O> extends BaseActionExecutor<T, I, K, O, Option<HoodieClusteringPlan>> {
private static final Logger LOG = LogManager.getLogger(BaseClusteringPlanActionExecutor.class);
private final Option<Map<String, String>> extraMetadata;
public BaseClusteringPlanActionExecutor(HoodieEngineContext context,
@@ -49,7 +56,32 @@ public abstract class BaseClusteringPlanActionExecutor<T extends HoodieRecordPay
this.extraMetadata = extraMetadata;
}
protected abstract Option<HoodieClusteringPlan> createClusteringPlan();
protected Option<HoodieClusteringPlan> createClusteringPlan() {
LOG.info("Checking if clustering needs to be run on " + config.getBasePath());
Option<HoodieInstant> lastClusteringInstant = table.getActiveTimeline().getCompletedReplaceTimeline().lastInstant();
int commitsSinceLastClustering = table.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()
.findInstantsAfter(lastClusteringInstant.map(HoodieInstant::getTimestamp).orElse("0"), Integer.MAX_VALUE)
.countInstants();
if (config.inlineClusteringEnabled() && config.getInlineClusterMaxCommits() > commitsSinceLastClustering) {
LOG.info("Not scheduling inline clustering as only " + commitsSinceLastClustering
+ " commits was found since last clustering " + lastClusteringInstant + ". Waiting for "
+ config.getInlineClusterMaxCommits());
return Option.empty();
}
if (config.isAsyncClusteringEnabled() && config.getAsyncClusterMaxCommits() > commitsSinceLastClustering) {
LOG.info("Not scheduling async clustering as only " + commitsSinceLastClustering
+ " commits was found since last clustering " + lastClusteringInstant + ". Waiting for "
+ config.getAsyncClusterMaxCommits());
return Option.empty();
}
LOG.info("Generating clustering plan for table " + config.getBasePath());
ClusteringPlanStrategy strategy = (ClusteringPlanStrategy)
ReflectionUtils.loadClass(config.getClusteringPlanStrategyClass(), table, context, config);
return strategy.generateClusteringPlan();
}
@Override
public Option<HoodieClusteringPlan> execute() {

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.table.action.compact;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.common.engine.EngineType;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.model.HoodieRecordPayload;
@@ -68,12 +69,15 @@ public class ScheduleCompactionActionExecutor<T extends HoodieRecordPayload, I,
public Option<HoodieCompactionPlan> execute() {
if (!config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()
&& !config.getFailedWritesCleanPolicy().isLazy()) {
// if there are inflight writes, their instantTime must not be less than that of compaction instant time
table.getActiveTimeline().getCommitsTimeline().filterPendingExcludingCompaction().firstInstant()
.ifPresent(earliestInflight -> ValidationUtils.checkArgument(
HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), HoodieTimeline.GREATER_THAN, instantTime),
"Earliest write inflight instant time must be later than compaction time. Earliest :" + earliestInflight
+ ", Compaction scheduled at " + instantTime));
// TODO(yihua): this validation is removed for Java client used by kafka-connect. Need to revisit this.
if (config.getEngineType() != EngineType.JAVA) {
// if there are inflight writes, their instantTime must not be less than that of compaction instant time
table.getActiveTimeline().getCommitsTimeline().filterPendingExcludingCompaction().firstInstant()
.ifPresent(earliestInflight -> ValidationUtils.checkArgument(
HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), HoodieTimeline.GREATER_THAN, instantTime),
"Earliest write inflight instant time must be later than compaction time. Earliest :" + earliestInflight
+ ", Compaction scheduled at " + instantTime));
}
// Committed and pending compaction instants should have strictly lower timestamps
List<HoodieInstant> conflictingInstants = table.getActiveTimeline()
.getWriteTimeline().filterCompletedAndCompactionInstants().getInstants()