[HUDI-2332] Add clustering and compaction in Kafka Connect Sink (#3857)
* [HUDI-2332] Add clustering and compaction in Kafka Connect Sink * Disable validation check on instant time for compaction and adjust configs * Add javadocs * Add clustering and compaction config * Fix transaction causing missing records in the target table * Add debugging logs * Fix kafka offset sync in participant * Adjust how clustering and compaction are configured in kafka-connect * Fix clustering strategy * Remove irrelevant changes from other published PRs * Update clustering logic and others * Update README * Fix test failures * Fix indentation * Fix clustering config * Add JavaCustomColumnsSortPartitioner and make async compaction enabled by default * Add test for JavaCustomColumnsSortPartitioner * Add more changes after IDE sync * Update README with clarification * Fix clustering logic after rebasing * Remove unrelated changes
This commit is contained in:
@@ -22,7 +22,9 @@ import org.apache.hudi.common.config.ConfigClassProperty;
|
||||
import org.apache.hudi.common.config.ConfigGroups;
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.HoodieConfig;
|
||||
import org.apache.hudi.common.engine.EngineType;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
@@ -41,6 +43,14 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
|
||||
// Any strategy specific params can be saved with this prefix
|
||||
public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy.";
|
||||
public static final String SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY =
|
||||
"org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy";
|
||||
public static final String JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY =
|
||||
"org.apache.hudi.client.clustering.plan.strategy.JavaSizeBasedClusteringPlanStrategy";
|
||||
public static final String SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY =
|
||||
"org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy";
|
||||
public static final String JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY =
|
||||
"org.apache.hudi.client.clustering.run.strategy.JavaSortAndSizeExecutionStrategy";
|
||||
|
||||
// Any Space-filling curves optimize(z-order/hilbert) params can be saved with this prefix
|
||||
public static final String LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize.";
|
||||
@@ -59,7 +69,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
|
||||
public static final ConfigProperty<String> PLAN_STRATEGY_CLASS_NAME = ConfigProperty
|
||||
.key("hoodie.clustering.plan.strategy.class")
|
||||
.defaultValue("org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy")
|
||||
.defaultValue(SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY)
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan "
|
||||
+ "i.e select what file groups are being clustered. Default strategy, looks at the clustering small file size limit (determined by "
|
||||
@@ -67,7 +77,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
|
||||
public static final ConfigProperty<String> EXECUTION_STRATEGY_CLASS_NAME = ConfigProperty
|
||||
.key("hoodie.clustering.execution.strategy.class")
|
||||
.defaultValue("org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy")
|
||||
.defaultValue(SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY)
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Config to provide a strategy class (subclass of RunClusteringStrategy) to define how the "
|
||||
+ " clustering plan is executed. By default, we sort the file groups in th plan by the specified columns, while "
|
||||
@@ -336,6 +346,12 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
public static class Builder {
|
||||
|
||||
private final HoodieClusteringConfig clusteringConfig = new HoodieClusteringConfig();
|
||||
private EngineType engineType = EngineType.SPARK;
|
||||
|
||||
public Builder withEngineType(EngineType engineType) {
|
||||
this.engineType = engineType;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder fromFile(File propertiesFile) throws IOException {
|
||||
try (FileReader reader = new FileReader(propertiesFile)) {
|
||||
@@ -455,9 +471,37 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
}
|
||||
|
||||
public HoodieClusteringConfig build() {
|
||||
clusteringConfig.setDefaultValue(
|
||||
PLAN_STRATEGY_CLASS_NAME, getDefaultPlanStrategyClassName(engineType));
|
||||
clusteringConfig.setDefaultValue(
|
||||
EXECUTION_STRATEGY_CLASS_NAME, getDefaultExecutionStrategyClassName(engineType));
|
||||
clusteringConfig.setDefaults(HoodieClusteringConfig.class.getName());
|
||||
return clusteringConfig;
|
||||
}
|
||||
|
||||
private String getDefaultPlanStrategyClassName(EngineType engineType) {
|
||||
switch (engineType) {
|
||||
case SPARK:
|
||||
return SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY;
|
||||
case FLINK:
|
||||
case JAVA:
|
||||
return JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY;
|
||||
default:
|
||||
throw new HoodieNotSupportedException("Unsupported engine " + engineType);
|
||||
}
|
||||
}
|
||||
|
||||
private String getDefaultExecutionStrategyClassName(EngineType engineType) {
|
||||
switch (engineType) {
|
||||
case SPARK:
|
||||
return SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY;
|
||||
case FLINK:
|
||||
case JAVA:
|
||||
return JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY;
|
||||
default:
|
||||
throw new HoodieNotSupportedException("Unsupported engine " + engineType);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -2182,7 +2182,8 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
writeConfig.setDefaultOnCondition(!isCompactionConfigSet,
|
||||
HoodieCompactionConfig.newBuilder().fromProperties(writeConfig.getProps()).build());
|
||||
writeConfig.setDefaultOnCondition(!isClusteringConfigSet,
|
||||
HoodieClusteringConfig.newBuilder().fromProperties(writeConfig.getProps()).build());
|
||||
HoodieClusteringConfig.newBuilder().withEngineType(engineType)
|
||||
.fromProperties(writeConfig.getProps()).build());
|
||||
writeConfig.setDefaultOnCondition(!isMetricsConfigSet, HoodieMetricsConfig.newBuilder().fromProperties(
|
||||
writeConfig.getProps()).build());
|
||||
writeConfig.setDefaultOnCondition(!isBootstrapConfigSet,
|
||||
|
||||
@@ -27,10 +27,15 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.BaseActionExecutor;
|
||||
import org.apache.hudi.table.action.cluster.strategy.ClusteringPlanStrategy;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
@@ -38,6 +43,8 @@ import java.util.Map;
|
||||
|
||||
public abstract class BaseClusteringPlanActionExecutor<T extends HoodieRecordPayload, I, K, O> extends BaseActionExecutor<T, I, K, O, Option<HoodieClusteringPlan>> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(BaseClusteringPlanActionExecutor.class);
|
||||
|
||||
private final Option<Map<String, String>> extraMetadata;
|
||||
|
||||
public BaseClusteringPlanActionExecutor(HoodieEngineContext context,
|
||||
@@ -49,7 +56,32 @@ public abstract class BaseClusteringPlanActionExecutor<T extends HoodieRecordPay
|
||||
this.extraMetadata = extraMetadata;
|
||||
}
|
||||
|
||||
protected abstract Option<HoodieClusteringPlan> createClusteringPlan();
|
||||
protected Option<HoodieClusteringPlan> createClusteringPlan() {
|
||||
LOG.info("Checking if clustering needs to be run on " + config.getBasePath());
|
||||
Option<HoodieInstant> lastClusteringInstant = table.getActiveTimeline().getCompletedReplaceTimeline().lastInstant();
|
||||
|
||||
int commitsSinceLastClustering = table.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()
|
||||
.findInstantsAfter(lastClusteringInstant.map(HoodieInstant::getTimestamp).orElse("0"), Integer.MAX_VALUE)
|
||||
.countInstants();
|
||||
if (config.inlineClusteringEnabled() && config.getInlineClusterMaxCommits() > commitsSinceLastClustering) {
|
||||
LOG.info("Not scheduling inline clustering as only " + commitsSinceLastClustering
|
||||
+ " commits was found since last clustering " + lastClusteringInstant + ". Waiting for "
|
||||
+ config.getInlineClusterMaxCommits());
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
if (config.isAsyncClusteringEnabled() && config.getAsyncClusterMaxCommits() > commitsSinceLastClustering) {
|
||||
LOG.info("Not scheduling async clustering as only " + commitsSinceLastClustering
|
||||
+ " commits was found since last clustering " + lastClusteringInstant + ". Waiting for "
|
||||
+ config.getAsyncClusterMaxCommits());
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
LOG.info("Generating clustering plan for table " + config.getBasePath());
|
||||
ClusteringPlanStrategy strategy = (ClusteringPlanStrategy)
|
||||
ReflectionUtils.loadClass(config.getClusteringPlanStrategyClass(), table, context, config);
|
||||
return strategy.generateClusteringPlan();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<HoodieClusteringPlan> execute() {
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
package org.apache.hudi.table.action.compact;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||
import org.apache.hudi.common.engine.EngineType;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieFileGroupId;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
@@ -68,12 +69,15 @@ public class ScheduleCompactionActionExecutor<T extends HoodieRecordPayload, I,
|
||||
public Option<HoodieCompactionPlan> execute() {
|
||||
if (!config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()
|
||||
&& !config.getFailedWritesCleanPolicy().isLazy()) {
|
||||
// if there are inflight writes, their instantTime must not be less than that of compaction instant time
|
||||
table.getActiveTimeline().getCommitsTimeline().filterPendingExcludingCompaction().firstInstant()
|
||||
.ifPresent(earliestInflight -> ValidationUtils.checkArgument(
|
||||
HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), HoodieTimeline.GREATER_THAN, instantTime),
|
||||
"Earliest write inflight instant time must be later than compaction time. Earliest :" + earliestInflight
|
||||
+ ", Compaction scheduled at " + instantTime));
|
||||
// TODO(yihua): this validation is removed for Java client used by kafka-connect. Need to revisit this.
|
||||
if (config.getEngineType() != EngineType.JAVA) {
|
||||
// if there are inflight writes, their instantTime must not be less than that of compaction instant time
|
||||
table.getActiveTimeline().getCommitsTimeline().filterPendingExcludingCompaction().firstInstant()
|
||||
.ifPresent(earliestInflight -> ValidationUtils.checkArgument(
|
||||
HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), HoodieTimeline.GREATER_THAN, instantTime),
|
||||
"Earliest write inflight instant time must be later than compaction time. Earliest :" + earliestInflight
|
||||
+ ", Compaction scheduled at " + instantTime));
|
||||
}
|
||||
// Committed and pending compaction instants should have strictly lower timestamps
|
||||
List<HoodieInstant> conflictingInstants = table.getActiveTimeline()
|
||||
.getWriteTimeline().filterCompletedAndCompactionInstants().getInstants()
|
||||
|
||||
Reference in New Issue
Block a user