1
0

[HUDI-2332] Add clustering and compaction in Kafka Connect Sink (#3857)

* [HUDI-2332] Add clustering and compaction in Kafka Connect Sink

* Disable validation check on instant time for compaction and adjust configs

* Add javadocs

* Add clustering and compaction config

* Fix transaction causing missing records in the target table

* Add debugging logs

* Fix kafka offset sync in participant

* Adjust how clustering and compaction are configured in kafka-connect

* Fix clustering strategy

* Remove irrelevant changes from other published PRs

* Update clustering logic and others

* Update README

* Fix test failures

* Fix indentation

* Fix clustering config

* Add JavaCustomColumnsSortPartitioner and make async compaction enabled by default

* Add test for JavaCustomColumnsSortPartitioner

* Add more changes after IDE sync

* Update README with clarification

* Fix clustering logic after rebasing

* Remove unrelated changes
This commit is contained in:
Y Ethan Guo
2021-11-23 00:53:28 -08:00
committed by GitHub
parent 9ed28b1570
commit ca9bfa2a40
27 changed files with 1358 additions and 93 deletions

View File

@@ -23,6 +23,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
import org.apache.hudi.table.HoodieSparkMergeOnReadTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -49,6 +50,7 @@ public class SparkRecentDaysClusteringPlanStrategy<T extends HoodieRecordPayload
super(table, engineContext, writeConfig);
}
@Override
protected List<String> filterPartitionPaths(List<String> partitionPaths) {
int targetPartitionsForClustering = getWriteConfig().getTargetPartitionsForClustering();
int skipPartitionsFromLatestForClustering = getWriteConfig().getSkipPartitionsFromLatestForClustering();

View File

@@ -18,18 +18,15 @@
package org.apache.hudi.execution.bulkinsert;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.config.SerializableSchema;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.spark.api.java.JavaRDD;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.spark.api.java.JavaRDD;
/**
* A partitioner that does sorting based on specified column values for each RDD partition.
@@ -57,7 +54,8 @@ public class RDDCustomColumnsSortPartitioner<T extends HoodieRecordPayload>
int outputSparkPartitions) {
final String[] sortColumns = this.sortColumnNames;
final SerializableSchema schema = this.serializableSchema;
return records.sortBy(record -> getRecordSortColumnValues(record, sortColumns, schema),
return records.sortBy(
record -> HoodieAvroUtils.getRecordColumnValues(record, sortColumns, schema),
true, outputSparkPartitions);
}
@@ -66,26 +64,6 @@ public class RDDCustomColumnsSortPartitioner<T extends HoodieRecordPayload>
return true;
}
private static Object getRecordSortColumnValues(HoodieRecord<? extends HoodieRecordPayload> record,
String[] sortColumns,
SerializableSchema schema) {
try {
GenericRecord genericRecord = (GenericRecord) record.getData().getInsertValue(schema.get()).get();
if (sortColumns.length == 1) {
return HoodieAvroUtils.getNestedFieldVal(genericRecord, sortColumns[0], true);
} else {
StringBuilder sb = new StringBuilder();
for (String col : sortColumns) {
sb.append(HoodieAvroUtils.getNestedFieldValAsString(genericRecord, col, true));
}
return sb.toString();
}
} catch (IOException e) {
throw new HoodieIOException("Unable to read record with key:" + record.getKey(), e);
}
}
private String[] getSortColumnName(HoodieWriteConfig config) {
return config.getUserDefinedBulkInsertPartitionerSortColumns().split(",");
}

View File

@@ -18,20 +18,14 @@
package org.apache.hudi.table.action.cluster;
import org.apache.hudi.avro.model.HoodieClusteringPlan;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.cluster.strategy.ClusteringPlanStrategy;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import java.util.Map;
@@ -40,8 +34,6 @@ import java.util.Map;
public class SparkClusteringPlanActionExecutor<T extends HoodieRecordPayload> extends
BaseClusteringPlanActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
private static final Logger LOG = LogManager.getLogger(SparkClusteringPlanActionExecutor.class);
public SparkClusteringPlanActionExecutor(HoodieEngineContext context,
HoodieWriteConfig config,
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
@@ -49,33 +41,4 @@ public class SparkClusteringPlanActionExecutor<T extends HoodieRecordPayload> ex
Option<Map<String, String>> extraMetadata) {
super(context, config, table, instantTime, extraMetadata);
}
@Override
protected Option<HoodieClusteringPlan> createClusteringPlan() {
LOG.info("Checking if clustering needs to be run on " + config.getBasePath());
Option<HoodieInstant> lastClusteringInstant = table.getActiveTimeline().getCompletedReplaceTimeline().lastInstant();
int commitsSinceLastClustering = table.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()
.findInstantsAfter(lastClusteringInstant.map(HoodieInstant::getTimestamp).orElse("0"), Integer.MAX_VALUE)
.countInstants();
if (config.inlineClusteringEnabled() && config.getInlineClusterMaxCommits() > commitsSinceLastClustering) {
LOG.info("Not scheduling inline clustering as only " + commitsSinceLastClustering
+ " commits was found since last clustering " + lastClusteringInstant + ". Waiting for "
+ config.getInlineClusterMaxCommits());
return Option.empty();
}
if (config.isAsyncClusteringEnabled() && config.getAsyncClusterMaxCommits() > commitsSinceLastClustering) {
LOG.info("Not scheduling async clustering as only " + commitsSinceLastClustering
+ " commits was found since last clustering " + lastClusteringInstant + ". Waiting for "
+ config.getAsyncClusterMaxCommits());
return Option.empty();
}
LOG.info("Generating clustering plan for table " + config.getBasePath());
ClusteringPlanStrategy strategy = (ClusteringPlanStrategy)
ReflectionUtils.loadClass(config.getClusteringPlanStrategyClass(), table, context, config);
return strategy.generateClusteringPlan();
}
}