[HUDI-2332] Add clustering and compaction in Kafka Connect Sink (#3857)
* [HUDI-2332] Add clustering and compaction in Kafka Connect Sink * Disable validation check on instant time for compaction and adjust configs * Add javadocs * Add clustering and compaction config * Fix transaction causing missing records in the target table * Add debugging logs * Fix kafka offset sync in participant * Adjust how clustering and compaction are configured in kafka-connect * Fix clustering strategy * Remove irrelevant changes from other published PRs * Update clustering logic and others * Update README * Fix test failures * Fix indentation * Fix clustering config * Add JavaCustomColumnsSortPartitioner and make async compaction enabled by default * Add test for JavaCustomColumnsSortPartitioner * Add more changes after IDE sync * Update README with clarification * Fix clustering logic after rebasing * Remove unrelated changes
This commit is contained in:
@@ -23,6 +23,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
|
||||
import org.apache.hudi.table.HoodieSparkMergeOnReadTable;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -49,6 +50,7 @@ public class SparkRecentDaysClusteringPlanStrategy<T extends HoodieRecordPayload
|
||||
super(table, engineContext, writeConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> filterPartitionPaths(List<String> partitionPaths) {
|
||||
int targetPartitionsForClustering = getWriteConfig().getTargetPartitionsForClustering();
|
||||
int skipPartitionsFromLatestForClustering = getWriteConfig().getSkipPartitionsFromLatestForClustering();
|
||||
|
||||
@@ -18,18 +18,15 @@
|
||||
|
||||
package org.apache.hudi.execution.bulkinsert;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.config.SerializableSchema;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
/**
|
||||
* A partitioner that does sorting based on specified column values for each RDD partition.
|
||||
@@ -57,7 +54,8 @@ public class RDDCustomColumnsSortPartitioner<T extends HoodieRecordPayload>
|
||||
int outputSparkPartitions) {
|
||||
final String[] sortColumns = this.sortColumnNames;
|
||||
final SerializableSchema schema = this.serializableSchema;
|
||||
return records.sortBy(record -> getRecordSortColumnValues(record, sortColumns, schema),
|
||||
return records.sortBy(
|
||||
record -> HoodieAvroUtils.getRecordColumnValues(record, sortColumns, schema),
|
||||
true, outputSparkPartitions);
|
||||
}
|
||||
|
||||
@@ -66,26 +64,6 @@ public class RDDCustomColumnsSortPartitioner<T extends HoodieRecordPayload>
|
||||
return true;
|
||||
}
|
||||
|
||||
private static Object getRecordSortColumnValues(HoodieRecord<? extends HoodieRecordPayload> record,
|
||||
String[] sortColumns,
|
||||
SerializableSchema schema) {
|
||||
try {
|
||||
GenericRecord genericRecord = (GenericRecord) record.getData().getInsertValue(schema.get()).get();
|
||||
if (sortColumns.length == 1) {
|
||||
return HoodieAvroUtils.getNestedFieldVal(genericRecord, sortColumns[0], true);
|
||||
} else {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String col : sortColumns) {
|
||||
sb.append(HoodieAvroUtils.getNestedFieldValAsString(genericRecord, col, true));
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Unable to read record with key:" + record.getKey(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private String[] getSortColumnName(HoodieWriteConfig config) {
|
||||
return config.getUserDefinedBulkInsertPartitionerSortColumns().split(",");
|
||||
}
|
||||
|
||||
@@ -18,20 +18,14 @@
|
||||
|
||||
package org.apache.hudi.table.action.cluster;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.cluster.strategy.ClusteringPlanStrategy;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.util.Map;
|
||||
@@ -40,8 +34,6 @@ import java.util.Map;
|
||||
public class SparkClusteringPlanActionExecutor<T extends HoodieRecordPayload> extends
|
||||
BaseClusteringPlanActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SparkClusteringPlanActionExecutor.class);
|
||||
|
||||
public SparkClusteringPlanActionExecutor(HoodieEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
@@ -49,33 +41,4 @@ public class SparkClusteringPlanActionExecutor<T extends HoodieRecordPayload> ex
|
||||
Option<Map<String, String>> extraMetadata) {
|
||||
super(context, config, table, instantTime, extraMetadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Option<HoodieClusteringPlan> createClusteringPlan() {
|
||||
LOG.info("Checking if clustering needs to be run on " + config.getBasePath());
|
||||
Option<HoodieInstant> lastClusteringInstant = table.getActiveTimeline().getCompletedReplaceTimeline().lastInstant();
|
||||
|
||||
int commitsSinceLastClustering = table.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()
|
||||
.findInstantsAfter(lastClusteringInstant.map(HoodieInstant::getTimestamp).orElse("0"), Integer.MAX_VALUE)
|
||||
.countInstants();
|
||||
if (config.inlineClusteringEnabled() && config.getInlineClusterMaxCommits() > commitsSinceLastClustering) {
|
||||
LOG.info("Not scheduling inline clustering as only " + commitsSinceLastClustering
|
||||
+ " commits was found since last clustering " + lastClusteringInstant + ". Waiting for "
|
||||
+ config.getInlineClusterMaxCommits());
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
if (config.isAsyncClusteringEnabled() && config.getAsyncClusterMaxCommits() > commitsSinceLastClustering) {
|
||||
LOG.info("Not scheduling async clustering as only " + commitsSinceLastClustering
|
||||
+ " commits was found since last clustering " + lastClusteringInstant + ". Waiting for "
|
||||
+ config.getAsyncClusterMaxCommits());
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
LOG.info("Generating clustering plan for table " + config.getBasePath());
|
||||
ClusteringPlanStrategy strategy = (ClusteringPlanStrategy)
|
||||
ReflectionUtils.loadClass(config.getClusteringPlanStrategyClass(), table, context, config);
|
||||
return strategy.generateClusteringPlan();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user