[HUDI-3042] Refactor clustering executors (#4847)
This commit is contained in:
@@ -41,13 +41,13 @@ import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class BaseClusteringPlanActionExecutor<T extends HoodieRecordPayload, I, K, O> extends BaseActionExecutor<T, I, K, O, Option<HoodieClusteringPlan>> {
|
||||
public class ClusteringPlanActionExecutor<T extends HoodieRecordPayload, I, K, O> extends BaseActionExecutor<T, I, K, O, Option<HoodieClusteringPlan>> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(BaseClusteringPlanActionExecutor.class);
|
||||
private static final Logger LOG = LogManager.getLogger(ClusteringPlanActionExecutor.class);
|
||||
|
||||
private final Option<Map<String, String>> extraMetadata;
|
||||
|
||||
public BaseClusteringPlanActionExecutor(HoodieEngineContext context,
|
||||
public ClusteringPlanActionExecutor(HoodieEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, I, K, O> table,
|
||||
String instantTime,
|
||||
@@ -18,12 +18,18 @@
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.avro.model.HoodieClusteringGroup;
|
||||
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.transaction.TransactionManager;
|
||||
import org.apache.hudi.client.utils.TransactionUtils;
|
||||
import org.apache.hudi.common.data.HoodieData;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieFileGroupId;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
@@ -31,10 +37,15 @@ import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.ClusteringUtils;
|
||||
import org.apache.hudi.common.util.CommitUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieClusteringException;
|
||||
import org.apache.hudi.exception.HoodieCommitException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
@@ -42,7 +53,9 @@ import org.apache.hudi.table.WorkloadProfile;
|
||||
import org.apache.hudi.table.WorkloadStat;
|
||||
import org.apache.hudi.table.action.BaseActionExecutor;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -53,6 +66,8 @@ import java.time.Instant;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload, I, K, O, R>
|
||||
extends BaseActionExecutor<T, I, K, O, R> {
|
||||
@@ -200,4 +215,65 @@ public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload, I,
|
||||
|
||||
protected abstract Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId,
|
||||
Iterator<HoodieRecord<T>> recordItr) throws IOException;
|
||||
|
||||
protected HoodieWriteMetadata<HoodieData<WriteStatus>> executeClustering(HoodieClusteringPlan clusteringPlan) {
|
||||
HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime);
|
||||
// Mark instant as clustering inflight
|
||||
table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty());
|
||||
table.getMetaClient().reloadActiveTimeline();
|
||||
|
||||
final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
||||
HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = (
|
||||
(ClusteringExecutionStrategy<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>>)
|
||||
ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(),
|
||||
new Class<?>[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config))
|
||||
.performClustering(clusteringPlan, schema, instantTime);
|
||||
HoodieData<WriteStatus> writeStatusList = writeMetadata.getWriteStatuses();
|
||||
HoodieData<WriteStatus> statuses = updateIndex(writeStatusList, writeMetadata);
|
||||
writeMetadata.setWriteStats(statuses.map(WriteStatus::getStat).collectAsList());
|
||||
writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(clusteringPlan, writeMetadata));
|
||||
validateWriteResult(clusteringPlan, writeMetadata);
|
||||
commitOnAutoCommit(writeMetadata);
|
||||
if (!writeMetadata.getCommitMetadata().isPresent()) {
|
||||
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(),
|
||||
extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType());
|
||||
writeMetadata.setCommitMetadata(Option.of(commitMetadata));
|
||||
}
|
||||
return writeMetadata;
|
||||
}
|
||||
|
||||
private HoodieData<WriteStatus> updateIndex(HoodieData<WriteStatus> writeStatuses, HoodieWriteMetadata<HoodieData<WriteStatus>> result) {
|
||||
Instant indexStartTime = Instant.now();
|
||||
// Update the index back
|
||||
HoodieData<WriteStatus> statuses = table.getIndex().updateLocation(writeStatuses, context, table);
|
||||
result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now()));
|
||||
result.setWriteStatuses(statuses);
|
||||
return statuses;
|
||||
}
|
||||
|
||||
private Map<String, List<String>> getPartitionToReplacedFileIds(HoodieClusteringPlan clusteringPlan, HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata) {
|
||||
Set<HoodieFileGroupId> newFilesWritten = writeMetadata.getWriteStats().get().stream()
|
||||
.map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet());
|
||||
|
||||
return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan)
|
||||
.filter(fg -> "org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy"
|
||||
.equals(config.getClusteringExecutionStrategyClass())
|
||||
|| !newFilesWritten.contains(fg))
|
||||
.collect(Collectors.groupingBy(HoodieFileGroupId::getPartitionPath, Collectors.mapping(HoodieFileGroupId::getFileId, Collectors.toList())));
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate actions taken by clustering. In the first implementation, we validate at least one new file is written.
|
||||
* But we can extend this to add more validation. E.g. number of records read = number of records written etc.
|
||||
* We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions.
|
||||
*/
|
||||
private void validateWriteResult(HoodieClusteringPlan clusteringPlan, HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata) {
|
||||
if (writeMetadata.getWriteStatuses().isEmpty()) {
|
||||
throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime
|
||||
+ " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least "
|
||||
+ clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum()
|
||||
+ " write statuses");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user