[HUDI-3042] Refactor clustering executors (#4847)
This commit is contained in:
@@ -41,13 +41,13 @@ import java.io.IOException;
|
|||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
public abstract class BaseClusteringPlanActionExecutor<T extends HoodieRecordPayload, I, K, O> extends BaseActionExecutor<T, I, K, O, Option<HoodieClusteringPlan>> {
|
public class ClusteringPlanActionExecutor<T extends HoodieRecordPayload, I, K, O> extends BaseActionExecutor<T, I, K, O, Option<HoodieClusteringPlan>> {
|
||||||
|
|
||||||
private static final Logger LOG = LogManager.getLogger(BaseClusteringPlanActionExecutor.class);
|
private static final Logger LOG = LogManager.getLogger(ClusteringPlanActionExecutor.class);
|
||||||
|
|
||||||
private final Option<Map<String, String>> extraMetadata;
|
private final Option<Map<String, String>> extraMetadata;
|
||||||
|
|
||||||
public BaseClusteringPlanActionExecutor(HoodieEngineContext context,
|
public ClusteringPlanActionExecutor(HoodieEngineContext context,
|
||||||
HoodieWriteConfig config,
|
HoodieWriteConfig config,
|
||||||
HoodieTable<T, I, K, O> table,
|
HoodieTable<T, I, K, O> table,
|
||||||
String instantTime,
|
String instantTime,
|
||||||
@@ -18,12 +18,18 @@
|
|||||||
|
|
||||||
package org.apache.hudi.table.action.commit;
|
package org.apache.hudi.table.action.commit;
|
||||||
|
|
||||||
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
|
import org.apache.hudi.avro.model.HoodieClusteringGroup;
|
||||||
|
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.client.transaction.TransactionManager;
|
import org.apache.hudi.client.transaction.TransactionManager;
|
||||||
import org.apache.hudi.client.utils.TransactionUtils;
|
import org.apache.hudi.client.utils.TransactionUtils;
|
||||||
|
import org.apache.hudi.common.data.HoodieData;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
|
import org.apache.hudi.common.model.HoodieFileGroupId;
|
||||||
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||||
@@ -31,10 +37,15 @@ import org.apache.hudi.common.model.WriteOperationType;
|
|||||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
|
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
|
import org.apache.hudi.common.util.ClusteringUtils;
|
||||||
|
import org.apache.hudi.common.util.CommitUtils;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.apache.hudi.common.util.ReflectionUtils;
|
||||||
import org.apache.hudi.common.util.StringUtils;
|
import org.apache.hudi.common.util.StringUtils;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.exception.HoodieClusteringException;
|
||||||
import org.apache.hudi.exception.HoodieCommitException;
|
import org.apache.hudi.exception.HoodieCommitException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
@@ -42,7 +53,9 @@ import org.apache.hudi.table.WorkloadProfile;
|
|||||||
import org.apache.hudi.table.WorkloadStat;
|
import org.apache.hudi.table.WorkloadStat;
|
||||||
import org.apache.hudi.table.action.BaseActionExecutor;
|
import org.apache.hudi.table.action.BaseActionExecutor;
|
||||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||||
|
import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy;
|
||||||
|
|
||||||
|
import org.apache.avro.Schema;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
@@ -53,6 +66,8 @@ import java.time.Instant;
|
|||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload, I, K, O, R>
|
public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload, I, K, O, R>
|
||||||
extends BaseActionExecutor<T, I, K, O, R> {
|
extends BaseActionExecutor<T, I, K, O, R> {
|
||||||
@@ -200,4 +215,65 @@ public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload, I,
|
|||||||
|
|
||||||
protected abstract Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId,
|
protected abstract Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId,
|
||||||
Iterator<HoodieRecord<T>> recordItr) throws IOException;
|
Iterator<HoodieRecord<T>> recordItr) throws IOException;
|
||||||
|
|
||||||
|
protected HoodieWriteMetadata<HoodieData<WriteStatus>> executeClustering(HoodieClusteringPlan clusteringPlan) {
|
||||||
|
HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime);
|
||||||
|
// Mark instant as clustering inflight
|
||||||
|
table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty());
|
||||||
|
table.getMetaClient().reloadActiveTimeline();
|
||||||
|
|
||||||
|
final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
||||||
|
HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = (
|
||||||
|
(ClusteringExecutionStrategy<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>>)
|
||||||
|
ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(),
|
||||||
|
new Class<?>[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config))
|
||||||
|
.performClustering(clusteringPlan, schema, instantTime);
|
||||||
|
HoodieData<WriteStatus> writeStatusList = writeMetadata.getWriteStatuses();
|
||||||
|
HoodieData<WriteStatus> statuses = updateIndex(writeStatusList, writeMetadata);
|
||||||
|
writeMetadata.setWriteStats(statuses.map(WriteStatus::getStat).collectAsList());
|
||||||
|
writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(clusteringPlan, writeMetadata));
|
||||||
|
validateWriteResult(clusteringPlan, writeMetadata);
|
||||||
|
commitOnAutoCommit(writeMetadata);
|
||||||
|
if (!writeMetadata.getCommitMetadata().isPresent()) {
|
||||||
|
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(),
|
||||||
|
extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType());
|
||||||
|
writeMetadata.setCommitMetadata(Option.of(commitMetadata));
|
||||||
|
}
|
||||||
|
return writeMetadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
private HoodieData<WriteStatus> updateIndex(HoodieData<WriteStatus> writeStatuses, HoodieWriteMetadata<HoodieData<WriteStatus>> result) {
|
||||||
|
Instant indexStartTime = Instant.now();
|
||||||
|
// Update the index back
|
||||||
|
HoodieData<WriteStatus> statuses = table.getIndex().updateLocation(writeStatuses, context, table);
|
||||||
|
result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now()));
|
||||||
|
result.setWriteStatuses(statuses);
|
||||||
|
return statuses;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, List<String>> getPartitionToReplacedFileIds(HoodieClusteringPlan clusteringPlan, HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata) {
|
||||||
|
Set<HoodieFileGroupId> newFilesWritten = writeMetadata.getWriteStats().get().stream()
|
||||||
|
.map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan)
|
||||||
|
.filter(fg -> "org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy"
|
||||||
|
.equals(config.getClusteringExecutionStrategyClass())
|
||||||
|
|| !newFilesWritten.contains(fg))
|
||||||
|
.collect(Collectors.groupingBy(HoodieFileGroupId::getPartitionPath, Collectors.mapping(HoodieFileGroupId::getFileId, Collectors.toList())));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate actions taken by clustering. In the first implementation, we validate at least one new file is written.
|
||||||
|
* But we can extend this to add more validation. E.g. number of records read = number of records written etc.
|
||||||
|
* We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions.
|
||||||
|
*/
|
||||||
|
private void validateWriteResult(HoodieClusteringPlan clusteringPlan, HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata) {
|
||||||
|
if (writeMetadata.getWriteStatuses().isEmpty()) {
|
||||||
|
throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime
|
||||||
|
+ " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least "
|
||||||
|
+ clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum()
|
||||||
|
+ " write statuses");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ import org.apache.hudi.avro.model.HoodieClusteringGroup;
|
|||||||
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.client.common.JavaTaskContextSupplier;
|
import org.apache.hudi.client.common.JavaTaskContextSupplier;
|
||||||
|
import org.apache.hudi.common.data.HoodieData;
|
||||||
|
import org.apache.hudi.common.data.HoodieList;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.model.ClusteringOperation;
|
import org.apache.hudi.common.model.ClusteringOperation;
|
||||||
import org.apache.hudi.common.model.HoodieAvroRecord;
|
import org.apache.hudi.common.model.HoodieAvroRecord;
|
||||||
@@ -71,7 +73,7 @@ import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_C
|
|||||||
* Clustering strategy for Java engine.
|
* Clustering strategy for Java engine.
|
||||||
*/
|
*/
|
||||||
public abstract class JavaExecutionStrategy<T extends HoodieRecordPayload<T>>
|
public abstract class JavaExecutionStrategy<T extends HoodieRecordPayload<T>>
|
||||||
extends ClusteringExecutionStrategy<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> {
|
extends ClusteringExecutionStrategy<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> {
|
||||||
|
|
||||||
private static final Logger LOG = LogManager.getLogger(JavaExecutionStrategy.class);
|
private static final Logger LOG = LogManager.getLogger(JavaExecutionStrategy.class);
|
||||||
|
|
||||||
@@ -81,7 +83,7 @@ public abstract class JavaExecutionStrategy<T extends HoodieRecordPayload<T>>
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public HoodieWriteMetadata<List<WriteStatus>> performClustering(
|
public HoodieWriteMetadata<HoodieData<WriteStatus>> performClustering(
|
||||||
HoodieClusteringPlan clusteringPlan, Schema schema, String instantTime) {
|
HoodieClusteringPlan clusteringPlan, Schema schema, String instantTime) {
|
||||||
// execute clustering for each group and collect WriteStatus
|
// execute clustering for each group and collect WriteStatus
|
||||||
List<WriteStatus> writeStatusList = new ArrayList<>();
|
List<WriteStatus> writeStatusList = new ArrayList<>();
|
||||||
@@ -90,8 +92,8 @@ public abstract class JavaExecutionStrategy<T extends HoodieRecordPayload<T>>
|
|||||||
inputGroup, clusteringPlan.getStrategy().getStrategyParams(),
|
inputGroup, clusteringPlan.getStrategy().getStrategyParams(),
|
||||||
Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false),
|
Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false),
|
||||||
instantTime)));
|
instantTime)));
|
||||||
HoodieWriteMetadata<List<WriteStatus>> writeMetadata = new HoodieWriteMetadata<>();
|
HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = new HoodieWriteMetadata<>();
|
||||||
writeMetadata.setWriteStatuses(writeStatusList);
|
writeMetadata.setWriteStatuses(HoodieList.of(writeStatusList));
|
||||||
return writeMetadata;
|
return writeMetadata;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ import org.apache.hudi.table.action.HoodieWriteMetadata;
|
|||||||
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
|
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
|
||||||
import org.apache.hudi.table.action.clean.CleanActionExecutor;
|
import org.apache.hudi.table.action.clean.CleanActionExecutor;
|
||||||
import org.apache.hudi.table.action.clean.CleanPlanActionExecutor;
|
import org.apache.hudi.table.action.clean.CleanPlanActionExecutor;
|
||||||
import org.apache.hudi.table.action.cluster.JavaClusteringPlanActionExecutor;
|
import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor;
|
||||||
import org.apache.hudi.table.action.cluster.JavaExecuteClusteringCommitActionExecutor;
|
import org.apache.hudi.table.action.cluster.JavaExecuteClusteringCommitActionExecutor;
|
||||||
import org.apache.hudi.table.action.commit.JavaBulkInsertCommitActionExecutor;
|
import org.apache.hudi.table.action.commit.JavaBulkInsertCommitActionExecutor;
|
||||||
import org.apache.hudi.table.action.commit.JavaBulkInsertPreppedCommitActionExecutor;
|
import org.apache.hudi.table.action.commit.JavaBulkInsertPreppedCommitActionExecutor;
|
||||||
@@ -70,10 +70,11 @@ import org.apache.hudi.table.action.savepoint.SavepointActionExecutor;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import javax.annotation.Nonnull;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import javax.annotation.Nonnull;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
@@ -192,7 +193,7 @@ public class HoodieJavaCopyOnWriteTable<T extends HoodieRecordPayload>
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Option<HoodieClusteringPlan> scheduleClustering(final HoodieEngineContext context, final String instantTime, final Option<Map<String, String>> extraMetadata) {
|
public Option<HoodieClusteringPlan> scheduleClustering(final HoodieEngineContext context, final String instantTime, final Option<Map<String, String>> extraMetadata) {
|
||||||
return new JavaClusteringPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute();
|
return new ClusteringPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -1,43 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing,
|
|
||||||
* software distributed under the License is distributed on an
|
|
||||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
* KIND, either express or implied. See the License for the
|
|
||||||
* specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.hudi.table.action.cluster;
|
|
||||||
|
|
||||||
import org.apache.hudi.client.WriteStatus;
|
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
|
||||||
import org.apache.hudi.common.util.Option;
|
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
|
||||||
import org.apache.hudi.table.HoodieTable;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
public class JavaClusteringPlanActionExecutor<T extends HoodieRecordPayload> extends
|
|
||||||
BaseClusteringPlanActionExecutor<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> {
|
|
||||||
|
|
||||||
public JavaClusteringPlanActionExecutor(
|
|
||||||
HoodieEngineContext context, HoodieWriteConfig config,
|
|
||||||
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table,
|
|
||||||
String instantTime, Option<Map<String, String>> extraMetadata) {
|
|
||||||
super(context, config, table, instantTime, extraMetadata);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -19,46 +19,32 @@
|
|||||||
|
|
||||||
package org.apache.hudi.table.action.cluster;
|
package org.apache.hudi.table.action.cluster;
|
||||||
|
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
|
||||||
import org.apache.hudi.avro.model.HoodieClusteringGroup;
|
|
||||||
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
|
import org.apache.hudi.common.data.HoodieData;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
|
||||||
import org.apache.hudi.common.model.HoodieFileGroupId;
|
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.model.WriteOperationType;
|
import org.apache.hudi.common.model.WriteOperationType;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
import org.apache.hudi.common.util.ClusteringUtils;
|
import org.apache.hudi.common.util.ClusteringUtils;
|
||||||
import org.apache.hudi.common.util.CommitUtils;
|
|
||||||
import org.apache.hudi.common.util.Option;
|
|
||||||
import org.apache.hudi.common.util.ReflectionUtils;
|
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieClusteringException;
|
import org.apache.hudi.exception.HoodieClusteringException;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||||
import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy;
|
|
||||||
import org.apache.hudi.table.action.commit.BaseJavaCommitActionExecutor;
|
import org.apache.hudi.table.action.commit.BaseJavaCommitActionExecutor;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
public class JavaExecuteClusteringCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
public class JavaExecuteClusteringCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||||
extends BaseJavaCommitActionExecutor<T> {
|
extends BaseJavaCommitActionExecutor<T> {
|
||||||
|
|
||||||
private final HoodieClusteringPlan clusteringPlan;
|
private final HoodieClusteringPlan clusteringPlan;
|
||||||
|
|
||||||
public JavaExecuteClusteringCommitActionExecutor(
|
public JavaExecuteClusteringCommitActionExecutor(HoodieEngineContext context,
|
||||||
HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table,
|
HoodieWriteConfig config,
|
||||||
String instantTime) {
|
HoodieTable table,
|
||||||
|
String instantTime) {
|
||||||
super(context, config, table, instantTime, WriteOperationType.CLUSTER);
|
super(context, config, table, instantTime, WriteOperationType.CLUSTER);
|
||||||
this.clusteringPlan = ClusteringUtils.getClusteringPlan(
|
this.clusteringPlan = ClusteringUtils.getClusteringPlan(
|
||||||
table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime))
|
table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime))
|
||||||
@@ -68,56 +54,13 @@ public class JavaExecuteClusteringCommitActionExecutor<T extends HoodieRecordPay
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public HoodieWriteMetadata<List<WriteStatus>> execute() {
|
public HoodieWriteMetadata<List<WriteStatus>> execute() {
|
||||||
HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime);
|
HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = executeClustering(clusteringPlan);
|
||||||
// Mark instant as clustering inflight
|
List<WriteStatus> transformedWriteStatuses = writeMetadata.getWriteStatuses().collectAsList();
|
||||||
table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty());
|
return writeMetadata.clone(transformedWriteStatuses);
|
||||||
table.getMetaClient().reloadActiveTimeline();
|
|
||||||
|
|
||||||
final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
|
||||||
HoodieWriteMetadata<List<WriteStatus>> writeMetadata = (
|
|
||||||
(ClusteringExecutionStrategy<T, List<HoodieRecord<? extends HoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>>)
|
|
||||||
ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(),
|
|
||||||
new Class<?>[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config))
|
|
||||||
.performClustering(clusteringPlan, schema, instantTime);
|
|
||||||
List<WriteStatus> writeStatusList = writeMetadata.getWriteStatuses();
|
|
||||||
List<WriteStatus> statuses = updateIndex(writeStatusList, writeMetadata);
|
|
||||||
writeMetadata.setWriteStats(statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()));
|
|
||||||
writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(writeMetadata));
|
|
||||||
validateWriteResult(writeMetadata);
|
|
||||||
commitOnAutoCommit(writeMetadata);
|
|
||||||
if (!writeMetadata.getCommitMetadata().isPresent()) {
|
|
||||||
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(),
|
|
||||||
extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType());
|
|
||||||
writeMetadata.setCommitMetadata(Option.of(commitMetadata));
|
|
||||||
}
|
|
||||||
return writeMetadata;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validate actions taken by clustering. In the first implementation, we validate at least one new file is written.
|
|
||||||
* But we can extend this to add more validation. E.g. number of records read = number of records written etc.
|
|
||||||
* We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions.
|
|
||||||
*/
|
|
||||||
private void validateWriteResult(HoodieWriteMetadata<List<WriteStatus>> writeMetadata) {
|
|
||||||
if (writeMetadata.getWriteStatuses().isEmpty()) {
|
|
||||||
throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime
|
|
||||||
+ " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least "
|
|
||||||
+ clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum()
|
|
||||||
+ " write statuses");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String getCommitActionType() {
|
protected String getCommitActionType() {
|
||||||
return HoodieTimeline.REPLACE_COMMIT_ACTION;
|
return HoodieTimeline.REPLACE_COMMIT_ACTION;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Map<String, List<String>> getPartitionToReplacedFileIds(HoodieWriteMetadata<List<WriteStatus>> writeMetadata) {
|
|
||||||
Set<HoodieFileGroupId> newFilesWritten = writeMetadata.getWriteStats().get().stream()
|
|
||||||
.map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet());
|
|
||||||
return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan)
|
|
||||||
.filter(fg -> !newFilesWritten.contains(fg))
|
|
||||||
.collect(Collectors.groupingBy(fg -> fg.getPartitionPath(), Collectors.mapping(fg -> fg.getFileId(), Collectors.toList())));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ import org.apache.hudi.client.WriteStatus;
|
|||||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||||
import org.apache.hudi.client.utils.ConcatenatingIterator;
|
import org.apache.hudi.client.utils.ConcatenatingIterator;
|
||||||
import org.apache.hudi.common.config.TypedProperties;
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
|
import org.apache.hudi.common.data.HoodieData;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.model.ClusteringOperation;
|
import org.apache.hudi.common.model.ClusteringOperation;
|
||||||
import org.apache.hudi.common.model.HoodieAvroRecord;
|
import org.apache.hudi.common.model.HoodieAvroRecord;
|
||||||
@@ -42,6 +43,7 @@ import org.apache.hudi.common.util.StringUtils;
|
|||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieClusteringConfig;
|
import org.apache.hudi.config.HoodieClusteringConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.data.HoodieJavaRDD;
|
||||||
import org.apache.hudi.exception.HoodieClusteringException;
|
import org.apache.hudi.exception.HoodieClusteringException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner;
|
import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner;
|
||||||
@@ -82,7 +84,7 @@ import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_C
|
|||||||
* Clustering strategy to submit multiple spark jobs and union the results.
|
* Clustering strategy to submit multiple spark jobs and union the results.
|
||||||
*/
|
*/
|
||||||
public abstract class MultipleSparkJobExecutionStrategy<T extends HoodieRecordPayload<T>>
|
public abstract class MultipleSparkJobExecutionStrategy<T extends HoodieRecordPayload<T>>
|
||||||
extends ClusteringExecutionStrategy<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
extends ClusteringExecutionStrategy<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> {
|
||||||
private static final Logger LOG = LogManager.getLogger(MultipleSparkJobExecutionStrategy.class);
|
private static final Logger LOG = LogManager.getLogger(MultipleSparkJobExecutionStrategy.class);
|
||||||
|
|
||||||
public MultipleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) {
|
public MultipleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) {
|
||||||
@@ -90,7 +92,7 @@ public abstract class MultipleSparkJobExecutionStrategy<T extends HoodieRecordPa
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) {
|
public HoodieWriteMetadata<HoodieData<WriteStatus>> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) {
|
||||||
JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext());
|
JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext());
|
||||||
// execute clustering for each group async and collect WriteStatus
|
// execute clustering for each group async and collect WriteStatus
|
||||||
Stream<JavaRDD<WriteStatus>> writeStatusRDDStream = FutureUtils.allOf(
|
Stream<JavaRDD<WriteStatus>> writeStatusRDDStream = FutureUtils.allOf(
|
||||||
@@ -105,8 +107,8 @@ public abstract class MultipleSparkJobExecutionStrategy<T extends HoodieRecordPa
|
|||||||
JavaRDD<WriteStatus>[] writeStatuses = convertStreamToArray(writeStatusRDDStream);
|
JavaRDD<WriteStatus>[] writeStatuses = convertStreamToArray(writeStatusRDDStream);
|
||||||
JavaRDD<WriteStatus> writeStatusRDD = engineContext.union(writeStatuses);
|
JavaRDD<WriteStatus> writeStatusRDD = engineContext.union(writeStatuses);
|
||||||
|
|
||||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata = new HoodieWriteMetadata<>();
|
HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = new HoodieWriteMetadata<>();
|
||||||
writeMetadata.setWriteStatuses(writeStatusRDD);
|
writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD));
|
||||||
return writeMetadata;
|
return writeMetadata;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
|||||||
import org.apache.hudi.client.utils.ConcatenatingIterator;
|
import org.apache.hudi.client.utils.ConcatenatingIterator;
|
||||||
import org.apache.hudi.common.config.SerializableSchema;
|
import org.apache.hudi.common.config.SerializableSchema;
|
||||||
import org.apache.hudi.common.config.TypedProperties;
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
|
import org.apache.hudi.common.data.HoodieData;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||||
import org.apache.hudi.common.model.ClusteringGroupInfo;
|
import org.apache.hudi.common.model.ClusteringGroupInfo;
|
||||||
@@ -37,6 +38,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
|
|||||||
import org.apache.hudi.common.model.RewriteAvroPayload;
|
import org.apache.hudi.common.model.RewriteAvroPayload;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.data.HoodieJavaRDD;
|
||||||
import org.apache.hudi.exception.HoodieClusteringException;
|
import org.apache.hudi.exception.HoodieClusteringException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
||||||
@@ -71,7 +73,7 @@ import java.util.stream.StreamSupport;
|
|||||||
* MultipleSparkJobExecution strategy is not ideal for use cases that require large number of clustering groups
|
* MultipleSparkJobExecution strategy is not ideal for use cases that require large number of clustering groups
|
||||||
*/
|
*/
|
||||||
public abstract class SingleSparkJobExecutionStrategy<T extends HoodieRecordPayload<T>>
|
public abstract class SingleSparkJobExecutionStrategy<T extends HoodieRecordPayload<T>>
|
||||||
extends ClusteringExecutionStrategy<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
extends ClusteringExecutionStrategy<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> {
|
||||||
private static final Logger LOG = LogManager.getLogger(SingleSparkJobExecutionStrategy.class);
|
private static final Logger LOG = LogManager.getLogger(SingleSparkJobExecutionStrategy.class);
|
||||||
|
|
||||||
public SingleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) {
|
public SingleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) {
|
||||||
@@ -79,7 +81,7 @@ public abstract class SingleSparkJobExecutionStrategy<T extends HoodieRecordPayl
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) {
|
public HoodieWriteMetadata<HoodieData<WriteStatus>> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) {
|
||||||
JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext());
|
JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext());
|
||||||
final TaskContextSupplier taskContextSupplier = getEngineContext().getTaskContextSupplier();
|
final TaskContextSupplier taskContextSupplier = getEngineContext().getTaskContextSupplier();
|
||||||
final SerializableSchema serializableSchema = new SerializableSchema(schema);
|
final SerializableSchema serializableSchema = new SerializableSchema(schema);
|
||||||
@@ -104,8 +106,8 @@ public abstract class SingleSparkJobExecutionStrategy<T extends HoodieRecordPayl
|
|||||||
).iterator();
|
).iterator();
|
||||||
});
|
});
|
||||||
|
|
||||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata = new HoodieWriteMetadata<>();
|
HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = new HoodieWriteMetadata<>();
|
||||||
writeMetadata.setWriteStatuses(writeStatusRDD);
|
writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD));
|
||||||
return writeMetadata;
|
return writeMetadata;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
|
|||||||
import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor;
|
import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor;
|
||||||
import org.apache.hudi.table.action.clean.CleanActionExecutor;
|
import org.apache.hudi.table.action.clean.CleanActionExecutor;
|
||||||
import org.apache.hudi.table.action.clean.CleanPlanActionExecutor;
|
import org.apache.hudi.table.action.clean.CleanPlanActionExecutor;
|
||||||
import org.apache.hudi.table.action.cluster.SparkClusteringPlanActionExecutor;
|
import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor;
|
||||||
import org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor;
|
import org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor;
|
||||||
import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor;
|
import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor;
|
||||||
import org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor;
|
import org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor;
|
||||||
@@ -244,7 +244,7 @@ public class HoodieSparkCopyOnWriteTable<T extends HoodieRecordPayload>
|
|||||||
public Option<HoodieClusteringPlan> scheduleClustering(HoodieEngineContext context,
|
public Option<HoodieClusteringPlan> scheduleClustering(HoodieEngineContext context,
|
||||||
String instantTime,
|
String instantTime,
|
||||||
Option<Map<String, String>> extraMetadata) {
|
Option<Map<String, String>> extraMetadata) {
|
||||||
return new SparkClusteringPlanActionExecutor<>(context, config,this, instantTime, extraMetadata).execute();
|
return new ClusteringPlanActionExecutor<>(context, config,this, instantTime, extraMetadata).execute();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -1,44 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.hudi.table.action.cluster;
|
|
||||||
|
|
||||||
import org.apache.hudi.client.WriteStatus;
|
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
|
||||||
import org.apache.hudi.common.util.Option;
|
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
|
||||||
import org.apache.hudi.table.HoodieTable;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
@SuppressWarnings("checkstyle:LineLength")
|
|
||||||
public class SparkClusteringPlanActionExecutor<T extends HoodieRecordPayload> extends
|
|
||||||
BaseClusteringPlanActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
|
||||||
|
|
||||||
public SparkClusteringPlanActionExecutor(HoodieEngineContext context,
|
|
||||||
HoodieWriteConfig config,
|
|
||||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
|
||||||
String instantTime,
|
|
||||||
Option<Map<String, String>> extraMetadata) {
|
|
||||||
super(context, config, table, instantTime, extraMetadata);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -18,111 +18,48 @@
|
|||||||
|
|
||||||
package org.apache.hudi.table.action.cluster;
|
package org.apache.hudi.table.action.cluster;
|
||||||
|
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
|
||||||
import org.apache.hudi.avro.model.HoodieClusteringGroup;
|
|
||||||
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy;
|
import org.apache.hudi.common.data.HoodieData;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
|
||||||
import org.apache.hudi.common.model.HoodieFileGroupId;
|
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.model.WriteOperationType;
|
import org.apache.hudi.common.model.WriteOperationType;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
import org.apache.hudi.common.util.ClusteringUtils;
|
import org.apache.hudi.common.util.ClusteringUtils;
|
||||||
import org.apache.hudi.common.util.CommitUtils;
|
|
||||||
import org.apache.hudi.common.util.Option;
|
|
||||||
import org.apache.hudi.common.util.ReflectionUtils;
|
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.data.HoodieJavaRDD;
|
||||||
import org.apache.hudi.exception.HoodieClusteringException;
|
import org.apache.hudi.exception.HoodieClusteringException;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||||
import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy;
|
|
||||||
import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor;
|
import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
public class SparkExecuteClusteringCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
public class SparkExecuteClusteringCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||||
extends BaseSparkCommitActionExecutor<T> {
|
extends BaseSparkCommitActionExecutor<T> {
|
||||||
|
|
||||||
private static final Logger LOG = LogManager.getLogger(SparkExecuteClusteringCommitActionExecutor.class);
|
|
||||||
private final HoodieClusteringPlan clusteringPlan;
|
private final HoodieClusteringPlan clusteringPlan;
|
||||||
|
|
||||||
public SparkExecuteClusteringCommitActionExecutor(HoodieEngineContext context,
|
public SparkExecuteClusteringCommitActionExecutor(HoodieEngineContext context,
|
||||||
HoodieWriteConfig config, HoodieTable table,
|
HoodieWriteConfig config, HoodieTable table,
|
||||||
String instantTime) {
|
String instantTime) {
|
||||||
super(context, config, table, instantTime, WriteOperationType.CLUSTER);
|
super(context, config, table, instantTime, WriteOperationType.CLUSTER);
|
||||||
this.clusteringPlan = ClusteringUtils.getClusteringPlan(table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime))
|
this.clusteringPlan = ClusteringUtils.getClusteringPlan(
|
||||||
.map(Pair::getRight).orElseThrow(() -> new HoodieClusteringException("Unable to read clustering plan for instant: " + instantTime));
|
table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime))
|
||||||
|
.map(Pair::getRight).orElseThrow(() -> new HoodieClusteringException(
|
||||||
|
"Unable to read clustering plan for instant: " + instantTime));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||||
HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime);
|
HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = executeClustering(clusteringPlan);
|
||||||
// Mark instant as clustering inflight
|
JavaRDD<WriteStatus> transformedWriteStatuses = HoodieJavaRDD.getJavaRDD(writeMetadata.getWriteStatuses());
|
||||||
table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty());
|
return writeMetadata.clone(transformedWriteStatuses);
|
||||||
table.getMetaClient().reloadActiveTimeline();
|
|
||||||
|
|
||||||
final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
|
||||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata = ((ClusteringExecutionStrategy<T, JavaRDD<HoodieRecord<? extends HoodieRecordPayload>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>>)
|
|
||||||
ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(),
|
|
||||||
new Class<?>[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config))
|
|
||||||
.performClustering(clusteringPlan, schema, instantTime);
|
|
||||||
JavaRDD<WriteStatus> writeStatusRDD = writeMetadata.getWriteStatuses();
|
|
||||||
JavaRDD<WriteStatus> statuses = updateIndex(writeStatusRDD, writeMetadata);
|
|
||||||
writeMetadata.setWriteStats(statuses.map(WriteStatus::getStat).collect());
|
|
||||||
writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(writeMetadata));
|
|
||||||
commitOnAutoCommit(writeMetadata);
|
|
||||||
if (!writeMetadata.getCommitMetadata().isPresent()) {
|
|
||||||
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(),
|
|
||||||
extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType());
|
|
||||||
writeMetadata.setCommitMetadata(Option.of(commitMetadata));
|
|
||||||
}
|
|
||||||
return writeMetadata;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validate actions taken by clustering. In the first implementation, we validate at least one new file is written.
|
|
||||||
* But we can extend this to add more validation. E.g. number of records read = number of records written etc.
|
|
||||||
* We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions.
|
|
||||||
*/
|
|
||||||
private void validateWriteResult(HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata) {
|
|
||||||
if (writeMetadata.getWriteStatuses().isEmpty()) {
|
|
||||||
throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime
|
|
||||||
+ " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least "
|
|
||||||
+ clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum()
|
|
||||||
+ " write statuses");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String getCommitActionType() {
|
protected String getCommitActionType() {
|
||||||
return HoodieTimeline.REPLACE_COMMIT_ACTION;
|
return HoodieTimeline.REPLACE_COMMIT_ACTION;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Map<String, List<String>> getPartitionToReplacedFileIds(HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata) {
|
|
||||||
Set<HoodieFileGroupId> newFilesWritten = writeMetadata.getWriteStats().get().stream()
|
|
||||||
.map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet());
|
|
||||||
// for the below execution strategy, new file group id would be same as old file group id
|
|
||||||
if (SparkSingleFileSortExecutionStrategy.class.getName().equals(config.getClusteringExecutionStrategyClass())) {
|
|
||||||
return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan)
|
|
||||||
.collect(Collectors.groupingBy(fg -> fg.getPartitionPath(), Collectors.mapping(fg -> fg.getFileId(), Collectors.toList())));
|
|
||||||
}
|
|
||||||
return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan)
|
|
||||||
.filter(fg -> !newFilesWritten.contains(fg))
|
|
||||||
.collect(Collectors.groupingBy(fg -> fg.getPartitionPath(), Collectors.mapping(fg -> fg.getFileId(), Collectors.toList())));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user