From b50f4b491c969675d00cec9655f70855de4db88a Mon Sep 17 00:00:00 2001 From: Raymond Xu <2701446+xushiyan@users.noreply.github.com> Date: Fri, 25 Feb 2022 05:39:43 -0800 Subject: [PATCH] [HUDI-3042] Refactor clustering executors (#4847) --- ...java => ClusteringPlanActionExecutor.java} | 6 +- .../commit/BaseCommitActionExecutor.java | 76 +++++++++++++++++ .../run/strategy/JavaExecutionStrategy.java | 10 ++- .../table/HoodieJavaCopyOnWriteTable.java | 7 +- .../JavaClusteringPlanActionExecutor.java | 43 ---------- ...ExecuteClusteringCommitActionExecutor.java | 73 ++--------------- .../MultipleSparkJobExecutionStrategy.java | 10 ++- .../SingleSparkJobExecutionStrategy.java | 10 ++- .../table/HoodieSparkCopyOnWriteTable.java | 4 +- .../SparkClusteringPlanActionExecutor.java | 44 ---------- ...ExecuteClusteringCommitActionExecutor.java | 81 +++---------------- 11 files changed, 120 insertions(+), 244 deletions(-) rename hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/{BaseClusteringPlanActionExecutor.java => ClusteringPlanActionExecutor.java} (93%) delete mode 100644 hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaClusteringPlanActionExecutor.java delete mode 100644 hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkClusteringPlanActionExecutor.java diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/BaseClusteringPlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java similarity index 93% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/BaseClusteringPlanActionExecutor.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java index a1820ed93..15ead5efb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/BaseClusteringPlanActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java @@ -41,13 +41,13 @@ import java.io.IOException; import java.util.Collections; import java.util.Map; -public abstract class BaseClusteringPlanActionExecutor extends BaseActionExecutor> { +public class ClusteringPlanActionExecutor extends BaseActionExecutor> { - private static final Logger LOG = LogManager.getLogger(BaseClusteringPlanActionExecutor.class); + private static final Logger LOG = LogManager.getLogger(ClusteringPlanActionExecutor.class); private final Option> extraMetadata; - public BaseClusteringPlanActionExecutor(HoodieEngineContext context, + public ClusteringPlanActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java index 351f95ef5..a9cb03a2b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java @@ -18,12 +18,18 @@ package org.apache.hudi.table.action.commit; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.transaction.TransactionManager; import org.apache.hudi.client.utils.TransactionUtils; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieWriteStat; @@ -31,10 +37,15 @@ import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstant.State; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.exception.HoodieCommitException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.table.HoodieTable; @@ -42,7 +53,9 @@ import org.apache.hudi.table.WorkloadProfile; import org.apache.hudi.table.WorkloadStat; import org.apache.hudi.table.action.BaseActionExecutor; import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; +import org.apache.avro.Schema; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -53,6 +66,8 @@ import java.time.Instant; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; public abstract class BaseCommitActionExecutor extends BaseActionExecutor { @@ -200,4 +215,65 @@ public abstract class BaseCommitActionExecutor> handleUpdate(String partitionPath, String fileId, Iterator> recordItr) throws IOException; + + protected HoodieWriteMetadata> executeClustering(HoodieClusteringPlan clusteringPlan) { + HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime); + // Mark instant as clustering inflight + table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty()); + table.getMetaClient().reloadActiveTimeline(); + + final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); + HoodieWriteMetadata> writeMetadata = ( + (ClusteringExecutionStrategy>, HoodieData, HoodieData>) + ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(), + new Class[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config)) + .performClustering(clusteringPlan, schema, instantTime); + HoodieData writeStatusList = writeMetadata.getWriteStatuses(); + HoodieData statuses = updateIndex(writeStatusList, writeMetadata); + writeMetadata.setWriteStats(statuses.map(WriteStatus::getStat).collectAsList()); + writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(clusteringPlan, writeMetadata)); + validateWriteResult(clusteringPlan, writeMetadata); + commitOnAutoCommit(writeMetadata); + if (!writeMetadata.getCommitMetadata().isPresent()) { + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(), + extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); + writeMetadata.setCommitMetadata(Option.of(commitMetadata)); + } + return writeMetadata; + } + + private HoodieData updateIndex(HoodieData writeStatuses, HoodieWriteMetadata> result) { + Instant indexStartTime = Instant.now(); + // Update the index back + HoodieData statuses = table.getIndex().updateLocation(writeStatuses, context, table); + result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); + result.setWriteStatuses(statuses); + return statuses; + } + + private Map> getPartitionToReplacedFileIds(HoodieClusteringPlan clusteringPlan, HoodieWriteMetadata> writeMetadata) { + Set newFilesWritten = writeMetadata.getWriteStats().get().stream() + .map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet()); + + return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan) + .filter(fg -> "org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy" + .equals(config.getClusteringExecutionStrategyClass()) + || !newFilesWritten.contains(fg)) + .collect(Collectors.groupingBy(HoodieFileGroupId::getPartitionPath, Collectors.mapping(HoodieFileGroupId::getFileId, Collectors.toList()))); + } + + /** + * Validate actions taken by clustering. In the first implementation, we validate at least one new file is written. + * But we can extend this to add more validation. E.g. number of records read = number of records written etc. + * We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions. + */ + private void validateWriteResult(HoodieClusteringPlan clusteringPlan, HoodieWriteMetadata> writeMetadata) { + if (writeMetadata.getWriteStatuses().isEmpty()) { + throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime + + " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least " + + clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum() + + " write statuses"); + } + } + } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java index 8cbe0d7fa..7d7609f0f 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java @@ -24,6 +24,8 @@ import org.apache.hudi.avro.model.HoodieClusteringGroup; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.JavaTaskContextSupplier; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.ClusteringOperation; import org.apache.hudi.common.model.HoodieAvroRecord; @@ -71,7 +73,7 @@ import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_C * Clustering strategy for Java engine. */ public abstract class JavaExecutionStrategy> - extends ClusteringExecutionStrategy>, List, List> { + extends ClusteringExecutionStrategy>, HoodieData, HoodieData> { private static final Logger LOG = LogManager.getLogger(JavaExecutionStrategy.class); @@ -81,7 +83,7 @@ public abstract class JavaExecutionStrategy> } @Override - public HoodieWriteMetadata> performClustering( + public HoodieWriteMetadata> performClustering( HoodieClusteringPlan clusteringPlan, Schema schema, String instantTime) { // execute clustering for each group and collect WriteStatus List writeStatusList = new ArrayList<>(); @@ -90,8 +92,8 @@ public abstract class JavaExecutionStrategy> inputGroup, clusteringPlan.getStrategy().getStrategyParams(), Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false), instantTime))); - HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); - writeMetadata.setWriteStatuses(writeStatusList); + HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); + writeMetadata.setWriteStatuses(HoodieList.of(writeStatusList)); return writeMetadata; } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java index f8590e9bd..447ed3e96 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java @@ -49,7 +49,7 @@ import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; import org.apache.hudi.table.action.clean.CleanActionExecutor; import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; -import org.apache.hudi.table.action.cluster.JavaClusteringPlanActionExecutor; +import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor; import org.apache.hudi.table.action.cluster.JavaExecuteClusteringCommitActionExecutor; import org.apache.hudi.table.action.commit.JavaBulkInsertCommitActionExecutor; import org.apache.hudi.table.action.commit.JavaBulkInsertPreppedCommitActionExecutor; @@ -70,10 +70,11 @@ import org.apache.hudi.table.action.savepoint.SavepointActionExecutor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nonnull; + import java.io.IOException; import java.util.Collections; import java.util.Iterator; -import javax.annotation.Nonnull; import java.util.List; import java.util.Map; @@ -192,7 +193,7 @@ public class HoodieJavaCopyOnWriteTable @Override public Option scheduleClustering(final HoodieEngineContext context, final String instantTime, final Option> extraMetadata) { - return new JavaClusteringPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); + return new ClusteringPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); } @Override diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaClusteringPlanActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaClusteringPlanActionExecutor.java deleted file mode 100644 index 1d78ecc2b..000000000 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaClusteringPlanActionExecutor.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hudi.table.action.cluster; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import java.util.List; -import java.util.Map; - -public class JavaClusteringPlanActionExecutor extends - BaseClusteringPlanActionExecutor>, List, List> { - - public JavaClusteringPlanActionExecutor( - HoodieEngineContext context, HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, Option> extraMetadata) { - super(context, config, table, instantTime, extraMetadata); - } -} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java index 83364bdc3..168d55814 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java @@ -19,46 +19,32 @@ package org.apache.hudi.table.action.cluster; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.model.HoodieClusteringGroup; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieFileGroupId; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.ClusteringUtils; -import org.apache.hudi.common.util.CommitUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; import org.apache.hudi.table.action.commit.BaseJavaCommitActionExecutor; -import org.apache.avro.Schema; - import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; public class JavaExecuteClusteringCommitActionExecutor> extends BaseJavaCommitActionExecutor { private final HoodieClusteringPlan clusteringPlan; - public JavaExecuteClusteringCommitActionExecutor( - HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime) { + public JavaExecuteClusteringCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime) { super(context, config, table, instantTime, WriteOperationType.CLUSTER); this.clusteringPlan = ClusteringUtils.getClusteringPlan( table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime)) @@ -68,56 +54,13 @@ public class JavaExecuteClusteringCommitActionExecutor> execute() { - HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime); - // Mark instant as clustering inflight - table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty()); - table.getMetaClient().reloadActiveTimeline(); - - final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); - HoodieWriteMetadata> writeMetadata = ( - (ClusteringExecutionStrategy>, List, List>) - ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(), - new Class[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config)) - .performClustering(clusteringPlan, schema, instantTime); - List writeStatusList = writeMetadata.getWriteStatuses(); - List statuses = updateIndex(writeStatusList, writeMetadata); - writeMetadata.setWriteStats(statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList())); - writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(writeMetadata)); - validateWriteResult(writeMetadata); - commitOnAutoCommit(writeMetadata); - if (!writeMetadata.getCommitMetadata().isPresent()) { - HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(), - extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); - writeMetadata.setCommitMetadata(Option.of(commitMetadata)); - } - return writeMetadata; - } - - /** - * Validate actions taken by clustering. In the first implementation, we validate at least one new file is written. - * But we can extend this to add more validation. E.g. number of records read = number of records written etc. - * We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions. - */ - private void validateWriteResult(HoodieWriteMetadata> writeMetadata) { - if (writeMetadata.getWriteStatuses().isEmpty()) { - throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime - + " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least " - + clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum() - + " write statuses"); - } + HoodieWriteMetadata> writeMetadata = executeClustering(clusteringPlan); + List transformedWriteStatuses = writeMetadata.getWriteStatuses().collectAsList(); + return writeMetadata.clone(transformedWriteStatuses); } @Override protected String getCommitActionType() { return HoodieTimeline.REPLACE_COMMIT_ACTION; } - - @Override - protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeMetadata) { - Set newFilesWritten = writeMetadata.getWriteStats().get().stream() - .map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet()); - return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan) - .filter(fg -> !newFilesWritten.contains(fg)) - .collect(Collectors.groupingBy(fg -> fg.getPartitionPath(), Collectors.mapping(fg -> fg.getFileId(), Collectors.toList()))); - } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java index 694fdd547..91d1f4e4e 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java @@ -26,6 +26,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.client.utils.ConcatenatingIterator; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.ClusteringOperation; import org.apache.hudi.common.model.HoodieAvroRecord; @@ -42,6 +43,7 @@ import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; @@ -82,7 +84,7 @@ import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_C * Clustering strategy to submit multiple spark jobs and union the results. */ public abstract class MultipleSparkJobExecutionStrategy> - extends ClusteringExecutionStrategy>, JavaRDD, JavaRDD> { + extends ClusteringExecutionStrategy>, HoodieData, HoodieData> { private static final Logger LOG = LogManager.getLogger(MultipleSparkJobExecutionStrategy.class); public MultipleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { @@ -90,7 +92,7 @@ public abstract class MultipleSparkJobExecutionStrategy> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) { + public HoodieWriteMetadata> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) { JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext()); // execute clustering for each group async and collect WriteStatus Stream> writeStatusRDDStream = FutureUtils.allOf( @@ -105,8 +107,8 @@ public abstract class MultipleSparkJobExecutionStrategy[] writeStatuses = convertStreamToArray(writeStatusRDDStream); JavaRDD writeStatusRDD = engineContext.union(writeStatuses); - HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); - writeMetadata.setWriteStatuses(writeStatusRDD); + HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); + writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD)); return writeMetadata; } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java index e4941848b..1158d0ada 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java @@ -25,6 +25,7 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.client.utils.ConcatenatingIterator; import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.ClusteringGroupInfo; @@ -37,6 +38,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.RewriteAvroPayload; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.storage.HoodieFileReaderFactory; @@ -71,7 +73,7 @@ import java.util.stream.StreamSupport; * MultipleSparkJobExecution strategy is not ideal for use cases that require large number of clustering groups */ public abstract class SingleSparkJobExecutionStrategy> - extends ClusteringExecutionStrategy>, JavaRDD, JavaRDD> { + extends ClusteringExecutionStrategy>, HoodieData, HoodieData> { private static final Logger LOG = LogManager.getLogger(SingleSparkJobExecutionStrategy.class); public SingleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { @@ -79,7 +81,7 @@ public abstract class SingleSparkJobExecutionStrategy> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) { + public HoodieWriteMetadata> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) { JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext()); final TaskContextSupplier taskContextSupplier = getEngineContext().getTaskContextSupplier(); final SerializableSchema serializableSchema = new SerializableSchema(schema); @@ -104,8 +106,8 @@ public abstract class SingleSparkJobExecutionStrategy> writeMetadata = new HoodieWriteMetadata<>(); - writeMetadata.setWriteStatuses(writeStatusRDD); + HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); + writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD)); return writeMetadata; } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java index 6a3305575..31bd43661 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java @@ -59,7 +59,7 @@ import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor; import org.apache.hudi.table.action.clean.CleanActionExecutor; import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; -import org.apache.hudi.table.action.cluster.SparkClusteringPlanActionExecutor; +import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor; import org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor; @@ -244,7 +244,7 @@ public class HoodieSparkCopyOnWriteTable public Option scheduleClustering(HoodieEngineContext context, String instantTime, Option> extraMetadata) { - return new SparkClusteringPlanActionExecutor<>(context, config,this, instantTime, extraMetadata).execute(); + return new ClusteringPlanActionExecutor<>(context, config,this, instantTime, extraMetadata).execute(); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkClusteringPlanActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkClusteringPlanActionExecutor.java deleted file mode 100644 index 81a0a74ae..000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkClusteringPlanActionExecutor.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.cluster; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.spark.api.java.JavaRDD; - -import java.util.Map; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkClusteringPlanActionExecutor extends - BaseClusteringPlanActionExecutor>, JavaRDD, JavaRDD> { - - public SparkClusteringPlanActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - Option> extraMetadata) { - super(context, config, table, instantTime, extraMetadata); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java index c8896e2cd..594a91042 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java @@ -18,111 +18,48 @@ package org.apache.hudi.table.action.cluster; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.model.HoodieClusteringGroup; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieFileGroupId; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.ClusteringUtils; -import org.apache.hudi.common.util.CommitUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor; -import org.apache.avro.Schema; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - public class SparkExecuteClusteringCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private static final Logger LOG = LogManager.getLogger(SparkExecuteClusteringCommitActionExecutor.class); private final HoodieClusteringPlan clusteringPlan; public SparkExecuteClusteringCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime) { super(context, config, table, instantTime, WriteOperationType.CLUSTER); - this.clusteringPlan = ClusteringUtils.getClusteringPlan(table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime)) - .map(Pair::getRight).orElseThrow(() -> new HoodieClusteringException("Unable to read clustering plan for instant: " + instantTime)); + this.clusteringPlan = ClusteringUtils.getClusteringPlan( + table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime)) + .map(Pair::getRight).orElseThrow(() -> new HoodieClusteringException( + "Unable to read clustering plan for instant: " + instantTime)); } @Override public HoodieWriteMetadata> execute() { - HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime); - // Mark instant as clustering inflight - table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty()); - table.getMetaClient().reloadActiveTimeline(); - - final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); - HoodieWriteMetadata> writeMetadata = ((ClusteringExecutionStrategy>, JavaRDD, JavaRDD>) - ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(), - new Class[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config)) - .performClustering(clusteringPlan, schema, instantTime); - JavaRDD writeStatusRDD = writeMetadata.getWriteStatuses(); - JavaRDD statuses = updateIndex(writeStatusRDD, writeMetadata); - writeMetadata.setWriteStats(statuses.map(WriteStatus::getStat).collect()); - writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(writeMetadata)); - commitOnAutoCommit(writeMetadata); - if (!writeMetadata.getCommitMetadata().isPresent()) { - HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(), - extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); - writeMetadata.setCommitMetadata(Option.of(commitMetadata)); - } - return writeMetadata; - } - - /** - * Validate actions taken by clustering. In the first implementation, we validate at least one new file is written. - * But we can extend this to add more validation. E.g. number of records read = number of records written etc. - * We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions. - */ - private void validateWriteResult(HoodieWriteMetadata> writeMetadata) { - if (writeMetadata.getWriteStatuses().isEmpty()) { - throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime - + " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least " - + clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum() - + " write statuses"); - } + HoodieWriteMetadata> writeMetadata = executeClustering(clusteringPlan); + JavaRDD transformedWriteStatuses = HoodieJavaRDD.getJavaRDD(writeMetadata.getWriteStatuses()); + return writeMetadata.clone(transformedWriteStatuses); } @Override protected String getCommitActionType() { return HoodieTimeline.REPLACE_COMMIT_ACTION; } - - @Override - protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeMetadata) { - Set newFilesWritten = writeMetadata.getWriteStats().get().stream() - .map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet()); - // for the below execution strategy, new file group id would be same as old file group id - if (SparkSingleFileSortExecutionStrategy.class.getName().equals(config.getClusteringExecutionStrategyClass())) { - return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan) - .collect(Collectors.groupingBy(fg -> fg.getPartitionPath(), Collectors.mapping(fg -> fg.getFileId(), Collectors.toList()))); - } - return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan) - .filter(fg -> !newFilesWritten.contains(fg)) - .collect(Collectors.groupingBy(fg -> fg.getPartitionPath(), Collectors.mapping(fg -> fg.getFileId(), Collectors.toList()))); - } }