From b4441abcf74951ec0ce28593b96baa84456a97d3 Mon Sep 17 00:00:00 2001 From: zhangyue19921010 <69956021+zhangyue19921010@users.noreply.github.com> Date: Tue, 10 Aug 2021 01:10:15 +0800 Subject: [PATCH] [HUDI-2194] Skip the latest N partitions when choosing partitions to create ClusteringPlan (#3300) * skip from latest partitions based on hoodie.clustering.plan.strategy.daybased.skipfromlatest.partitions && 0(default means skip nothing) * change config verison * add ut Co-authored-by: yuezhang --- .../hudi/config/HoodieClusteringConfig.java | 11 ++++ .../apache/hudi/config/HoodieWriteConfig.java | 4 ++ ...SparkRecentDaysClusteringPlanStrategy.java | 2 + ...SparkRecentDaysClusteringPlanStrategy.java | 66 +++++++++++++++++++ 4 files changed, 83 insertions(+) create mode 100644 hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkRecentDaysClusteringPlanStrategy.java diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java index e97cec6d2..9e0461198 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java @@ -80,6 +80,12 @@ public class HoodieClusteringConfig extends HoodieConfig { .sinceVersion("0.9.0") .withDocumentation("Config to control frequency of async clustering"); + public static final ConfigProperty CLUSTERING_SKIP_PARTITIONS_FROM_LATEST = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.skipfromlatest.partitions") + .defaultValue("0") + .sinceVersion("0.9.0") + .withDocumentation("Number of partitions to skip from latest when choosing partitions to create ClusteringPlan"); + public static final ConfigProperty CLUSTERING_PLAN_SMALL_FILE_LIMIT = ConfigProperty .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit") .defaultValue(String.valueOf(600 * 1024 * 1024L)) @@ -165,6 +171,11 @@ public class HoodieClusteringConfig extends HoodieConfig { return this; } + public Builder withClusteringSkipPartitionsFromLatest(int clusteringSkipPartitionsFromLatest) { + clusteringConfig.setValue(CLUSTERING_SKIP_PARTITIONS_FROM_LATEST, String.valueOf(clusteringSkipPartitionsFromLatest)); + return this; + } + public Builder withClusteringPlanSmallFileLimit(long clusteringSmallFileLimit) { clusteringConfig.setValue(CLUSTERING_PLAN_SMALL_FILE_LIMIT, String.valueOf(clusteringSmallFileLimit)); return this; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index 6e87257b3..7e9c0cc66 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -773,6 +773,10 @@ public class HoodieWriteConfig extends HoodieConfig { return getInt(HoodieClusteringConfig.CLUSTERING_TARGET_PARTITIONS); } + public int getSkipPartitionsFromLatestForClustering() { + return getInt(HoodieClusteringConfig.CLUSTERING_SKIP_PARTITIONS_FROM_LATEST); + } + public String getClusteringSortColumns() { return getString(HoodieClusteringConfig.CLUSTERING_SORT_COLUMNS_PROPERTY); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkRecentDaysClusteringPlanStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkRecentDaysClusteringPlanStrategy.java index 21f1609e8..5c132773c 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkRecentDaysClusteringPlanStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkRecentDaysClusteringPlanStrategy.java @@ -51,8 +51,10 @@ public class SparkRecentDaysClusteringPlanStrategy filterPartitionPaths(List partitionPaths) { int targetPartitionsForClustering = getWriteConfig().getTargetPartitionsForClustering(); + int skipPartitionsFromLatestForClustering = getWriteConfig().getSkipPartitionsFromLatestForClustering(); return partitionPaths.stream() .sorted(Comparator.reverseOrder()) + .skip(Math.max(skipPartitionsFromLatestForClustering, 0)) .limit(targetPartitionsForClustering > 0 ? targetPartitionsForClustering : partitionPaths.size()) .collect(Collectors.toList()); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkRecentDaysClusteringPlanStrategy.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkRecentDaysClusteringPlanStrategy.java new file mode 100644 index 000000000..a9c7b7154 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkRecentDaysClusteringPlanStrategy.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.plan.strategy; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; + +public class TestSparkRecentDaysClusteringPlanStrategy { + @Mock + HoodieSparkCopyOnWriteTable table; + @Mock + HoodieSparkEngineContext context; + HoodieWriteConfig hoodieWriteConfig; + + @BeforeEach + public void setUp() { + this.hoodieWriteConfig = HoodieWriteConfig + .newBuilder() + .withPath("Fake_Table_Path") + .withClusteringConfig(HoodieClusteringConfig + .newBuilder() + .withClusteringSkipPartitionsFromLatest(1) + .withClusteringTargetPartitions(1) + .build()) + .build(); + } + + @Test + public void testFilterPartitionPaths() { + SparkRecentDaysClusteringPlanStrategy sg = new SparkRecentDaysClusteringPlanStrategy(table, context, hoodieWriteConfig); + ArrayList fakeTimeBasedPartitionsPath = new ArrayList<>(); + fakeTimeBasedPartitionsPath.add("20210718"); + fakeTimeBasedPartitionsPath.add("20210716"); + fakeTimeBasedPartitionsPath.add("20210719"); + List list = sg.filterPartitionPaths(fakeTimeBasedPartitionsPath); + assertEquals(1, list.size()); + assertSame("20210718", list.get(0)); + } +}