[HUDI-2194] Skip the latest N partitions when choosing partitions to create ClusteringPlan (#3300)

* skip from latest partitions based on hoodie.clustering.plan.strategy.daybased.skipfromlatest.partitions && 0(default means skip nothing) * change config verison * add ut Co-authored-by: yuezhang <yuezhang@freewheel.tv>
2021-08-10 01:10:15 +08:00
parent 41a9986a76
commit b4441abcf7
4 changed files with 83 additions and 0 deletions
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java
@@ -80,6 +80,12 @@ public class HoodieClusteringConfig extends HoodieConfig {
      .sinceVersion("0.9.0")
      .withDocumentation("Config to control frequency of async clustering");

+  public static final ConfigProperty<String> CLUSTERING_SKIP_PARTITIONS_FROM_LATEST = ConfigProperty
+          .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.skipfromlatest.partitions")
+          .defaultValue("0")
+          .sinceVersion("0.9.0")
+          .withDocumentation("Number of partitions to skip from latest when choosing partitions to create ClusteringPlan");
+
  public static final ConfigProperty<String> CLUSTERING_PLAN_SMALL_FILE_LIMIT = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit")
      .defaultValue(String.valueOf(600 * 1024 * 1024L))
@@ -165,6 +171,11 @@ public class HoodieClusteringConfig extends HoodieConfig {
      return this;
    }

+    public Builder withClusteringSkipPartitionsFromLatest(int clusteringSkipPartitionsFromLatest) {
+      clusteringConfig.setValue(CLUSTERING_SKIP_PARTITIONS_FROM_LATEST, String.valueOf(clusteringSkipPartitionsFromLatest));
+      return this;
+    }
+
    public Builder withClusteringPlanSmallFileLimit(long clusteringSmallFileLimit) {
      clusteringConfig.setValue(CLUSTERING_PLAN_SMALL_FILE_LIMIT, String.valueOf(clusteringSmallFileLimit));
      return this;
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
@@ -773,6 +773,10 @@ public class HoodieWriteConfig extends HoodieConfig {
    return getInt(HoodieClusteringConfig.CLUSTERING_TARGET_PARTITIONS);
  }

+  public int getSkipPartitionsFromLatestForClustering() {
+    return getInt(HoodieClusteringConfig.CLUSTERING_SKIP_PARTITIONS_FROM_LATEST);
+  }
+
  public String getClusteringSortColumns() {
    return getString(HoodieClusteringConfig.CLUSTERING_SORT_COLUMNS_PROPERTY);
  }
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkRecentDaysClusteringPlanStrategy.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkRecentDaysClusteringPlanStrategy.java
@@ -51,8 +51,10 @@ public class SparkRecentDaysClusteringPlanStrategy<T extends HoodieRecordPayload

  protected List<String> filterPartitionPaths(List<String> partitionPaths) {
    int targetPartitionsForClustering = getWriteConfig().getTargetPartitionsForClustering();
+    int skipPartitionsFromLatestForClustering = getWriteConfig().getSkipPartitionsFromLatestForClustering();
    return partitionPaths.stream()
        .sorted(Comparator.reverseOrder())
+        .skip(Math.max(skipPartitionsFromLatestForClustering, 0))
        .limit(targetPartitionsForClustering > 0 ? targetPartitionsForClustering : partitionPaths.size())
        .collect(Collectors.toList());
  }
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkRecentDaysClusteringPlanStrategy.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkRecentDaysClusteringPlanStrategy.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.client.clustering.plan.strategy;
+
+import org.apache.hudi.client.common.HoodieSparkEngineContext;
+import org.apache.hudi.config.HoodieClusteringConfig;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.mockito.Mock;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertSame;
+
+public class TestSparkRecentDaysClusteringPlanStrategy {
+  @Mock
+  HoodieSparkCopyOnWriteTable table;
+  @Mock
+  HoodieSparkEngineContext context;
+  HoodieWriteConfig hoodieWriteConfig;
+
+  @BeforeEach
+  public void setUp() {
+    this.hoodieWriteConfig = HoodieWriteConfig
+            .newBuilder()
+            .withPath("Fake_Table_Path")
+            .withClusteringConfig(HoodieClusteringConfig
+                    .newBuilder()
+                    .withClusteringSkipPartitionsFromLatest(1)
+                    .withClusteringTargetPartitions(1)
+                    .build())
+            .build();
+  }
+
+  @Test
+  public void testFilterPartitionPaths() {
+    SparkRecentDaysClusteringPlanStrategy sg = new SparkRecentDaysClusteringPlanStrategy(table, context, hoodieWriteConfig);
+    ArrayList<String> fakeTimeBasedPartitionsPath = new ArrayList<>();
+    fakeTimeBasedPartitionsPath.add("20210718");
+    fakeTimeBasedPartitionsPath.add("20210716");
+    fakeTimeBasedPartitionsPath.add("20210719");
+    List list = sg.filterPartitionPaths(fakeTimeBasedPartitionsPath);
+    assertEquals(1, list.size());
+    assertSame("20210718", list.get(0));
+  }
+}