[HUDI-3095] abstract partition filter logic to enable code reuse (#4454)

* [HUDI-3095] abstract partition filter logic to enable code reuse * [HUDI-3095] address reviews
2021-12-31 13:37:52 +08:00
parent e88b5fd450
commit 2444f40a4b
16 changed files with 368 additions and 272 deletions
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkRecentDaysClusteringPlanStrategy.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkRecentDaysClusteringPlanStrategy.java
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi.client.clustering.plan.strategy;
-
-import org.apache.hudi.client.common.HoodieSparkEngineContext;
-import org.apache.hudi.common.model.HoodieRecordPayload;
-import org.apache.hudi.config.HoodieWriteConfig;
-import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
-import org.apache.hudi.table.HoodieSparkMergeOnReadTable;
-
-import org.apache.log4j.LogManager;
-import org.apache.log4j.Logger;
-
-import java.util.Comparator;
-import java.util.List;
-import java.util.stream.Collectors;
-
-/**
- * Clustering Strategy that only looks at latest 'daybased.lookback.partitions' partitions.
- */
-public class SparkRecentDaysClusteringPlanStrategy<T extends HoodieRecordPayload<T>>
-    extends SparkSizeBasedClusteringPlanStrategy<T> {
-  private static final Logger LOG = LogManager.getLogger(SparkRecentDaysClusteringPlanStrategy.class);
-
-  public SparkRecentDaysClusteringPlanStrategy(HoodieSparkCopyOnWriteTable<T> table,
-                                               HoodieSparkEngineContext engineContext,
-                                               HoodieWriteConfig writeConfig) {
-    super(table, engineContext, writeConfig);
-  }
-
-  public SparkRecentDaysClusteringPlanStrategy(HoodieSparkMergeOnReadTable<T> table,
-                                               HoodieSparkEngineContext engineContext,
-                                               HoodieWriteConfig writeConfig) {
-    super(table, engineContext, writeConfig);
-  }
-
-  @Override
-  protected List<String> filterPartitionPaths(List<String> partitionPaths) {
-    int targetPartitionsForClustering = getWriteConfig().getTargetPartitionsForClustering();
-    int skipPartitionsFromLatestForClustering = getWriteConfig().getSkipPartitionsFromLatestForClustering();
-    return partitionPaths.stream()
-        .sorted(Comparator.reverseOrder())
-        .skip(Math.max(skipPartitionsFromLatestForClustering, 0))
-        .limit(targetPartitionsForClustering > 0 ? targetPartitionsForClustering : partitionPaths.size())
-        .collect(Collectors.toList());
-  }
-}
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSelectedPartitionsClusteringPlanStrategy.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSelectedPartitionsClusteringPlanStrategy.java
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi.client.clustering.plan.strategy;
-
-import org.apache.hudi.client.common.HoodieSparkEngineContext;
-import org.apache.hudi.common.model.HoodieRecordPayload;
-import org.apache.hudi.config.HoodieWriteConfig;
-import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
-import org.apache.hudi.table.HoodieSparkMergeOnReadTable;
-import org.apache.log4j.LogManager;
-import org.apache.log4j.Logger;
-
-import java.util.List;
-import java.util.stream.Collectors;
-
-import static org.apache.hudi.config.HoodieClusteringConfig.CLUSTERING_STRATEGY_PARAM_PREFIX;
-
-/**
- * Clustering Strategy to filter just specified partitions from [begin, end]. Note both begin and end are inclusive.
- */
-public class SparkSelectedPartitionsClusteringPlanStrategy<T extends HoodieRecordPayload<T>>
-    extends SparkSizeBasedClusteringPlanStrategy<T> {
-  private static final Logger LOG = LogManager.getLogger(SparkSelectedPartitionsClusteringPlanStrategy.class);
-
-  public static final String CONF_BEGIN_PARTITION = CLUSTERING_STRATEGY_PARAM_PREFIX + "cluster.begin.partition";
-  public static final String CONF_END_PARTITION = CLUSTERING_STRATEGY_PARAM_PREFIX + "cluster.end.partition";
-  
-  public SparkSelectedPartitionsClusteringPlanStrategy(HoodieSparkCopyOnWriteTable<T> table,
-                                                       HoodieSparkEngineContext engineContext,
-                                                       HoodieWriteConfig writeConfig) {
-    super(table, engineContext, writeConfig);
-  }
-
-  public SparkSelectedPartitionsClusteringPlanStrategy(HoodieSparkMergeOnReadTable<T> table,
-                                                       HoodieSparkEngineContext engineContext,
-                                                       HoodieWriteConfig writeConfig) {
-    super(table, engineContext, writeConfig);
-  }
-
-  @Override
-  protected List<String> filterPartitionPaths(List<String> partitionPaths) {
-    String beginPartition = getWriteConfig().getProps().getProperty(CONF_BEGIN_PARTITION);
-    String endPartition = getWriteConfig().getProps().getProperty(CONF_END_PARTITION);
-    List<String> filteredPartitions = partitionPaths.stream()
-        .filter(path -> path.compareTo(beginPartition) >= 0 && path.compareTo(endPartition) <= 0)
-        .collect(Collectors.toList());
-    LOG.info("Filtered to the following partitions: " + filteredPartitions);
-    return filteredPartitions;
-  }
-}
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSingleFileSortPlanStrategy.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSingleFileSortPlanStrategy.java
@@ -48,6 +48,7 @@ public class SparkSingleFileSortPlanStrategy<T extends HoodieRecordPayload<T>>
    super(table, engineContext, writeConfig);
  }

+  @Override
  protected Stream<HoodieClusteringGroup> buildClusteringGroupsForPartition(String partitionPath, List<FileSlice> fileSlices) {
    List<Pair<List<FileSlice>, Integer>> fileSliceGroups = fileSlices.stream()
        .map(fileSlice -> Pair.of(Collections.singletonList(fileSlice), 1)).collect(Collectors.toList());
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSizeBasedClusteringPlanStrategy.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSizeBasedClusteringPlanStrategy.java
@@ -115,11 +115,6 @@ public class SparkSizeBasedClusteringPlanStrategy<T extends HoodieRecordPayload<
    return params;
  }

-  @Override
-  protected List<String> filterPartitionPaths(List<String> partitionPaths) {
-    return partitionPaths;
-  }
-
  @Override
  protected Stream<FileSlice> getFileSlicesEligibleForClustering(final String partition) {
    return super.getFileSlicesEligibleForClustering(partition)
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkRecentDaysClusteringPlanStrategy.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkRecentDaysClusteringPlanStrategy.java
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi.client.clustering.plan.strategy;
-
-import org.apache.hudi.client.common.HoodieSparkEngineContext;
-import org.apache.hudi.config.HoodieClusteringConfig;
-import org.apache.hudi.config.HoodieWriteConfig;
-import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-import org.mockito.Mock;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertSame;
-
-public class TestSparkRecentDaysClusteringPlanStrategy {
-  @Mock
-  HoodieSparkCopyOnWriteTable table;
-  @Mock
-  HoodieSparkEngineContext context;
-  HoodieWriteConfig hoodieWriteConfig;
-
-  @BeforeEach
-  public void setUp() {
-    this.hoodieWriteConfig = HoodieWriteConfig
-            .newBuilder()
-            .withPath("Fake_Table_Path")
-            .withClusteringConfig(HoodieClusteringConfig
-                    .newBuilder()
-                    .withClusteringSkipPartitionsFromLatest(1)
-                    .withClusteringTargetPartitions(1)
-                    .build())
-            .build();
-  }
-
-  @Test
-  public void testFilterPartitionPaths() {
-    SparkRecentDaysClusteringPlanStrategy sg = new SparkRecentDaysClusteringPlanStrategy(table, context, hoodieWriteConfig);
-    ArrayList<String> fakeTimeBasedPartitionsPath = new ArrayList<>();
-    fakeTimeBasedPartitionsPath.add("20210718");
-    fakeTimeBasedPartitionsPath.add("20210716");
-    fakeTimeBasedPartitionsPath.add("20210719");
-    List list = sg.filterPartitionPaths(fakeTimeBasedPartitionsPath);
-    assertEquals(1, list.size());
-    assertSame("20210718", list.get(0));
-  }
-}
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestSparkClusteringPlanPartitionFilter.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestSparkClusteringPlanPartitionFilter.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.table.action.cluster.strategy;
+
+import org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy;
+import org.apache.hudi.client.common.HoodieSparkEngineContext;
+import org.apache.hudi.config.HoodieClusteringConfig;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
+import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode;
+
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.mockito.Mock;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertSame;
+
+public class TestSparkClusteringPlanPartitionFilter {
+  @Mock
+  HoodieSparkCopyOnWriteTable table;
+  @Mock
+  HoodieSparkEngineContext context;
+  HoodieWriteConfig.Builder hoodieWriteConfigBuilder;
+
+  @BeforeEach
+  public void setUp() {
+    this.hoodieWriteConfigBuilder = HoodieWriteConfig
+            .newBuilder()
+            .withPath("Fake_Table_Path");
+  }
+
+  @Test
+  public void testFilterPartitionNoFilter() {
+    HoodieWriteConfig config = hoodieWriteConfigBuilder.withClusteringConfig(HoodieClusteringConfig.newBuilder()
+            .withClusteringPlanPartitionFilterMode(ClusteringPlanPartitionFilterMode.NONE)
+            .build())
+        .build();
+
+    PartitionAwareClusteringPlanStrategy sg = new SparkSizeBasedClusteringPlanStrategy(table, context, config);
+    ArrayList<String> fakeTimeBasedPartitionsPath = new ArrayList<>();
+    fakeTimeBasedPartitionsPath.add("20210718");
+    fakeTimeBasedPartitionsPath.add("20210716");
+    fakeTimeBasedPartitionsPath.add("20210719");
+    List list = sg.filterPartitionPaths(fakeTimeBasedPartitionsPath);
+    assertEquals(3, list.size());
+  }
+
+  @Test
+  public void testFilterPartitionRecentDays() {
+    HoodieWriteConfig config = hoodieWriteConfigBuilder.withClusteringConfig(HoodieClusteringConfig.newBuilder()
+            .withClusteringSkipPartitionsFromLatest(1)
+            .withClusteringTargetPartitions(1)
+            .withClusteringPlanPartitionFilterMode(ClusteringPlanPartitionFilterMode.RECENT_DAYS)
+            .build())
+        .build();
+
+    PartitionAwareClusteringPlanStrategy sg = new SparkSizeBasedClusteringPlanStrategy(table, context, config);
+    ArrayList<String> fakeTimeBasedPartitionsPath = new ArrayList<>();
+    fakeTimeBasedPartitionsPath.add("20210718");
+    fakeTimeBasedPartitionsPath.add("20210716");
+    fakeTimeBasedPartitionsPath.add("20210719");
+    List list = sg.filterPartitionPaths(fakeTimeBasedPartitionsPath);
+    assertEquals(1, list.size());
+    assertSame("20210718", list.get(0));
+  }
+
+  @Test
+  public void testFilterPartitionSelectedPartitions() {
+    HoodieWriteConfig config = hoodieWriteConfigBuilder.withClusteringConfig(HoodieClusteringConfig.newBuilder()
+            .withClusteringPartitionFilterBeginPartition("20211222")
+            .withClusteringPartitionFilterEndPartition("20211223")
+            .withClusteringPlanPartitionFilterMode(ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS)
+            .build())
+        .build();
+
+    PartitionAwareClusteringPlanStrategy sg = new SparkSizeBasedClusteringPlanStrategy(table, context, config);
+    ArrayList<String> fakeTimeBasedPartitionsPath = new ArrayList<>();
+    fakeTimeBasedPartitionsPath.add("20211220");
+    fakeTimeBasedPartitionsPath.add("20211221");
+    fakeTimeBasedPartitionsPath.add("20211222");
+    fakeTimeBasedPartitionsPath.add("20211224");
+    List list = sg.filterPartitionPaths(fakeTimeBasedPartitionsPath);
+    assertEquals(1, list.size());
+    assertSame("20211222", list.get(0));
+  }
+}