[HUDI-3438] Avoid getSmallFiles if hoodie.parquet.small.file.limit is 0 (#4823)

Co-authored-by: Hui An <hui.an@shopee.com>
2022-02-18 21:57:04 +08:00
parent fba5822ee3
commit 5009138d04
5 changed files with 26 additions and 4 deletions
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java
@@ -165,7 +165,7 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo

        List<SmallFile> smallFiles =
            filterSmallFilesInClustering(partitionPathToPendingClusteringFileGroupsId.getOrDefault(partitionPath, Collections.emptySet()),
-                partitionSmallFilesMap.get(partitionPath));
+                partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>()));

        this.smallFiles.addAll(smallFiles);

@@ -241,6 +241,11 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
  private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, HoodieEngineContext context) {
    JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
    Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
+
+    if (config.getParquetSmallFileLimit() <= 0) {
+      return partitionSmallFilesMap;
+    }
+
    if (partitionPaths != null && partitionPaths.size() > 0) {
      context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions");
      JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitPartitioner.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitPartitioner.java
@@ -97,6 +97,10 @@ public class SparkUpsertDeltaCommitPartitioner<T extends HoodieRecordPayload<T>>
              .collect(Collectors.toList());
    }

+    if (config.getParquetSmallFileLimit() <= 0) {
+      return Collections.emptyList();
+    }
+
    // If we cannot index log files, then we choose the smallest parquet file in the partition and add inserts to
    // it. Doing this overtime for a partition, we ensure that we handle small file issues
    return table.getSliceView()