[HUDI-724] Parallelize getSmallFiles for partitions (#1421)

Co-authored-by: Feichi Feng <feicfeng@amazon.com>
2020-03-30 01:14:38 -06:00
parent fa36082554
commit 1f5b0c77d6
6 changed files with 37 additions and 24 deletions
--- a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java
+++ b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java
@@ -489,9 +489,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> extends AbstractHo

  private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) {
    if (isUpsert) {
-      return table.getUpsertPartitioner(profile);
+      return table.getUpsertPartitioner(profile, jsc);
    } else {
-      return table.getInsertPartitioner(profile);
+      return table.getInsertPartitioner(profile, jsc);
    }
  }

--- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java
+++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java
@@ -81,6 +81,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.stream.Collectors;

+import org.apache.spark.api.java.function.PairFunction;
 import scala.Tuple2;

 /**
@@ -142,16 +143,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
  }

  @Override
-  public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
+  public Partitioner getUpsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc) {
    if (profile == null) {
      throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
    }
-    return new UpsertPartitioner(profile);
+    return new UpsertPartitioner(profile, jsc);
  }

  @Override
-  public Partitioner getInsertPartitioner(WorkloadProfile profile) {
-    return getUpsertPartitioner(profile);
+  public Partitioner getInsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc) {
+    return getUpsertPartitioner(profile, jsc);
  }

  @Override
@@ -573,14 +574,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
     */
    protected HoodieRollingStatMetadata rollingStatMetadata;

-    UpsertPartitioner(WorkloadProfile profile) {
+    UpsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc) {
      updateLocationToBucket = new HashMap<>();
      partitionPathToInsertBuckets = new HashMap<>();
      bucketInfoMap = new HashMap<>();
      globalStat = profile.getGlobalStat();
      rollingStatMetadata = getRollingStats();
      assignUpdates(profile);
-      assignInserts(profile);
+      assignInserts(profile, jsc);

      LOG.info("Total Buckets :" + totalBuckets + ", buckets info => " + bucketInfoMap + ", \n"
          + "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n"
@@ -610,18 +611,24 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
      return bucket;
    }

-    private void assignInserts(WorkloadProfile profile) {
+    private void assignInserts(WorkloadProfile profile, JavaSparkContext jsc) {
      // for new inserts, compute buckets depending on how many records we have for each partition
      Set<String> partitionPaths = profile.getPartitionPaths();
      long averageRecordSize =
          averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
              config.getCopyOnWriteRecordSizeEstimate());
      LOG.info("AvgRecordSize => " + averageRecordSize);
+
+      Map<String, List<SmallFile>> partitionSmallFilesMap =
+              getSmallFilesForPartitions(new ArrayList<String>(partitionPaths), jsc);
+
      for (String partitionPath : partitionPaths) {
        WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
        if (pStat.getNumInserts() > 0) {

-          List<SmallFile> smallFiles = getSmallFiles(partitionPath);
+          List<SmallFile> smallFiles = partitionSmallFilesMap.get(partitionPath);
+          this.smallFiles.addAll(smallFiles);
+
          LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);

          long totalUnassignedInserts = pStat.getNumInserts();
@@ -684,6 +691,18 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
      }
    }

+    private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, JavaSparkContext jsc) {
+
+      Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
+      if (partitionPaths != null && partitionPaths.size() > 0) {
+        JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
+        partitionSmallFilesMap = partitionPathRdds.mapToPair((PairFunction<String, String, List<SmallFile>>)
+            partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath))).collectAsMap();
+      }
+
+      return partitionSmallFilesMap;
+    }
+
    /**
     * Returns a list of small files in the given partition path.
     */
@@ -706,8 +725,6 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
            sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
            sf.sizeBytes = file.getFileSize();
            smallFileLocations.add(sf);
-            // Update the global small files list
-            smallFiles.add(sf);
          }
        }
      }
--- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java
+++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java
@@ -89,11 +89,11 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
  }

  @Override
-  public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
+  public Partitioner getUpsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc) {
    if (profile == null) {
      throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
    }
-    mergeOnReadUpsertPartitioner = new MergeOnReadUpsertPartitioner(profile);
+    mergeOnReadUpsertPartitioner = new MergeOnReadUpsertPartitioner(profile, jsc);
    return mergeOnReadUpsertPartitioner;
  }

@@ -325,8 +325,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
   */
  class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {

-    MergeOnReadUpsertPartitioner(WorkloadProfile profile) {
-      super(profile);
+    MergeOnReadUpsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc) {
+      super(profile, jsc);
    }

    @Override
@@ -376,16 +376,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
            sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
            sf.sizeBytes = getTotalFileSize(smallFileSlice);
            smallFileLocations.add(sf);
-            // Update the global small files list
-            smallFiles.add(sf);
          } else {
            HoodieLogFile logFile = smallFileSlice.getLogFiles().findFirst().get();
            sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()),
                FSUtils.getFileIdFromLogPath(logFile.getPath()));
            sf.sizeBytes = getTotalFileSize(smallFileSlice);
            smallFileLocations.add(sf);
-            // Update the global small files list
-            smallFiles.add(sf);
          }
        }
      }
--- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java
+++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java
@@ -118,12 +118,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
  /**
   * Provides a partitioner to perform the upsert operation, based on the workload profile.
   */
-  public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
+  public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc);

  /**
   * Provides a partitioner to perform the insert operation, based on the workload profile.
   */
-  public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
+  public abstract Partitioner getInsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc);

  /**
   * Return whether this HoodieTable implementation can benefit from workload profiling.