[HUDI-4101] When BucketIndexPartitioner take partition path for dispersion may cause the fileID of the task to not be loaded correctly (#5763)

Co-authored-by: john.wick <john.wick@vipshop.com>
2022-06-06 21:53:55 +08:00
parent 21ab0ff8be
commit 132c0aa8c7
1 changed files with 6 additions and 19 deletions
--- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java
+++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java
@@ -57,11 +57,6 @@ public class BucketStreamWriteFunction<I> extends StreamWriteFunction<I> {
  private String indexKeyFields;
  /**
   * BucketID should be loaded in this task.
   */
  private Set<Integer> bucketToLoad;
  /**
   * BucketID to file group mapping in each partition.
   * Map(partition -> Map(bucketId, fileID)).
@@ -91,7 +86,6 @@ public class BucketStreamWriteFunction<I> extends StreamWriteFunction<I> {
    this.indexKeyFields = config.getString(FlinkOptions.INDEX_KEY_FIELD);
    this.taskID = getRuntimeContext().getIndexOfThisSubtask();
    this.parallelism = getRuntimeContext().getNumberOfParallelSubtasks();
    this.bucketToLoad = getBucketToLoad();
    this.bucketIndex = new HashMap<>();
    this.incBucketIndex = new HashSet<>();
  }
@@ -136,19 +130,12 @@ public class BucketStreamWriteFunction<I> extends StreamWriteFunction<I> {
  }
  /**
-   * Bootstrap bucket info from existing file system,
+   * Determine whether the current fileID belongs to the current task.
-   * bucketNum % totalParallelism == this taskID belongs to this task.
+   * (partition + curBucket) % numPartitions == this taskID belongs to this task.
   */
-  private Set<Integer> getBucketToLoad() {
+  public boolean isBucketToLoad(int bucketNumber, String partition) {
-    Set<Integer> bucketToLoad = new HashSet<>();
+    int globalHash = ((partition + bucketNumber).hashCode()) & Integer.MAX_VALUE;
-    for (int i = 0; i < bucketNum; i++) {
+    return BucketIdentifier.mod(globalHash, parallelism) == taskID;
      int partitionOfBucket = BucketIdentifier.mod(i, parallelism);
      if (partitionOfBucket == taskID) {
        bucketToLoad.add(i);
      }
    }
    LOG.info("Bucket number that belongs to task [{}/{}]: {}", taskID, parallelism, bucketToLoad);
    return bucketToLoad;
  }
  /**
@@ -167,7 +154,7 @@ public class BucketStreamWriteFunction<I> extends StreamWriteFunction<I> {
    this.writeClient.getHoodieTable().getFileSystemView().getAllFileGroups(partition).forEach(fileGroup -> {
      String fileID = fileGroup.getFileGroupId().getFileId();
      int bucketNumber = BucketIdentifier.bucketIdFromFileId(fileID);
-      if (bucketToLoad.contains(bucketNumber)) {
+      if (isBucketToLoad(bucketNumber, partition)) {
        LOG.info(String.format("Should load this partition bucket %s with fileID %s", bucketNumber, fileID));
        if (bucketToFileIDMap.containsKey(bucketNumber)) {
          throw new RuntimeException(String.format("Duplicate fileID %s from bucket %s of partition %s found "