[HUDI-4397] Flink Inline Cluster and Compact plan distribute strategy changed from rebalance to hash to avoid potential multiple threads accessing the same file (#6106)
Co-authored-by: jerryyue <jerryyue@didiglobal.com>
This commit is contained in:
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.sink.utils;
|
package org.apache.hudi.sink.utils;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.model.ClusteringOperation;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.configuration.FlinkOptions;
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
@@ -65,6 +66,7 @@ import org.apache.flink.table.types.logical.RowType;
|
|||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utilities to generate all kinds of sub-pipelines.
|
* Utilities to generate all kinds of sub-pipelines.
|
||||||
@@ -357,7 +359,7 @@ public class Pipelines {
|
|||||||
*
|
*
|
||||||
* <pre>
|
* <pre>
|
||||||
* /=== | task1 | ===\
|
* /=== | task1 | ===\
|
||||||
* | plan generation | ===> re-balance | commit |
|
* | plan generation | ===> hash | commit |
|
||||||
* \=== | task2 | ===/
|
* \=== | task2 | ===/
|
||||||
*
|
*
|
||||||
* Note: both the compaction plan generation task and commission task are singleton.
|
* Note: both the compaction plan generation task and commission task are singleton.
|
||||||
@@ -372,7 +374,7 @@ public class Pipelines {
|
|||||||
TypeInformation.of(CompactionPlanEvent.class),
|
TypeInformation.of(CompactionPlanEvent.class),
|
||||||
new CompactionPlanOperator(conf))
|
new CompactionPlanOperator(conf))
|
||||||
.setParallelism(1) // plan generate must be singleton
|
.setParallelism(1) // plan generate must be singleton
|
||||||
.rebalance()
|
.keyBy(plan -> plan.getOperation().getFileGroupId().getFileId())
|
||||||
.transform("compact_task",
|
.transform("compact_task",
|
||||||
TypeInformation.of(CompactionCommitEvent.class),
|
TypeInformation.of(CompactionCommitEvent.class),
|
||||||
new ProcessOperator<>(new CompactFunction(conf)))
|
new ProcessOperator<>(new CompactFunction(conf)))
|
||||||
@@ -392,7 +394,7 @@ public class Pipelines {
|
|||||||
*
|
*
|
||||||
* <pre>
|
* <pre>
|
||||||
* /=== | task1 | ===\
|
* /=== | task1 | ===\
|
||||||
* | plan generation | ===> re-balance | commit |
|
* | plan generation | ===> hash | commit |
|
||||||
* \=== | task2 | ===/
|
* \=== | task2 | ===/
|
||||||
*
|
*
|
||||||
* Note: both the clustering plan generation task and commission task are singleton.
|
* Note: both the clustering plan generation task and commission task are singleton.
|
||||||
@@ -408,7 +410,9 @@ public class Pipelines {
|
|||||||
TypeInformation.of(ClusteringPlanEvent.class),
|
TypeInformation.of(ClusteringPlanEvent.class),
|
||||||
new ClusteringPlanOperator(conf))
|
new ClusteringPlanOperator(conf))
|
||||||
.setParallelism(1) // plan generate must be singleton
|
.setParallelism(1) // plan generate must be singleton
|
||||||
.rebalance()
|
.keyBy(plan -> plan.getClusteringGroupInfo().getOperations()
|
||||||
|
.stream().map(ClusteringOperation::getFileId)
|
||||||
|
.collect(Collectors.joining()))
|
||||||
.transform("clustering_task",
|
.transform("clustering_task",
|
||||||
TypeInformation.of(ClusteringCommitEvent.class),
|
TypeInformation.of(ClusteringCommitEvent.class),
|
||||||
new ClusteringOperator(conf, rowType))
|
new ClusteringOperator(conf, rowType))
|
||||||
|
|||||||
Reference in New Issue
Block a user