[HUDI-2207] Support independent flink hudi clustering function
This commit is contained in:
@@ -1379,7 +1379,7 @@ public abstract class BaseHoodieWriteClient<T extends HoodieRecordPayload, I, K,
|
||||
return scheduleClustering(extraMetadata);
|
||||
}
|
||||
|
||||
protected void rollbackInflightClustering(HoodieInstant inflightInstant, HoodieTable table) {
|
||||
public void rollbackInflightClustering(HoodieInstant inflightInstant, HoodieTable table) {
|
||||
Option<HoodiePendingRollbackInfo> pendingRollbackInstantInfo = getPendingRollbackInfo(table.getMetaClient(), inflightInstant.getTimestamp(), false);
|
||||
String commitTime = pendingRollbackInstantInfo.map(entry -> entry.getRollbackInstant().getTimestamp()).orElse(HoodieActiveTimeline.createNewInstantTime());
|
||||
table.scheduleRollback(context, commitTime, inflightInstant, false, config.shouldRollbackUsingMarkers());
|
||||
|
||||
@@ -51,6 +51,8 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy.";
|
||||
public static final String SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY =
|
||||
"org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy";
|
||||
public static final String FLINK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY =
|
||||
"org.apache.hudi.client.clustering.plan.strategy.FlinkSizeBasedClusteringPlanStrategy";
|
||||
public static final String JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY =
|
||||
"org.apache.hudi.client.clustering.plan.strategy.JavaSizeBasedClusteringPlanStrategy";
|
||||
public static final String SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY =
|
||||
|
||||
@@ -25,20 +25,20 @@ import org.apache.hudi.io.WriteHandleFactory;
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Repartition input records into at least expected number of output spark partitions. It should give below guarantees -
|
||||
* Output spark partition will have records from only one hoodie partition. - Average records per output spark
|
||||
* partitions should be almost equal to (#inputRecords / #outputSparkPartitions) to avoid possible skews.
|
||||
* Repartition input records into at least expected number of output partitions. It should give below guarantees -
|
||||
* Output partition will have records from only one hoodie partition. - Average records per output
|
||||
* partitions should be almost equal to (#inputRecords / #outputPartitions) to avoid possible skews.
|
||||
*/
|
||||
public interface BulkInsertPartitioner<I> extends Serializable {
|
||||
|
||||
/**
|
||||
* Repartitions the input records into at least expected number of output spark partitions.
|
||||
* Repartitions the input records into at least expected number of output partitions.
|
||||
*
|
||||
* @param records Input Hoodie records
|
||||
* @param outputSparkPartitions Expected number of output partitions
|
||||
* @param records Input Hoodie records
|
||||
* @param outputPartitions Expected number of output partitions
|
||||
* @return
|
||||
*/
|
||||
I repartitionRecords(I records, int outputSparkPartitions);
|
||||
I repartitionRecords(I records, int outputPartitions);
|
||||
|
||||
/**
|
||||
* @return {@code true} if the records within a partition are sorted; {@code false} otherwise.
|
||||
@@ -48,6 +48,7 @@ public interface BulkInsertPartitioner<I> extends Serializable {
|
||||
/**
|
||||
* Return file group id prefix for the given data partition.
|
||||
* By defauult, return a new file group id prefix, so that incoming records will route to a fresh new file group
|
||||
*
|
||||
* @param partitionId data partition
|
||||
* @return
|
||||
*/
|
||||
@@ -57,6 +58,7 @@ public interface BulkInsertPartitioner<I> extends Serializable {
|
||||
|
||||
/**
|
||||
* Return write handle factory for the given partition.
|
||||
*
|
||||
* @param partitionId data partition
|
||||
* @return
|
||||
*/
|
||||
|
||||
@@ -70,6 +70,9 @@ public abstract class ClusteringPlanStrategy<T extends HoodieRecordPayload,I,K,O
|
||||
String sparkSizeBasedClassName = HoodieClusteringConfig.SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY;
|
||||
String sparkSelectedPartitionsClassName = "org.apache.hudi.client.clustering.plan.strategy.SparkSelectedPartitionsClusteringPlanStrategy";
|
||||
String sparkRecentDaysClassName = "org.apache.hudi.client.clustering.plan.strategy.SparkRecentDaysClusteringPlanStrategy";
|
||||
String flinkSizeBasedClassName = HoodieClusteringConfig.FLINK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY;
|
||||
String flinkSelectedPartitionsClassName = "org.apache.hudi.client.clustering.plan.strategy.FlinkSelectedPartitionsClusteringPlanStrategy";
|
||||
String flinkRecentDaysClassName = "org.apache.hudi.client.clustering.plan.strategy.FlinkRecentDaysClusteringPlanStrategy";
|
||||
String javaSelectedPartitionClassName = "org.apache.hudi.client.clustering.plan.strategy.JavaRecentDaysClusteringPlanStrategy";
|
||||
String javaSizeBasedClassName = HoodieClusteringConfig.JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY;
|
||||
|
||||
@@ -82,6 +85,14 @@ public abstract class ClusteringPlanStrategy<T extends HoodieRecordPayload,I,K,O
|
||||
config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name());
|
||||
LOG.warn(String.format(logStr, className, sparkSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name()));
|
||||
return sparkSizeBasedClassName;
|
||||
} else if (flinkRecentDaysClassName.equals(className)) {
|
||||
config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.RECENT_DAYS.name());
|
||||
LOG.warn(String.format(logStr, className, sparkSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.RECENT_DAYS.name()));
|
||||
return flinkSizeBasedClassName;
|
||||
} else if (flinkSelectedPartitionsClassName.equals(className)) {
|
||||
config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name());
|
||||
LOG.warn(String.format(logStr, className, sparkSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name()));
|
||||
return flinkSizeBasedClassName;
|
||||
} else if (javaSelectedPartitionClassName.equals(className)) {
|
||||
config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.RECENT_DAYS.name());
|
||||
LOG.warn(String.format(logStr, className, javaSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name()));
|
||||
|
||||
Reference in New Issue
Block a user