1
0

[HUDI-3085] Improve bulk insert partitioner abstraction (#4441)

This commit is contained in:
Yuwei XIAO
2022-04-25 18:42:17 +08:00
committed by GitHub
parent 9054b85961
commit f2ba0fead2
9 changed files with 65 additions and 45 deletions

View File

@@ -18,12 +18,18 @@
package org.apache.hudi.table;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.io.WriteHandleFactory;
import java.io.Serializable;
/**
* Repartition input records into at least expected number of output spark partitions. It should give below guarantees -
* Output spark partition will have records from only one hoodie partition. - Average records per output spark
* partitions should be almost equal to (#inputRecords / #outputSparkPartitions) to avoid possible skews.
*/
public interface BulkInsertPartitioner<I> {
public interface BulkInsertPartitioner<I> extends Serializable {
/**
* Repartitions the input records into at least expected number of output spark partitions.
@@ -38,4 +44,24 @@ public interface BulkInsertPartitioner<I> {
* @return {@code true} if the records within a partition are sorted; {@code false} otherwise.
*/
boolean arePartitionRecordsSorted();
/**
* Return file group id prefix for the given data partition.
* By defauult, return a new file group id prefix, so that incoming records will route to a fresh new file group
* @param partitionId data partition
* @return
*/
default String getFileIdPfx(int partitionId) {
return FSUtils.createNewFileIdPfx();
}
/**
* Return write handle factory for the given partition.
* @param partitionId data partition
* @return
*/
default Option<WriteHandleFactory> getWriteHandleFactory(int partitionId) {
return Option.empty();
}
}

View File

@@ -42,7 +42,7 @@ public abstract class BaseBulkInsertHelper<T extends HoodieRecordPayload, I, K,
public abstract O bulkInsert(I inputRecords, String instantTime,
HoodieTable<T, I, K, O> table, HoodieWriteConfig config,
boolean performDedupe,
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner,
BulkInsertPartitioner partitioner,
boolean addMetadataFields,
int parallelism,
WriteHandleFactory writeHandleFactory);