[HUDI-3085] Improve bulk insert partitioner abstraction (#4441)
This commit is contained in:
@@ -18,12 +18,18 @@
|
||||
|
||||
package org.apache.hudi.table;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.io.WriteHandleFactory;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Repartition input records into at least expected number of output spark partitions. It should give below guarantees -
|
||||
* Output spark partition will have records from only one hoodie partition. - Average records per output spark
|
||||
* partitions should be almost equal to (#inputRecords / #outputSparkPartitions) to avoid possible skews.
|
||||
*/
|
||||
public interface BulkInsertPartitioner<I> {
|
||||
public interface BulkInsertPartitioner<I> extends Serializable {
|
||||
|
||||
/**
|
||||
* Repartitions the input records into at least expected number of output spark partitions.
|
||||
@@ -38,4 +44,24 @@ public interface BulkInsertPartitioner<I> {
|
||||
* @return {@code true} if the records within a partition are sorted; {@code false} otherwise.
|
||||
*/
|
||||
boolean arePartitionRecordsSorted();
|
||||
|
||||
/**
|
||||
* Return file group id prefix for the given data partition.
|
||||
* By defauult, return a new file group id prefix, so that incoming records will route to a fresh new file group
|
||||
* @param partitionId data partition
|
||||
* @return
|
||||
*/
|
||||
default String getFileIdPfx(int partitionId) {
|
||||
return FSUtils.createNewFileIdPfx();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return write handle factory for the given partition.
|
||||
* @param partitionId data partition
|
||||
* @return
|
||||
*/
|
||||
default Option<WriteHandleFactory> getWriteHandleFactory(int partitionId) {
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -42,7 +42,7 @@ public abstract class BaseBulkInsertHelper<T extends HoodieRecordPayload, I, K,
|
||||
public abstract O bulkInsert(I inputRecords, String instantTime,
|
||||
HoodieTable<T, I, K, O> table, HoodieWriteConfig config,
|
||||
boolean performDedupe,
|
||||
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner,
|
||||
BulkInsertPartitioner partitioner,
|
||||
boolean addMetadataFields,
|
||||
int parallelism,
|
||||
WriteHandleFactory writeHandleFactory);
|
||||
|
||||
Reference in New Issue
Block a user