[HUDI-3085] Improve bulk insert partitioner abstraction (#4441)
This commit is contained in:
@@ -41,6 +41,7 @@ import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieClusteringException;
|
||||
import org.apache.hudi.execution.bulkinsert.JavaBulkInsertInternalPartitionerFactory;
|
||||
import org.apache.hudi.execution.bulkinsert.JavaCustomColumnsSortPartitioner;
|
||||
import org.apache.hudi.io.IOUtils;
|
||||
import org.apache.hudi.io.storage.HoodieFileReader;
|
||||
@@ -121,16 +122,16 @@ public abstract class JavaExecutionStrategy<T extends HoodieRecordPayload<T>>
|
||||
*
|
||||
* @param strategyParams Strategy parameters containing columns to sort the data by when clustering.
|
||||
* @param schema Schema of the data including metadata fields.
|
||||
* @return empty for now.
|
||||
* @return partitioner for the java engine
|
||||
*/
|
||||
protected Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> getPartitioner(Map<String, String> strategyParams, Schema schema) {
|
||||
protected BulkInsertPartitioner<List<HoodieRecord<T>>> getPartitioner(Map<String, String> strategyParams, Schema schema) {
|
||||
if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) {
|
||||
return Option.of(new JavaCustomColumnsSortPartitioner(
|
||||
return new JavaCustomColumnsSortPartitioner(
|
||||
strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key()).split(","),
|
||||
HoodieAvroUtils.addMetadataFields(schema),
|
||||
getWriteConfig().isConsistentLogicalTimestampEnabled()));
|
||||
getWriteConfig().isConsistentLogicalTimestampEnabled());
|
||||
} else {
|
||||
return Option.empty();
|
||||
return JavaBulkInsertInternalPartitionerFactory.get(getWriteConfig().getBulkInsertSortMode());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -77,8 +77,11 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Base
|
||||
config.shouldAllowMultiWriteOnSameInstant());
|
||||
}
|
||||
|
||||
BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.orElse(JavaBulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode()));
|
||||
|
||||
// write new files
|
||||
List<WriteStatus> writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, userDefinedBulkInsertPartitioner, false, config.getBulkInsertShuffleParallelism(), new CreateHandleFactory(false));
|
||||
List<WriteStatus> writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, partitioner, false,
|
||||
config.getBulkInsertShuffleParallelism(), new CreateHandleFactory(false));
|
||||
//update index
|
||||
((BaseJavaCommitActionExecutor) executor).updateIndexAndCommitIfNeeded(writeStatuses, result);
|
||||
return result;
|
||||
@@ -90,7 +93,7 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Base
|
||||
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table,
|
||||
HoodieWriteConfig config,
|
||||
boolean performDedupe,
|
||||
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner,
|
||||
BulkInsertPartitioner partitioner,
|
||||
boolean useWriterSchema,
|
||||
int parallelism,
|
||||
WriteHandleFactory writeHandleFactory) {
|
||||
@@ -103,12 +106,7 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Base
|
||||
parallelism, table);
|
||||
}
|
||||
|
||||
final List<HoodieRecord<T>> repartitionedRecords;
|
||||
BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent()
|
||||
? userDefinedBulkInsertPartitioner.get()
|
||||
: JavaBulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
|
||||
// only List is supported for Java partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
|
||||
repartitionedRecords = (List<HoodieRecord<T>>) partitioner.repartitionRecords(dedupedRecords, parallelism);
|
||||
final List<HoodieRecord<T>> repartitionedRecords = (List<HoodieRecord<T>>) partitioner.repartitionRecords(dedupedRecords, parallelism);
|
||||
|
||||
FileIdPrefixProvider fileIdPrefixProvider = (FileIdPrefixProvider) ReflectionUtils.loadClass(
|
||||
config.getFileIdPrefixProviderClassName(),
|
||||
@@ -119,7 +117,8 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Base
|
||||
new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true,
|
||||
config, instantTime, table,
|
||||
fileIdPrefixProvider.createFilePrefix(""), table.getTaskContextSupplier(),
|
||||
new CreateHandleFactory<>()).forEachRemaining(writeStatuses::addAll);
|
||||
// Always get the first WriteHandleFactory, as there is only a single data partition for hudi java engine.
|
||||
(WriteHandleFactory) partitioner.getWriteHandleFactory(0).orElse(writeHandleFactory)).forEachRemaining(writeStatuses::addAll);
|
||||
|
||||
return writeStatuses;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user