1
0

[HUDI-1468] Support custom clustering strategies and preserve commit metadata as part of clustering (#3419)

Co-authored-by: Satish Kotha <satishkotha@uber.com>
This commit is contained in:
Sagar Sumit
2021-08-07 08:23:08 +05:30
committed by GitHub
parent 9ce548edb1
commit 70b6bd485f
34 changed files with 1150 additions and 343 deletions

View File

@@ -216,7 +216,7 @@ public abstract class BaseJavaCommitActionExecutor<T extends HoodieRecordPayload
}
}
protected Map<String, List<String>> getPartitionToReplacedFileIds(List<WriteStatus> writeStatuses) {
protected Map<String, List<String>> getPartitionToReplacedFileIds(HoodieWriteMetadata<List<WriteStatus>> writeMetadata) {
return Collections.emptyMap();
}
@@ -330,7 +330,7 @@ public abstract class BaseJavaCommitActionExecutor<T extends HoodieRecordPayload
List<WriteStatus> statuses = table.getIndex().updateLocation(writeStatuses, context, table);
result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now()));
result.setWriteStatuses(statuses);
result.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(statuses));
result.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(result));
commitOnAutoCommit(result);
}
}

View File

@@ -71,7 +71,7 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Abst
table.getMetaClient().getCommitActionType(), instantTime), Option.empty(),
config.shouldAllowMultiWriteOnSameInstant());
// write new files
List<WriteStatus> writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, userDefinedBulkInsertPartitioner, false, config.getBulkInsertShuffleParallelism());
List<WriteStatus> writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, userDefinedBulkInsertPartitioner, false, config.getBulkInsertShuffleParallelism(), false);
//update index
((BaseJavaCommitActionExecutor) executor).updateIndexAndCommitIfNeeded(writeStatuses, result);
return result;
@@ -85,7 +85,8 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Abst
boolean performDedupe,
Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner,
boolean useWriterSchema,
int parallelism) {
int parallelism,
boolean preserveHoodieMetadata) {
// De-dupe/merge if needed
List<HoodieRecord<T>> dedupedRecords = inputRecords;

View File

@@ -64,9 +64,9 @@ public class JavaInsertOverwriteCommitActionExecutor<T extends HoodieRecordPaylo
}
@Override
protected Map<String, List<String>> getPartitionToReplacedFileIds(List<WriteStatus> writeStatuses) {
protected Map<String, List<String>> getPartitionToReplacedFileIds(HoodieWriteMetadata<List<WriteStatus>> writeResult) {
return context.mapToPair(
writeStatuses.stream().map(status -> status.getStat().getPartitionPath()).distinct().collect(Collectors.toList()),
writeResult.getWriteStatuses().stream().map(status -> status.getStat().getPartitionPath()).distinct().collect(Collectors.toList()),
partitionPath ->
Pair.of(partitionPath, getAllExistingFileIds(partitionPath)), 1
);

View File

@@ -27,6 +27,7 @@ import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import java.util.HashMap;
import java.util.List;
@@ -48,7 +49,7 @@ public class JavaInsertOverwriteTableCommitActionExecutor<T extends HoodieRecord
}
@Override
protected Map<String, List<String>> getPartitionToReplacedFileIds(List<WriteStatus> writeStatuses) {
protected Map<String, List<String>> getPartitionToReplacedFileIds(HoodieWriteMetadata<List<WriteStatus>> writeResult) {
Map<String, List<String>> partitionToExistingFileIds = new HashMap<>();
List<String> partitionPaths = FSUtils.getAllPartitionPaths(context,
table.getMetaClient().getBasePath(), config.useFileListingMetadata(),