1
0

[HUDI-2917] rollback insert data appended to log file when using Hbase Index (#4840)

Co-authored-by: guanziyue <guanziyue@gmail.com>
This commit is contained in:
Sivabalan Narayanan
2022-02-28 08:13:17 -05:00
committed by GitHub
parent 193215201c
commit 4a59876c8b
11 changed files with 340 additions and 61 deletions

View File

@@ -156,19 +156,22 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
LOG.info("RDD PreppedRecords was persisted at: " + inputRecordsRDD.getStorageLevel());
}
WorkloadProfile profile = null;
WorkloadProfile workloadProfile = null;
if (isWorkloadProfileNeeded()) {
context.setJobStatus(this.getClass().getSimpleName(), "Building workload profile");
profile = new WorkloadProfile(buildProfile(inputRecordsRDD), operationType);
LOG.info("Workload profile :" + profile);
saveWorkloadProfileMetadataToInflight(profile, instantTime);
workloadProfile = new WorkloadProfile(buildProfile(inputRecordsRDD), operationType, table.getIndex().canIndexLogFiles());
LOG.info("Input workload profile :" + workloadProfile);
}
// partition using the insert partitioner
final Partitioner partitioner = getPartitioner(workloadProfile);
if (isWorkloadProfileNeeded()) {
saveWorkloadProfileMetadataToInflight(workloadProfile, instantTime);
}
// handle records update with clustering
JavaRDD<HoodieRecord<T>> inputRecordsRDDWithClusteringUpdate = clusteringHandleUpdate(inputRecordsRDD);
// partition using the insert partitioner
final Partitioner partitioner = getPartitioner(profile);
context.setJobStatus(this.getClass().getSimpleName(), "Doing partition and writing data");
JavaRDD<HoodieRecord<T>> partitionedRecords = partition(inputRecordsRDDWithClusteringUpdate, partitioner);
JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> {

View File

@@ -90,7 +90,7 @@ public class SparkBucketIndexPartitioner<T extends HoodieRecordPayload<T>> exten
private void assignUpdates(WorkloadProfile profile) {
updatePartitionPathFileIds = new HashMap<>();
// each update location gets a partition
Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getPartitionPathStatMap()
Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getInputPartitionPathStatMap()
.entrySet();
for (Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
if (!updatePartitionPathFileIds.containsKey(partitionStat.getKey())) {

View File

@@ -26,6 +26,7 @@ import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.NumericUtils;
@@ -100,11 +101,19 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
private void assignUpdates(WorkloadProfile profile) {
// each update location gets a partition
Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getPartitionPathStatMap().entrySet();
Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet();
for (Map.Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat());
for (Map.Entry<String, Pair<String, Long>> updateLocEntry :
partitionStat.getValue().getUpdateLocationToCount().entrySet()) {
addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey());
if (profile.hasOutputWorkLoadStats()) {
HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey());
outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue());
}
}
if (profile.hasOutputWorkLoadStats()) {
profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats);
}
}
}
@@ -161,6 +170,7 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
for (String partitionPath : partitionPaths) {
WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionPath, new WorkloadStat());
if (pStat.getNumInserts() > 0) {
List<SmallFile> smallFiles =
@@ -189,6 +199,9 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId());
LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
}
if (profile.hasOutputWorkLoadStats()) {
outputWorkloadStats.addInserts(smallFile.location, recordsToAppend);
}
bucketNumbers.add(bucket);
recordsPerBucket.add(recordsToAppend);
totalUnassignedInserts -= recordsToAppend;
@@ -218,6 +231,9 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
}
BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, FSUtils.createNewFileIdPfx(), partitionPath);
bucketInfoMap.put(totalBuckets, bucketInfo);
if (profile.hasOutputWorkLoadStats()) {
outputWorkloadStats.addInserts(new HoodieRecordLocation(HoodieWriteStat.NULL_COMMIT, bucketInfo.getFileIdPrefix()), recordsPerBucket.get(recordsPerBucket.size() - 1));
}
totalBuckets++;
}
}
@@ -235,6 +251,9 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets);
}
if (profile.hasOutputWorkLoadStats()) {
profile.updateOutputPartitionPathStatMap(partitionPath, outputWorkloadStats);
}
}
}
@@ -302,6 +321,11 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
return totalBuckets;
}
@Override
public int getNumPartitions() {
return totalBuckets;
}
@Override
public int getPartition(Object key) {
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation =