[HUDI-2917] rollback insert data appended to log file when using Hbase Index (#4840)
Co-authored-by: guanziyue <guanziyue@gmail.com>
This commit is contained in:
committed by
GitHub
parent
193215201c
commit
4a59876c8b
@@ -156,19 +156,22 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
|
||||
LOG.info("RDD PreppedRecords was persisted at: " + inputRecordsRDD.getStorageLevel());
|
||||
}
|
||||
|
||||
WorkloadProfile profile = null;
|
||||
WorkloadProfile workloadProfile = null;
|
||||
if (isWorkloadProfileNeeded()) {
|
||||
context.setJobStatus(this.getClass().getSimpleName(), "Building workload profile");
|
||||
profile = new WorkloadProfile(buildProfile(inputRecordsRDD), operationType);
|
||||
LOG.info("Workload profile :" + profile);
|
||||
saveWorkloadProfileMetadataToInflight(profile, instantTime);
|
||||
workloadProfile = new WorkloadProfile(buildProfile(inputRecordsRDD), operationType, table.getIndex().canIndexLogFiles());
|
||||
LOG.info("Input workload profile :" + workloadProfile);
|
||||
}
|
||||
|
||||
// partition using the insert partitioner
|
||||
final Partitioner partitioner = getPartitioner(workloadProfile);
|
||||
if (isWorkloadProfileNeeded()) {
|
||||
saveWorkloadProfileMetadataToInflight(workloadProfile, instantTime);
|
||||
}
|
||||
|
||||
// handle records update with clustering
|
||||
JavaRDD<HoodieRecord<T>> inputRecordsRDDWithClusteringUpdate = clusteringHandleUpdate(inputRecordsRDD);
|
||||
|
||||
// partition using the insert partitioner
|
||||
final Partitioner partitioner = getPartitioner(profile);
|
||||
context.setJobStatus(this.getClass().getSimpleName(), "Doing partition and writing data");
|
||||
JavaRDD<HoodieRecord<T>> partitionedRecords = partition(inputRecordsRDDWithClusteringUpdate, partitioner);
|
||||
JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> {
|
||||
|
||||
@@ -90,7 +90,7 @@ public class SparkBucketIndexPartitioner<T extends HoodieRecordPayload<T>> exten
|
||||
private void assignUpdates(WorkloadProfile profile) {
|
||||
updatePartitionPathFileIds = new HashMap<>();
|
||||
// each update location gets a partition
|
||||
Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getPartitionPathStatMap()
|
||||
Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getInputPartitionPathStatMap()
|
||||
.entrySet();
|
||||
for (Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
|
||||
if (!updatePartitionPathFileIds.containsKey(partitionStat.getKey())) {
|
||||
|
||||
@@ -26,6 +26,7 @@ import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.NumericUtils;
|
||||
@@ -100,11 +101,19 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
|
||||
|
||||
private void assignUpdates(WorkloadProfile profile) {
|
||||
// each update location gets a partition
|
||||
Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getPartitionPathStatMap().entrySet();
|
||||
Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet();
|
||||
for (Map.Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
|
||||
WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat());
|
||||
for (Map.Entry<String, Pair<String, Long>> updateLocEntry :
|
||||
partitionStat.getValue().getUpdateLocationToCount().entrySet()) {
|
||||
addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey());
|
||||
if (profile.hasOutputWorkLoadStats()) {
|
||||
HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey());
|
||||
outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue());
|
||||
}
|
||||
}
|
||||
if (profile.hasOutputWorkLoadStats()) {
|
||||
profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -161,6 +170,7 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
|
||||
|
||||
for (String partitionPath : partitionPaths) {
|
||||
WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
|
||||
WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionPath, new WorkloadStat());
|
||||
if (pStat.getNumInserts() > 0) {
|
||||
|
||||
List<SmallFile> smallFiles =
|
||||
@@ -189,6 +199,9 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
|
||||
bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId());
|
||||
LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
|
||||
}
|
||||
if (profile.hasOutputWorkLoadStats()) {
|
||||
outputWorkloadStats.addInserts(smallFile.location, recordsToAppend);
|
||||
}
|
||||
bucketNumbers.add(bucket);
|
||||
recordsPerBucket.add(recordsToAppend);
|
||||
totalUnassignedInserts -= recordsToAppend;
|
||||
@@ -218,6 +231,9 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
|
||||
}
|
||||
BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, FSUtils.createNewFileIdPfx(), partitionPath);
|
||||
bucketInfoMap.put(totalBuckets, bucketInfo);
|
||||
if (profile.hasOutputWorkLoadStats()) {
|
||||
outputWorkloadStats.addInserts(new HoodieRecordLocation(HoodieWriteStat.NULL_COMMIT, bucketInfo.getFileIdPrefix()), recordsPerBucket.get(recordsPerBucket.size() - 1));
|
||||
}
|
||||
totalBuckets++;
|
||||
}
|
||||
}
|
||||
@@ -235,6 +251,9 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
|
||||
LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
|
||||
partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets);
|
||||
}
|
||||
if (profile.hasOutputWorkLoadStats()) {
|
||||
profile.updateOutputPartitionPathStatMap(partitionPath, outputWorkloadStats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -302,6 +321,11 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
|
||||
return totalBuckets;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumPartitions() {
|
||||
return totalBuckets;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPartition(Object key) {
|
||||
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation =
|
||||
|
||||
Reference in New Issue
Block a user