[HUDI-3624] Check all instants before starting a commit in metadata table (#5098)
This commit is contained in:
@@ -136,21 +136,29 @@ public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad
|
||||
compactIfNecessary(writeClient, instantTime);
|
||||
}
|
||||
|
||||
if (!metadataMetaClient.getActiveTimeline().filterCompletedInstants().containsInstant(instantTime)) {
|
||||
if (!metadataMetaClient.getActiveTimeline().containsInstant(instantTime)) {
|
||||
// if this is a new commit being applied to metadata for the first time
|
||||
writeClient.startCommitWithTime(instantTime);
|
||||
} else {
|
||||
// this code path refers to a re-attempted commit that got committed to metadata table, but failed in datatable.
|
||||
// for eg, lets say compaction c1 on 1st attempt succeeded in metadata table and failed before committing to datatable.
|
||||
// when retried again, data table will first rollback pending compaction. these will be applied to metadata table, but all changes
|
||||
// are upserts to metadata table and so only a new delta commit will be created.
|
||||
// once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is
|
||||
// already part of completed commit. So, we have to manually remove the completed instant and proceed.
|
||||
// and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table.
|
||||
HoodieInstant alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant().get();
|
||||
HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant);
|
||||
metadataMetaClient.reloadActiveTimeline();
|
||||
Option<HoodieInstant> alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant();
|
||||
if (alreadyCompletedInstant.isPresent()) {
|
||||
// this code path refers to a re-attempted commit that got committed to metadata table, but failed in datatable.
|
||||
// for eg, lets say compaction c1 on 1st attempt succeeded in metadata table and failed before committing to datatable.
|
||||
// when retried again, data table will first rollback pending compaction. these will be applied to metadata table, but all changes
|
||||
// are upserts to metadata table and so only a new delta commit will be created.
|
||||
// once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is
|
||||
// already part of completed commit. So, we have to manually remove the completed instant and proceed.
|
||||
// and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table.
|
||||
HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant.get());
|
||||
metadataMetaClient.reloadActiveTimeline();
|
||||
}
|
||||
// If the alreadyCompletedInstant is empty, that means there is a requested or inflight
|
||||
// instant with the same instant time. This happens for data table clean action which
|
||||
// reuses the same instant time without rollback first. It is a no-op here as the
|
||||
// clean plan is the same, so we don't need to delete the requested and inflight instant
|
||||
// files in the active timeline.
|
||||
}
|
||||
|
||||
List<WriteStatus> statuses = writeClient.upsertPreppedRecords(preppedRecordRDD, instantTime).collect();
|
||||
statuses.forEach(writeStatus -> {
|
||||
if (writeStatus.hasErrors()) {
|
||||
|
||||
Reference in New Issue
Block a user