[HUDI-1937] Rollback unfinished replace commit to allow updates (#3869)
* [HUDI-1937] Rollback unfinished replace commit to allow updates while clustering * Revert and delete requested replacecommit too * Rollback pending clustering instants transactionally * No double locking and add a config to enable rollback * Update config to be clear about rollback only on conflict
This commit is contained in:
@@ -22,10 +22,15 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieFileGroupId;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.table.action.cluster.strategy.UpdateStrategy;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Allow ingestion commits during clustering job.
|
||||
@@ -37,8 +42,19 @@ public class SparkAllowUpdateStrategy<T extends HoodieRecordPayload<T>> extends
|
||||
super(engineContext, fileGroupsInPendingClustering);
|
||||
}
|
||||
|
||||
private List<HoodieFileGroupId> getGroupIdsWithUpdate(JavaRDD<HoodieRecord<T>> inputRecords) {
|
||||
List<HoodieFileGroupId> fileGroupIdsWithUpdates = inputRecords
|
||||
.filter(record -> record.getCurrentLocation() != null)
|
||||
.map(record -> new HoodieFileGroupId(record.getPartitionPath(), record.getCurrentLocation().getFileId())).distinct().collect();
|
||||
return fileGroupIdsWithUpdates;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> handleUpdate(JavaRDD<HoodieRecord<T>> taggedRecordsRDD) {
|
||||
return taggedRecordsRDD;
|
||||
public Pair<JavaRDD<HoodieRecord<T>>, Set<HoodieFileGroupId>> handleUpdate(JavaRDD<HoodieRecord<T>> taggedRecordsRDD) {
|
||||
List<HoodieFileGroupId> fileGroupIdsWithRecordUpdate = getGroupIdsWithUpdate(taggedRecordsRDD);
|
||||
Set<HoodieFileGroupId> fileGroupIdsWithUpdatesAndPendingClustering = fileGroupIdsWithRecordUpdate.stream()
|
||||
.filter(f -> fileGroupsInPendingClustering.contains(f))
|
||||
.collect(Collectors.toSet());
|
||||
return Pair.of(taggedRecordsRDD, fileGroupIdsWithUpdatesAndPendingClustering);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,14 +22,18 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieFileGroupId;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieClusteringUpdateException;
|
||||
import org.apache.hudi.table.action.cluster.strategy.UpdateStrategy;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Update strategy based on following.
|
||||
@@ -50,7 +54,7 @@ public class SparkRejectUpdateStrategy<T extends HoodieRecordPayload<T>> extends
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> handleUpdate(JavaRDD<HoodieRecord<T>> taggedRecordsRDD) {
|
||||
public Pair<JavaRDD<HoodieRecord<T>>, Set<HoodieFileGroupId>> handleUpdate(JavaRDD<HoodieRecord<T>> taggedRecordsRDD) {
|
||||
List<HoodieFileGroupId> fileGroupIdsWithRecordUpdate = getGroupIdsWithUpdate(taggedRecordsRDD);
|
||||
fileGroupIdsWithRecordUpdate.forEach(fileGroupIdWithRecordUpdate -> {
|
||||
if (fileGroupsInPendingClustering.contains(fileGroupIdWithRecordUpdate)) {
|
||||
@@ -61,7 +65,7 @@ public class SparkRejectUpdateStrategy<T extends HoodieRecordPayload<T>> extends
|
||||
throw new HoodieClusteringUpdateException(msg);
|
||||
}
|
||||
});
|
||||
return taggedRecordsRDD;
|
||||
return Pair.of(taggedRecordsRDD, Collections.emptySet());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -77,6 +77,8 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.hudi.common.util.ClusteringUtils.getAllFileGroupsInPendingClusteringPlans;
|
||||
|
||||
public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayload> extends
|
||||
BaseCommitActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, HoodieWriteMetadata> {
|
||||
|
||||
@@ -118,7 +120,27 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
|
||||
table.getFileSystemView().getFileGroupsInPendingClustering().map(entry -> entry.getKey()).collect(Collectors.toSet());
|
||||
UpdateStrategy updateStrategy = (UpdateStrategy)ReflectionUtils
|
||||
.loadClass(config.getClusteringUpdatesStrategyClass(), this.context, fileGroupsInPendingClustering);
|
||||
return (JavaRDD<HoodieRecord<T>>)updateStrategy.handleUpdate(inputRecordsRDD);
|
||||
Pair<JavaRDD<HoodieRecord<T>>, Set<HoodieFileGroupId>> recordsAndPendingClusteringFileGroups =
|
||||
(Pair<JavaRDD<HoodieRecord<T>>, Set<HoodieFileGroupId>>)updateStrategy.handleUpdate(inputRecordsRDD);
|
||||
Set<HoodieFileGroupId> fileGroupsWithUpdatesAndPendingClustering = recordsAndPendingClusteringFileGroups.getRight();
|
||||
if (fileGroupsWithUpdatesAndPendingClustering.isEmpty()) {
|
||||
return recordsAndPendingClusteringFileGroups.getLeft();
|
||||
}
|
||||
// there are filegroups pending clustering and receiving updates, so rollback the pending clustering instants
|
||||
// there could be race condition, for example, if the clustering completes after instants are fetched but before rollback completed
|
||||
if (config.isRollbackPendingClustering()) {
|
||||
Set<HoodieInstant> pendingClusteringInstantsToRollback = getAllFileGroupsInPendingClusteringPlans(table.getMetaClient()).entrySet().stream()
|
||||
.filter(e -> fileGroupsWithUpdatesAndPendingClustering.contains(e.getKey()))
|
||||
.map(Map.Entry::getValue)
|
||||
.collect(Collectors.toSet());
|
||||
pendingClusteringInstantsToRollback.forEach(instant -> {
|
||||
String commitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
table.scheduleRollback(context, commitTime, instant, false, config.shouldRollbackUsingMarkers());
|
||||
table.rollback(context, commitTime, instant, true, true);
|
||||
});
|
||||
table.getMetaClient().reloadActiveTimeline();
|
||||
}
|
||||
return recordsAndPendingClusteringFileGroups.getLeft();
|
||||
} else {
|
||||
return inputRecordsRDD;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user