Implement Savepoints and required metadata timeline (#86)
- Introduce avro to save clean metadata with details about the last commit that was retained - Save rollback metadata in the meta timeline - Create savepoint metadata and add API to createSavepoint, deleteSavepoint and rollbackToSavepoint - Savepointed commit should not be rolledback or cleaned or archived - introduce cli commands to show, create and rollback to savepoints - Write unit tests to test savepoints and rollbackToSavepoints
This commit is contained in:
@@ -17,6 +17,14 @@
|
||||
package com.uber.hoodie;
|
||||
|
||||
import com.codahale.metrics.Timer;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
|
||||
import com.uber.hoodie.avro.model.HoodieRollbackMetadata;
|
||||
import com.uber.hoodie.avro.model.HoodieSavepointMetadata;
|
||||
import com.uber.hoodie.common.HoodieCleanStat;
|
||||
import com.uber.hoodie.common.HoodieRollbackStat;
|
||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
@@ -27,12 +35,15 @@ import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.AvroUtils;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieCommitException;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import com.uber.hoodie.exception.HoodieInsertException;
|
||||
import com.uber.hoodie.exception.HoodieRollbackException;
|
||||
import com.uber.hoodie.exception.HoodieSavepointException;
|
||||
import com.uber.hoodie.exception.HoodieUpsertException;
|
||||
import com.uber.hoodie.func.BulkInsertMapFunction;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
@@ -62,14 +73,19 @@ import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.spark.util.AccumulatorV2;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import scala.Option;
|
||||
import scala.Tuple2;
|
||||
|
||||
@@ -359,7 +375,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
// We cannot have unbounded commit files. Archive commits if we have to archive
|
||||
archiveLog.archiveIfRequired();
|
||||
// Call clean to cleanup if there is anything to cleanup after the commit,
|
||||
clean();
|
||||
clean(commitTime);
|
||||
if (writeContext != null) {
|
||||
long durationInMs = metrics.getDurationInMs(writeContext.stop());
|
||||
metrics.updateCommitMetrics(
|
||||
@@ -379,6 +395,143 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Savepoint the latest commit. The data files and commit files for that commit will never be rolledback,
|
||||
* cleaned or archived. This gives an option to rollback the state to the savepoint anytime.
|
||||
* Savepoint needs to be manually created and deleted.
|
||||
*
|
||||
* Savepoint should be on a commit that is not cleaned.
|
||||
*
|
||||
* @param savePointMetadata - metadata about the savepoint
|
||||
* @return true if the savepoint was created successfully
|
||||
*/
|
||||
public boolean savepoint(HoodieSavepointMetadata savePointMetadata) {
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
if (table.getCompletedCommitTimeline().empty()) {
|
||||
throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty");
|
||||
}
|
||||
|
||||
String latestCommit = table.getCompletedCommitTimeline().lastInstant().get().getTimestamp();
|
||||
logger.info("Savepointing latest commit " + latestCommit);
|
||||
return savepoint(latestCommit, savePointMetadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Savepoint a specific commit. The data files and commit files for that commit will never be rolledback,
|
||||
* cleaned or archived. This gives an option to rollback the state to the savepoint anytime.
|
||||
* Savepoint needs to be manually created and deleted.
|
||||
*
|
||||
* Savepoint should be on a commit that is not cleaned.
|
||||
*
|
||||
* @param savePointMetadata - metadata about the savepoint
|
||||
* @return true if the savepoint was created successfully
|
||||
*/
|
||||
public boolean savepoint(String commitTime, HoodieSavepointMetadata savePointMetadata) {
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
|
||||
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
if(!table.getCompletedCommitTimeline().containsInstant(commitInstant)) {
|
||||
throw new HoodieSavepointException("Could not savepoint non-existing commit " + commitInstant);
|
||||
}
|
||||
|
||||
try {
|
||||
// Check the last commit that was not cleaned and check if savepoint time is > that commit
|
||||
String lastCommitRetained;
|
||||
if (cleanInstant.isPresent()) {
|
||||
HoodieCleanMetadata cleanMetadata = AvroUtils.deserializeHoodieCleanMetadata(
|
||||
table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get());
|
||||
lastCommitRetained = cleanMetadata.getEarliestCommitToRetain();
|
||||
} else {
|
||||
lastCommitRetained =
|
||||
table.getCompletedCommitTimeline().firstInstant().get().getTimestamp();
|
||||
}
|
||||
|
||||
// Cannot allow savepoint time on a commit that could have been cleaned
|
||||
Preconditions.checkArgument(table.getActiveTimeline()
|
||||
.compareTimestamps(commitTime, lastCommitRetained, HoodieTimeline.GREATER_OR_EQUAL),
|
||||
"Could not savepoint commit " + commitTime + " as this is beyond the lookup window "
|
||||
+ lastCommitRetained);
|
||||
|
||||
// Nothing to save in the savepoint
|
||||
table.getActiveTimeline().saveAsComplete(
|
||||
new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, commitTime),
|
||||
AvroUtils.serializeSavepointMetadata(savePointMetadata));
|
||||
logger.info("Savepoint " + commitTime + " created");
|
||||
return true;
|
||||
} catch (IOException e) {
|
||||
throw new HoodieSavepointException("Failed to savepoint " + commitTime, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a savepoint that was created. Once the savepoint is deleted, the commit can be rolledback
|
||||
* and cleaner may clean up data files.
|
||||
*
|
||||
* @param savepointTime - delete the savepoint
|
||||
* @return true if the savepoint was deleted successfully
|
||||
*/
|
||||
public void deleteSavepoint(String savepointTime) {
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
|
||||
HoodieInstant savePoint =
|
||||
new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
|
||||
boolean isSavepointPresent =
|
||||
table.getCompletedSavepointTimeline().containsInstant(savePoint);
|
||||
if (!isSavepointPresent) {
|
||||
logger.warn("No savepoint present " + savepointTime);
|
||||
return;
|
||||
}
|
||||
|
||||
activeTimeline.revertToInflight(savePoint);
|
||||
activeTimeline.deleteInflight(
|
||||
new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, savepointTime));
|
||||
logger.info("Savepoint " + savepointTime + " deleted");
|
||||
}
|
||||
|
||||
/**
|
||||
* Rollback the state to the savepoint.
|
||||
* WARNING: This rollsback recent commits and deleted data files. Queries accessing the files
|
||||
* will mostly fail. This should be done during a downtime.
|
||||
*
|
||||
* @param savepointTime - savepoint time to rollback to
|
||||
* @return true if the savepoint was rollecback to successfully
|
||||
*/
|
||||
public boolean rollbackToSavepoint(String savepointTime) {
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
|
||||
|
||||
HoodieInstant savePoint =
|
||||
new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
|
||||
boolean isSavepointPresent =
|
||||
table.getCompletedSavepointTimeline().containsInstant(savePoint);
|
||||
if (!isSavepointPresent) {
|
||||
throw new HoodieRollbackException("No savepoint for commitTime " + savepointTime);
|
||||
}
|
||||
|
||||
List<String> commitsToRollback =
|
||||
commitTimeline.findInstantsAfter(savepointTime, Integer.MAX_VALUE).getInstants()
|
||||
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||
logger.info("Rolling back commits " + commitsToRollback);
|
||||
|
||||
rollback(commitsToRollback);
|
||||
|
||||
// Make sure the rollback was successful
|
||||
Optional<HoodieInstant> lastInstant =
|
||||
activeTimeline.reload().getCommitTimeline().filterCompletedInstants().lastInstant();
|
||||
Preconditions.checkArgument(lastInstant.isPresent());
|
||||
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
|
||||
savepointTime + "is not the last commit after rolling back " + commitsToRollback
|
||||
+ ", last commit was " + lastInstant.get().getTimestamp());
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rollback the (inflight/committed) record changes with the given commit time.
|
||||
* Three steps:
|
||||
@@ -388,81 +541,135 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* (4) Finally delete .commit or .inflight file,
|
||||
*/
|
||||
public boolean rollback(final String commitTime) throws HoodieRollbackException {
|
||||
rollback(Lists.newArrayList(commitTime));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private void rollback(List<String> commits) {
|
||||
if(commits.isEmpty()) {
|
||||
logger.info("List of commits to rollback is empty");
|
||||
return;
|
||||
}
|
||||
|
||||
final Timer.Context context = metrics.getRollbackCtx();
|
||||
String startRollbackTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());
|
||||
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
HoodieTimeline inflightTimeline = activeTimeline.getCommitTimeline().filterInflights();
|
||||
HoodieTimeline commitTimeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
||||
HoodieTimeline inflightTimeline = table.getInflightCommitTimeline();
|
||||
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
|
||||
|
||||
// Check if any of the commits is a savepoint - do not allow rollback on those commits
|
||||
List<String> savepoints =
|
||||
table.getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList());
|
||||
commits.forEach(s -> {
|
||||
if (savepoints.contains(s)) {
|
||||
throw new HoodieRollbackException(
|
||||
"Could not rollback a savepointed commit. Delete savepoint first before rolling back"
|
||||
+ s);
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
if (commitTimeline.lastInstant().isPresent()
|
||||
&& !commitTimeline.findInstantsAfter(commitTime, Integer.MAX_VALUE).empty()) {
|
||||
throw new HoodieRollbackException("Found commits after time :" + commitTime +
|
||||
if (commitTimeline.empty() && inflightTimeline.empty()) {
|
||||
// nothing to rollback
|
||||
logger.info("No commits to rollback " + commits);
|
||||
}
|
||||
|
||||
// Make sure only the last n commits are being rolled back
|
||||
// If there is a commit in-between or after that is not rolled back, then abort
|
||||
String lastCommit = commits.get(commits.size() - 1);
|
||||
if (!commitTimeline.empty() && !commitTimeline
|
||||
.findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) {
|
||||
throw new HoodieRollbackException("Found commits after time :" + lastCommit +
|
||||
", please rollback greater commits first");
|
||||
}
|
||||
|
||||
List<String> inflights = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList());
|
||||
if (!inflights.isEmpty() && inflights.indexOf(commitTime) != inflights.size() - 1) {
|
||||
throw new HoodieRollbackException(
|
||||
"Found in-flight commits after time :" + commitTime +
|
||||
", please rollback greater commits first");
|
||||
if (!inflights.isEmpty() && inflights.indexOf(lastCommit) != inflights.size() - 1) {
|
||||
throw new HoodieRollbackException(
|
||||
"Found in-flight commits after time :" + lastCommit +
|
||||
", please rollback greater commits first");
|
||||
}
|
||||
|
||||
if (inflights.contains(commitTime) || (commitTimeline.lastInstant().isPresent()
|
||||
&& commitTimeline.lastInstant().get().getTimestamp().equals(commitTime))) {
|
||||
// 1. Atomically unpublish this commit
|
||||
if(!inflights.contains(commitTime)) {
|
||||
// This is completed commit, first revert it to inflight to unpublish data
|
||||
activeTimeline.revertToInflight(
|
||||
new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime));
|
||||
}
|
||||
// 2. Revert the index changes
|
||||
logger.info("Clean out index changes at time: " + commitTime);
|
||||
if (!index.rollbackCommit(commitTime)) {
|
||||
throw new HoodieRollbackException(
|
||||
"Clean out index changes failed, for time :" + commitTime);
|
||||
}
|
||||
// Atomically unpublish all the commits
|
||||
commits.stream().filter(s -> !inflights.contains(s))
|
||||
.map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s))
|
||||
.forEach(activeTimeline::revertToInflight);
|
||||
logger.info("Unpublished " + commits);
|
||||
|
||||
// 3. Delete the new generated parquet files
|
||||
logger.info("Clean out all parquet files generated at time: " + commitTime);
|
||||
final Accumulator<Integer> numFilesDeletedAccu = jsc.accumulator(0);
|
||||
jsc.parallelize(
|
||||
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath()))
|
||||
.foreach((VoidFunction<String>) partitionPath -> {
|
||||
// Scan all partitions files with this commit time
|
||||
FileSystem fs1 = FSUtils.getFs();
|
||||
FileStatus[] toBeDeleted =
|
||||
fs1.listStatus(new Path(config.getBasePath(), partitionPath),
|
||||
path -> {
|
||||
return commitTime
|
||||
.equals(FSUtils.getCommitTime(path.getName()));
|
||||
});
|
||||
for (FileStatus file : toBeDeleted) {
|
||||
boolean success = fs1.delete(file.getPath(), false);
|
||||
logger.info("Delete file " + file.getPath() + "\t" + success);
|
||||
if (success) {
|
||||
numFilesDeletedAccu.add(1);
|
||||
// cleanup index entries
|
||||
commits.stream().forEach(s -> {
|
||||
if (!index.rollbackCommit(s)) {
|
||||
throw new HoodieRollbackException(
|
||||
"Clean out index changes failed, for time :" + s);
|
||||
}
|
||||
});
|
||||
logger.info("Index rolled back for commits " + commits);
|
||||
|
||||
// delete all the data files for all these commits
|
||||
logger.info("Clean out all parquet files generated for commits: " + commits);
|
||||
final LongAccumulator numFilesDeletedCounter = jsc.sc().longAccumulator();
|
||||
List<HoodieRollbackStat> stats = jsc.parallelize(
|
||||
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath()))
|
||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||
// Scan all partitions files with this commit time
|
||||
logger.info("Cleaning path " + partitionPath);
|
||||
FileSystem fs1 = FSUtils.getFs();
|
||||
FileStatus[] toBeDeleted =
|
||||
fs1.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
|
||||
if(!path.toString().contains(".parquet")) {
|
||||
return false;
|
||||
}
|
||||
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
||||
return commits.contains(fileCommitTime);
|
||||
});
|
||||
Map<FileStatus, Boolean> results = Maps.newHashMap();
|
||||
for (FileStatus file : toBeDeleted) {
|
||||
boolean success = fs1.delete(file.getPath(), false);
|
||||
results.put(file, success);
|
||||
logger.info("Delete file " + file.getPath() + "\t" + success);
|
||||
if (success) {
|
||||
numFilesDeletedCounter.add(1);
|
||||
}
|
||||
});
|
||||
// 4. Remove commit
|
||||
logger.info("Clean out metadata files at time: " + commitTime);
|
||||
activeTimeline.deleteInflight(
|
||||
new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime));
|
||||
}
|
||||
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
||||
.withDeletedFileResults(results).build();
|
||||
}).collect();
|
||||
|
||||
if (context != null) {
|
||||
long durationInMs = metrics.getDurationInMs(context.stop());
|
||||
int numFilesDeleted = numFilesDeletedAccu.value();
|
||||
metrics.updateRollbackMetrics(durationInMs, numFilesDeleted);
|
||||
}
|
||||
// Remove the rolled back inflight commits
|
||||
commits.stream().map(s -> new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, s))
|
||||
.forEach(activeTimeline::deleteInflight);
|
||||
logger.info("Deleted inflight commits " + commits);
|
||||
|
||||
Optional<Long> durationInMs = Optional.empty();
|
||||
if (context != null) {
|
||||
durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
|
||||
Long numFilesDeleted = numFilesDeletedCounter.value();
|
||||
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
|
||||
}
|
||||
HoodieRollbackMetadata rollbackMetadata =
|
||||
AvroUtils.convertRollbackMetadata(startRollbackTime, durationInMs, commits, stats);
|
||||
table.getActiveTimeline().saveAsComplete(
|
||||
new HoodieInstant(false, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime),
|
||||
AvroUtils.serializeRollbackMetadata(rollbackMetadata));
|
||||
logger.info("Commits " + commits + " rollback is complete");
|
||||
|
||||
if (!table.getActiveTimeline().getCleanerTimeline().empty()) {
|
||||
logger.info("Cleaning up older rollback meta files");
|
||||
// Cleanup of older cleaner meta files
|
||||
// TODO - make the commit archival generic and archive rollback metadata
|
||||
FSUtils.deleteOlderRollbackMetaFiles(fs, table.getMetaClient().getMetaPath(),
|
||||
table.getActiveTimeline().getRollbackTimeline().getInstants());
|
||||
}
|
||||
return true;
|
||||
} catch (IOException e) {
|
||||
throw new HoodieRollbackException("Failed to rollback " +
|
||||
config.getBasePath() + " at commit time" + commitTime, e);
|
||||
config.getBasePath() + " commits " + commits, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -476,37 +683,58 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
/**
|
||||
* Clean up any stale/old files/data lying around (either on file storage or index storage)
|
||||
*/
|
||||
private void clean() throws HoodieIOException {
|
||||
private void clean(String startCleanTime) throws HoodieIOException {
|
||||
try {
|
||||
logger.info("Cleaner started");
|
||||
final Timer.Context context = metrics.getCleanCtx();
|
||||
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
|
||||
List<String> partitionsToClean = FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath());
|
||||
List<String> partitionsToClean =
|
||||
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath());
|
||||
// shuffle to distribute cleaning work across partitions evenly
|
||||
Collections.shuffle(partitionsToClean);
|
||||
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config.getCleanerPolicy());
|
||||
if(partitionsToClean.isEmpty()) {
|
||||
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
|
||||
.getCleanerPolicy());
|
||||
if (partitionsToClean.isEmpty()) {
|
||||
logger.info("Nothing to clean here mom. It is already clean");
|
||||
return;
|
||||
}
|
||||
|
||||
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
|
||||
int numFilesDeleted = jsc.parallelize(partitionsToClean, cleanerParallelism)
|
||||
.map((Function<String, Integer>) partitionPathToClean -> {
|
||||
List<HoodieCleanStat> cleanStats = jsc.parallelize(partitionsToClean, cleanerParallelism)
|
||||
.map((Function<String, HoodieCleanStat>) partitionPathToClean -> {
|
||||
HoodieCleaner cleaner = new HoodieCleaner(table, config);
|
||||
return cleaner.clean(partitionPathToClean);
|
||||
})
|
||||
.reduce((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);
|
||||
.collect();
|
||||
|
||||
logger.info("Cleaned " + numFilesDeleted + " files");
|
||||
// Emit metrics (duration, numFilesDeleted) if needed
|
||||
Optional<Long> durationInMs = Optional.empty();
|
||||
if (context != null) {
|
||||
long durationInMs = metrics.getDurationInMs(context.stop());
|
||||
logger.info("cleanerElaspsedTime (Minutes): " + durationInMs / (1000 * 60));
|
||||
metrics.updateCleanMetrics(durationInMs, numFilesDeleted);
|
||||
durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
|
||||
logger.info("cleanerElaspsedTime (Minutes): " + durationInMs.get() / (1000 * 60));
|
||||
}
|
||||
|
||||
// Create the metadata and save it
|
||||
HoodieCleanMetadata metadata =
|
||||
AvroUtils.convertCleanMetadata(startCleanTime, durationInMs, cleanStats);
|
||||
logger.info("Cleaned " + metadata.getTotalFilesDeleted() + " files");
|
||||
metrics.updateCleanMetrics(durationInMs.orElseGet(() -> -1L),
|
||||
metadata.getTotalFilesDeleted());
|
||||
|
||||
table.getActiveTimeline().saveAsComplete(
|
||||
new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, startCleanTime),
|
||||
AvroUtils.serializeCleanMetadata(metadata));
|
||||
logger.info("Marked clean started on " + startCleanTime + " as complete");
|
||||
|
||||
if (!table.getActiveTimeline().getCleanerTimeline().empty()) {
|
||||
// Cleanup of older cleaner meta files
|
||||
// TODO - make the commit archival generic and archive clean metadata
|
||||
FSUtils.deleteOlderCleanMetaFiles(fs, table.getMetaClient().getMetaPath(),
|
||||
table.getActiveTimeline().getCleanerTimeline().getInstants());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Failed to clean up after commit", e);
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
package com.uber.hoodie.config;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.uber.hoodie.io.HoodieCleaner;
|
||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
||||
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
import java.io.File;
|
||||
@@ -32,7 +32,7 @@ import java.util.Properties;
|
||||
public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy";
|
||||
private static final String DEFAULT_CLEANER_POLICY =
|
||||
HoodieCleaner.CleaningPolicy.KEEP_LATEST_COMMITS.name();
|
||||
HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
|
||||
|
||||
public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP =
|
||||
"hoodie.cleaner.fileversions.retained";
|
||||
@@ -94,7 +94,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
}
|
||||
|
||||
public Builder withCleanerPolicy(HoodieCleaner.CleaningPolicy policy) {
|
||||
public Builder withCleanerPolicy(HoodieCleaningPolicy policy) {
|
||||
props.setProperty(CLEANER_POLICY_PROP, policy.name());
|
||||
return this;
|
||||
}
|
||||
@@ -164,7 +164,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM),
|
||||
CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM);
|
||||
|
||||
HoodieCleaner.CleaningPolicy.valueOf(props.getProperty(CLEANER_POLICY_PROP));
|
||||
HoodieCleaningPolicy.valueOf(props.getProperty(CLEANER_POLICY_PROP));
|
||||
Preconditions.checkArgument(
|
||||
Integer.parseInt(props.getProperty(MAX_COMMITS_TO_KEEP)) > Integer
|
||||
.parseInt(props.getProperty(MIN_COMMITS_TO_KEEP)));
|
||||
|
||||
@@ -18,6 +18,7 @@ package com.uber.hoodie.config;
|
||||
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.io.HoodieCleaner;
|
||||
import com.uber.hoodie.metrics.MetricsReporterType;
|
||||
@@ -97,8 +98,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||
/**
|
||||
* compaction properties
|
||||
**/
|
||||
public HoodieCleaner.CleaningPolicy getCleanerPolicy() {
|
||||
return HoodieCleaner.CleaningPolicy
|
||||
public HoodieCleaningPolicy getCleanerPolicy() {
|
||||
return HoodieCleaningPolicy
|
||||
.valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP));
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.exception;
|
||||
|
||||
public class HoodieSavepointException extends HoodieException {
|
||||
|
||||
public HoodieSavepointException(String msg, Throwable e) {
|
||||
super(msg, e);
|
||||
}
|
||||
|
||||
public HoodieSavepointException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,10 @@
|
||||
|
||||
package com.uber.hoodie.io;
|
||||
|
||||
import com.clearspring.analytics.util.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.common.HoodieCleanStat;
|
||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.TableFileSystemView;
|
||||
@@ -33,6 +37,8 @@ import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
@@ -47,13 +53,6 @@ import java.util.stream.Collectors;
|
||||
public class HoodieCleaner {
|
||||
private static Logger logger = LogManager.getLogger(HoodieCleaner.class);
|
||||
|
||||
|
||||
public enum CleaningPolicy {
|
||||
KEEP_LATEST_FILE_VERSIONS,
|
||||
KEEP_LATEST_COMMITS
|
||||
}
|
||||
|
||||
|
||||
private final TableFileSystemView fileSystemView;
|
||||
private final HoodieTimeline commitTimeline;
|
||||
private HoodieTable hoodieTable;
|
||||
@@ -86,13 +85,18 @@ public class HoodieCleaner {
|
||||
fileSystemView.getEveryVersionInPartition(partitionPath)
|
||||
.collect(Collectors.toList());
|
||||
List<String> deletePaths = new ArrayList<>();
|
||||
List<String> savepoints = hoodieTable.getSavepoints();
|
||||
|
||||
for (List<HoodieDataFile> versionsForFileId : fileVersions) {
|
||||
int keepVersions = config.getCleanerFileVersionsRetained();
|
||||
Iterator<HoodieDataFile> commitItr = versionsForFileId.iterator();
|
||||
while (commitItr.hasNext() && keepVersions > 0) {
|
||||
// Skip this most recent version
|
||||
commitItr.next();
|
||||
HoodieDataFile next = commitItr.next();
|
||||
if(savepoints.contains(next.getCommitTime())) {
|
||||
// do not clean datafiles that are savepointed
|
||||
continue;
|
||||
}
|
||||
keepVersions--;
|
||||
}
|
||||
// Delete the remaining files
|
||||
@@ -130,6 +134,8 @@ public class HoodieCleaner {
|
||||
"Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
|
||||
List<String> deletePaths = new ArrayList<>();
|
||||
|
||||
List<String> savepoints = hoodieTable.getSavepoints();
|
||||
|
||||
// determine if we have enough commits, to start cleaning.
|
||||
if (commitTimeline.countInstants() > commitsRetained) {
|
||||
HoodieInstant earliestCommitToRetain =
|
||||
@@ -146,6 +152,10 @@ public class HoodieCleaner {
|
||||
// i.e always spare the last commit.
|
||||
for (HoodieDataFile afile : fileList) {
|
||||
String fileCommitTime = afile.getCommitTime();
|
||||
if(savepoints.contains(fileCommitTime)) {
|
||||
// do not clean up a savepoint data file
|
||||
continue;
|
||||
}
|
||||
// Dont delete the latest commit and also the last commit before the earliest commit we are retaining
|
||||
// The window of commit retain == max query run time. So a query could be running which still
|
||||
// uses this file.
|
||||
@@ -196,30 +206,42 @@ public class HoodieCleaner {
|
||||
*
|
||||
* @throws IllegalArgumentException if unknown cleaning policy is provided
|
||||
*/
|
||||
public int clean(String partitionPath) throws IOException {
|
||||
CleaningPolicy policy = config.getCleanerPolicy();
|
||||
public HoodieCleanStat clean(String partitionPath) throws IOException {
|
||||
HoodieCleaningPolicy policy = config.getCleanerPolicy();
|
||||
List<String> deletePaths;
|
||||
if (policy == CleaningPolicy.KEEP_LATEST_COMMITS) {
|
||||
Optional<HoodieInstant> earliestCommitToRetain = Optional.empty();
|
||||
if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) {
|
||||
deletePaths = getFilesToCleanKeepingLatestCommits(partitionPath);
|
||||
} else if (policy == CleaningPolicy.KEEP_LATEST_FILE_VERSIONS) {
|
||||
int commitsRetained = config.getCleanerCommitsRetained();
|
||||
if (commitTimeline.countInstants() > commitsRetained) {
|
||||
earliestCommitToRetain =
|
||||
commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained);
|
||||
}
|
||||
} else if (policy == HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) {
|
||||
deletePaths = getFilesToCleanKeepingLatestVersions(partitionPath);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name());
|
||||
}
|
||||
|
||||
// perform the actual deletes
|
||||
Map<FileStatus, Boolean> deletedFiles = Maps.newHashMap();
|
||||
for (String deletePath : deletePaths) {
|
||||
logger.info("Working on delete path :" + deletePath);
|
||||
FileStatus[] deleteVersions = fs.globStatus(new Path(deletePath));
|
||||
if (deleteVersions != null) {
|
||||
for (FileStatus deleteVersion : deleteVersions) {
|
||||
if (fs.delete(deleteVersion.getPath(), false)) {
|
||||
logger.info("Cleaning file at path :" + deleteVersion.getPath());
|
||||
boolean deleteResult = fs.delete(deleteVersion.getPath(), false);
|
||||
deletedFiles.put(deleteVersion, deleteResult);
|
||||
if (deleteResult) {
|
||||
logger.info("Cleaned file at path :" + deleteVersion.getPath());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.info(deletePaths.size() + " files deleted for partition path:" + partitionPath);
|
||||
return deletePaths.size();
|
||||
|
||||
logger.info(deletePaths.size() + " patterns used to delete in partition path:" + partitionPath);
|
||||
return HoodieCleanStat.newBuilder().withPolicy(policy).withDeletePathPattern(deletePaths)
|
||||
.withPartitionPath(partitionPath).withEarliestCommitRetained(earliestCommitToRetain)
|
||||
.withDeletedFileResults(deletedFiles).build();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.common.file.HoodieAppendLog;
|
||||
import com.uber.hoodie.exception.HoodieCommitException;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
@@ -71,17 +72,17 @@ public class HoodieCommitArchiveLog {
|
||||
}
|
||||
|
||||
private Stream<HoodieInstant> getCommitsToArchive() {
|
||||
|
||||
int maxCommitsToKeep = config.getMaxCommitsToKeep();
|
||||
int minCommitsToKeep = config.getMinCommitsToKeep();
|
||||
|
||||
HoodieTableMetaClient metaClient =
|
||||
new HoodieTableMetaClient(fs, config.getBasePath(), true);
|
||||
HoodieTimeline commitTimeline =
|
||||
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
|
||||
List<String> savepoints = table.getSavepoints();
|
||||
|
||||
if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) {
|
||||
// Actually do the commits
|
||||
return commitTimeline.getInstants()
|
||||
return commitTimeline.getInstants().filter(s -> !savepoints.contains(s.getTimestamp()))
|
||||
.limit(commitTimeline.countInstants() - minCommitsToKeep);
|
||||
}
|
||||
return Stream.empty();
|
||||
|
||||
@@ -98,7 +98,7 @@ public class HoodieMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
public void updateRollbackMetrics(long durationInMs, int numFilesDeleted) {
|
||||
public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) {
|
||||
if (config.isMetricsOn()) {
|
||||
logger.info(String.format("Sending rollback metrics (duration=%d, numFilesDeleted=$d)",
|
||||
durationInMs, numFilesDeleted));
|
||||
|
||||
@@ -21,6 +21,7 @@ import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.TableFileSystemView;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
@@ -35,6 +36,7 @@ import org.apache.spark.Partitioner;
|
||||
import java.io.Serializable;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Abstract implementation of a HoodieTable
|
||||
@@ -111,6 +113,36 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
return getCommitTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get only the inflights (no-completed) commit timeline
|
||||
* @return
|
||||
*/
|
||||
public HoodieTimeline getInflightCommitTimeline() {
|
||||
return getCommitTimeline().filterInflights();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get only the completed (no-inflights) clean timeline
|
||||
* @return
|
||||
*/
|
||||
public HoodieTimeline getCompletedCleanTimeline() {
|
||||
return getActiveTimeline().getCleanerTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get only the completed (no-inflights) savepoint timeline
|
||||
* @return
|
||||
*/
|
||||
public HoodieTimeline getCompletedSavepointTimeline() {
|
||||
return getActiveTimeline().getSavePointTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
public List<String> getSavepoints() {
|
||||
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public HoodieActiveTimeline getActiveTimeline() {
|
||||
return metaClient.getActiveTimeline();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user