1
0

Ensure Cleaner and Archiver do not delete file-slices and workload marked for compaction

This commit is contained in:
Balaji Varadarajan
2018-05-31 14:16:19 -07:00
committed by vinoth chandar
parent 0a0451a765
commit 9b78523d62
10 changed files with 666 additions and 76 deletions

View File

@@ -16,6 +16,7 @@
package com.uber.hoodie.io;
import com.uber.hoodie.common.model.CompactionOperation;
import com.uber.hoodie.common.model.FileSlice;
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
import com.uber.hoodie.common.model.HoodieDataFile;
@@ -25,14 +26,17 @@ import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -48,6 +52,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
private final TableFileSystemView fileSystemView;
private final HoodieTimeline commitTimeline;
private final Map<String, CompactionOperation> fileIdToPendingCompactionOperations;
private HoodieTable<T> hoodieTable;
private HoodieWriteConfig config;
@@ -56,9 +61,12 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
this.fileSystemView = hoodieTable.getCompletedFileSystemView();
this.commitTimeline = hoodieTable.getCompletedCommitTimeline();
this.config = config;
this.fileIdToPendingCompactionOperations =
((HoodieTableFileSystemView)hoodieTable.getRTFileSystemView()).getFileIdToPendingCompaction().entrySet()
.stream().map(entry -> Pair.of(entry.getKey(), entry.getValue().getValue()))
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
}
/**
* Selects the older versions of files for cleaning, such that it bounds the number of versions of
* each file. This policy is useful, if you are simply interested in querying the table, and you
@@ -81,8 +89,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
while (fileSliceIterator.hasNext() && keepVersions > 0) {
// Skip this most recent version
FileSlice nextSlice = fileSliceIterator.next();
HoodieDataFile dataFile = nextSlice.getDataFile().get();
if (savepointedFiles.contains(dataFile.getFileName())) {
Optional<HoodieDataFile> dataFile = nextSlice.getDataFile();
if (dataFile.isPresent() && savepointedFiles.contains(dataFile.get().getFileName())) {
// do not clean up a savepoint data file
continue;
}
@@ -91,12 +99,16 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
// Delete the remaining files
while (fileSliceIterator.hasNext()) {
FileSlice nextSlice = fileSliceIterator.next();
HoodieDataFile dataFile = nextSlice.getDataFile().get();
deletePaths.add(dataFile.getFileStatus().getPath().toString());
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
// If merge on read, then clean the log files for the commits as well
deletePaths.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString())
.collect(Collectors.toList()));
if (!isFileSliceNeededForPendingCompaction(nextSlice)) {
if (nextSlice.getDataFile().isPresent()) {
HoodieDataFile dataFile = nextSlice.getDataFile().get();
deletePaths.add(dataFile.getFileStatus().getPath().toString());
}
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
// If merge on read, then clean the log files for the commits as well
deletePaths.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString())
.collect(Collectors.toList()));
}
}
}
}
@@ -133,17 +145,21 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
.collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) {
List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
HoodieDataFile dataFile = fileSliceList.get(0).getDataFile().get();
String lastVersion = dataFile.getCommitTime();
if (fileSliceList.isEmpty()) {
continue;
}
String lastVersion = fileSliceList.get(0).getBaseInstantTime();
String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList,
earliestCommitToRetain);
// Ensure there are more than 1 version of the file (we only clean old files from updates)
// i.e always spare the last commit.
for (FileSlice aSlice : fileSliceList) {
HoodieDataFile aFile = aSlice.getDataFile().get();
String fileCommitTime = aFile.getCommitTime();
if (savepointedFiles.contains(aFile.getFileName())) {
Optional<HoodieDataFile> aFile = aSlice.getDataFile();
String fileCommitTime = aSlice.getBaseInstantTime();
if (aFile.isPresent() && savepointedFiles.contains(aFile.get().getFileName())) {
// do not clean up a savepoint data file
continue;
}
@@ -159,11 +175,14 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
}
// Always keep the last commit
if (HoodieTimeline
if (!isFileSliceNeededForPendingCompaction(aSlice)
&& HoodieTimeline
.compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime,
HoodieTimeline.GREATER)) {
// this is a commit, that should be cleaned.
deletePaths.add(aFile.getFileStatus().getPath().toString());
if (aFile.isPresent()) {
deletePaths.add(aFile.get().getFileStatus().getPath().toString());
}
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
// If merge on read, then clean the log files for the commits as well
deletePaths.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString())
@@ -183,7 +202,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
private String getLatestVersionBeforeCommit(List<FileSlice> fileSliceList,
HoodieInstant commitTime) {
for (FileSlice file : fileSliceList) {
String fileCommitTime = file.getDataFile().get().getCommitTime();
String fileCommitTime = file.getBaseInstantTime();
if (HoodieTimeline
.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
// fileList is sorted on the reverse, so the first commit we find <= commitTime is the
@@ -226,4 +245,19 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
}
return earliestCommitToRetain;
}
}
/**
* Determine if file slice needed to be preserved for pending compaction
* @param fileSlice File Slice
* @return true if file slice needs to be preserved, false otherwise.
*/
private boolean isFileSliceNeededForPendingCompaction(FileSlice fileSlice) {
CompactionOperation op = fileIdToPendingCompactionOperations.get(fileSlice.getFileId());
if (null != op) {
// If file slice's instant time is newer or same as that of operation, do not clean
return HoodieTimeline.compareTimestamps(fileSlice.getBaseInstantTime(), op.getBaseInstantTime(),
HoodieTimeline.GREATER_OR_EQUAL);
}
return false;
}
}

View File

@@ -16,6 +16,10 @@
package com.uber.hoodie.io;
import static com.uber.hoodie.common.table.HoodieTimeline.COMMIT_ACTION;
import static com.uber.hoodie.common.table.HoodieTimeline.DELTA_COMMIT_ACTION;
import static com.uber.hoodie.common.table.HoodieTimeline.LESSER_OR_EQUAL;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
@@ -32,6 +36,7 @@ import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFormat;
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
import com.uber.hoodie.common.table.log.block.HoodieLogBlock;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.AvroUtils;
@@ -42,6 +47,7 @@ import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -100,7 +106,7 @@ public class HoodieCommitArchiveLog {
/**
* Check if commits need to be archived. If yes, archive commits.
*/
public boolean archiveIfRequired(final JavaSparkContext jsc) {
public boolean archiveIfRequired(final JavaSparkContext jsc) throws IOException {
try {
List<HoodieInstant> instantsToArchive = getInstantsToArchive(jsc).collect(Collectors.toList());
boolean success = true;
@@ -144,23 +150,34 @@ public class HoodieCommitArchiveLog {
//TODO (na) : Add a way to return actions associated with a timeline and then merge/unify
// with logic above to avoid Stream.concats
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
Optional<HoodieInstant> oldestPendingCompactionInstant =
table.getActiveTimeline().filterPendingCompactionTimeline().firstInstant();
// We cannot have any holes in the commit timeline. We cannot archive any commits which are
// made after the first savepoint present.
Optional<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) {
// Actually do the commits
instants = Stream.concat(instants, commitTimeline.getInstants().filter(s -> {
// if no savepoint present, then dont filter
return !(firstSavepoint.isPresent() && HoodieTimeline
.compareTimestamps(firstSavepoint.get().getTimestamp(), s.getTimestamp(),
HoodieTimeline.LESSER_OR_EQUAL));
}).limit(commitTimeline.countInstants() - minCommitsToKeep));
instants = Stream.concat(instants, commitTimeline.getInstants()
.filter(s -> {
// if no savepoint present, then dont filter
return !(firstSavepoint.isPresent() && HoodieTimeline
.compareTimestamps(firstSavepoint.get().getTimestamp(), s.getTimestamp(),
HoodieTimeline.LESSER_OR_EQUAL));
})
.filter(s -> {
// Ensure commits >= oldest pending compaction commit is retained
return oldestPendingCompactionInstant.map(instant -> {
return HoodieTimeline.compareTimestamps(instant.getTimestamp(), s.getTimestamp(), HoodieTimeline.GREATER);
}).orElse(true);
})
.limit(commitTimeline.countInstants() - minCommitsToKeep));
}
return instants;
}
private boolean deleteArchivedInstants(List<HoodieInstant> archivedInstants) {
private boolean deleteArchivedInstants(List<HoodieInstant> archivedInstants) throws IOException {
log.info("Deleting instants " + archivedInstants);
boolean success = true;
for (HoodieInstant archivedInstant : archivedInstants) {
@@ -174,6 +191,48 @@ public class HoodieCommitArchiveLog {
throw new HoodieIOException("Failed to delete archived instant " + archivedInstant, e);
}
}
// Remove older meta-data from auxiliary path too
Optional<HoodieInstant> latestCommitted =
archivedInstants.stream()
.filter(i -> {
return i.isCompleted()
&& (i.getAction().equals(COMMIT_ACTION) || (i.getAction().equals(DELTA_COMMIT_ACTION)));
})
.sorted(Comparator.comparing(HoodieInstant::getTimestamp).reversed()).findFirst();
if (latestCommitted.isPresent()) {
success &= deleteAllInstantsOlderorEqualsInAuxMetaFolder(latestCommitted.get());
}
return success;
}
/**
* Remove older instants from auxiliary meta folder
*
* @param thresholdInstant Hoodie Instant
* @return success if all eligible file deleted successfully
* @throws IOException in case of error
*/
private boolean deleteAllInstantsOlderorEqualsInAuxMetaFolder(HoodieInstant thresholdInstant)
throws IOException {
List<HoodieInstant> instants =
HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(),
new Path(metaClient.getMetaAuxiliaryPath()),
HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE);
List<HoodieInstant> instantsToBeDeleted =
instants.stream().filter(instant1 -> HoodieTimeline.compareTimestamps(instant1.getTimestamp(),
thresholdInstant.getTimestamp(), LESSER_OR_EQUAL)).collect(Collectors.toList());
boolean success = true;
for (HoodieInstant deleteInstant : instantsToBeDeleted) {
log.info("Deleting instant " + deleteInstant + " in auxiliary meta path " + metaClient.getMetaAuxiliaryPath());
Path metaFile = new Path(metaClient.getMetaAuxiliaryPath(), deleteInstant.getFileName());
if (metaClient.getFs().exists(metaFile)) {
success &= metaClient.getFs().delete(metaFile, false);
log.info("Deleted instant file in auxiliary metapath : " + metaFile);
}
}
return success;
}
@@ -212,7 +271,7 @@ public class HoodieCommitArchiveLog {
archivedMetaWrapper.setActionType(ActionType.clean.name());
break;
}
case HoodieTimeline.COMMIT_ACTION: {
case COMMIT_ACTION: {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));