1
0

FileSystemView and Timeline level changes to support Async Compaction

This commit is contained in:
Balaji Varadarajan
2018-05-23 16:54:53 -07:00
committed by vinoth chandar
parent 44caf0d40c
commit 6d01ae8ca0
20 changed files with 892 additions and 132 deletions

View File

@@ -398,7 +398,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
});
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
Optional<HoodieInstant> instant = activeTimeline.filterInflights().lastInstant();
Optional<HoodieInstant> instant = activeTimeline.filterInflightsExcludingCompaction().lastInstant();
activeTimeline.saveToInflight(instant.get(),
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} catch (IOException io) {
@@ -692,7 +692,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config, jsc);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
HoodieTimeline commitTimeline = table.getMetaClient().getCommitsTimeline();
HoodieTimeline commitTimeline = table.getMetaClient().getCommitsAndCompactionTimeline();
HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
savepointTime);
@@ -709,8 +709,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
rollback(commitsToRollback);
// Make sure the rollback was successful
Optional<HoodieInstant> lastInstant = activeTimeline.reload().getCommitsTimeline()
.filterCompletedInstants().lastInstant();
Optional<HoodieInstant> lastInstant = activeTimeline.reload().getCommitsAndCompactionTimeline()
.filterCompletedAndCompactionInstants().lastInstant();
Preconditions.checkArgument(lastInstant.isPresent());
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
savepointTime + "is not the last commit after rolling back " + commitsToRollback
@@ -1051,7 +1051,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
private void rollbackInflightCommits() {
HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config, jsc);
HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterInflights();
HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterInflightsExcludingCompaction();
List<String> commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList());
Collections.reverse(commits);

View File

@@ -113,7 +113,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
.filter(fileSlice1 -> fileSlice1.getFileId().equals(fileId)).findFirst();
String baseInstantTime = commitTime;
if (fileSlice.isPresent()) {
baseInstantTime = fileSlice.get().getBaseCommitTime();
baseInstantTime = fileSlice.get().getBaseInstantTime();
} else {
// This means there is no base data file, start appending to a new log file
fileSlice = Optional.of(new FileSlice(baseInstantTime, this.fileId));

View File

@@ -182,7 +182,7 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
.getLatestFileSlices(partitionPath).map(
s -> {
List<HoodieLogFile> logFiles = s.getLogFiles().sorted(HoodieLogFile
.getLogVersionComparator().reversed()).collect(Collectors.toList());
.getBaseInstantAndLogVersionComparator().reversed()).collect(Collectors.toList());
totalLogFiles.add((long) logFiles.size());
totalFileSlices.add(1L);
return new CompactionOperation(s.getDataFile(), partitionPath, logFiles, config);

View File

@@ -230,7 +230,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// This needs to be done since GlobalIndex at the moment does not store the latest commit time
Map<String, String> fileIdToLatestCommitTimeMap =
hoodieIndex.isGlobal() ? this.getRTFileSystemView().getLatestFileSlices(partitionPath)
.collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseCommitTime)) : null;
.collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseInstantTime)) : null;
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
.filter(wStat -> {
if (wStat != null
@@ -341,7 +341,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// TODO - check if index.isglobal then small files are log files too
Optional<FileSlice> smallFileSlice = getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).filter(
// Use the merged file-slice for small file selection
.getLatestMergedFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).filter(
fileSlice -> fileSlice.getLogFiles().count() < 1
&& fileSlice.getDataFile().get().getFileSize() < config
.getParquetSmallFileLimit()).sorted((FileSlice left, FileSlice right) ->

View File

@@ -119,7 +119,8 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
* Get the real time view of the file system for this table
*/
public TableFileSystemView.RealtimeView getRTFileSystemView() {
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
return new HoodieTableFileSystemView(metaClient,
metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants());
}
/**
@@ -140,7 +141,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
* Get only the inflights (no-completed) commit timeline
*/
public HoodieTimeline getInflightCommitTimeline() {
return metaClient.getCommitsTimeline().filterInflights();
return metaClient.getCommitsTimeline().filterInflightsExcludingCompaction();
}
/**

View File

@@ -595,8 +595,9 @@ public class TestMergeOnReadTable {
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
dataFilesToRead = roView.getLatestDataFiles();
List<HoodieDataFile> dataFilesList = dataFilesToRead.collect(Collectors.toList());
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
dataFilesToRead.findAny().isPresent());
dataFilesList.size() > 0);
/**
* Write 2 (only updates + inserts, written to .log file + correction of existing parquet
@@ -624,7 +625,8 @@ public class TestMergeOnReadTable {
roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
dataFilesToRead = roView.getLatestDataFiles();
Map<String, Long> parquetFileIdToNewSize = dataFilesToRead.collect(
List<HoodieDataFile> newDataFilesList = dataFilesToRead.collect(Collectors.toList());
Map<String, Long> parquetFileIdToNewSize = newDataFilesList.stream().collect(
Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
assertTrue(parquetFileIdToNewSize.entrySet().stream()