1
0

HUDI-148 Small File selection logic for MOR must skip fileIds selected for pending compaction correctly

This commit is contained in:
Balaji Varadarajan
2019-06-08 12:40:08 -07:00
committed by vinoth chandar
parent 8c9980f4f5
commit a1483f2c5f
11 changed files with 48 additions and 27 deletions

View File

@@ -442,7 +442,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
.sorted(HoodieLogFile.getLogFileComparator())
.collect(Collectors.toList());
FileSlice fileSliceForCompaction =
fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime())
fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime(), true)
.filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
int maxUsedVersion =
fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion)

View File

@@ -384,7 +384,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// TODO : choose last N small files since there can be multiple small files written to a single partition
// by different spark partitions in a single batch
Optional<FileSlice> smallFileSlice = getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).filter(
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false).filter(
fileSlice -> fileSlice.getLogFiles().count() < 1
&& fileSlice.getDataFile().get().getFileSize() < config
.getParquetSmallFileLimit()).sorted((FileSlice left, FileSlice right) ->
@@ -394,9 +394,10 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
allSmallFileSlices.add(smallFileSlice.get());
}
} else {
// If we can index log files, we can add more inserts to log files.
// If we can index log files, we can add more inserts to log files for fileIds including those under
// pending compaction.
List<FileSlice> allFileSlices = getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp())
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true)
.collect(Collectors.toList());
for (FileSlice fileSlice : allFileSlices) {
if (isSmallFile(partitionPath, fileSlice)) {

View File

@@ -865,7 +865,7 @@ public class TestCleaner extends TestHoodieClientBase {
String baseInstantForCompaction = fileIdToLatestInstantBeforeCompaction.get(fileId);
Optional<FileSlice> fileSliceForCompaction =
hoodieTable.getRTFileSystemView().getLatestFileSlicesBeforeOrOn(DEFAULT_FIRST_PARTITION_PATH,
baseInstantForCompaction).filter(fs -> fs.getFileId().equals(fileId)).findFirst();
baseInstantForCompaction, true).filter(fs -> fs.getFileId().equals(fileId)).findFirst();
Assert.assertTrue("Base Instant for Compaction must be preserved", fileSliceForCompaction.isPresent());
Assert.assertTrue("FileSlice has data-file", fileSliceForCompaction.get().getDataFile().isPresent());
Assert.assertEquals("FileSlice has log-files", 2,

View File

@@ -280,7 +280,7 @@ public class TestCompactionAdminClient extends TestHoodieClientBase {
final HoodieTableFileSystemView newFsView =
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
// Expect all file-slice whose base-commit is same as compaction commit to contain no new Log files
newFsView.getLatestFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant)
newFsView.getLatestFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant, true)
.filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)).forEach(fs -> {
Assert.assertFalse("No Data file must be present", fs.getDataFile().isPresent());
Assert.assertTrue("No Log Files", fs.getLogFiles().count() == 0);
@@ -342,7 +342,7 @@ public class TestCompactionAdminClient extends TestHoodieClientBase {
final HoodieTableFileSystemView newFsView =
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
// Expect all file-slice whose base-commit is same as compaction commit to contain no new Log files
newFsView.getLatestFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant)
newFsView.getLatestFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant, true)
.filter(fs -> fs.getBaseInstantTime().equals(compactionInstant))
.filter(fs -> fs.getFileId().equals(op.getFileId())).forEach(fs -> {
Assert.assertFalse("No Data file must be present", fs.getDataFile().isPresent());