1
0

[HUDI-2480] FileSlice after pending compaction-requested instant-time… (#3703)

* [HUDI-2480] FileSlice after pending compaction-requested instant-time is ignored by MOR snapshot reader

* include file slice after a pending compaction for spark reader

Co-authored-by: garyli1019 <yanjia.gary.li@gmail.com>
This commit is contained in:
Danny Chan
2021-11-25 22:30:09 +08:00
committed by GitHub
parent 88067f57a2
commit a2eb2b0b0a
4 changed files with 32 additions and 5 deletions

View File

@@ -303,8 +303,8 @@ public class HoodieTableSource implements
}
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
metaClient.getActiveTimeline().getCommitsTimeline()
.filterCompletedInstants(), fileStatuses);
// file-slice after pending compaction-requested instant-time is also considered valid
metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(), fileStatuses);
String latestCommit = fsView.getLastInstant().get().getTimestamp();
final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE);
final AtomicInteger cnt = new AtomicInteger(0);

View File

@@ -463,6 +463,32 @@ public class TestInputFormat {
TestData.assertRowDataEquals(result, TestData.DATA_SET_INSERT);
}
/**
* Test reading file groups with compaction plan scheduled and delta logs.
* File-slice after pending compaction-requested instant-time should also be considered valid.
*/
@Test
void testReadMORWithCompactionPlanScheduled() throws Exception {
Map<String, String> options = new HashMap<>();
// compact for each commit
options.put(FlinkOptions.COMPACTION_DELTA_COMMITS.key(), "1");
options.put(FlinkOptions.COMPACTION_ASYNC_ENABLED.key(), "false");
beforeEach(HoodieTableType.MERGE_ON_READ, options);
// write three commits
for (int i = 0; i < 6; i += 2) {
List<RowData> dataset = TestData.dataSetInsert(i + 1, i + 2);
TestData.writeData(dataset, conf);
}
InputFormat<RowData, ?> inputFormat1 = this.tableSource.getInputFormat();
assertThat(inputFormat1, instanceOf(MergeOnReadInputFormat.class));
List<RowData> actual = readData(inputFormat1);
final List<RowData> expected = TestData.dataSetInsert(1, 2, 3, 4, 5, 6);
TestData.assertRowDataEquals(actual, expected);
}
// -------------------------------------------------------------------------
// Utilities
// -------------------------------------------------------------------------

View File

@@ -237,7 +237,7 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
try {
// Both commit and delta-commits are included - pick the latest completed one
Option<HoodieInstant> latestCompletedInstant =
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant();
metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants().lastInstant();
Stream<FileSlice> latestFileSlices = latestCompletedInstant
.map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp()))

View File

@@ -151,8 +151,9 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
// Load files from the global paths if it has defined to be compatible with the original mode
val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sqlContext.sparkSession, globPaths.get)
val fsView = new HoodieTableFileSystemView(metaClient,
metaClient.getActiveTimeline.getCommitsTimeline
.filterCompletedInstants, inMemoryFileIndex.allFiles().toArray)
// file-slice after pending compaction-requested instant-time is also considered valid
metaClient.getCommitsAndCompactionTimeline.filterCompletedAndCompactionInstants,
inMemoryFileIndex.allFiles().toArray)
val partitionPaths = fsView.getLatestBaseFiles.iterator().asScala.toList.map(_.getFileStatus.getPath.getParent)