1
0

[HUDI-1740] Fix insert-overwrite API archival (#2784)

- fix problem of archiving replace commits
- Fix problem when getting empty replacecommit.requested
- Improved the logic of handling empty and non-empty requested/inflight commit files. Added unit tests to cover both empty and non-empty inflight files cases and cleaned up some unused test util methods

Co-authored-by: yorkzero831 <yorkzero8312@gmail.com>
Co-authored-by: zheren.yu <zheren.yu@paypay-corp.co.jp>
This commit is contained in:
Susu Dong
2021-05-22 05:52:13 +09:00
committed by GitHub
parent 99b14a78e3
commit 685f77b5dd
12 changed files with 169 additions and 107 deletions

View File

@@ -241,6 +241,10 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
.withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build())
.build();
// when using insert_overwrite or insert_overwrite_table
// first commit may without replaceFileIds
createReplaceMetadataWithoutReplaceFileId("000");
int numCommits = 4;
int commitInstant = 100;
for (int i = 0; i < numCommits; i++) {
@@ -251,7 +255,7 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
assertEquals(4, timeline.countInstants(), "Loaded 4 commits and the count should match");
assertEquals(5, timeline.countInstants(), "Loaded 5 commits and the count should match");
HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table);
boolean result = archiveLog.archiveIfRequired(context);
assertTrue(result);
@@ -513,13 +517,7 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
HoodieCommitMetadata hoodieCommitMetadata = new HoodieCommitMetadata();
hoodieCommitMetadata.setOperationType(WriteOperationType.INSERT);
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-commitMetadata-converter")
.withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build())
.build();
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table);
org.apache.hudi.avro.model.HoodieCommitMetadata expectedCommitMetadata = MetadataConversionUtils
.convertCommitMetadata(hoodieCommitMetadata);
@@ -681,6 +679,22 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
assertEquals(notArchivedInstants, Arrays.asList(notArchivedInstant1, notArchivedInstant2, notArchivedInstant3), "");
}
private void createReplaceMetadataWithoutReplaceFileId(String instantTime) throws Exception {
// create replace instant without a previous replace commit
HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder()
.setOperationType(WriteOperationType.INSERT_OVERWRITE_TABLE.toString())
.setVersion(1)
.setExtraMetadata(Collections.emptyMap())
.build();
HoodieReplaceCommitMetadata completeReplaceMetadata = new HoodieReplaceCommitMetadata();
HoodieCommitMetadata inflightReplaceMetadata = new HoodieCommitMetadata();
completeReplaceMetadata.setOperationType(WriteOperationType.INSERT_OVERWRITE_TABLE);
inflightReplaceMetadata.setOperationType(WriteOperationType.INSERT_OVERWRITE_TABLE);
HoodieTestTable.of(metaClient)
.addReplaceCommit(instantTime, Option.of(requestedReplaceMetadata), Option.of(inflightReplaceMetadata), completeReplaceMetadata);
}
private void createReplaceMetadata(String instantTime) throws Exception {
String fileId1 = "file-" + instantTime + "-1";
String fileId2 = "file-" + instantTime + "-2";
@@ -691,11 +705,13 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
.setVersion(1)
.setExtraMetadata(Collections.emptyMap())
.build();
HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata();
replaceMetadata.addReplaceFileId(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1);
replaceMetadata.setOperationType(WriteOperationType.INSERT_OVERWRITE);
HoodieReplaceCommitMetadata completeReplaceMetadata = new HoodieReplaceCommitMetadata();
HoodieCommitMetadata inflightReplaceMetadata = new HoodieCommitMetadata();
completeReplaceMetadata.addReplaceFileId(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1);
completeReplaceMetadata.setOperationType(WriteOperationType.INSERT_OVERWRITE);
inflightReplaceMetadata.setOperationType(WriteOperationType.INSERT_OVERWRITE);
HoodieTestTable.of(metaClient)
.addReplaceCommit(instantTime, requestedReplaceMetadata, replaceMetadata)
.addReplaceCommit(instantTime, Option.of(requestedReplaceMetadata), Option.of(inflightReplaceMetadata), completeReplaceMetadata)
.withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2);
}

View File

@@ -843,10 +843,11 @@ public class TestCleaner extends HoodieClientTestBase {
assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
// make next replacecommit, with 1 clustering operation. logically delete p0. No change to p1
// notice that clustering generates empty inflight commit files
Map<String, String> partitionAndFileId002 = testTable.forReplaceCommit("00000000000002").getFileIdsWithBaseFilesInPartitions(p0);
String file2P0C1 = partitionAndFileId002.get(p0);
Pair<HoodieRequestedReplaceMetadata, HoodieReplaceCommitMetadata> replaceMetadata = generateReplaceCommitMetadata(p0, file1P0C0, file2P0C1);
testTable.addReplaceCommit("00000000000002", replaceMetadata.getKey(), replaceMetadata.getValue());
testTable.addReplaceCommit("00000000000002", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
// run cleaner
List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config);
@@ -856,10 +857,11 @@ public class TestCleaner extends HoodieClientTestBase {
assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
// make next replacecommit, with 1 clustering operation. Replace data in p1. No change to p0
// notice that clustering generates empty inflight commit files
Map<String, String> partitionAndFileId003 = testTable.forReplaceCommit("00000000000003").getFileIdsWithBaseFilesInPartitions(p1);
String file3P1C2 = partitionAndFileId003.get(p1);
replaceMetadata = generateReplaceCommitMetadata(p1, file1P1C0, file3P1C2);
testTable.addReplaceCommit("00000000000003", replaceMetadata.getKey(), replaceMetadata.getValue());
testTable.addReplaceCommit("00000000000003", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
// run cleaner
List<HoodieCleanStat> hoodieCleanStatsThree = runCleaner(config);
@@ -870,10 +872,11 @@ public class TestCleaner extends HoodieClientTestBase {
assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
// make next replacecommit, with 1 clustering operation. Replace data in p0 again
// notice that clustering generates empty inflight commit files
Map<String, String> partitionAndFileId004 = testTable.forReplaceCommit("00000000000004").getFileIdsWithBaseFilesInPartitions(p0);
String file4P0C3 = partitionAndFileId004.get(p0);
replaceMetadata = generateReplaceCommitMetadata(p0, file2P0C1, file4P0C3);
testTable.addReplaceCommit("00000000000004", replaceMetadata.getKey(), replaceMetadata.getValue());
testTable.addReplaceCommit("00000000000004", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
// run cleaner
List<HoodieCleanStat> hoodieCleanStatsFour = runCleaner(config);
@@ -885,10 +888,11 @@ public class TestCleaner extends HoodieClientTestBase {
assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
// make next replacecommit, with 1 clustering operation. Replace all data in p1. no new files created
// notice that clustering generates empty inflight commit files
Map<String, String> partitionAndFileId005 = testTable.forReplaceCommit("00000000000005").getFileIdsWithBaseFilesInPartitions(p1);
String file4P1C4 = partitionAndFileId005.get(p1);
replaceMetadata = generateReplaceCommitMetadata(p0, file3P1C2, file4P1C4);
testTable.addReplaceCommit("00000000000005", replaceMetadata.getKey(), replaceMetadata.getValue());
testTable.addReplaceCommit("00000000000005", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
List<HoodieCleanStat> hoodieCleanStatsFive = runCleaner(config, 2);
assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3));