[HUDI-850] Avoid unnecessary listings in incremental cleaning mode (#1576)
This commit is contained in:
committed by
GitHub
parent
c4b71622b9
commit
506447fd4f
@@ -104,7 +104,8 @@ public class TestCleaner extends TestHoodieClientBase {
|
||||
*/
|
||||
private void insertFirstBigBatchForClientCleanerTest(HoodieWriteConfig cfg, HoodieWriteClient client,
|
||||
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
|
||||
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn) throws Exception {
|
||||
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn,
|
||||
HoodieCleaningPolicy cleaningPolicy) throws Exception {
|
||||
|
||||
/*
|
||||
* do a big insert (this is basically same as insert part of upsert, just adding it here so we can catch breakages
|
||||
@@ -127,11 +128,15 @@ public class TestCleaner extends TestHoodieClientBase {
|
||||
HoodieTable table = HoodieTable.create(metaClient, client.getConfig(), jsc);
|
||||
|
||||
assertFalse(table.getCompletedCommitsTimeline().empty());
|
||||
String instantTime = table.getCompletedCommitsTimeline().getInstants().findFirst().get().getTimestamp();
|
||||
assertFalse(table.getCompletedCleanTimeline().empty());
|
||||
assertEquals(instantTime,
|
||||
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp(),
|
||||
"The clean instant should be the same as the commit instant");
|
||||
if (cleaningPolicy.equals(HoodieCleaningPolicy.KEEP_LATEST_COMMITS)) {
|
||||
// We no longer write empty cleaner plans when there are not enough commits present
|
||||
assertTrue(table.getCompletedCleanTimeline().empty());
|
||||
} else {
|
||||
String instantTime = table.getCompletedCommitsTimeline().getInstants().findFirst().get().getTimestamp();
|
||||
assertFalse(table.getCompletedCleanTimeline().empty());
|
||||
assertEquals(instantTime, table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp(),
|
||||
"The clean instant should be the same as the commit instant");
|
||||
}
|
||||
|
||||
HoodieIndex index = HoodieIndex.createIndex(cfg, jsc);
|
||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), jsc, table).collect();
|
||||
@@ -201,7 +206,8 @@ public class TestCleaner extends TestHoodieClientBase {
|
||||
final Function2<List<HoodieRecord>, String, Integer> recordUpsertGenWrappedFunction =
|
||||
generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateUniqueUpdates);
|
||||
|
||||
insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn);
|
||||
insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn,
|
||||
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS);
|
||||
|
||||
Map<HoodieFileGroupId, FileSlice> compactionFileIdToLatestFileSlice = new HashMap<>();
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||
@@ -349,7 +355,7 @@ public class TestCleaner extends TestHoodieClientBase {
|
||||
int maxCommits = 3; // keep upto 3 commits from the past
|
||||
HoodieWriteConfig cfg = getConfigBuilder()
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainCommits(maxCommits).build())
|
||||
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build())
|
||||
.withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1)
|
||||
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
|
||||
.build();
|
||||
@@ -361,7 +367,8 @@ public class TestCleaner extends TestHoodieClientBase {
|
||||
final Function2<List<HoodieRecord>, String, Integer> recordUpsertGenWrappedFunction =
|
||||
generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateUniqueUpdates);
|
||||
|
||||
insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn);
|
||||
insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn,
|
||||
HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
|
||||
|
||||
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
|
||||
HoodieTestUtils.monotonicIncreasingCommitTimestamps(8, 1).forEach(newCommitTime -> {
|
||||
@@ -376,7 +383,9 @@ public class TestCleaner extends TestHoodieClientBase {
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||
HoodieTable table1 = HoodieTable.create(metaClient, cfg, jsc);
|
||||
HoodieTimeline activeTimeline = table1.getCompletedCommitsTimeline();
|
||||
Option<HoodieInstant> earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1);
|
||||
// NOTE: See CleanPlanner#getFilesToCleanKeepingLatestCommits. We explicitly keep one commit before earliest
|
||||
// commit
|
||||
Option<HoodieInstant> earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits);
|
||||
Set<HoodieInstant> acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet());
|
||||
if (earliestRetainedCommit.isPresent()) {
|
||||
acceptableCommits
|
||||
@@ -751,12 +760,7 @@ public class TestCleaner extends TestHoodieClientBase {
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||
|
||||
List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config, simulateFailureRetry);
|
||||
assertEquals(0,
|
||||
getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
||||
.size(), "Must not clean any files");
|
||||
assertEquals(0,
|
||||
getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
||||
.size(), "Must not clean any files");
|
||||
assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files");
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
|
||||
file1P0C0));
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000",
|
||||
@@ -786,12 +790,7 @@ public class TestCleaner extends TestHoodieClientBase {
|
||||
new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001"),
|
||||
Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||
List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config, simulateFailureRetry);
|
||||
assertEquals(0,
|
||||
getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
||||
.size(), "Must not clean any files");
|
||||
assertEquals(0,
|
||||
getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
||||
.size(), "Must not clean any files");
|
||||
assertEquals(0, hoodieCleanStatsTwo.size(), "Must not scan any partitions and clean any files");
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
|
||||
file2P0C1));
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001",
|
||||
|
||||
Reference in New Issue
Block a user