1
0

[HUDI-3494] Consider triggering condition of MOR compaction during archival (#4974)

This commit is contained in:
Y Ethan Guo
2022-03-16 22:28:11 -07:00
committed by GitHub
parent 95e6e53810
commit 5ba2d9ab2f
5 changed files with 400 additions and 38 deletions

View File

@@ -25,6 +25,7 @@ import org.apache.hudi.common.model.CompactionOperation;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
@@ -59,13 +60,13 @@ public class CompactionUtils {
/**
* Generate compaction operation from file-slice.
*
* @param partitionPath Partition path
* @param fileSlice File Slice
* @param partitionPath Partition path
* @param fileSlice File Slice
* @param metricsCaptureFunction Metrics Capture function
* @return Compaction Operation
*/
public static HoodieCompactionOperation buildFromFileSlice(String partitionPath, FileSlice fileSlice,
Option<Function<Pair<String, FileSlice>, Map<String, Double>>> metricsCaptureFunction) {
Option<Function<Pair<String, FileSlice>, Map<String, Double>>> metricsCaptureFunction) {
HoodieCompactionOperation.Builder builder = HoodieCompactionOperation.newBuilder();
builder.setPartitionPath(partitionPath);
builder.setFileId(fileSlice.getFileId());
@@ -87,12 +88,12 @@ public class CompactionUtils {
* Generate compaction plan from file-slices.
*
* @param partitionFileSlicePairs list of partition file-slice pairs
* @param extraMetadata Extra Metadata
* @param metricsCaptureFunction Metrics Capture function
* @param extraMetadata Extra Metadata
* @param metricsCaptureFunction Metrics Capture function
*/
public static HoodieCompactionPlan buildFromFileSlices(List<Pair<String, FileSlice>> partitionFileSlicePairs,
Option<Map<String, String>> extraMetadata,
Option<Function<Pair<String, FileSlice>, Map<String, Double>>> metricsCaptureFunction) {
Option<Map<String, String>> extraMetadata,
Option<Function<Pair<String, FileSlice>, Map<String, Double>>> metricsCaptureFunction) {
HoodieCompactionPlan.Builder builder = HoodieCompactionPlan.newBuilder();
extraMetadata.ifPresent(builder::setExtraMetadata);
@@ -195,10 +196,76 @@ public class CompactionUtils {
/**
* Return all pending compaction instant times.
*
*
* @return
*/
public static List<HoodieInstant> getPendingCompactionInstantTimes(HoodieTableMetaClient metaClient) {
return metaClient.getActiveTimeline().filterPendingCompactionTimeline().getInstants().collect(Collectors.toList());
}
/**
* Returns a pair of (timeline containing the delta commits after the latest completed
* compaction commit, the completed compaction commit instant), if the latest completed
* compaction commit is present; a pair of (timeline containing all the delta commits,
* the first delta commit instant), if there is no completed compaction commit.
*
* @param activeTimeline Active timeline of a table.
* @return Pair of timeline containing delta commits and an instant.
*/
public static Option<Pair<HoodieTimeline, HoodieInstant>> getDeltaCommitsSinceLatestCompaction(
HoodieActiveTimeline activeTimeline) {
Option<HoodieInstant> lastCompaction = activeTimeline.getCommitTimeline()
.filterCompletedInstants().lastInstant();
HoodieTimeline deltaCommits = activeTimeline.getDeltaCommitTimeline();
HoodieInstant latestInstant;
if (lastCompaction.isPresent()) {
latestInstant = lastCompaction.get();
// timeline containing the delta commits after the latest completed compaction commit,
// and the completed compaction commit instant
return Option.of(Pair.of(deltaCommits.findInstantsAfter(
latestInstant.getTimestamp(), Integer.MAX_VALUE), lastCompaction.get()));
} else {
if (deltaCommits.countInstants() > 0) {
latestInstant = deltaCommits.firstInstant().get();
// timeline containing all the delta commits, and the first delta commit instant
return Option.of(Pair.of(deltaCommits.findInstantsAfterOrEquals(
latestInstant.getTimestamp(), Integer.MAX_VALUE), latestInstant));
} else {
return Option.empty();
}
}
}
/**
* Gets the oldest instant to retain for MOR compaction.
* If there is no completed compaction,
* num delta commits >= "hoodie.compact.inline.max.delta.commits"
* If there is a completed compaction,
* num delta commits after latest completed compaction >= "hoodie.compact.inline.max.delta.commits"
*
* @param activeTimeline Active timeline of a table.
* @param maxDeltaCommits Maximum number of delta commits that trigger the compaction plan,
* i.e., "hoodie.compact.inline.max.delta.commits".
* @return the oldest instant to keep for MOR compaction.
*/
public static Option<HoodieInstant> getOldestInstantToRetainForCompaction(
HoodieActiveTimeline activeTimeline, int maxDeltaCommits) {
Option<Pair<HoodieTimeline, HoodieInstant>> deltaCommitsInfoOption =
CompactionUtils.getDeltaCommitsSinceLatestCompaction(activeTimeline);
if (deltaCommitsInfoOption.isPresent()) {
Pair<HoodieTimeline, HoodieInstant> deltaCommitsInfo = deltaCommitsInfoOption.get();
HoodieTimeline deltaCommitTimeline = deltaCommitsInfo.getLeft();
int numDeltaCommits = deltaCommitTimeline.countInstants();
if (numDeltaCommits < maxDeltaCommits) {
return Option.of(deltaCommitsInfo.getRight());
} else {
// delta commits with the last one to keep
List<HoodieInstant> instants = deltaCommitTimeline.getInstants()
.limit(numDeltaCommits - maxDeltaCommits + 1).collect(Collectors.toList());
return Option.of(instants.get(instants.size() - 1));
}
}
return Option.empty();
}
}

View File

@@ -27,6 +27,9 @@ import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.versioning.compaction.CompactionPlanMigrator;
import org.apache.hudi.common.testutils.CompactionTestUtils.DummyHoodieBaseFile;
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
@@ -35,15 +38,20 @@ import org.apache.hudi.common.util.collection.Pair;
import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import static org.apache.hudi.common.testutils.CompactionTestUtils.createCompactionPlan;
import static org.apache.hudi.common.testutils.CompactionTestUtils.scheduleCompaction;
@@ -230,11 +238,95 @@ public class TestCompactionUtils extends HoodieCommonTestHarness {
setupAndValidateCompactionOperations(metaClient, false, 0, 0, 0, 0);
}
@ParameterizedTest
@ValueSource(booleans = {true, false})
public void testGetDeltaCommitsSinceLatestCompaction(boolean hasCompletedCompaction) {
HoodieActiveTimeline timeline = prepareTimeline(hasCompletedCompaction);
Pair<HoodieTimeline, HoodieInstant> actual =
CompactionUtils.getDeltaCommitsSinceLatestCompaction(timeline).get();
if (hasCompletedCompaction) {
Stream<HoodieInstant> instants = actual.getLeft().getInstants();
assertEquals(
Stream.of(
new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "07"),
new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "08"),
new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, "09"))
.collect(Collectors.toList()),
actual.getLeft().getInstants().collect(Collectors.toList()));
assertEquals(
new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "06"),
actual.getRight());
} else {
assertEquals(
Stream.of(
new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "01"),
new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "02"),
new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "03"),
new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "04"),
new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "05"),
new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "07"),
new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "08"),
new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, "09"))
.collect(Collectors.toList()),
actual.getLeft().getInstants().collect(Collectors.toList()));
assertEquals(
new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "01"),
actual.getRight());
}
}
@Test
public void testGetDeltaCommitsSinceLatestCompactionWithEmptyDeltaCommits() {
HoodieActiveTimeline timeline = new MockHoodieActiveTimeline();
assertEquals(Option.empty(), CompactionUtils.getDeltaCommitsSinceLatestCompaction(timeline));
}
@ParameterizedTest
@ValueSource(booleans = {true, false})
public void testGetOldestInstantToKeepForCompaction(boolean hasCompletedCompaction) {
HoodieActiveTimeline timeline = prepareTimeline(hasCompletedCompaction);
Option<HoodieInstant> actual = CompactionUtils.getOldestInstantToRetainForCompaction(timeline, 20);
if (hasCompletedCompaction) {
assertEquals(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "06"), actual.get());
} else {
assertEquals(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "01"), actual.get());
}
actual = CompactionUtils.getOldestInstantToRetainForCompaction(timeline, 3);
assertEquals(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "07"), actual.get());
actual = CompactionUtils.getOldestInstantToRetainForCompaction(timeline, 2);
assertEquals(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "08"), actual.get());
}
@Test
public void testGetOldestInstantToKeepForCompactionWithEmptyDeltaCommits() {
HoodieActiveTimeline timeline = new MockHoodieActiveTimeline();
assertEquals(Option.empty(), CompactionUtils.getOldestInstantToRetainForCompaction(timeline, 20));
}
private HoodieActiveTimeline prepareTimeline(boolean hasCompletedCompaction) {
if (hasCompletedCompaction) {
return new MockHoodieActiveTimeline(
Stream.of("01", "02", "03", "04", "05", "07", "08"),
Stream.of("06"),
Stream.of(Pair.of("09", HoodieTimeline.DELTA_COMMIT_ACTION)));
} else {
return new MockHoodieActiveTimeline(
Stream.of("01", "02", "03", "04", "05", "07", "08"),
Stream.empty(),
Stream.of(
Pair.of("06", HoodieTimeline.COMMIT_ACTION),
Pair.of("09", HoodieTimeline.DELTA_COMMIT_ACTION)));
}
}
/**
* Validates if generated compaction plan matches with input file-slices.
*
* @param input File Slices with partition-path
* @param plan Compaction Plan
* @param plan Compaction Plan
*/
private void testFileSlicesCompactionPlanEquality(List<Pair<String, FileSlice>> input, HoodieCompactionPlan plan) {
assertEquals(input.size(), plan.getOperations().size(), "All file-slices present");
@@ -245,12 +337,12 @@ public class TestCompactionUtils extends HoodieCommonTestHarness {
/**
* Validates if generated compaction operation matches with input file slice and partition path.
*
* @param slice File Slice
* @param op HoodieCompactionOperation
* @param slice File Slice
* @param op HoodieCompactionOperation
* @param expPartitionPath Partition path
*/
private void testFileSliceCompactionOpEquality(FileSlice slice, HoodieCompactionOperation op, String expPartitionPath,
int version) {
int version) {
assertEquals(expPartitionPath, op.getPartitionPath(), "Partition path is correct");
assertEquals(slice.getBaseInstantTime(), op.getBaseInstantTime(), "Same base-instant");
assertEquals(slice.getFileId(), op.getFileId(), "Same file-id");
@@ -270,4 +362,24 @@ public class TestCompactionUtils extends HoodieCommonTestHarness {
protected HoodieTableType getTableType() {
return HoodieTableType.MERGE_ON_READ;
}
class MockHoodieActiveTimeline extends HoodieActiveTimeline {
public MockHoodieActiveTimeline() {
super();
this.setInstants(new ArrayList<>());
}
public MockHoodieActiveTimeline(
Stream<String> completedDeltaCommits,
Stream<String> completedCompactionCommits,
Stream<Pair<String, String>> inflights) {
super();
this.setInstants(Stream.concat(
Stream.concat(completedDeltaCommits.map(s -> new HoodieInstant(false, DELTA_COMMIT_ACTION, s)),
completedCompactionCommits.map(s -> new HoodieInstant(false, COMMIT_ACTION, s))),
inflights.map(s -> new HoodieInstant(true, s.getRight(), s.getLeft())))
.sorted(Comparator.comparing(HoodieInstant::getFileName)).collect(Collectors.toList()));
}
}
}