1
0

[HUDI-1740] Fix insert-overwrite API archival (#2784)

- fix problem of archiving replace commits
- Fix problem when getting empty replacecommit.requested
- Improved the logic of handling empty and non-empty requested/inflight commit files. Added unit tests to cover both empty and non-empty inflight files cases and cleaned up some unused test util methods

Co-authored-by: yorkzero831 <yorkzero8312@gmail.com>
Co-authored-by: zheren.yu <zheren.yu@paypay-corp.co.jp>
This commit is contained in:
Susu Dong
2021-05-22 05:52:13 +09:00
committed by GitHub
parent 99b14a78e3
commit 685f77b5dd
12 changed files with 169 additions and 107 deletions

View File

@@ -68,6 +68,11 @@ public class ReplaceArchivalHelper implements Serializable {
public static boolean deleteReplacedFileGroups(HoodieEngineContext context, HoodieTableMetaClient metaClient,
TableFileSystemView fileSystemView,
HoodieInstant instant, List<String> replacedPartitions) {
// There is no file id to be replaced in the very first replace commit file for insert overwrite operation
if (replacedPartitions.isEmpty()) {
LOG.warn("Found no partition files to replace");
return true;
}
context.setJobStatus(ReplaceArchivalHelper.class.getSimpleName(), "Delete replaced file groups");
List<Boolean> f = context.map(replacedPartitions, partition -> {
Stream<FileSlice> fileSlices = fileSystemView.getReplacedFileGroupsBeforeOrOn(instant.getTimestamp(), partition)

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.client.utils;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import org.apache.hudi.avro.model.HoodieArchivedMetaEntry;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata;
@@ -37,8 +38,8 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.util.CleanerUtils;
import org.apache.hudi.common.util.ClusteringUtils;
import org.apache.hudi.common.util.CompactionUtils;
import org.apache.hudi.common.util.Option;
/**
* Helper class to convert between different action related payloads and {@link HoodieArchivedMetaEntry}.
@@ -72,10 +73,21 @@ public class MetadataConversionUtils {
HoodieReplaceCommitMetadata replaceCommitMetadata = HoodieReplaceCommitMetadata
.fromBytes(metaClient.getActiveTimeline().getInstantDetails(hoodieInstant).get(), HoodieReplaceCommitMetadata.class);
archivedMetaWrapper.setHoodieReplaceCommitMetadata(ReplaceArchivalHelper.convertReplaceCommitMetadata(replaceCommitMetadata));
} else if (hoodieInstant.isInflight()) {
// inflight replacecommit files have the same meta data body as HoodieCommitMetadata
// so we could re-use it without further creating an inflight extension.
// Or inflight replacecommit files are empty under clustering circumstance
Option<HoodieCommitMetadata> inflightCommitMetadata = getInflightReplaceMetadata(metaClient, hoodieInstant);
if (inflightCommitMetadata.isPresent()) {
archivedMetaWrapper.setHoodieInflightReplaceMetadata(convertCommitMetadata(inflightCommitMetadata.get()));
}
} else {
HoodieRequestedReplaceMetadata requestedReplaceMetadata =
ClusteringUtils.getRequestedReplaceMetadata(metaClient, hoodieInstant).get();
archivedMetaWrapper.setHoodieRequestedReplaceMetadata(requestedReplaceMetadata);
// we may have cases with empty HoodieRequestedReplaceMetadata e.g. insert_overwrite_table or insert_overwrite
// without clustering. However, we should revisit the requested commit file standardization
Option<HoodieRequestedReplaceMetadata> requestedReplaceMetadata = getRequestedReplaceMetadata(metaClient, hoodieInstant);
if (requestedReplaceMetadata.isPresent()) {
archivedMetaWrapper.setHoodieRequestedReplaceMetadata(requestedReplaceMetadata.get());
}
}
archivedMetaWrapper.setActionType(ActionType.replacecommit.name());
break;
@@ -107,14 +119,25 @@ public class MetadataConversionUtils {
return archivedMetaWrapper;
}
public static HoodieArchivedMetaEntry createMetaWrapper(HoodieInstant hoodieInstant,
HoodieCommitMetadata hoodieCommitMetadata) {
HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry();
archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp());
archivedMetaWrapper.setActionState(hoodieInstant.getState().name());
archivedMetaWrapper.setHoodieCommitMetadata(convertCommitMetadata(hoodieCommitMetadata));
archivedMetaWrapper.setActionType(ActionType.commit.name());
return archivedMetaWrapper;
public static Option<HoodieCommitMetadata> getInflightReplaceMetadata(HoodieTableMetaClient metaClient, HoodieInstant instant) throws IOException {
Option<byte[]> inflightContent = metaClient.getActiveTimeline().getInstantDetails(instant);
if (!inflightContent.isPresent() || inflightContent.get().length == 0) {
// inflight files can be empty in some certain cases, e.g. when users opt in clustering
return Option.empty();
}
return Option.of(HoodieCommitMetadata.fromBytes(inflightContent.get(), HoodieCommitMetadata.class));
}
public static Option<HoodieRequestedReplaceMetadata> getRequestedReplaceMetadata(HoodieTableMetaClient metaClient, HoodieInstant instant) throws IOException {
Option<byte[]> requestedContent = metaClient.getActiveTimeline().getInstantDetails(instant);
if (!requestedContent.isPresent() || requestedContent.get().length == 0) {
// requested commit files can be empty in some certain cases, e.g. insert_overwrite or insert_overwrite_table.
// However, it appears requested files are supposed to contain meta data and we should revisit the standardization
// of requested commit files
// TODO revisit requested commit file standardization https://issues.apache.org/jira/browse/HUDI-1739
return Option.empty();
}
return Option.of(TimelineMetadataUtils.deserializeRequestedReplaceMetadata(requestedContent.get()));
}
public static org.apache.hudi.avro.model.HoodieCommitMetadata convertCommitMetadata(

View File

@@ -296,7 +296,6 @@ public class HoodieTimelineArchiveLog<T extends HoodieAvroPayload, I, K, O> {
public void archive(HoodieEngineContext context, List<HoodieInstant> instants) throws HoodieCommitException {
try {
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants();
Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
LOG.info("Wrapper schema " + wrapperSchema.toString());
List<IndexedRecord> records = new ArrayList<>();
@@ -308,7 +307,7 @@ public class HoodieTimelineArchiveLog<T extends HoodieAvroPayload, I, K, O> {
}
try {
deleteAnyLeftOverMarkerFiles(context, hoodieInstant);
records.add(convertToAvroRecord(commitTimeline, hoodieInstant));
records.add(convertToAvroRecord(hoodieInstant));
if (records.size() >= this.config.getCommitArchivalBatchSize()) {
writeToFile(wrapperSchema, records);
}
@@ -365,8 +364,8 @@ public class HoodieTimelineArchiveLog<T extends HoodieAvroPayload, I, K, O> {
}
}
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant)
throws IOException {
private IndexedRecord convertToAvroRecord(HoodieInstant hoodieInstant)
throws IOException {
return MetadataConversionUtils.createMetaWrapper(hoodieInstant, metaClient);
}
}