1
0

[HUDI-2675] Fix the exception 'Not an Avro data file' when archive and clean (#4016)

This commit is contained in:
董可伦
2021-12-30 11:53:17 +08:00
committed by GitHub
parent 674c149234
commit 436becf3ea
11 changed files with 209 additions and 78 deletions

View File

@@ -18,15 +18,10 @@
package org.apache.hudi.io;
import org.apache.hudi.avro.model.HoodieActionInstant;
import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieCleanerPlan;
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
import org.apache.hudi.client.utils.MetadataConversionUtils;
import org.apache.hudi.common.HoodieCleanStat;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
import org.apache.hudi.common.model.HoodieCleaningPolicy;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.WriteOperationType;
@@ -35,13 +30,11 @@ import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.testutils.HoodieMetadataTestTable;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestTable;
import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieWriteConfig;
@@ -70,11 +63,9 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -502,8 +493,9 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
HoodieTimeline.CLEAN_ACTION), expectedActiveInstants, commitsAfterArchival);
}
@Test
public void testArchiveCompletedRollbackAndClean() throws Exception {
@ParameterizedTest
@ValueSource(booleans = {true, false})
public void testArchiveCompletedRollbackAndClean(boolean isEmpty) throws Exception {
init();
int minInstantsToKeep = 2;
int maxInstantsToKeep = 10;
@@ -519,11 +511,11 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
int startInstant = 1;
for (int i = 0; i < maxInstantsToKeep + 1; i++, startInstant++) {
createCleanMetadata(startInstant + "", false);
createCleanMetadata(startInstant + "", false, isEmpty || i % 2 == 0);
}
for (int i = 0; i < maxInstantsToKeep + 1; i++, startInstant += 2) {
createCommitAndRollbackFile(startInstant + 1 + "", startInstant + "", false);
createCommitAndRollbackFile(startInstant + 1 + "", startInstant + "", false, isEmpty || i % 2 == 0);
}
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
@@ -701,31 +693,16 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
return allInstants;
}
private HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly) throws IOException {
HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(),
CleanPlanV2MigrationHandler.VERSION, new HashMap<>());
if (inflightOnly) {
HoodieTestTable.of(metaClient).addInflightClean(instantTime, cleanerPlan);
} else {
HoodieCleanStat cleanStats = new HoodieCleanStat(
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS,
HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)],
Collections.emptyList(),
Collections.emptyList(),
Collections.emptyList(),
instantTime);
HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats));
HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata);
}
return new HoodieInstant(inflightOnly, "clean", instantTime);
}
private void createCommitAndRollbackFile(String commitToRollback, String rollbackTIme, boolean isRollbackInflight) throws IOException {
HoodieTestDataGenerator.createCommitFile(basePath, commitToRollback, wrapperFs.getConf());
createRollbackMetadata(rollbackTIme, commitToRollback, isRollbackInflight);
createCommitAndRollbackFile(commitToRollback, rollbackTIme, isRollbackInflight, false);
}
private HoodieInstant createRollbackMetadata(String rollbackTime, String commitToRollback, boolean inflight) throws IOException {
private void createCommitAndRollbackFile(String commitToRollback, String rollbackTIme, boolean isRollbackInflight, boolean isEmpty) throws IOException {
HoodieTestDataGenerator.createCommitFile(basePath, commitToRollback, wrapperFs.getConf());
createRollbackMetadata(rollbackTIme, commitToRollback, isRollbackInflight, isEmpty);
}
private HoodieInstant createRollbackMetadata(String rollbackTime, String commitToRollback, boolean inflight, boolean isEmpty) throws IOException {
if (inflight) {
HoodieTestTable.of(metaClient).addInflightRollback(rollbackTime);
} else {
@@ -738,7 +715,7 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
.setPartitionMetadata(Collections.emptyMap())
.setInstantsRollback(Collections.emptyList())
.build();
HoodieTestTable.of(metaClient).addRollback(rollbackTime, hoodieRollbackMetadata);
HoodieTestTable.of(metaClient).addRollback(rollbackTime, hoodieRollbackMetadata, isEmpty);
}
return new HoodieInstant(inflight, "rollback", rollbackTime);
}

View File

@@ -259,7 +259,7 @@ public class TestCleaner extends HoodieClientTestBase {
* @param insertFn Insert API to be tested
* @param upsertFn Upsert API to be tested
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
* record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
* @throws Exception in case of errors
*/
private void testInsertAndCleanByVersions(
@@ -274,7 +274,7 @@ public class TestCleaner extends HoodieClientTestBase {
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build())
.build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction =
generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
@@ -429,7 +429,7 @@ public class TestCleaner extends HoodieClientTestBase {
* @param insertFn Insert API to be tested
* @param upsertFn Upsert API to be tested
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
* record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
* @throws Exception in case of errors
*/
private void testInsertAndCleanByCommits(
@@ -550,10 +550,10 @@ public class TestCleaner extends HoodieClientTestBase {
HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
assertTrue(timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().countInstants() == 3);
Option<HoodieInstant> rolleBackInstantForFailedCommit = timeline.getTimelineOfActions(
Option<HoodieInstant> rollBackInstantForFailedCommit = timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().lastInstant();
HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeAvroMetadata(
timeline.getInstantDetails(rolleBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
timeline.getInstantDetails(rollBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
// Rollback of one of the failed writes should have deleted 3 files
assertEquals(3, rollbackMetadata.getTotalFilesDeleted());
}
@@ -750,6 +750,59 @@ public class TestCleaner extends HoodieClientTestBase {
assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
}
@Test
public void testCleanEmptyInstants() throws Exception {
HoodieWriteConfig config =
HoodieWriteConfig.newBuilder()
.withPath(basePath)
.withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build())
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).build())
.build();
metaClient = HoodieTableMetaClient.reload(metaClient);
int commitCount = 20;
int cleanCount = 20;
int startInstant = 1;
for (int i = 0; i < commitCount; i++, startInstant++) {
String commitTime = makeNewCommitTime(startInstant);
HoodieTestTable.of(metaClient).addCommit(commitTime);
}
for (int i = 0; i < cleanCount; i++, startInstant++) {
String commitTime = makeNewCommitTime(startInstant);
createCleanMetadata(commitTime + "", false, true);
}
List<HoodieCleanStat> cleanStats = runCleaner(config);
HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
assertEquals(0, cleanStats.size(), "Must not clean any files");
assertEquals(1, timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().countInstants());
assertEquals(0, timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflights().countInstants());
assertEquals(--cleanCount, timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants().countInstants());
assertTrue(timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().containsInstant(makeNewCommitTime(--startInstant)));
cleanStats = runCleaner(config);
timeline = metaClient.reloadActiveTimeline();
assertEquals(0, cleanStats.size(), "Must not clean any files");
assertEquals(1, timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().countInstants());
assertEquals(0, timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflights().countInstants());
assertEquals(--cleanCount, timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants().countInstants());
assertTrue(timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().containsInstant(makeNewCommitTime(--startInstant)));
}
/**
* Test HoodieTable.clean() Cleaning by versions logic for MOR table with Log files.
*/
@@ -1425,7 +1478,7 @@ public class TestCleaner extends HoodieClientTestBase {
*
* @param insertFn Insert API to be tested
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
* record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
* @throws Exception in case of errors
*/
private void testInsertAndCleanFailedWritesByVersions(
@@ -1441,7 +1494,7 @@ public class TestCleaner extends HoodieClientTestBase {
.withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1)
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
.build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction =
generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
@@ -1474,10 +1527,10 @@ public class TestCleaner extends HoodieClientTestBase {
HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
assertTrue(timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().countInstants() == 3);
Option<HoodieInstant> rolleBackInstantForFailedCommit = timeline.getTimelineOfActions(
Option<HoodieInstant> rollBackInstantForFailedCommit = timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().lastInstant();
HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeAvroMetadata(
timeline.getInstantDetails(rolleBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
timeline.getInstantDetails(rollBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
// Rollback of one of the failed writes should have deleted 3 files
assertEquals(3, rollbackMetadata.getTotalFilesDeleted());
}

View File

@@ -17,13 +17,18 @@
package org.apache.hudi.testutils;
import org.apache.hudi.avro.model.HoodieActionInstant;
import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieCleanerPlan;
import org.apache.hudi.client.HoodieReadClient;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.SparkTaskContextSupplier;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.HoodieCleanStat;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieCleaningPolicy;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieFileGroup;
import org.apache.hudi.common.model.HoodieRecord;
@@ -31,7 +36,9 @@ import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.table.view.TableFileSystemView;
@@ -86,12 +93,14 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.stream.Collectors;
import scala.Tuple2;
import static org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
@@ -674,7 +683,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
Assertions.assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size());
// Metadata table should automatically compact and clean
// versions are +1 as autoclean / compaction happens end of commits
// versions are +1 as autoClean / compaction happens end of commits
int numFileVersions = metadataWriteConfig.getCleanerFileVersionsRetained() + 1;
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metadataMetaClient, metadataMetaClient.getActiveTimeline());
metadataTablePartitions.forEach(partition -> {
@@ -685,4 +694,27 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
+ numFileVersions + " but was " + latestSlices.size());
});
}
public HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly) throws IOException {
return createCleanMetadata(instantTime, inflightOnly, false);
}
public HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly, boolean isEmpty) throws IOException {
HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(),
CleanPlanV2MigrationHandler.VERSION, new HashMap<>());
if (inflightOnly) {
HoodieTestTable.of(metaClient).addInflightClean(instantTime, cleanerPlan);
} else {
HoodieCleanStat cleanStats = new HoodieCleanStat(
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS,
HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)],
Collections.emptyList(),
Collections.emptyList(),
Collections.emptyList(),
instantTime);
HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats));
HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata, isEmpty);
}
return new HoodieInstant(inflightOnly, "clean", instantTime);
}
}