[HUDI-2675] Fix the exception 'Not an Avro data file' when archive and clean (#4016)
This commit is contained in:
@@ -18,15 +18,10 @@
|
||||
|
||||
package org.apache.hudi.io;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieActionInstant;
|
||||
import org.apache.hudi.avro.model.HoodieCleanMetadata;
|
||||
import org.apache.hudi.avro.model.HoodieCleanerPlan;
|
||||
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
|
||||
import org.apache.hudi.client.utils.MetadataConversionUtils;
|
||||
import org.apache.hudi.common.HoodieCleanStat;
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
|
||||
import org.apache.hudi.common.model.HoodieCleaningPolicy;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
@@ -35,13 +30,11 @@ import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler;
|
||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||
import org.apache.hudi.common.testutils.HoodieMetadataTestTable;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
import org.apache.hudi.common.testutils.HoodieTestTable;
|
||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
@@ -70,11 +63,9 @@ import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
@@ -502,8 +493,9 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
|
||||
HoodieTimeline.CLEAN_ACTION), expectedActiveInstants, commitsAfterArchival);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArchiveCompletedRollbackAndClean() throws Exception {
|
||||
@ParameterizedTest
|
||||
@ValueSource(booleans = {true, false})
|
||||
public void testArchiveCompletedRollbackAndClean(boolean isEmpty) throws Exception {
|
||||
init();
|
||||
int minInstantsToKeep = 2;
|
||||
int maxInstantsToKeep = 10;
|
||||
@@ -519,11 +511,11 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
|
||||
|
||||
int startInstant = 1;
|
||||
for (int i = 0; i < maxInstantsToKeep + 1; i++, startInstant++) {
|
||||
createCleanMetadata(startInstant + "", false);
|
||||
createCleanMetadata(startInstant + "", false, isEmpty || i % 2 == 0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < maxInstantsToKeep + 1; i++, startInstant += 2) {
|
||||
createCommitAndRollbackFile(startInstant + 1 + "", startInstant + "", false);
|
||||
createCommitAndRollbackFile(startInstant + 1 + "", startInstant + "", false, isEmpty || i % 2 == 0);
|
||||
}
|
||||
|
||||
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
|
||||
@@ -701,31 +693,16 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
|
||||
return allInstants;
|
||||
}
|
||||
|
||||
private HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly) throws IOException {
|
||||
HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(),
|
||||
CleanPlanV2MigrationHandler.VERSION, new HashMap<>());
|
||||
if (inflightOnly) {
|
||||
HoodieTestTable.of(metaClient).addInflightClean(instantTime, cleanerPlan);
|
||||
} else {
|
||||
HoodieCleanStat cleanStats = new HoodieCleanStat(
|
||||
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS,
|
||||
HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)],
|
||||
Collections.emptyList(),
|
||||
Collections.emptyList(),
|
||||
Collections.emptyList(),
|
||||
instantTime);
|
||||
HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats));
|
||||
HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata);
|
||||
}
|
||||
return new HoodieInstant(inflightOnly, "clean", instantTime);
|
||||
}
|
||||
|
||||
private void createCommitAndRollbackFile(String commitToRollback, String rollbackTIme, boolean isRollbackInflight) throws IOException {
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, commitToRollback, wrapperFs.getConf());
|
||||
createRollbackMetadata(rollbackTIme, commitToRollback, isRollbackInflight);
|
||||
createCommitAndRollbackFile(commitToRollback, rollbackTIme, isRollbackInflight, false);
|
||||
}
|
||||
|
||||
private HoodieInstant createRollbackMetadata(String rollbackTime, String commitToRollback, boolean inflight) throws IOException {
|
||||
private void createCommitAndRollbackFile(String commitToRollback, String rollbackTIme, boolean isRollbackInflight, boolean isEmpty) throws IOException {
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, commitToRollback, wrapperFs.getConf());
|
||||
createRollbackMetadata(rollbackTIme, commitToRollback, isRollbackInflight, isEmpty);
|
||||
}
|
||||
|
||||
private HoodieInstant createRollbackMetadata(String rollbackTime, String commitToRollback, boolean inflight, boolean isEmpty) throws IOException {
|
||||
if (inflight) {
|
||||
HoodieTestTable.of(metaClient).addInflightRollback(rollbackTime);
|
||||
} else {
|
||||
@@ -738,7 +715,7 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness {
|
||||
.setPartitionMetadata(Collections.emptyMap())
|
||||
.setInstantsRollback(Collections.emptyList())
|
||||
.build();
|
||||
HoodieTestTable.of(metaClient).addRollback(rollbackTime, hoodieRollbackMetadata);
|
||||
HoodieTestTable.of(metaClient).addRollback(rollbackTime, hoodieRollbackMetadata, isEmpty);
|
||||
}
|
||||
return new HoodieInstant(inflight, "rollback", rollbackTime);
|
||||
}
|
||||
|
||||
@@ -259,7 +259,7 @@ public class TestCleaner extends HoodieClientTestBase {
|
||||
* @param insertFn Insert API to be tested
|
||||
* @param upsertFn Upsert API to be tested
|
||||
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
|
||||
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
|
||||
* record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
|
||||
* @throws Exception in case of errors
|
||||
*/
|
||||
private void testInsertAndCleanByVersions(
|
||||
@@ -274,7 +274,7 @@ public class TestCleaner extends HoodieClientTestBase {
|
||||
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
|
||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build())
|
||||
.build();
|
||||
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
|
||||
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
|
||||
|
||||
final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction =
|
||||
generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
|
||||
@@ -429,7 +429,7 @@ public class TestCleaner extends HoodieClientTestBase {
|
||||
* @param insertFn Insert API to be tested
|
||||
* @param upsertFn Upsert API to be tested
|
||||
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
|
||||
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
|
||||
* record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
|
||||
* @throws Exception in case of errors
|
||||
*/
|
||||
private void testInsertAndCleanByCommits(
|
||||
@@ -550,10 +550,10 @@ public class TestCleaner extends HoodieClientTestBase {
|
||||
HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
|
||||
assertTrue(timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().countInstants() == 3);
|
||||
Option<HoodieInstant> rolleBackInstantForFailedCommit = timeline.getTimelineOfActions(
|
||||
Option<HoodieInstant> rollBackInstantForFailedCommit = timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().lastInstant();
|
||||
HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeAvroMetadata(
|
||||
timeline.getInstantDetails(rolleBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
|
||||
timeline.getInstantDetails(rollBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
|
||||
// Rollback of one of the failed writes should have deleted 3 files
|
||||
assertEquals(3, rollbackMetadata.getTotalFilesDeleted());
|
||||
}
|
||||
@@ -750,6 +750,59 @@ public class TestCleaner extends HoodieClientTestBase {
|
||||
assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCleanEmptyInstants() throws Exception {
|
||||
HoodieWriteConfig config =
|
||||
HoodieWriteConfig.newBuilder()
|
||||
.withPath(basePath)
|
||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build())
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).build())
|
||||
.build();
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||
|
||||
int commitCount = 20;
|
||||
int cleanCount = 20;
|
||||
|
||||
int startInstant = 1;
|
||||
for (int i = 0; i < commitCount; i++, startInstant++) {
|
||||
String commitTime = makeNewCommitTime(startInstant);
|
||||
HoodieTestTable.of(metaClient).addCommit(commitTime);
|
||||
}
|
||||
|
||||
for (int i = 0; i < cleanCount; i++, startInstant++) {
|
||||
String commitTime = makeNewCommitTime(startInstant);
|
||||
createCleanMetadata(commitTime + "", false, true);
|
||||
}
|
||||
|
||||
List<HoodieCleanStat> cleanStats = runCleaner(config);
|
||||
HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
|
||||
|
||||
assertEquals(0, cleanStats.size(), "Must not clean any files");
|
||||
assertEquals(1, timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().countInstants());
|
||||
assertEquals(0, timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflights().countInstants());
|
||||
assertEquals(--cleanCount, timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants().countInstants());
|
||||
assertTrue(timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().containsInstant(makeNewCommitTime(--startInstant)));
|
||||
|
||||
cleanStats = runCleaner(config);
|
||||
timeline = metaClient.reloadActiveTimeline();
|
||||
|
||||
assertEquals(0, cleanStats.size(), "Must not clean any files");
|
||||
assertEquals(1, timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().countInstants());
|
||||
assertEquals(0, timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflights().countInstants());
|
||||
assertEquals(--cleanCount, timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants().countInstants());
|
||||
assertTrue(timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().containsInstant(makeNewCommitTime(--startInstant)));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test HoodieTable.clean() Cleaning by versions logic for MOR table with Log files.
|
||||
*/
|
||||
@@ -1425,7 +1478,7 @@ public class TestCleaner extends HoodieClientTestBase {
|
||||
*
|
||||
* @param insertFn Insert API to be tested
|
||||
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
|
||||
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
|
||||
* record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
|
||||
* @throws Exception in case of errors
|
||||
*/
|
||||
private void testInsertAndCleanFailedWritesByVersions(
|
||||
@@ -1441,7 +1494,7 @@ public class TestCleaner extends HoodieClientTestBase {
|
||||
.withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1)
|
||||
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
|
||||
.build();
|
||||
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
|
||||
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
|
||||
|
||||
final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction =
|
||||
generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
|
||||
@@ -1474,10 +1527,10 @@ public class TestCleaner extends HoodieClientTestBase {
|
||||
HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
|
||||
assertTrue(timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().countInstants() == 3);
|
||||
Option<HoodieInstant> rolleBackInstantForFailedCommit = timeline.getTimelineOfActions(
|
||||
Option<HoodieInstant> rollBackInstantForFailedCommit = timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().lastInstant();
|
||||
HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeAvroMetadata(
|
||||
timeline.getInstantDetails(rolleBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
|
||||
timeline.getInstantDetails(rollBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
|
||||
// Rollback of one of the failed writes should have deleted 3 files
|
||||
assertEquals(3, rollbackMetadata.getTotalFilesDeleted());
|
||||
}
|
||||
|
||||
@@ -17,13 +17,18 @@
|
||||
|
||||
package org.apache.hudi.testutils;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieActionInstant;
|
||||
import org.apache.hudi.avro.model.HoodieCleanMetadata;
|
||||
import org.apache.hudi.avro.model.HoodieCleanerPlan;
|
||||
import org.apache.hudi.client.HoodieReadClient;
|
||||
import org.apache.hudi.client.SparkRDDWriteClient;
|
||||
import org.apache.hudi.client.SparkTaskContextSupplier;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.HoodieCleanStat;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.FileSlice;
|
||||
import org.apache.hudi.common.model.HoodieCleaningPolicy;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.model.HoodieFileGroup;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
@@ -31,7 +36,9 @@ import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler;
|
||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
||||
import org.apache.hudi.common.table.view.TableFileSystemView;
|
||||
@@ -86,12 +93,14 @@ import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
import static org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
@@ -674,7 +683,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
||||
Assertions.assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size());
|
||||
|
||||
// Metadata table should automatically compact and clean
|
||||
// versions are +1 as autoclean / compaction happens end of commits
|
||||
// versions are +1 as autoClean / compaction happens end of commits
|
||||
int numFileVersions = metadataWriteConfig.getCleanerFileVersionsRetained() + 1;
|
||||
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metadataMetaClient, metadataMetaClient.getActiveTimeline());
|
||||
metadataTablePartitions.forEach(partition -> {
|
||||
@@ -685,4 +694,27 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
||||
+ numFileVersions + " but was " + latestSlices.size());
|
||||
});
|
||||
}
|
||||
|
||||
public HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly) throws IOException {
|
||||
return createCleanMetadata(instantTime, inflightOnly, false);
|
||||
}
|
||||
|
||||
public HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly, boolean isEmpty) throws IOException {
|
||||
HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(),
|
||||
CleanPlanV2MigrationHandler.VERSION, new HashMap<>());
|
||||
if (inflightOnly) {
|
||||
HoodieTestTable.of(metaClient).addInflightClean(instantTime, cleanerPlan);
|
||||
} else {
|
||||
HoodieCleanStat cleanStats = new HoodieCleanStat(
|
||||
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS,
|
||||
HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)],
|
||||
Collections.emptyList(),
|
||||
Collections.emptyList(),
|
||||
Collections.emptyList(),
|
||||
instantTime);
|
||||
HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats));
|
||||
HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata, isEmpty);
|
||||
}
|
||||
return new HoodieInstant(inflightOnly, "clean", instantTime);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user