1
0

[HUDI-3191] Removing duplicating file-listing process w/in Hive's MOR FileInputFormats (#4556)

This commit is contained in:
Alexey Kudinkin
2022-02-03 14:01:41 -08:00
committed by GitHub
parent 5927bdd1c0
commit 69dfcda116
22 changed files with 493 additions and 197 deletions

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.table;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.client.HoodieReadClient;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
@@ -189,8 +190,10 @@ public class TestHoodieMergeOnReadTable extends SparkClientFunctionalTestHarness
assertTrue(fileIdToNewSize.entrySet().stream().anyMatch(entry -> fileIdToSize.get(entry.getKey()) < entry.getValue()));
List<String> dataFiles = roView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles,
List<String> inputPaths = roView.getLatestBaseFiles()
.map(baseFile -> new Path(baseFile.getPath()).getParent().toString())
.collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths,
basePath(), new JobConf(hadoopConf()), true, false);
// Wrote 20 records in 2 batches
assertEquals(40, recordsRead.size(), "Must contain 40 records");

View File

@@ -22,6 +22,7 @@ package org.apache.hudi.table.functional;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
@@ -113,13 +114,14 @@ public class TestHoodieSparkMergeOnReadTableCompaction extends SparkClientFuncti
JavaRDD records = jsc().parallelize(dataGen.generateInserts(instant, numRecords), 2);
metaClient = HoodieTableMetaClient.reload(metaClient);
client.startCommitWithTime(instant);
List<WriteStatus> writeStatues = client.upsert(records, instant).collect();
org.apache.hudi.testutils.Assertions.assertNoWriteErrors(writeStatues);
List<WriteStatus> writeStatuses = client.upsert(records, instant).collect();
org.apache.hudi.testutils.Assertions.assertNoWriteErrors(writeStatuses);
if (doCommit) {
Assertions.assertTrue(client.commitStats(instant, writeStatues.stream().map(WriteStatus::getStat).collect(Collectors.toList()),
Option.empty(), metaClient.getCommitActionType()));
List<HoodieWriteStat> writeStats = writeStatuses.stream().map(WriteStatus::getStat).collect(Collectors.toList());
boolean committed = client.commitStats(instant, writeStats, Option.empty(), metaClient.getCommitActionType());
Assertions.assertTrue(committed);
}
metaClient = HoodieTableMetaClient.reload(metaClient);
return writeStatues;
return writeStatuses;
}
}

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.table.functional;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.FileSlice;
@@ -213,8 +214,11 @@ public class TestHoodieSparkMergeOnReadTableInsertUpdateDelete extends SparkClie
dataFilesToRead = tableView.getLatestBaseFiles();
assertTrue(dataFilesToRead.findAny().isPresent());
List<String> dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, basePath(), new JobConf(hadoopConf()), true, false);
List<String> inputPaths = tableView.getLatestBaseFiles()
.map(baseFile -> new Path(baseFile.getPath()).getParent().toString())
.collect(Collectors.toList());
List<GenericRecord> recordsRead =
HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, false);
// Wrote 20 records and deleted 20 records, so remaining 20-20 = 0
assertEquals(0, recordsRead.size(), "Must contain 0 records");
}

View File

@@ -24,6 +24,7 @@ import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy;
import org.apache.hudi.common.model.HoodieFileGroup;
import org.apache.hudi.common.model.HoodieRecord;
@@ -40,6 +41,7 @@ import org.apache.hudi.common.table.view.TableFileSystemView;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestTable;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieStorageConfig;
import org.apache.hudi.config.HoodieWriteConfig;
@@ -64,6 +66,7 @@ import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -172,10 +175,12 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
JavaRDD<WriteStatus> writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
client.commit(newCommitTime, writeStatusJavaRDD);
List<WriteStatus> statuses = writeStatusJavaRDD.collect();
assertNoWriteErrors(statuses);
client.commit(newCommitTime, jsc().parallelize(statuses));
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
@@ -208,8 +213,10 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
copyOfRecords = dataGen.generateUpdates(commitTime1, copyOfRecords);
copyOfRecords.addAll(dataGen.generateInserts(commitTime1, 200));
List<String> dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles,
List<String> inputPaths = tableView.getLatestBaseFiles()
.map(baseFile -> new Path(baseFile.getPath()).getParent().toString())
.collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths,
basePath());
assertEquals(200, recordsRead.size());
@@ -225,8 +232,10 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
.contains(commitTime1)).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
assertEquals(0, remainingFiles.size(), "There files should have been rolled-back "
+ "when rolling back commit " + commitTime1 + " but are still remaining. Files: " + remainingFiles);
dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, basePath());
inputPaths = tableView.getLatestBaseFiles()
.map(baseFile -> new Path(baseFile.getPath()).getParent().toString())
.collect(Collectors.toList());
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
assertEquals(200, recordsRead.size());
}
@@ -241,8 +250,10 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
copyOfRecords = dataGen.generateUpdates(commitTime2, copyOfRecords);
copyOfRecords.addAll(dataGen.generateInserts(commitTime2, 200));
List<String> dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles,
List<String> inputPaths = tableView.getLatestBaseFiles()
.map(baseFile -> new Path(baseFile.getPath()).getParent().toString())
.collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths,
basePath());
assertEquals(200, recordsRead.size());
@@ -262,8 +273,10 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, basePath());
inputPaths = tableView.getLatestBaseFiles()
.map(baseFile -> new Path(baseFile.getPath()).getParent().toString())
.collect(Collectors.toList());
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
// check that the number of records read is still correct after rollback operation
assertEquals(200, recordsRead.size());
@@ -275,11 +288,13 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
thirdClient.startCommitWithTime(newCommitTime);
writeStatusJavaRDD = thirdClient.upsert(writeRecords, newCommitTime);
statuses = writeStatusJavaRDD.collect();
thirdClient.commit(newCommitTime, writeStatusJavaRDD);
// Verify there are no errors
assertNoWriteErrors(statuses);
thirdClient.commit(newCommitTime, jsc().parallelize(statuses));
metaClient = HoodieTableMetaClient.reload(metaClient);
String compactionInstantTime = thirdClient.scheduleCompaction(Option.empty()).get().toString();
@@ -317,8 +332,8 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
try (final SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
/*
* Write 1 (only inserts)
*/
@@ -329,20 +344,29 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
JavaRDD<WriteStatus> writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
client.commit(newCommitTime, writeStatusJavaRDD);
List<WriteStatus> statuses = writeStatusJavaRDD.collect();
assertNoWriteErrors(statuses);
client.commit(newCommitTime, jsc().parallelize(statuses));
client.close();
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantCommitMetadataPairOpt =
metaClient.getActiveTimeline().getLastCommitMetadataWithValidData();
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
assertTrue(instantCommitMetadataPairOpt.isPresent());
HoodieInstant commitInstant = instantCommitMetadataPairOpt.get().getKey();
assertEquals("001", commitInstant.getTimestamp());
assertEquals(HoodieTimeline.DELTA_COMMIT_ACTION, commitInstant.getAction());
assertEquals(200, getTotalRecordsWritten(instantCommitMetadataPairOpt.get().getValue()));
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
@@ -352,6 +376,7 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
dataFilesToRead = tableView.getLatestBaseFiles();
assertTrue(dataFilesToRead.findAny().isPresent(),
"Should list the base files we wrote in the delta commit");
/*
* Write 2 (inserts + updates)
*/
@@ -368,7 +393,9 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
copyOfRecords = dataGen.generateUpdates(newCommitTime, copyOfRecords);
copyOfRecords.addAll(dataGen.generateInserts(newCommitTime, 200));
List<String> dataFiles = tableView.getLatestBaseFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
List<String> dataFiles = tableView.getLatestBaseFiles()
.map(baseFile -> new Path(baseFile.getPath()).getParent().toString())
.collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles,
basePath());
assertEquals(200, recordsRead.size());
@@ -376,7 +403,9 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
statuses = nClient.upsert(jsc().parallelize(copyOfRecords, 1), newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
nClient.commit(newCommitTime, writeStatusJavaRDD);
nClient.commit(newCommitTime, jsc().parallelize(statuses));
copyOfRecords.clear();
}
@@ -393,11 +422,12 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
writeRecords = jsc().parallelize(records, 1);
writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
client.commit(newCommitTime, writeStatusJavaRDD);
statuses = writeStatusJavaRDD.collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
client.commit(newCommitTime, jsc().parallelize(statuses));
metaClient = HoodieTableMetaClient.reload(metaClient);
String compactionInstantTime = "004";
@@ -414,11 +444,12 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
writeRecords = jsc().parallelize(records, 1);
writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
client.commit(newCommitTime, writeStatusJavaRDD);
statuses = writeStatusJavaRDD.collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
client.commit(newCommitTime, jsc().parallelize(statuses));
metaClient = HoodieTableMetaClient.reload(metaClient);
compactionInstantTime = "006";
@@ -447,7 +478,9 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
statuses = client.upsert(jsc().parallelize(copyOfRecords, 1), newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
client.commit(newCommitTime, writeStatusJavaRDD);
client.commit(newCommitTime, jsc().parallelize(statuses));
copyOfRecords.clear();
// Rollback latest commit first
@@ -471,6 +504,13 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
}
}
private long getTotalRecordsWritten(HoodieCommitMetadata commitMetadata) {
return commitMetadata.getPartitionToWriteStats().values().stream()
.flatMap(Collection::stream)
.map(stat -> stat.getNumWrites() + stat.getNumUpdateWrites())
.reduce(0L, Long::sum);
}
@ParameterizedTest
@ValueSource(booleans = {true, false})
void testMORTableRestore(boolean restoreAfterCompaction) throws Exception {
@@ -523,8 +563,6 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
JavaRDD<WriteStatus> writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
client.commit(newCommitTime, writeStatusJavaRDD);
List<WriteStatus> statuses = writeStatusJavaRDD.collect();
assertNoWriteErrors(statuses);
return records;
}
@@ -541,8 +579,10 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
List<String> dataFiles = tableView.getLatestBaseFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles,
List<String> inputPaths = tableView.getLatestBaseFiles()
.map(hf -> new Path(hf.getPath()).getParent().toString())
.collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths,
basePath());
assertRecords(expectedRecords, recordsRead);
}
@@ -603,9 +643,8 @@ public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunction
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
JavaRDD<HoodieRecord> recordsRDD = jsc().parallelize(records, 1);
JavaRDD<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime);
// trigger an action
List<WriteStatus> writeStatuses = statuses.collect();
List<WriteStatus> writeStatuses = ((JavaRDD<WriteStatus>) writeClient.insert(recordsRDD, newCommitTime)).collect();
// Ensure that inserts are written to only log files
assertEquals(0,