1
0

[HUDI-999] [RFC-12] Parallelize fetching of source data files/partitions (#1924)

This commit is contained in:
Udit Mehrotra
2020-08-06 23:44:57 -07:00
committed by GitHub
parent b51646dcc7
commit ab453f2623
4 changed files with 95 additions and 52 deletions

View File

@@ -171,9 +171,9 @@ public class TestBootstrap extends HoodieClientTestBase {
} else {
df.write().format("parquet").mode(SaveMode.Overwrite).save(srcPath);
}
String filePath = FileStatusUtils.toPath(BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), srcPath,
(status) -> status.getName().endsWith(".parquet")).stream().findAny().map(p -> p.getValue().stream().findAny())
.orElse(null).get().getPath()).toString();
String filePath = FileStatusUtils.toPath(BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(),
srcPath, jsc).stream().findAny().map(p -> p.getValue().stream().findAny())
.orElse(null).get().getPath()).toString();
ParquetFileReader reader = ParquetFileReader.open(metaClient.getHadoopConf(), new Path(filePath));
MessageType schema = reader.getFooter().getFileMetaData().getSchema();
return new AvroSchemaConverter().convert(schema);
@@ -266,8 +266,8 @@ public class TestBootstrap extends HoodieClientTestBase {
client.rollBackInflightBootstrap();
metaClient.reloadActiveTimeline();
assertEquals(0, metaClient.getCommitsTimeline().countInstants());
assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), basePath,
(status) -> status.getName().endsWith(".parquet")).stream().flatMap(f -> f.getValue().stream()).count());
assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath, jsc)
.stream().flatMap(f -> f.getValue().stream()).count());
BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient);
assertFalse(index.useIndex());
@@ -292,8 +292,8 @@ public class TestBootstrap extends HoodieClientTestBase {
String updateSPath = tmpFolder.toAbsolutePath().toString() + "/data2";
generateNewDataSetAndReturnSchema(updateTimestamp, totalRecords, partitions, updateSPath);
JavaRDD<HoodieRecord> updateBatch =
generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), updateSPath,
(status) -> status.getName().endsWith("parquet")), schema);
generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), updateSPath, jsc),
schema);
String newInstantTs = client.startCommit();
client.upsert(updateBatch, newInstantTs);
checkBootstrapResults(totalRecords, schema, newInstantTs, false, numInstantsAfterBootstrap + 1,
@@ -353,9 +353,8 @@ public class TestBootstrap extends HoodieClientTestBase {
bootstrapped.registerTempTable("bootstrapped");
original.registerTempTable("original");
if (checkNumRawFiles) {
List<HoodieFileStatus> files = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), bootstrapBasePath,
(status) -> status.getName().endsWith(".parquet"))
.stream().flatMap(x -> x.getValue().stream()).collect(Collectors.toList());
List<HoodieFileStatus> files = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(),
bootstrapBasePath, jsc).stream().flatMap(x -> x.getValue().stream()).collect(Collectors.toList());
assertEquals(files.size() * numVersions,
sqlContext.sql("select distinct _hoodie_file_name from bootstrapped").count());
}