[HUDI-999] [RFC-12] Parallelize fetching of source data files/partitions (#1924)
This commit is contained in:
@@ -171,9 +171,9 @@ public class TestBootstrap extends HoodieClientTestBase {
|
||||
} else {
|
||||
df.write().format("parquet").mode(SaveMode.Overwrite).save(srcPath);
|
||||
}
|
||||
String filePath = FileStatusUtils.toPath(BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), srcPath,
|
||||
(status) -> status.getName().endsWith(".parquet")).stream().findAny().map(p -> p.getValue().stream().findAny())
|
||||
.orElse(null).get().getPath()).toString();
|
||||
String filePath = FileStatusUtils.toPath(BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(),
|
||||
srcPath, jsc).stream().findAny().map(p -> p.getValue().stream().findAny())
|
||||
.orElse(null).get().getPath()).toString();
|
||||
ParquetFileReader reader = ParquetFileReader.open(metaClient.getHadoopConf(), new Path(filePath));
|
||||
MessageType schema = reader.getFooter().getFileMetaData().getSchema();
|
||||
return new AvroSchemaConverter().convert(schema);
|
||||
@@ -266,8 +266,8 @@ public class TestBootstrap extends HoodieClientTestBase {
|
||||
client.rollBackInflightBootstrap();
|
||||
metaClient.reloadActiveTimeline();
|
||||
assertEquals(0, metaClient.getCommitsTimeline().countInstants());
|
||||
assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), basePath,
|
||||
(status) -> status.getName().endsWith(".parquet")).stream().flatMap(f -> f.getValue().stream()).count());
|
||||
assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath, jsc)
|
||||
.stream().flatMap(f -> f.getValue().stream()).count());
|
||||
|
||||
BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient);
|
||||
assertFalse(index.useIndex());
|
||||
@@ -292,8 +292,8 @@ public class TestBootstrap extends HoodieClientTestBase {
|
||||
String updateSPath = tmpFolder.toAbsolutePath().toString() + "/data2";
|
||||
generateNewDataSetAndReturnSchema(updateTimestamp, totalRecords, partitions, updateSPath);
|
||||
JavaRDD<HoodieRecord> updateBatch =
|
||||
generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), updateSPath,
|
||||
(status) -> status.getName().endsWith("parquet")), schema);
|
||||
generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), updateSPath, jsc),
|
||||
schema);
|
||||
String newInstantTs = client.startCommit();
|
||||
client.upsert(updateBatch, newInstantTs);
|
||||
checkBootstrapResults(totalRecords, schema, newInstantTs, false, numInstantsAfterBootstrap + 1,
|
||||
@@ -353,9 +353,8 @@ public class TestBootstrap extends HoodieClientTestBase {
|
||||
bootstrapped.registerTempTable("bootstrapped");
|
||||
original.registerTempTable("original");
|
||||
if (checkNumRawFiles) {
|
||||
List<HoodieFileStatus> files = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), bootstrapBasePath,
|
||||
(status) -> status.getName().endsWith(".parquet"))
|
||||
.stream().flatMap(x -> x.getValue().stream()).collect(Collectors.toList());
|
||||
List<HoodieFileStatus> files = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(),
|
||||
bootstrapBasePath, jsc).stream().flatMap(x -> x.getValue().stream()).collect(Collectors.toList());
|
||||
assertEquals(files.size() * numVersions,
|
||||
sqlContext.sql("select distinct _hoodie_file_name from bootstrapped").count());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user