1
0

[HUDI-999] [RFC-12] Parallelize fetching of source data files/partitions (#1924)

This commit is contained in:
Udit Mehrotra
2020-08-06 23:44:57 -07:00
committed by GitHub
parent b51646dcc7
commit ab453f2623
4 changed files with 95 additions and 52 deletions

View File

@@ -33,7 +33,6 @@ import org.apache.hudi.common.bootstrap.index.BootstrapIndex;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.BootstrapFileMapping; import org.apache.hudi.common.model.BootstrapFileMapping;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieRecordPayload;
@@ -280,13 +279,8 @@ public class BootstrapCommitActionExecutor<T extends HoodieRecordPayload<T>>
* @throws IOException * @throws IOException
*/ */
private Map<BootstrapMode, List<Pair<String, List<HoodieFileStatus>>>> listAndProcessSourcePartitions() throws IOException { private Map<BootstrapMode, List<Pair<String, List<HoodieFileStatus>>>> listAndProcessSourcePartitions() throws IOException {
List<Pair<String, List<HoodieFileStatus>>> folders = List<Pair<String, List<HoodieFileStatus>>> folders = BootstrapUtils.getAllLeafFoldersWithFiles(
BootstrapUtils.getAllLeafFoldersWithFiles(bootstrapSourceFileSystem, table.getMetaClient(), bootstrapSourceFileSystem, config.getBootstrapSourceBasePath(), jsc);
config.getBootstrapSourceBasePath(), path -> {
// TODO: Needs to be abstracted out when supporting different formats
// TODO: Remove hoodieFilter
return path.getName().endsWith(HoodieFileFormat.PARQUET.getFileExtension());
});
LOG.info("Fetching Bootstrap Schema !!"); LOG.info("Fetching Bootstrap Schema !!");
BootstrapSchemaProvider sourceSchemaProvider = new BootstrapSchemaProvider(config); BootstrapSchemaProvider sourceSchemaProvider = new BootstrapSchemaProvider(config);

View File

@@ -18,14 +18,20 @@
package org.apache.hudi.table.action.bootstrap; package org.apache.hudi.table.action.bootstrap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hudi.avro.model.HoodieFileStatus; import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.common.bootstrap.FileStatusUtils; import org.apache.hudi.common.bootstrap.FileStatusUtils;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.common.util.collection.Pair;
import org.apache.hadoop.fs.FileSystem; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@@ -39,23 +45,62 @@ public class BootstrapUtils {
/** /**
* Returns leaf folders with files under a path. * Returns leaf folders with files under a path.
* @param metaClient Hoodie table metadata client
* @param fs File System * @param fs File System
* @param basePathStr Base Path to look for leaf folders * @param jsc Java spark context
* @param filePathFilter Filters to skip directories/paths
* @return list of partition paths with files under them. * @return list of partition paths with files under them.
* @throws IOException * @throws IOException
*/ */
public static List<Pair<String, List<HoodieFileStatus>>> getAllLeafFoldersWithFiles(FileSystem fs, String basePathStr, public static List<Pair<String, List<HoodieFileStatus>>> getAllLeafFoldersWithFiles(HoodieTableMetaClient metaClient,
PathFilter filePathFilter) throws IOException { FileSystem fs, String basePathStr, JavaSparkContext jsc) throws IOException {
final Path basePath = new Path(basePathStr); final Path basePath = new Path(basePathStr);
final String baseFileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
final Map<Integer, List<String>> levelToPartitions = new HashMap<>(); final Map<Integer, List<String>> levelToPartitions = new HashMap<>();
final Map<String, List<HoodieFileStatus>> partitionToFiles = new HashMap<>(); final Map<String, List<HoodieFileStatus>> partitionToFiles = new HashMap<>();
FSUtils.processFiles(fs, basePathStr, (status) -> { PathFilter filePathFilter = getFilePathFilter(baseFileExtension);
if (status.isFile() && filePathFilter.accept(status.getPath())) { PathFilter metaPathFilter = getExcludeMetaPathFilter();
String relativePath = FSUtils.getRelativePartitionPath(basePath, status.getPath().getParent());
FileStatus[] topLevelStatuses = fs.listStatus(basePath);
List<String> subDirectories = new ArrayList<>();
List<Pair<HoodieFileStatus, Pair<Integer, String>>> result = new ArrayList<>();
for (FileStatus topLevelStatus: topLevelStatuses) {
if (topLevelStatus.isFile() && filePathFilter.accept(topLevelStatus.getPath())) {
String relativePath = FSUtils.getRelativePartitionPath(basePath, topLevelStatus.getPath().getParent());
Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count();
HoodieFileStatus hoodieFileStatus = FileStatusUtils.fromFileStatus(topLevelStatus);
result.add(Pair.of(hoodieFileStatus, Pair.of(level, relativePath)));
} else if (metaPathFilter.accept(topLevelStatus.getPath())) {
subDirectories.add(topLevelStatus.getPath().toString());
}
}
if (subDirectories.size() > 0) {
result.addAll(jsc.parallelize(subDirectories, subDirectories.size()).flatMap(directory -> {
PathFilter pathFilter = getFilePathFilter(baseFileExtension);
Path path = new Path(directory);
FileSystem fileSystem = path.getFileSystem(new Configuration());
RemoteIterator<LocatedFileStatus> itr = fileSystem.listFiles(path, true);
List<Pair<HoodieFileStatus, Pair<Integer, String>>> res = new ArrayList<>();
while (itr.hasNext()) {
FileStatus status = itr.next();
if (pathFilter.accept(status.getPath())) {
String relativePath = FSUtils.getRelativePartitionPath(new Path(basePathStr), status.getPath().getParent());
Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count();
HoodieFileStatus hoodieFileStatus = FileStatusUtils.fromFileStatus(status);
res.add(Pair.of(hoodieFileStatus, Pair.of(level, relativePath)));
}
}
return res.iterator();
}).collect());
}
result.forEach(val -> {
String relativePath = val.getRight().getRight();
List<HoodieFileStatus> statusList = partitionToFiles.get(relativePath); List<HoodieFileStatus> statusList = partitionToFiles.get(relativePath);
if (null == statusList) { if (null == statusList) {
Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count(); Integer level = val.getRight().getLeft();
List<String> dirs = levelToPartitions.get(level); List<String> dirs = levelToPartitions.get(level);
if (null == dirs) { if (null == dirs) {
dirs = new ArrayList<>(); dirs = new ArrayList<>();
@@ -65,13 +110,23 @@ public class BootstrapUtils {
statusList = new ArrayList<>(); statusList = new ArrayList<>();
partitionToFiles.put(relativePath, statusList); partitionToFiles.put(relativePath, statusList);
} }
statusList.add(FileStatusUtils.fromFileStatus(status)); statusList.add(val.getLeft());
} });
return true;
}, true);
OptionalInt maxLevelOpt = levelToPartitions.keySet().stream().mapToInt(x -> x).max(); OptionalInt maxLevelOpt = levelToPartitions.keySet().stream().mapToInt(x -> x).max();
int maxLevel = maxLevelOpt.orElse(-1); int maxLevel = maxLevelOpt.orElse(-1);
return maxLevel >= 0 ? levelToPartitions.get(maxLevel).stream() return maxLevel >= 0 ? levelToPartitions.get(maxLevel).stream()
.map(d -> Pair.of(d, partitionToFiles.get(d))).collect(Collectors.toList()) : new ArrayList<>(); .map(d -> Pair.of(d, partitionToFiles.get(d))).collect(Collectors.toList()) : new ArrayList<>();
} }
private static PathFilter getFilePathFilter(String baseFileExtension) {
return (path) -> {
return path.getName().endsWith(baseFileExtension);
};
}
private static PathFilter getExcludeMetaPathFilter() {
// Avoid listing and including any folders under the metafolder
return (path) -> !path.toString().contains(HoodieTableMetaClient.METAFOLDER_NAME);
}
} }

View File

@@ -63,20 +63,15 @@ public class TestBootstrapUtils extends HoodieClientTestBase {
} }
}); });
List<Pair<String, List<HoodieFileStatus>>> collected = List<Pair<String, List<HoodieFileStatus>>> collected = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient,
BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), basePath, (status) -> { metaClient.getFs(), basePath, jsc);
return true;
});
assertEquals(3, collected.size()); assertEquals(3, collected.size());
collected.stream().forEach(k -> { collected.stream().forEach(k -> {
assertEquals(2, k.getRight().size()); assertEquals(2, k.getRight().size());
}); });
// Simulate reading from un-partitioned dataset // Simulate reading from un-partitioned dataset
collected = collected = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath + "/" + folders.get(0), jsc);
BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), basePath + "/" + folders.get(0), (status) -> {
return true;
});
assertEquals(1, collected.size()); assertEquals(1, collected.size());
collected.stream().forEach(k -> { collected.stream().forEach(k -> {
assertEquals(2, k.getRight().size()); assertEquals(2, k.getRight().size());

View File

@@ -171,8 +171,8 @@ public class TestBootstrap extends HoodieClientTestBase {
} else { } else {
df.write().format("parquet").mode(SaveMode.Overwrite).save(srcPath); df.write().format("parquet").mode(SaveMode.Overwrite).save(srcPath);
} }
String filePath = FileStatusUtils.toPath(BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), srcPath, String filePath = FileStatusUtils.toPath(BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(),
(status) -> status.getName().endsWith(".parquet")).stream().findAny().map(p -> p.getValue().stream().findAny()) srcPath, jsc).stream().findAny().map(p -> p.getValue().stream().findAny())
.orElse(null).get().getPath()).toString(); .orElse(null).get().getPath()).toString();
ParquetFileReader reader = ParquetFileReader.open(metaClient.getHadoopConf(), new Path(filePath)); ParquetFileReader reader = ParquetFileReader.open(metaClient.getHadoopConf(), new Path(filePath));
MessageType schema = reader.getFooter().getFileMetaData().getSchema(); MessageType schema = reader.getFooter().getFileMetaData().getSchema();
@@ -266,8 +266,8 @@ public class TestBootstrap extends HoodieClientTestBase {
client.rollBackInflightBootstrap(); client.rollBackInflightBootstrap();
metaClient.reloadActiveTimeline(); metaClient.reloadActiveTimeline();
assertEquals(0, metaClient.getCommitsTimeline().countInstants()); assertEquals(0, metaClient.getCommitsTimeline().countInstants());
assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), basePath, assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath, jsc)
(status) -> status.getName().endsWith(".parquet")).stream().flatMap(f -> f.getValue().stream()).count()); .stream().flatMap(f -> f.getValue().stream()).count());
BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient); BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient);
assertFalse(index.useIndex()); assertFalse(index.useIndex());
@@ -292,8 +292,8 @@ public class TestBootstrap extends HoodieClientTestBase {
String updateSPath = tmpFolder.toAbsolutePath().toString() + "/data2"; String updateSPath = tmpFolder.toAbsolutePath().toString() + "/data2";
generateNewDataSetAndReturnSchema(updateTimestamp, totalRecords, partitions, updateSPath); generateNewDataSetAndReturnSchema(updateTimestamp, totalRecords, partitions, updateSPath);
JavaRDD<HoodieRecord> updateBatch = JavaRDD<HoodieRecord> updateBatch =
generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), updateSPath, generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), updateSPath, jsc),
(status) -> status.getName().endsWith("parquet")), schema); schema);
String newInstantTs = client.startCommit(); String newInstantTs = client.startCommit();
client.upsert(updateBatch, newInstantTs); client.upsert(updateBatch, newInstantTs);
checkBootstrapResults(totalRecords, schema, newInstantTs, false, numInstantsAfterBootstrap + 1, checkBootstrapResults(totalRecords, schema, newInstantTs, false, numInstantsAfterBootstrap + 1,
@@ -353,9 +353,8 @@ public class TestBootstrap extends HoodieClientTestBase {
bootstrapped.registerTempTable("bootstrapped"); bootstrapped.registerTempTable("bootstrapped");
original.registerTempTable("original"); original.registerTempTable("original");
if (checkNumRawFiles) { if (checkNumRawFiles) {
List<HoodieFileStatus> files = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), bootstrapBasePath, List<HoodieFileStatus> files = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(),
(status) -> status.getName().endsWith(".parquet")) bootstrapBasePath, jsc).stream().flatMap(x -> x.getValue().stream()).collect(Collectors.toList());
.stream().flatMap(x -> x.getValue().stream()).collect(Collectors.toList());
assertEquals(files.size() * numVersions, assertEquals(files.size() * numVersions,
sqlContext.sql("select distinct _hoodie_file_name from bootstrapped").count()); sqlContext.sql("select distinct _hoodie_file_name from bootstrapped").count());
} }