1
0

[HUDI-2634] Improved the metadata table bootstrap for very large tables. (#3873)

* [HUDI-2634] Improved the metadata table bootstrap for very large tables.

Following improvements are implemented:
1. Memory overhead reduction:
  - Existing code caches FileStatus for each file in memory.
  - Created a new class DirectoryInfo which is used to cache a director's file list with parts of the FileStatus (only filename and file len). This reduces the memory requirements.

2. Improved parallelism:
  - Existing code collects all the listing to the Driver and then creates HoodieRecord on the Driver.
  - This takes a long time for large tables (11million HoodieRecords to be created)
  - Created a new function in SparkRDDWriteClient specifically for bootstrap commit. In it, the HoodieRecord creation is parallelized across executors so it completes fast.

3. Fixed setting to limit the number of parallel listings:
  - Existing code had a bug wherein 1500 executors were hardcoded to perform listing. This leads to exception due to limit in the spark's result memory.
  - Corrected the use of the config.

Result:
Dataset has 1299 partitions and 12Million files.
file listing time=1.5mins
HoodieRecord creation time=13seconds
deltacommit duration=2.6mins

Co-authored-by: Sivabalan Narayanan <n.siva.b@gmail.com>
This commit is contained in:
Prashant Wason
2021-11-10 19:37:48 -08:00
committed by GitHub
parent 90f9b4562a
commit 77b0440eb4
8 changed files with 188 additions and 76 deletions

View File

@@ -97,6 +97,13 @@ public abstract class HoodieData<T> implements Serializable {
*/
public abstract HoodieData<T> distinct();
/**
* Unions this {@link HoodieData} with other {@link HoodieData}.
* @param other {@link HoodieData} of interest.
* @return the union of two as as instance of {@link HoodieData}.
*/
public abstract HoodieData<T> union(HoodieData<T> other);
/**
* @return collected results in {@link List<T>}.
*/

View File

@@ -132,6 +132,14 @@ public class HoodieList<T> extends HoodieData<T> {
return HoodieList.of(new ArrayList<>(new HashSet<>(listData)));
}
@Override
public HoodieData<T> union(HoodieData<T> other) {
List<T> unionResult = new ArrayList<>();
unionResult.addAll(listData);
unionResult.addAll(other.collectAsList());
return HoodieList.of(unionResult);
}
@Override
public List<T> collectAsList() {
return listData;

View File

@@ -239,7 +239,7 @@ public class FSUtils {
/**
* Recursively processes all files in the base-path. If excludeMetaFolder is set, the meta-folder and all its subdirs
* are skipped
*
*
* @param fs File System
* @param basePathStr Base-Path
* @param consumer Callback for processing
@@ -431,17 +431,29 @@ public class FSUtils {
public static String makeLogFileName(String fileId, String logFileExtension, String baseCommitTime, int version,
String writeToken) {
String suffix =
(writeToken == null) ? String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version)
: String.format("%s_%s%s.%d_%s", fileId, baseCommitTime, logFileExtension, version, writeToken);
String suffix = (writeToken == null)
? String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version)
: String.format("%s_%s%s.%d_%s", fileId, baseCommitTime, logFileExtension, version, writeToken);
return LOG_FILE_PREFIX + suffix;
}
public static boolean isBaseFile(Path path) {
String extension = getFileExtension(path.getName());
return HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(extension);
}
public static boolean isLogFile(Path logPath) {
Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName());
return matcher.find() && logPath.getName().contains(".log");
}
/**
* Returns true if the given path is a Base file or a Log file.
*/
public static boolean isDataFile(Path path) {
return isBaseFile(path) || isLogFile(path);
}
/**
* Get the names of all the base and log files in the given partition path.
*/

View File

@@ -18,6 +18,11 @@
package org.apache.hudi.common.model;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Hoodie file format.
*/
@@ -27,6 +32,11 @@ public enum HoodieFileFormat {
HFILE(".hfile"),
ORC(".orc");
public static final Set<String> BASE_FILE_EXTENSIONS = Arrays.stream(HoodieFileFormat.values())
.map(HoodieFileFormat::getFileExtension)
.filter(x -> !x.equals(HoodieFileFormat.HOODIE_LOG.getFileExtension()))
.collect(Collectors.toCollection(HashSet::new));
private final String extension;
HoodieFileFormat(String extension) {