1
0

[HUDI-1479] Use HoodieEngineContext to parallelize fetching of partiton paths (#2417)

* [HUDI-1479] Use HoodieEngineContext to parallelize fetching of partition paths

* Adding testClass for FileSystemBackedTableMetadata

Co-authored-by: Nishith Agarwal <nagarwal@uber.com>
This commit is contained in:
Udit Mehrotra
2021-01-10 21:19:52 -08:00
committed by GitHub
parent 23e93d05c0
commit 7ce3ac778e
38 changed files with 509 additions and 100 deletions

View File

@@ -39,6 +39,7 @@ public final class HoodieMetadataConfig extends DefaultHoodieConfig {
// Validate contents of Metadata Table on each access against the actual filesystem
public static final String METADATA_VALIDATE_PROP = METADATA_PREFIX + ".validate";
public static final boolean DEFAULT_METADATA_VALIDATE = false;
public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = false;
// Parallelism for inserts
public static final String METADATA_INSERT_PARALLELISM_PROP = METADATA_PREFIX + ".insert.parallelism";
@@ -62,10 +63,6 @@ public final class HoodieMetadataConfig extends DefaultHoodieConfig {
public static final String CLEANER_COMMITS_RETAINED_PROP = METADATA_PREFIX + ".cleaner.commits.retained";
public static final int DEFAULT_CLEANER_COMMITS_RETAINED = 3;
// We can set the default to true for readers, as it will internally default to listing from filesystem if metadata
// table is not found
public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = true;
private HoodieMetadataConfig(Properties props) {
super(props);
}

View File

@@ -0,0 +1,90 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.engine;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.function.SerializableConsumer;
import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.function.SerializablePairFunction;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static java.util.stream.Collectors.toList;
import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper;
import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper;
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper;
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper;
/**
* A java based engine context, use this implementation on the query engine integrations if needed.
*/
public final class HoodieLocalEngineContext extends HoodieEngineContext {
public HoodieLocalEngineContext(Configuration conf) {
this(conf, new LocalTaskContextSupplier());
}
public HoodieLocalEngineContext(Configuration conf, TaskContextSupplier taskContextSupplier) {
super(new SerializableConfiguration(conf), taskContextSupplier);
}
@Override
public <I, O> List<O> map(List<I> data, SerializableFunction<I, O> func, int parallelism) {
return data.stream().parallel().map(throwingMapWrapper(func)).collect(toList());
}
@Override
public <I, O> List<O> flatMap(List<I> data, SerializableFunction<I, Stream<O>> func, int parallelism) {
return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(toList());
}
@Override
public <I> void foreach(List<I> data, SerializableConsumer<I> consumer, int parallelism) {
data.stream().forEach(throwingForeachWrapper(consumer));
}
@Override
public <I, K, V> Map<K, V> mapToPair(List<I> data, SerializablePairFunction<I, K, V> func, Integer parallelism) {
return data.stream().map(throwingMapToPairWrapper(func)).collect(
Collectors.toMap(Pair::getLeft, Pair::getRight, (oldVal, newVal) -> newVal)
);
}
@Override
public void setProperty(EngineProperty key, String value) {
// no operation for now
}
@Override
public Option<String> getProperty(EngineProperty key) {
return Option.empty();
}
@Override
public void setJobStatus(String activeModule, String activityDescription) {
// no operation for now
}
}

View File

@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.engine;
import org.apache.hudi.common.util.Option;
import java.util.function.Supplier;
public final class LocalTaskContextSupplier extends TaskContextSupplier {
@Override
public Supplier<Integer> getPartitionIdSupplier() {
return () -> 0;
}
@Override
public Supplier<Integer> getStageIdSupplier() {
return () -> 0;
}
@Override
public Supplier<Long> getAttemptIdSupplier() {
return () -> 0L;
}
@Override
public Option<String> getProperty(EngineProperty prop) {
return Option.empty();
}
}

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.common.fs;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodiePartitionMetadata;
@@ -252,13 +253,14 @@ public class FSUtils {
}
}
public static List<String> getAllPartitionPaths(FileSystem fs, String basePathStr, boolean useFileListingFromMetadata, boolean verifyListings,
public static List<String> getAllPartitionPaths(HoodieEngineContext engineContext, FileSystem fs, String basePathStr,
boolean useFileListingFromMetadata, boolean verifyListings,
boolean assumeDatePartitioning) throws IOException {
if (assumeDatePartitioning) {
return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr);
} else {
HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(fs.getConf(), basePathStr, "/tmp/", useFileListingFromMetadata,
verifyListings, false, false);
HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, basePathStr, "/tmp/",
useFileListingFromMetadata, verifyListings, false, false);
return tableMetadata.getAllPartitionPaths();
}
}

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.common.table.view;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Functions.Function2;
@@ -159,12 +160,11 @@ public class FileSystemViewManager {
return new HoodieTableFileSystemView(metaClient, timeline, viewConf.isIncrementalTimelineSyncEnabled());
}
public static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieTableMetaClient metaClient,
boolean useFileListingFromMetadata,
boolean verifyListings) {
public static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieEngineContext engineContext,
HoodieTableMetaClient metaClient, boolean useFileListingFromMetadata, boolean verifyListings) {
LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePath());
if (useFileListingFromMetadata) {
return new HoodieMetadataFileSystemView(metaClient,
return new HoodieMetadataFileSystemView(engineContext, metaClient,
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(),
true,
verifyListings);

View File

@@ -23,6 +23,7 @@ import org.apache.avro.Schema;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.model.HoodieMetadataRecord;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.metrics.Registry;
import org.apache.hudi.common.model.HoodiePartitionMetadata;
@@ -33,7 +34,6 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.common.util.Option;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
@@ -54,6 +54,7 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
static final long MAX_MEMORY_SIZE_IN_BYTES = 1024 * 1024 * 1024;
static final int BUFFER_SIZE = 10 * 1024 * 1024;
protected final transient HoodieEngineContext engineContext;
protected final SerializableConfiguration hadoopConf;
protected final String datasetBasePath;
protected boolean enabled;
@@ -66,10 +67,11 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
protected final String spillableMapDirectory;
private transient HoodieMetadataMergedInstantRecordScanner timelineRecordScanner;
protected BaseTableMetadata(Configuration hadoopConf, String datasetBasePath, String spillableMapDirectory,
protected BaseTableMetadata(HoodieEngineContext engineContext, String datasetBasePath, String spillableMapDirectory,
boolean enabled, boolean validateLookups, boolean enableMetrics,
boolean assumeDatePartitioning) {
this.hadoopConf = new SerializableConfiguration(hadoopConf);
this.engineContext = engineContext;
this.hadoopConf = new SerializableConfiguration(engineContext.getHadoopConf());
this.datasetBasePath = datasetBasePath;
this.spillableMapDirectory = spillableMapDirectory;
@@ -102,7 +104,8 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
LOG.error("Failed to retrieve list of partition from metadata", e);
}
}
return new FileSystemBackedTableMetadata(hadoopConf, datasetBasePath, assumeDatePartitioning).getAllPartitionPaths();
return new FileSystemBackedTableMetadata(engineContext, hadoopConf, datasetBasePath,
assumeDatePartitioning).getAllPartitionPaths();
}
/**
@@ -155,7 +158,8 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
if (validateLookups) {
// Validate the Metadata Table data by listing the partitions from the file system
timer.startTimer();
FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(hadoopConf, datasetBasePath, assumeDatePartitioning);
FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(engineContext,
hadoopConf, datasetBasePath, assumeDatePartitioning);
List<String> actualPartitions = fileSystemBackedTableMetadata.getAllPartitionPaths();
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_PARTITIONS_STR, timer.endTimer()));
@@ -290,4 +294,8 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
protected void closeReaders() {
timelineRecordScanner = null;
}
protected HoodieEngineContext getEngineContext() {
return engineContext;
}
}

View File

@@ -19,23 +19,36 @@
package org.apache.hudi.metadata;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodiePartitionMetadata;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;
public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
private static final int DEFAULT_LISTING_PARALLELISM = 1500;
private final transient HoodieEngineContext engineContext;
private final SerializableConfiguration hadoopConf;
private final String datasetBasePath;
private final boolean assumeDatePartitioning;
public FileSystemBackedTableMetadata(SerializableConfiguration conf, String datasetBasePath, boolean assumeDatePartitioning) {
public FileSystemBackedTableMetadata(HoodieEngineContext engineContext, SerializableConfiguration conf, String datasetBasePath,
boolean assumeDatePartitioning) {
this.engineContext = engineContext;
this.hadoopConf = conf;
this.datasetBasePath = datasetBasePath;
this.assumeDatePartitioning = assumeDatePartitioning;
@@ -49,12 +62,47 @@ public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
@Override
public List<String> getAllPartitionPaths() throws IOException {
FileSystem fs = new Path(datasetBasePath).getFileSystem(hadoopConf.get());
if (assumeDatePartitioning) {
FileSystem fs = new Path(datasetBasePath).getFileSystem(hadoopConf.get());
return FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, datasetBasePath);
} else {
return FSUtils.getAllFoldersWithPartitionMetaFile(fs, datasetBasePath);
}
List<Path> pathsToList = new LinkedList<>();
pathsToList.add(new Path(datasetBasePath));
List<String> partitionPaths = new ArrayList<>();
while (!pathsToList.isEmpty()) {
// TODO: Get the parallelism from HoodieWriteConfig
int listingParallelism = Math.min(DEFAULT_LISTING_PARALLELISM, pathsToList.size());
// List all directories in parallel
List<Pair<Path, FileStatus[]>> dirToFileListing = engineContext.map(pathsToList, path -> {
FileSystem fileSystem = path.getFileSystem(hadoopConf.get());
return Pair.of(path, fileSystem.listStatus(path));
}, listingParallelism);
pathsToList.clear();
// If the listing reveals a directory, add it to queue. If the listing reveals a hoodie partition, add it to
// the results.
dirToFileListing.forEach(p -> {
Option<FileStatus> partitionMetaFile = Option.fromJavaOptional(Arrays.stream(p.getRight()).parallel()
.filter(fs -> fs.getPath().getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE))
.findFirst());
if (partitionMetaFile.isPresent()) {
// Is a partition.
String partitionName = FSUtils.getRelativePartitionPath(new Path(datasetBasePath), p.getLeft());
partitionPaths.add(partitionName);
} else {
// Add sub-dirs to the queue
pathsToList.addAll(Arrays.stream(p.getRight())
.filter(fs -> fs.isDirectory() && !fs.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME))
.map(fs -> fs.getPath())
.collect(Collectors.toList()));
}
});
}
return partitionPaths;
}
@Override
@@ -64,6 +112,6 @@ public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
@Override
public boolean isInSync() {
throw new UnsupportedOperationException();
return true;
}
}

View File

@@ -21,6 +21,8 @@ package org.apache.hudi.metadata;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.model.HoodieMetadataRecord;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieLogFile;
@@ -70,15 +72,15 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata {
private transient HoodieFileReader<GenericRecord> baseFileReader;
private transient HoodieMetadataMergedLogRecordScanner logRecordScanner;
public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory,
boolean enabled, boolean validateLookups, boolean assumeDatePartitioning) {
this(conf, datasetBasePath, spillableMapDirectory, enabled, validateLookups, false, assumeDatePartitioning);
public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory, boolean enabled,
boolean validateLookups, boolean assumeDatePartitioning) {
this(new HoodieLocalEngineContext(conf), datasetBasePath, spillableMapDirectory, enabled, validateLookups,
false, assumeDatePartitioning);
}
public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory,
boolean enabled, boolean validateLookups, boolean enableMetrics,
boolean assumeDatePartitioning) {
super(conf, datasetBasePath, spillableMapDirectory, enabled, validateLookups, enableMetrics, assumeDatePartitioning);
public HoodieBackedTableMetadata(HoodieEngineContext engineContext, String datasetBasePath, String spillableMapDirectory,
boolean enabled, boolean validateLookups, boolean enableMetrics, boolean assumeDatePartitioning) {
super(engineContext, datasetBasePath, spillableMapDirectory, enabled, validateLookups, enableMetrics, assumeDatePartitioning);
this.metadataBasePath = HoodieTableMetadata.getMetadataTableBasePath(datasetBasePath);
if (enabled) {
try {

View File

@@ -22,6 +22,8 @@ import java.io.IOException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
@@ -40,12 +42,13 @@ public class HoodieMetadataFileSystemView extends HoodieTableFileSystemView {
this.tableMetadata = tableMetadata;
}
public HoodieMetadataFileSystemView(HoodieTableMetaClient metaClient,
public HoodieMetadataFileSystemView(HoodieEngineContext engineContext,
HoodieTableMetaClient metaClient,
HoodieTimeline visibleActiveTimeline,
boolean useFileListingFromMetadata,
boolean verifyListings) {
super(metaClient, visibleActiveTimeline);
this.tableMetadata = HoodieTableMetadata.create(metaClient.getHadoopConf(), metaClient.getBasePath(),
this.tableMetadata = HoodieTableMetadata.create(engineContext, metaClient.getBasePath(),
FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR, useFileListingFromMetadata, verifyListings,
false, false);
}

View File

@@ -18,10 +18,11 @@
package org.apache.hudi.metadata;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.Option;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
@@ -68,10 +69,16 @@ public interface HoodieTableMetadata extends Serializable {
return basePath.endsWith(METADATA_TABLE_REL_PATH);
}
static HoodieTableMetadata create(Configuration conf, String datasetBasePath, String spillableMapPath, boolean useFileListingFromMetadata,
boolean verifyListings, boolean enableMetrics, boolean shouldAssumeDatePartitioning) {
return new HoodieBackedTableMetadata(conf, datasetBasePath, spillableMapPath, useFileListingFromMetadata, verifyListings,
enableMetrics, shouldAssumeDatePartitioning);
static HoodieTableMetadata create(HoodieEngineContext engineContext, String datasetBasePath,
String spillableMapPath, boolean useFileListingFromMetadata, boolean verifyListings,
boolean enableMetrics, boolean shouldAssumeDatePartitioning) {
if (useFileListingFromMetadata) {
return new HoodieBackedTableMetadata(engineContext, datasetBasePath, spillableMapPath, useFileListingFromMetadata,
verifyListings, enableMetrics, shouldAssumeDatePartitioning);
} else {
return new FileSystemBackedTableMetadata(engineContext, new SerializableConfiguration(engineContext.getHadoopConf()),
datasetBasePath, shouldAssumeDatePartitioning);
}
}
/**