1
0

[HUDI-1450] Use metadata table for listing in HoodieROTablePathFilter (apache#2326)

[HUDI-1394] [RFC-15] Use metadata table (if present) to get all partition paths (apache#2351)
This commit is contained in:
Udit Mehrotra
2020-12-31 01:20:02 -08:00
committed by vinoth chandar
parent 298808baaf
commit 4e64226844
38 changed files with 308 additions and 102 deletions

View File

@@ -0,0 +1,151 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.config;
import javax.annotation.concurrent.Immutable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;
/**
* Configurations used by the HUDI Metadata Table.
*/
@Immutable
public final class HoodieMetadataConfig extends DefaultHoodieConfig {
public static final String METADATA_PREFIX = "hoodie.metadata";
// Enable the internal Metadata Table which saves file listings
public static final String METADATA_ENABLE_PROP = METADATA_PREFIX + ".enable";
public static final boolean DEFAULT_METADATA_ENABLE = false;
// Validate contents of Metadata Table on each access against the actual filesystem
public static final String METADATA_VALIDATE_PROP = METADATA_PREFIX + ".validate";
public static final boolean DEFAULT_METADATA_VALIDATE = false;
// Parallelism for inserts
public static final String METADATA_INSERT_PARALLELISM_PROP = METADATA_PREFIX + ".insert.parallelism";
public static final int DEFAULT_METADATA_INSERT_PARALLELISM = 1;
// Async clean
public static final String METADATA_ASYNC_CLEAN_PROP = METADATA_PREFIX + ".clean.async";
public static final boolean DEFAULT_METADATA_ASYNC_CLEAN = false;
// Maximum delta commits before compaction occurs
public static final String METADATA_COMPACT_NUM_DELTA_COMMITS_PROP = METADATA_PREFIX + ".compact.max.delta.commits";
public static final int DEFAULT_METADATA_COMPACT_NUM_DELTA_COMMITS = 24;
// Archival settings
public static final String MIN_COMMITS_TO_KEEP_PROP = METADATA_PREFIX + ".keep.min.commits";
public static final int DEFAULT_MIN_COMMITS_TO_KEEP = 20;
public static final String MAX_COMMITS_TO_KEEP_PROP = METADATA_PREFIX + ".keep.max.commits";
public static final int DEFAULT_MAX_COMMITS_TO_KEEP = 30;
// Cleaner commits retained
public static final String CLEANER_COMMITS_RETAINED_PROP = METADATA_PREFIX + ".cleaner.commits.retained";
public static final int DEFAULT_CLEANER_COMMITS_RETAINED = 3;
// We can set the default to true for readers, as it will internally default to listing from filesystem if metadata
// table is not found
public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = true;
private HoodieMetadataConfig(Properties props) {
super(props);
}
public static HoodieMetadataConfig.Builder newBuilder() {
return new Builder();
}
public static class Builder {
private final Properties props = new Properties();
public Builder fromFile(File propertiesFile) throws IOException {
try (FileReader reader = new FileReader(propertiesFile)) {
this.props.load(reader);
return this;
}
}
public Builder fromProperties(Properties props) {
this.props.putAll(props);
return this;
}
public Builder enable(boolean enable) {
props.setProperty(METADATA_ENABLE_PROP, String.valueOf(enable));
return this;
}
public Builder validate(boolean validate) {
props.setProperty(METADATA_VALIDATE_PROP, String.valueOf(validate));
return this;
}
public Builder withInsertParallelism(int parallelism) {
props.setProperty(METADATA_INSERT_PARALLELISM_PROP, String.valueOf(parallelism));
return this;
}
public Builder withAsyncClean(boolean asyncClean) {
props.setProperty(METADATA_ASYNC_CLEAN_PROP, String.valueOf(asyncClean));
return this;
}
public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) {
props.setProperty(METADATA_COMPACT_NUM_DELTA_COMMITS_PROP, String.valueOf(maxNumDeltaCommitsBeforeCompaction));
return this;
}
public Builder archiveCommitsWith(int minToKeep, int maxToKeep) {
props.setProperty(MIN_COMMITS_TO_KEEP_PROP, String.valueOf(minToKeep));
props.setProperty(MAX_COMMITS_TO_KEEP_PROP, String.valueOf(maxToKeep));
return this;
}
public Builder retainCommits(int commitsRetained) {
props.setProperty(CLEANER_COMMITS_RETAINED_PROP, String.valueOf(commitsRetained));
return this;
}
public HoodieMetadataConfig build() {
HoodieMetadataConfig config = new HoodieMetadataConfig(props);
setDefaultOnCondition(props, !props.containsKey(METADATA_ENABLE_PROP), METADATA_ENABLE_PROP,
String.valueOf(DEFAULT_METADATA_ENABLE));
setDefaultOnCondition(props, !props.containsKey(METADATA_VALIDATE_PROP), METADATA_VALIDATE_PROP,
String.valueOf(DEFAULT_METADATA_VALIDATE));
setDefaultOnCondition(props, !props.containsKey(METADATA_INSERT_PARALLELISM_PROP), METADATA_INSERT_PARALLELISM_PROP,
String.valueOf(DEFAULT_METADATA_INSERT_PARALLELISM));
setDefaultOnCondition(props, !props.containsKey(METADATA_ASYNC_CLEAN_PROP), METADATA_ASYNC_CLEAN_PROP,
String.valueOf(DEFAULT_METADATA_ASYNC_CLEAN));
setDefaultOnCondition(props, !props.containsKey(METADATA_COMPACT_NUM_DELTA_COMMITS_PROP),
METADATA_COMPACT_NUM_DELTA_COMMITS_PROP, String.valueOf(DEFAULT_METADATA_COMPACT_NUM_DELTA_COMMITS));
setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), CLEANER_COMMITS_RETAINED_PROP,
String.valueOf(DEFAULT_CLEANER_COMMITS_RETAINED));
setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP_PROP), MAX_COMMITS_TO_KEEP_PROP,
String.valueOf(DEFAULT_MAX_COMMITS_TO_KEEP));
setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP_PROP), MIN_COMMITS_TO_KEEP_PROP,
String.valueOf(DEFAULT_MIN_COMMITS_TO_KEEP));
return config;
}
}
}

View File

@@ -29,6 +29,7 @@ import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.InvalidHoodiePathException;
import org.apache.hudi.metadata.HoodieTableMetadata;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
@@ -251,12 +252,14 @@ public class FSUtils {
}
}
public static List<String> getAllPartitionPaths(FileSystem fs, String basePathStr, boolean assumeDatePartitioning)
throws IOException {
public static List<String> getAllPartitionPaths(FileSystem fs, String basePathStr, boolean useFileListingFromMetadata, boolean verifyListings,
boolean assumeDatePartitioning) throws IOException {
if (assumeDatePartitioning) {
return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr);
} else {
return getAllFoldersWithPartitionMetaFile(fs, basePathStr);
HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(fs.getConf(), basePathStr, "/tmp/", useFileListingFromMetadata,
verifyListings, false, false);
return tableMetadata.getAllPartitionPaths();
}
}

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Functions.Function2;
import org.apache.hudi.metadata.HoodieMetadataFileSystemView;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -158,6 +159,22 @@ public class FileSystemViewManager {
return new HoodieTableFileSystemView(metaClient, timeline, viewConf.isIncrementalTimelineSyncEnabled());
}
public static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieTableMetaClient metaClient,
boolean useFileListingFromMetadata,
boolean verifyListings) {
LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePath());
if (useFileListingFromMetadata) {
return new HoodieMetadataFileSystemView(metaClient,
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(),
true,
verifyListings);
}
return new HoodieTableFileSystemView(metaClient,
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
}
/**
* Create a remote file System view for a table.
*

View File

@@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.metadata;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.util.List;
public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
private final SerializableConfiguration hadoopConf;
private final String datasetBasePath;
private final boolean assumeDatePartitioning;
public FileSystemBackedTableMetadata(SerializableConfiguration conf, String datasetBasePath, boolean assumeDatePartitioning) {
this.hadoopConf = conf;
this.datasetBasePath = datasetBasePath;
this.assumeDatePartitioning = assumeDatePartitioning;
}
@Override
public FileStatus[] getAllFilesInPartition(Path partitionPath) throws IOException {
FileSystem fs = partitionPath.getFileSystem(hadoopConf.get());
return FSUtils.getAllDataFilesInPartition(fs, partitionPath);
}
@Override
public List<String> getAllPartitionPaths() throws IOException {
FileSystem fs = new Path(datasetBasePath).getFileSystem(hadoopConf.get());
if (assumeDatePartitioning) {
return FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, datasetBasePath);
} else {
return FSUtils.getAllFoldersWithPartitionMetaFile(fs, datasetBasePath);
}
}
@Override
public Option<String> getSyncedInstantTime() {
throw new UnsupportedOperationException();
}
@Override
public boolean isInSync() {
throw new UnsupportedOperationException();
}
}

View File

@@ -30,7 +30,6 @@ import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.model.HoodieMetadataRecord;
@@ -85,7 +84,7 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
private final String spillableMapDirectory;
// Readers for the base and log file which store the metadata
private transient HoodieFileReader<GenericRecord> basefileReader;
private transient HoodieFileReader<GenericRecord> baseFileReader;
private transient HoodieMetadataMergedLogRecordScanner logRecordScanner;
public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory,
@@ -108,7 +107,7 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
try {
this.metaClient = new HoodieTableMetaClient(hadoopConf.get(), metadataBasePath);
} catch (TableNotFoundException e) {
LOG.error("Metadata table was not found at path " + metadataBasePath);
LOG.warn("Metadata table was not found at path " + metadataBasePath);
this.enabled = false;
} catch (Exception e) {
LOG.error("Failed to initialize metadata table at path " + metadataBasePath, e);
@@ -144,9 +143,7 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
LOG.error("Failed to retrieve list of partition from metadata", e);
}
}
FileSystem fs = FSUtils.getFs(datasetBasePath, hadoopConf.get());
return FSUtils.getAllPartitionPaths(fs, datasetBasePath, assumeDatePartitioning);
return new FileSystemBackedTableMetadata(hadoopConf, datasetBasePath, assumeDatePartitioning).getAllPartitionPaths();
}
/**
@@ -199,7 +196,8 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
if (validateLookups) {
// Validate the Metadata Table data by listing the partitions from the file system
timer.startTimer();
List<String> actualPartitions = FSUtils.getAllPartitionPaths(metaClient.getFs(), datasetBasePath, false);
FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(hadoopConf, datasetBasePath, assumeDatePartitioning);
List<String> actualPartitions = fileSystemBackedTableMetadata.getAllPartitionPaths();
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_PARTITIONS_STR, timer.endTimer()));
Collections.sort(actualPartitions);
@@ -287,9 +285,9 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
// Retrieve record from base file
HoodieRecord<HoodieMetadataPayload> hoodieRecord = null;
if (basefileReader != null) {
if (baseFileReader != null) {
HoodieTimer timer = new HoodieTimer().startTimer();
Option<GenericRecord> baseRecord = basefileReader.getRecordByKey(key);
Option<GenericRecord> baseRecord = baseFileReader.getRecordByKey(key);
if (baseRecord.isPresent()) {
hoodieRecord = SpillableMapUtils.convertToHoodieRecordPayload(baseRecord.get(),
metaClient.getTableConfig().getPayloadClass());
@@ -338,7 +336,7 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
Option<HoodieBaseFile> basefile = latestSlices.get(0).getBaseFile();
if (basefile.isPresent()) {
String basefilePath = basefile.get().getPath();
basefileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
LOG.info("Opened metadata base file from " + basefilePath + " at instant " + basefile.get().getCommitTime());
}
@@ -365,9 +363,9 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
}
public void closeReaders() {
if (basefileReader != null) {
basefileReader.close();
basefileReader = null;
if (baseFileReader != null) {
baseFileReader.close();
baseFileReader = null;
}
logRecordScanner = null;
}

View File

@@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.metadata;
import java.io.IOException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
/**
* {@code HoodieTableFileSystemView} implementation that retrieved partition listings from the Metadata Table.
*/
public class HoodieMetadataFileSystemView extends HoodieTableFileSystemView {
private final HoodieTableMetadata tableMetadata;
public HoodieMetadataFileSystemView(HoodieTableMetaClient metaClient, HoodieTableMetadata tableMetadata,
HoodieTimeline visibleActiveTimeline, boolean enableIncrementalTimelineSync) {
super(metaClient, visibleActiveTimeline, enableIncrementalTimelineSync);
this.tableMetadata = tableMetadata;
}
public HoodieMetadataFileSystemView(HoodieTableMetaClient metaClient,
HoodieTimeline visibleActiveTimeline,
boolean useFileListingFromMetadata,
boolean verifyListings) {
super(metaClient, visibleActiveTimeline);
this.tableMetadata = HoodieTableMetadata.create(metaClient.getHadoopConf(), metaClient.getBasePath(),
FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR, useFileListingFromMetadata, verifyListings,
false, false);
}
/**
* Return all the files in the partition by reading from the Metadata Table.
*
* @param partitionPath The absolute path of the partition
* @throws IOException
*/
@Override
protected FileStatus[] listPartition(Path partitionPath) throws IOException {
return tableMetadata.getAllFilesInPartition(partitionPath);
}
}