[HUDI-1450] Use metadata table for listing in HoodieROTablePathFilter (apache#2326)
[HUDI-1394] [RFC-15] Use metadata table (if present) to get all partition paths (apache#2351)
This commit is contained in:
committed by
vinoth chandar
parent
298808baaf
commit
4e64226844
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.config;
|
||||
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Configurations used by the HUDI Metadata Table.
|
||||
*/
|
||||
@Immutable
|
||||
public final class HoodieMetadataConfig extends DefaultHoodieConfig {
|
||||
|
||||
public static final String METADATA_PREFIX = "hoodie.metadata";
|
||||
|
||||
// Enable the internal Metadata Table which saves file listings
|
||||
public static final String METADATA_ENABLE_PROP = METADATA_PREFIX + ".enable";
|
||||
public static final boolean DEFAULT_METADATA_ENABLE = false;
|
||||
|
||||
// Validate contents of Metadata Table on each access against the actual filesystem
|
||||
public static final String METADATA_VALIDATE_PROP = METADATA_PREFIX + ".validate";
|
||||
public static final boolean DEFAULT_METADATA_VALIDATE = false;
|
||||
|
||||
// Parallelism for inserts
|
||||
public static final String METADATA_INSERT_PARALLELISM_PROP = METADATA_PREFIX + ".insert.parallelism";
|
||||
public static final int DEFAULT_METADATA_INSERT_PARALLELISM = 1;
|
||||
|
||||
// Async clean
|
||||
public static final String METADATA_ASYNC_CLEAN_PROP = METADATA_PREFIX + ".clean.async";
|
||||
public static final boolean DEFAULT_METADATA_ASYNC_CLEAN = false;
|
||||
|
||||
// Maximum delta commits before compaction occurs
|
||||
public static final String METADATA_COMPACT_NUM_DELTA_COMMITS_PROP = METADATA_PREFIX + ".compact.max.delta.commits";
|
||||
public static final int DEFAULT_METADATA_COMPACT_NUM_DELTA_COMMITS = 24;
|
||||
|
||||
// Archival settings
|
||||
public static final String MIN_COMMITS_TO_KEEP_PROP = METADATA_PREFIX + ".keep.min.commits";
|
||||
public static final int DEFAULT_MIN_COMMITS_TO_KEEP = 20;
|
||||
public static final String MAX_COMMITS_TO_KEEP_PROP = METADATA_PREFIX + ".keep.max.commits";
|
||||
public static final int DEFAULT_MAX_COMMITS_TO_KEEP = 30;
|
||||
|
||||
// Cleaner commits retained
|
||||
public static final String CLEANER_COMMITS_RETAINED_PROP = METADATA_PREFIX + ".cleaner.commits.retained";
|
||||
public static final int DEFAULT_CLEANER_COMMITS_RETAINED = 3;
|
||||
|
||||
// We can set the default to true for readers, as it will internally default to listing from filesystem if metadata
|
||||
// table is not found
|
||||
public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = true;
|
||||
|
||||
private HoodieMetadataConfig(Properties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
public static HoodieMetadataConfig.Builder newBuilder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private final Properties props = new Properties();
|
||||
|
||||
public Builder fromFile(File propertiesFile) throws IOException {
|
||||
try (FileReader reader = new FileReader(propertiesFile)) {
|
||||
this.props.load(reader);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
public Builder fromProperties(Properties props) {
|
||||
this.props.putAll(props);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder enable(boolean enable) {
|
||||
props.setProperty(METADATA_ENABLE_PROP, String.valueOf(enable));
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder validate(boolean validate) {
|
||||
props.setProperty(METADATA_VALIDATE_PROP, String.valueOf(validate));
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withInsertParallelism(int parallelism) {
|
||||
props.setProperty(METADATA_INSERT_PARALLELISM_PROP, String.valueOf(parallelism));
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withAsyncClean(boolean asyncClean) {
|
||||
props.setProperty(METADATA_ASYNC_CLEAN_PROP, String.valueOf(asyncClean));
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) {
|
||||
props.setProperty(METADATA_COMPACT_NUM_DELTA_COMMITS_PROP, String.valueOf(maxNumDeltaCommitsBeforeCompaction));
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder archiveCommitsWith(int minToKeep, int maxToKeep) {
|
||||
props.setProperty(MIN_COMMITS_TO_KEEP_PROP, String.valueOf(minToKeep));
|
||||
props.setProperty(MAX_COMMITS_TO_KEEP_PROP, String.valueOf(maxToKeep));
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder retainCommits(int commitsRetained) {
|
||||
props.setProperty(CLEANER_COMMITS_RETAINED_PROP, String.valueOf(commitsRetained));
|
||||
return this;
|
||||
}
|
||||
|
||||
public HoodieMetadataConfig build() {
|
||||
HoodieMetadataConfig config = new HoodieMetadataConfig(props);
|
||||
setDefaultOnCondition(props, !props.containsKey(METADATA_ENABLE_PROP), METADATA_ENABLE_PROP,
|
||||
String.valueOf(DEFAULT_METADATA_ENABLE));
|
||||
setDefaultOnCondition(props, !props.containsKey(METADATA_VALIDATE_PROP), METADATA_VALIDATE_PROP,
|
||||
String.valueOf(DEFAULT_METADATA_VALIDATE));
|
||||
setDefaultOnCondition(props, !props.containsKey(METADATA_INSERT_PARALLELISM_PROP), METADATA_INSERT_PARALLELISM_PROP,
|
||||
String.valueOf(DEFAULT_METADATA_INSERT_PARALLELISM));
|
||||
setDefaultOnCondition(props, !props.containsKey(METADATA_ASYNC_CLEAN_PROP), METADATA_ASYNC_CLEAN_PROP,
|
||||
String.valueOf(DEFAULT_METADATA_ASYNC_CLEAN));
|
||||
setDefaultOnCondition(props, !props.containsKey(METADATA_COMPACT_NUM_DELTA_COMMITS_PROP),
|
||||
METADATA_COMPACT_NUM_DELTA_COMMITS_PROP, String.valueOf(DEFAULT_METADATA_COMPACT_NUM_DELTA_COMMITS));
|
||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), CLEANER_COMMITS_RETAINED_PROP,
|
||||
String.valueOf(DEFAULT_CLEANER_COMMITS_RETAINED));
|
||||
setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP_PROP), MAX_COMMITS_TO_KEEP_PROP,
|
||||
String.valueOf(DEFAULT_MAX_COMMITS_TO_KEEP));
|
||||
setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP_PROP), MIN_COMMITS_TO_KEEP_PROP,
|
||||
String.valueOf(DEFAULT_MIN_COMMITS_TO_KEEP));
|
||||
|
||||
return config;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -29,6 +29,7 @@ import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.InvalidHoodiePathException;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadata;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
@@ -251,12 +252,14 @@ public class FSUtils {
|
||||
}
|
||||
}
|
||||
|
||||
public static List<String> getAllPartitionPaths(FileSystem fs, String basePathStr, boolean assumeDatePartitioning)
|
||||
throws IOException {
|
||||
public static List<String> getAllPartitionPaths(FileSystem fs, String basePathStr, boolean useFileListingFromMetadata, boolean verifyListings,
|
||||
boolean assumeDatePartitioning) throws IOException {
|
||||
if (assumeDatePartitioning) {
|
||||
return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr);
|
||||
} else {
|
||||
return getAllFoldersWithPartitionMetaFile(fs, basePathStr);
|
||||
HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(fs.getConf(), basePathStr, "/tmp/", useFileListingFromMetadata,
|
||||
verifyListings, false, false);
|
||||
return tableMetadata.getAllPartitionPaths();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Functions.Function2;
|
||||
import org.apache.hudi.metadata.HoodieMetadataFileSystemView;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -158,6 +159,22 @@ public class FileSystemViewManager {
|
||||
return new HoodieTableFileSystemView(metaClient, timeline, viewConf.isIncrementalTimelineSyncEnabled());
|
||||
}
|
||||
|
||||
public static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieTableMetaClient metaClient,
|
||||
boolean useFileListingFromMetadata,
|
||||
boolean verifyListings) {
|
||||
LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePath());
|
||||
if (useFileListingFromMetadata) {
|
||||
return new HoodieMetadataFileSystemView(metaClient,
|
||||
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(),
|
||||
true,
|
||||
verifyListings);
|
||||
}
|
||||
|
||||
return new HoodieTableFileSystemView(metaClient,
|
||||
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a remote file System view for a table.
|
||||
*
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.metadata;
|
||||
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
|
||||
|
||||
private final SerializableConfiguration hadoopConf;
|
||||
private final String datasetBasePath;
|
||||
private final boolean assumeDatePartitioning;
|
||||
|
||||
public FileSystemBackedTableMetadata(SerializableConfiguration conf, String datasetBasePath, boolean assumeDatePartitioning) {
|
||||
this.hadoopConf = conf;
|
||||
this.datasetBasePath = datasetBasePath;
|
||||
this.assumeDatePartitioning = assumeDatePartitioning;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileStatus[] getAllFilesInPartition(Path partitionPath) throws IOException {
|
||||
FileSystem fs = partitionPath.getFileSystem(hadoopConf.get());
|
||||
return FSUtils.getAllDataFilesInPartition(fs, partitionPath);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getAllPartitionPaths() throws IOException {
|
||||
FileSystem fs = new Path(datasetBasePath).getFileSystem(hadoopConf.get());
|
||||
if (assumeDatePartitioning) {
|
||||
return FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, datasetBasePath);
|
||||
} else {
|
||||
return FSUtils.getAllFoldersWithPartitionMetaFile(fs, datasetBasePath);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getSyncedInstantTime() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isInSync() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
@@ -30,7 +30,6 @@ import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.avro.model.HoodieMetadataRecord;
|
||||
@@ -85,7 +84,7 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
|
||||
private final String spillableMapDirectory;
|
||||
|
||||
// Readers for the base and log file which store the metadata
|
||||
private transient HoodieFileReader<GenericRecord> basefileReader;
|
||||
private transient HoodieFileReader<GenericRecord> baseFileReader;
|
||||
private transient HoodieMetadataMergedLogRecordScanner logRecordScanner;
|
||||
|
||||
public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory,
|
||||
@@ -108,7 +107,7 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
|
||||
try {
|
||||
this.metaClient = new HoodieTableMetaClient(hadoopConf.get(), metadataBasePath);
|
||||
} catch (TableNotFoundException e) {
|
||||
LOG.error("Metadata table was not found at path " + metadataBasePath);
|
||||
LOG.warn("Metadata table was not found at path " + metadataBasePath);
|
||||
this.enabled = false;
|
||||
} catch (Exception e) {
|
||||
LOG.error("Failed to initialize metadata table at path " + metadataBasePath, e);
|
||||
@@ -144,9 +143,7 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
|
||||
LOG.error("Failed to retrieve list of partition from metadata", e);
|
||||
}
|
||||
}
|
||||
|
||||
FileSystem fs = FSUtils.getFs(datasetBasePath, hadoopConf.get());
|
||||
return FSUtils.getAllPartitionPaths(fs, datasetBasePath, assumeDatePartitioning);
|
||||
return new FileSystemBackedTableMetadata(hadoopConf, datasetBasePath, assumeDatePartitioning).getAllPartitionPaths();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -199,7 +196,8 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
|
||||
if (validateLookups) {
|
||||
// Validate the Metadata Table data by listing the partitions from the file system
|
||||
timer.startTimer();
|
||||
List<String> actualPartitions = FSUtils.getAllPartitionPaths(metaClient.getFs(), datasetBasePath, false);
|
||||
FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(hadoopConf, datasetBasePath, assumeDatePartitioning);
|
||||
List<String> actualPartitions = fileSystemBackedTableMetadata.getAllPartitionPaths();
|
||||
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_PARTITIONS_STR, timer.endTimer()));
|
||||
|
||||
Collections.sort(actualPartitions);
|
||||
@@ -287,9 +285,9 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
|
||||
|
||||
// Retrieve record from base file
|
||||
HoodieRecord<HoodieMetadataPayload> hoodieRecord = null;
|
||||
if (basefileReader != null) {
|
||||
if (baseFileReader != null) {
|
||||
HoodieTimer timer = new HoodieTimer().startTimer();
|
||||
Option<GenericRecord> baseRecord = basefileReader.getRecordByKey(key);
|
||||
Option<GenericRecord> baseRecord = baseFileReader.getRecordByKey(key);
|
||||
if (baseRecord.isPresent()) {
|
||||
hoodieRecord = SpillableMapUtils.convertToHoodieRecordPayload(baseRecord.get(),
|
||||
metaClient.getTableConfig().getPayloadClass());
|
||||
@@ -338,7 +336,7 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
|
||||
Option<HoodieBaseFile> basefile = latestSlices.get(0).getBaseFile();
|
||||
if (basefile.isPresent()) {
|
||||
String basefilePath = basefile.get().getPath();
|
||||
basefileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
|
||||
baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
|
||||
LOG.info("Opened metadata base file from " + basefilePath + " at instant " + basefile.get().getCommitTime());
|
||||
}
|
||||
|
||||
@@ -365,9 +363,9 @@ public class HoodieBackedTableMetadata implements HoodieTableMetadata {
|
||||
}
|
||||
|
||||
public void closeReaders() {
|
||||
if (basefileReader != null) {
|
||||
basefileReader.close();
|
||||
basefileReader = null;
|
||||
if (baseFileReader != null) {
|
||||
baseFileReader.close();
|
||||
baseFileReader = null;
|
||||
}
|
||||
logRecordScanner = null;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
||||
|
||||
/**
|
||||
* {@code HoodieTableFileSystemView} implementation that retrieved partition listings from the Metadata Table.
|
||||
*/
|
||||
public class HoodieMetadataFileSystemView extends HoodieTableFileSystemView {
|
||||
|
||||
private final HoodieTableMetadata tableMetadata;
|
||||
|
||||
public HoodieMetadataFileSystemView(HoodieTableMetaClient metaClient, HoodieTableMetadata tableMetadata,
|
||||
HoodieTimeline visibleActiveTimeline, boolean enableIncrementalTimelineSync) {
|
||||
super(metaClient, visibleActiveTimeline, enableIncrementalTimelineSync);
|
||||
this.tableMetadata = tableMetadata;
|
||||
}
|
||||
|
||||
public HoodieMetadataFileSystemView(HoodieTableMetaClient metaClient,
|
||||
HoodieTimeline visibleActiveTimeline,
|
||||
boolean useFileListingFromMetadata,
|
||||
boolean verifyListings) {
|
||||
super(metaClient, visibleActiveTimeline);
|
||||
this.tableMetadata = HoodieTableMetadata.create(metaClient.getHadoopConf(), metaClient.getBasePath(),
|
||||
FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR, useFileListingFromMetadata, verifyListings,
|
||||
false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all the files in the partition by reading from the Metadata Table.
|
||||
*
|
||||
* @param partitionPath The absolute path of the partition
|
||||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
protected FileStatus[] listPartition(Path partitionPath) throws IOException {
|
||||
return tableMetadata.getAllFilesInPartition(partitionPath);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user