1
0

[HUDI-1450] Use metadata table for listing in HoodieROTablePathFilter (apache#2326)

[HUDI-1394] [RFC-15] Use metadata table (if present) to get all partition paths (apache#2351)
This commit is contained in:
Udit Mehrotra
2020-12-31 01:20:02 -08:00
committed by vinoth chandar
parent 298808baaf
commit 4e64226844
38 changed files with 308 additions and 102 deletions

View File

@@ -19,6 +19,8 @@
package org.apache.hudi.dla;
import com.beust.jcommander.Parameter;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor;
import java.io.Serializable;
@@ -68,6 +70,12 @@ public class DLASyncConfig implements Serializable {
@Parameter(names = {"--hive-style-partitioning"}, description = "Use DLA hive style partitioning, true if like the following style: field1=value1/field2=value2")
public Boolean useDLASyncHiveStylePartitioning = false;
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
public Boolean useFileListingFromMetadata = HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS;
@Parameter(names = {"--verify-metadata-file-listing"}, description = "Verify file listing from Hudi's metadata against file system")
public Boolean verifyMetadataFileListing = HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
@@ -88,6 +96,8 @@ public class DLASyncConfig implements Serializable {
newConfig.skipROSuffix = cfg.skipROSuffix;
newConfig.skipRTSync = cfg.skipRTSync;
newConfig.useDLASyncHiveStylePartitioning = cfg.useDLASyncHiveStylePartitioning;
newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
newConfig.verifyMetadataFileListing = cfg.verifyMetadataFileListing;
newConfig.supportTimestamp = cfg.supportTimestamp;
return newConfig;
}
@@ -99,6 +109,8 @@ public class DLASyncConfig implements Serializable {
+ ", basePath='" + basePath + '\'' + ", partitionFields=" + partitionFields + ", partitionValueExtractorClass='"
+ partitionValueExtractorClass + '\'' + ", assumeDatePartitioning=" + assumeDatePartitioning
+ ", useDLASyncHiveStylePartitioning=" + useDLASyncHiveStylePartitioning
+ ", useFileListingFromMetadata=" + useFileListingFromMetadata
+ ", verifyMetadataFileListing=" + verifyMetadataFileListing
+ ", help=" + help + '}';
}
}

View File

@@ -70,7 +70,8 @@ public class HoodieDLAClient extends AbstractSyncHoodieClient {
private PartitionValueExtractor partitionValueExtractor;
public HoodieDLAClient(DLASyncConfig syncConfig, FileSystem fs) {
super(syncConfig.basePath, syncConfig.assumeDatePartitioning, fs);
super(syncConfig.basePath, syncConfig.assumeDatePartitioning, syncConfig.useFileListingFromMetadata,
syncConfig.verifyMetadataFileListing, fs);
this.dlaConfig = syncConfig;
try {
this.partitionValueExtractor =

View File

@@ -18,6 +18,8 @@
package org.apache.hudi.hive;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import com.beust.jcommander.Parameter;
import java.io.Serializable;
@@ -77,6 +79,12 @@ public class HiveSyncConfig implements Serializable {
@Parameter(names = {"--skip-ro-suffix"}, description = "Skip the `_ro` suffix for Read optimized table, when registering")
public Boolean skipROSuffix = false;
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
public Boolean useFileListingFromMetadata = HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS;
@Parameter(names = {"--verify-metadata-file-listing"}, description = "Verify file listing from Hudi's metadata against file system")
public Boolean verifyMetadataFileListing = HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
@@ -99,6 +107,8 @@ public class HiveSyncConfig implements Serializable {
newConfig.jdbcUrl = cfg.jdbcUrl;
newConfig.tableName = cfg.tableName;
newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat;
newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
newConfig.verifyMetadataFileListing = cfg.verifyMetadataFileListing;
newConfig.supportTimestamp = cfg.supportTimestamp;
newConfig.decodePartition = cfg.decodePartition;
return newConfig;
@@ -107,23 +117,25 @@ public class HiveSyncConfig implements Serializable {
@Override
public String toString() {
return "HiveSyncConfig{"
+ "databaseName='" + databaseName + '\''
+ ", tableName='" + tableName + '\''
+ ", baseFileFormat='" + baseFileFormat + '\''
+ ", hiveUser='" + hiveUser + '\''
+ ", hivePass='" + hivePass + '\''
+ ", jdbcUrl='" + jdbcUrl + '\''
+ ", basePath='" + basePath + '\''
+ ", partitionFields=" + partitionFields
+ ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\''
+ ", assumeDatePartitioning=" + assumeDatePartitioning
+ ", usePreApacheInputFormat=" + usePreApacheInputFormat
+ ", useJdbc=" + useJdbc
+ ", autoCreateDatabase=" + autoCreateDatabase
+ ", skipROSuffix=" + skipROSuffix
+ ", help=" + help
+ ", supportTimestamp=" + supportTimestamp
+ ", decodePartition=" + decodePartition
+ '}';
+ "databaseName='" + databaseName + '\''
+ ", tableName='" + tableName + '\''
+ ", baseFileFormat='" + baseFileFormat + '\''
+ ", hiveUser='" + hiveUser + '\''
+ ", hivePass='" + hivePass + '\''
+ ", jdbcUrl='" + jdbcUrl + '\''
+ ", basePath='" + basePath + '\''
+ ", partitionFields=" + partitionFields
+ ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\''
+ ", assumeDatePartitioning=" + assumeDatePartitioning
+ ", usePreApacheInputFormat=" + usePreApacheInputFormat
+ ", useJdbc=" + useJdbc
+ ", autoCreateDatabase=" + autoCreateDatabase
+ ", skipROSuffix=" + skipROSuffix
+ ", help=" + help
+ ", supportTimestamp=" + supportTimestamp
+ ", decodePartition=" + decodePartition
+ ", useFileListingFromMetadata=" + useFileListingFromMetadata
+ ", verifyMetadataFileListing=" + verifyMetadataFileListing
+ '}';
}
}

View File

@@ -76,7 +76,7 @@ public class HoodieHiveClient extends AbstractSyncHoodieClient {
private HiveConf configuration;
public HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
super(cfg.basePath, cfg.assumeDatePartitioning, fs);
super(cfg.basePath, cfg.assumeDatePartitioning, cfg.useFileListingFromMetadata, cfg.verifyMetadataFileListing, fs);
this.syncConfig = cfg;
this.fs = fs;

View File

@@ -40,18 +40,25 @@ import java.util.List;
import java.util.Map;
public abstract class AbstractSyncHoodieClient {
private static final Logger LOG = LogManager.getLogger(AbstractSyncHoodieClient.class);
protected final HoodieTableMetaClient metaClient;
protected final HoodieTableType tableType;
protected final FileSystem fs;
private String basePath;
private boolean assumeDatePartitioning;
private boolean useFileListingFromMetadata;
private boolean verifyMetadataFileListing;
public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, FileSystem fs) {
public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata,
boolean verifyMetadataFileListing, FileSystem fs) {
this.metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true);
this.tableType = metaClient.getTableType();
this.basePath = basePath;
this.assumeDatePartitioning = assumeDatePartitioning;
this.useFileListingFromMetadata = useFileListingFromMetadata;
this.verifyMetadataFileListing = verifyMetadataFileListing;
this.fs = fs;
}
@@ -120,7 +127,7 @@ public abstract class AbstractSyncHoodieClient {
if (!lastCommitTimeSynced.isPresent()) {
LOG.info("Last commit time synced is not known, listing all partitions in " + basePath + ",FS :" + fs);
try {
return FSUtils.getAllPartitionPaths(fs, basePath, assumeDatePartitioning);
return FSUtils.getAllPartitionPaths(fs, basePath, useFileListingFromMetadata, verifyMetadataFileListing, assumeDatePartitioning);
} catch (IOException e) {
throw new HoodieIOException("Failed to list all partitions in " + basePath, e);
}