1
0

[HUDI-2883] Refactor hive sync tool / config to use reflection and standardize configs (#4175)

- Refactor hive sync tool / config to use reflection and standardize configs

Co-authored-by: sivabalan <n.siva.b@gmail.com>
Co-authored-by: Rajesh Mahindra <rmahindra@Rajeshs-MacBook-Pro.local>
Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
Rajesh Mahindra
2022-03-21 19:56:31 -07:00
committed by GitHub
parent 9b6e138af2
commit 5f570ea151
43 changed files with 1521 additions and 1217 deletions

View File

@@ -17,17 +17,31 @@
package org.apache.hudi.sync.common;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import java.util.Properties;
/**
* Base class to sync Hudi meta data with Metastores to make
* Hudi table queryable through external systems.
*/
public abstract class AbstractSyncTool {
protected Properties props;
protected FileSystem fileSystem;
protected final Configuration conf;
protected final FileSystem fs;
protected TypedProperties props;
public AbstractSyncTool(Properties props, FileSystem fileSystem) {
public AbstractSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
this.props = props;
this.fileSystem = fileSystem;
this.conf = conf;
this.fs = fs;
}
@Deprecated
public AbstractSyncTool(Properties props, FileSystem fileSystem) {
this(new TypedProperties(props), fileSystem.getConf(), fileSystem);
}
public abstract void syncHoodieTable();

View File

@@ -0,0 +1,188 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sync.common;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import com.beust.jcommander.Parameter;
import java.util.Collections;
import java.util.List;
import java.util.function.Function;
/**
* Configs needed to sync data into external meta stores, catalogs, etc.
*/
public class HoodieSyncConfig extends HoodieConfig {
@Parameter(names = {"--database"}, description = "name of the target database in meta store", required = true)
public String databaseName;
@Parameter(names = {"--table"}, description = "name of the target table in meta store", required = true)
public String tableName;
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
public String basePath;
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
public String baseFileFormat;
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
public List<String> partitionFields;
@Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
+ "to extract the partition values from HDFS path")
public String partitionValueExtractorClass;
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
public Boolean assumeDatePartitioning;
@Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
public Boolean decodePartition;
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
public Boolean useFileListingFromMetadata;
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
public Boolean isConditionalSync;
@Parameter(names = {"--spark-version"}, description = "The spark version")
public String sparkVersion;
public static final ConfigProperty<String> META_SYNC_BASE_PATH = ConfigProperty
.key("hoodie.datasource.meta.sync.base.path")
.defaultValue("")
.withDocumentation("Base path of the hoodie table to sync");
public static final ConfigProperty<String> META_SYNC_ENABLED = ConfigProperty
.key("hoodie.datasource.meta.sync.enable")
.defaultValue("false")
.withDocumentation("Enable Syncing the Hudi Table with an external meta store or data catalog.");
// ToDo change the prefix of the following configs from hive_sync to meta_sync
public static final ConfigProperty<String> META_SYNC_DATABASE_NAME = ConfigProperty
.key("hoodie.datasource.hive_sync.database")
.defaultValue("default")
.withDocumentation("The name of the destination database that we should sync the hudi table to.");
// If the table name for the metastore destination is not provided, pick it up from write or table configs.
public static final Function<HoodieConfig, Option<String>> TABLE_NAME_INFERENCE_FUNCTION = cfg -> {
if (cfg.contains(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY)) {
return Option.of(cfg.getString(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY));
} else if (cfg.contains(HoodieTableConfig.HOODIE_TABLE_NAME_KEY)) {
return Option.of(cfg.getString(HoodieTableConfig.HOODIE_TABLE_NAME_KEY));
} else {
return Option.empty();
}
};
public static final ConfigProperty<String> META_SYNC_TABLE_NAME = ConfigProperty
.key("hoodie.datasource.hive_sync.table")
.defaultValue("unknown")
.withInferFunction(TABLE_NAME_INFERENCE_FUNCTION)
.withDocumentation("The name of the destination table that we should sync the hudi table to.");
public static final ConfigProperty<String> META_SYNC_BASE_FILE_FORMAT = ConfigProperty
.key("hoodie.datasource.hive_sync.base_file_format")
.defaultValue("PARQUET")
.withDocumentation("Base file format for the sync.");
// If partition fields are not explicitly provided, obtain from the KeyGeneration Configs
public static final Function<HoodieConfig, Option<String>> PARTITION_FIELDS_INFERENCE_FUNCTION = cfg -> {
if (cfg.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME)) {
return Option.of(cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME));
} else {
return Option.empty();
}
};
public static final ConfigProperty<String> META_SYNC_PARTITION_FIELDS = ConfigProperty
.key("hoodie.datasource.hive_sync.partition_fields")
.defaultValue("")
.withInferFunction(PARTITION_FIELDS_INFERENCE_FUNCTION)
.withDocumentation("Field in the table to use for determining hive partition columns.");
// If partition value extraction class is not explicitly provided, configure based on the partition fields.
public static final Function<HoodieConfig, Option<String>> PARTITION_EXTRACTOR_CLASS_FUNCTION = cfg -> {
if (!cfg.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME)) {
return Option.of("org.apache.hudi.hive.NonPartitionedExtractor");
} else {
int numOfPartFields = cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME).split(",").length;
if (numOfPartFields == 1
&& cfg.contains(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE)
&& cfg.getString(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE).equals("true")) {
return Option.of("org.apache.hudi.hive.HiveStylePartitionValueExtractor");
} else {
return Option.of("org.apache.hudi.hive.MultiPartKeysValueExtractor");
}
}
};
public static final ConfigProperty<String> META_SYNC_PARTITION_EXTRACTOR_CLASS = ConfigProperty
.key("hoodie.datasource.hive_sync.partition_extractor_class")
.defaultValue("org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor")
.withInferFunction(PARTITION_EXTRACTOR_CLASS_FUNCTION)
.withDocumentation("Class which implements PartitionValueExtractor to extract the partition values, "
+ "default 'SlashEncodedDayPartitionValueExtractor'.");
public static final ConfigProperty<String> META_SYNC_ASSUME_DATE_PARTITION = ConfigProperty
.key("hoodie.datasource.hive_sync.assume_date_partitioning")
.defaultValue("false")
.withDocumentation("Assume partitioning is yyyy/mm/dd");
public static final ConfigProperty<Boolean> META_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty
.key("hoodie.meta.sync.metadata_file_listing")
.defaultValue(HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS)
.withDocumentation("Enable the internal metadata table for file listing for syncing with metastores");
public static final ConfigProperty<String> META_SYNC_CONDITIONAL_SYNC = ConfigProperty
.key("hoodie.datasource.meta_sync.condition.sync")
.defaultValue("false")
.withDocumentation("If true, only sync on conditions like schema change or partition change.");
public static final ConfigProperty<String> META_SYNC_SPARK_VERSION = ConfigProperty
.key("hoodie.meta_sync.spark.version")
.defaultValue("")
.withDocumentation("The spark version used when syncing with a metastore.");
public HoodieSyncConfig(TypedProperties props) {
super(props);
setDefaults();
this.basePath = getStringOrDefault(META_SYNC_BASE_PATH);
this.databaseName = getStringOrDefault(META_SYNC_DATABASE_NAME);
this.tableName = getStringOrDefault(META_SYNC_TABLE_NAME);
this.baseFileFormat = getStringOrDefault(META_SYNC_BASE_FILE_FORMAT);
this.partitionFields = props.getStringList(META_SYNC_PARTITION_FIELDS.key(), ",", Collections.emptyList());
this.partitionValueExtractorClass = getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS);
this.assumeDatePartitioning = getBooleanOrDefault(META_SYNC_ASSUME_DATE_PARTITION);
this.decodePartition = getBooleanOrDefault(KeyGeneratorOptions.URL_ENCODE_PARTITIONING);
this.useFileListingFromMetadata = getBooleanOrDefault(META_SYNC_USE_FILE_LISTING_FROM_METADATA);
this.isConditionalSync = getBooleanOrDefault(META_SYNC_CONDITIONAL_SYNC);
this.sparkVersion = getStringOrDefault(META_SYNC_SPARK_VERSION);
}
protected void setDefaults() {
this.setDefaultValue(META_SYNC_TABLE_NAME);
}
}

View File

@@ -0,0 +1,72 @@
package org.apache.hudi.sync.common.util;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.sync.common.AbstractSyncTool;
import org.apache.hudi.sync.common.HoodieSyncConfig;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.util.Properties;
/**
* Helper class for syncing Hudi commit data with external metastores.
*/
public class SyncUtilHelpers {
private static final Logger LOG = LogManager.getLogger(SyncUtilHelpers.class);
/**
* Create an instance of an implementation of {@link AbstractSyncTool} that will sync all the relevant meta information
* with an external metastore such as Hive etc. to ensure Hoodie tables can be queried or read via external systems.
*
* @param metaSyncFQCN The class that implements the sync of the metadata.
* @param props property map.
* @param hadoopConfig Hadoop confs.
* @param fs Filesystem used.
* @param targetBasePath The target base path that contains the hoodie table.
* @param baseFileFormat The file format used by the hoodie table (defaults to PARQUET).
*/
public static void runHoodieMetaSync(String metaSyncFQCN,
TypedProperties props,
Configuration hadoopConfig,
FileSystem fs,
String targetBasePath,
String baseFileFormat) {
try {
instantiateMetaSyncTool(metaSyncFQCN, props, hadoopConfig, fs, targetBasePath, baseFileFormat).syncHoodieTable();
} catch (Throwable e) {
throw new HoodieException("Could not sync using the meta sync class " + metaSyncFQCN, e);
}
}
static AbstractSyncTool instantiateMetaSyncTool(String metaSyncFQCN,
TypedProperties props,
Configuration hadoopConfig,
FileSystem fs,
String targetBasePath,
String baseFileFormat) {
TypedProperties properties = new TypedProperties();
properties.putAll(props);
properties.put(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), targetBasePath);
properties.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), baseFileFormat);
if (ReflectionUtils.hasConstructor(metaSyncFQCN,
new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class})) {
return ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class},
properties, hadoopConfig, fs));
} else {
LOG.warn("Falling back to deprecated constructor for class: " + metaSyncFQCN);
try {
return ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
new Class<?>[] {Properties.class, FileSystem.class}, properties, fs));
} catch (Throwable t) {
throw new HoodieException("Could not load meta sync class " + metaSyncFQCN, t);
}
}
}
}

View File

@@ -0,0 +1,124 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sync.common.util;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.sync.common.AbstractSyncTool;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.Properties;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestSyncUtilHelpers {
private static final String BASE_PATH = "/tmp/test";
private static final String BASE_FORMAT = "PARQUET";
private Configuration hadoopConf;
private FileSystem fileSystem;
@BeforeEach
public void setUp() throws IOException {
fileSystem = FSUtils.getFs(BASE_PATH, new Configuration());
hadoopConf = fileSystem.getConf();
}
@Test
public void testCreateValidSyncClass() {
AbstractSyncTool metaSyncTool = SyncUtilHelpers.instantiateMetaSyncTool(
ValidMetaSyncClass.class.getName(),
new TypedProperties(),
hadoopConf,
fileSystem,
BASE_PATH,
BASE_FORMAT
);
assertTrue(metaSyncTool instanceof ValidMetaSyncClass);
}
/**
* Ensure it still works for the deprecated constructor of {@link AbstractSyncTool}
* as we implemented the fallback.
*/
@Test
public void testCreateDeprecatedSyncClass() {
Properties properties = new Properties();
AbstractSyncTool deprecatedMetaSyncClass = SyncUtilHelpers.instantiateMetaSyncTool(
DeprecatedMetaSyncClass.class.getName(),
new TypedProperties(properties),
hadoopConf,
fileSystem,
BASE_PATH,
BASE_FORMAT
);
assertTrue(deprecatedMetaSyncClass instanceof DeprecatedMetaSyncClass);
}
@Test
public void testCreateInvalidSyncClass() {
Exception exception = assertThrows(HoodieException.class, () -> {
SyncUtilHelpers.instantiateMetaSyncTool(
InvalidSyncClass.class.getName(),
new TypedProperties(),
hadoopConf,
fileSystem,
BASE_PATH,
BASE_FORMAT
);
});
String expectedMessage = "Could not load meta sync class " + InvalidSyncClass.class.getName();
assertTrue(exception.getMessage().contains(expectedMessage));
}
public static class ValidMetaSyncClass extends AbstractSyncTool {
public ValidMetaSyncClass(TypedProperties props, Configuration conf, FileSystem fs) {
super(props, conf, fs);
}
@Override
public void syncHoodieTable() {
throw new HoodieException("Method unimplemented as its a test class");
}
}
public static class DeprecatedMetaSyncClass extends AbstractSyncTool {
public DeprecatedMetaSyncClass(Properties props, FileSystem fileSystem) {
super(props, fileSystem);
}
@Override
public void syncHoodieTable() {
throw new HoodieException("Method unimplemented as its a test class");
}
}
public static class InvalidSyncClass {
public InvalidSyncClass(Properties props) {
}
}
}

View File

@@ -0,0 +1,29 @@
###
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=WARN, CONSOLE
log4j.logger.org.apache.hudi=DEBUG
# CONSOLE is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
# CONSOLE uses PatternLayout.
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL

View File

@@ -0,0 +1,30 @@
###
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=WARN, CONSOLE
log4j.logger.org.apache=INFO
log4j.logger.org.apache.hudi=DEBUG
# A1 is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL