[HUDI-2883] Refactor hive sync tool / config to use reflection and standardize configs (#4175)
- Refactor hive sync tool / config to use reflection and standardize configs Co-authored-by: sivabalan <n.siva.b@gmail.com> Co-authored-by: Rajesh Mahindra <rmahindra@Rajeshs-MacBook-Pro.local> Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
@@ -23,6 +23,8 @@ import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
|
||||
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
@@ -41,7 +43,6 @@ import org.apache.parquet.schema.MessageType;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
@@ -63,8 +64,8 @@ public class DLASyncTool extends AbstractSyncTool {
|
||||
private final String snapshotTableName;
|
||||
private final Option<String> roTableTableName;
|
||||
|
||||
public DLASyncTool(Properties properties, FileSystem fs) {
|
||||
super(properties, fs);
|
||||
public DLASyncTool(TypedProperties properties, Configuration conf, FileSystem fs) {
|
||||
super(properties, conf, fs);
|
||||
this.hoodieDLAClient = new HoodieDLAClient(Utils.propertiesToConfig(properties), fs);
|
||||
this.cfg = Utils.propertiesToConfig(properties);
|
||||
switch (hoodieDLAClient.getTableType()) {
|
||||
@@ -205,7 +206,8 @@ public class DLASyncTool extends AbstractSyncTool {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
}
|
||||
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
|
||||
new DLASyncTool(Utils.configToProperties(cfg), fs).syncHoodieTable();
|
||||
Configuration hadoopConf = new Configuration();
|
||||
FileSystem fs = FSUtils.getFs(cfg.basePath, hadoopConf);
|
||||
new DLASyncTool(Utils.configToProperties(cfg), hadoopConf, fs).syncHoodieTable();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,12 +18,12 @@
|
||||
|
||||
package org.apache.hudi.dla.util;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.dla.DLASyncConfig;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Properties;
|
||||
|
||||
public class Utils {
|
||||
public static String DLA_DATABASE_OPT_KEY = "hoodie.datasource.dla_sync.database";
|
||||
@@ -39,8 +39,8 @@ public class Utils {
|
||||
public static String DLA_SKIP_RT_SYNC = "hoodie.datasource.dla_sync.skip_rt_sync";
|
||||
public static String DLA_SYNC_HIVE_STYLE_PARTITIONING = "hoodie.datasource.dla_sync.hive.style.partitioning";
|
||||
|
||||
public static Properties configToProperties(DLASyncConfig cfg) {
|
||||
Properties properties = new Properties();
|
||||
public static TypedProperties configToProperties(DLASyncConfig cfg) {
|
||||
TypedProperties properties = new TypedProperties();
|
||||
properties.put(DLA_DATABASE_OPT_KEY, cfg.databaseName);
|
||||
properties.put(DLA_TABLE_OPT_KEY, cfg.tableName);
|
||||
properties.put(DLA_USER_OPT_KEY, cfg.dlaUser);
|
||||
@@ -54,7 +54,7 @@ public class Utils {
|
||||
return properties;
|
||||
}
|
||||
|
||||
public static DLASyncConfig propertiesToConfig(Properties properties) {
|
||||
public static DLASyncConfig propertiesToConfig(TypedProperties properties) {
|
||||
DLASyncConfig config = new DLASyncConfig();
|
||||
config.databaseName = properties.getProperty(DLA_DATABASE_OPT_KEY);
|
||||
config.tableName = properties.getProperty(DLA_TABLE_OPT_KEY);
|
||||
|
||||
@@ -18,27 +18,16 @@
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Configs needed to sync data into Hive.
|
||||
* Configs needed to sync data into the Hive Metastore.
|
||||
*/
|
||||
public class HiveSyncConfig implements Serializable {
|
||||
|
||||
@Parameter(names = {"--database"}, description = "name of the target database in Hive", required = true)
|
||||
public String databaseName;
|
||||
|
||||
@Parameter(names = {"--table"}, description = "name of the target table in Hive", required = true)
|
||||
public String tableName;
|
||||
|
||||
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
|
||||
public String baseFileFormat = "PARQUET";
|
||||
public class HiveSyncConfig extends HoodieSyncConfig {
|
||||
|
||||
@Parameter(names = {"--user"}, description = "Hive username")
|
||||
public String hiveUser;
|
||||
@@ -52,48 +41,31 @@ public class HiveSyncConfig implements Serializable {
|
||||
@Parameter(names = {"--metastore-uris"}, description = "Hive metastore uris")
|
||||
public String metastoreUris;
|
||||
|
||||
@Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true)
|
||||
public String basePath;
|
||||
|
||||
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
|
||||
public List<String> partitionFields = new ArrayList<>();
|
||||
|
||||
@Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
|
||||
+ "to extract the partition values from HDFS path")
|
||||
public String partitionValueExtractorClass = SlashEncodedDayPartitionValueExtractor.class.getName();
|
||||
|
||||
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
|
||||
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
|
||||
public Boolean assumeDatePartitioning = false;
|
||||
|
||||
@Parameter(names = {"--use-pre-apache-input-format"},
|
||||
description = "Use InputFormat under com.uber.hoodie package "
|
||||
+ "instead of org.apache.hudi package. Use this when you are in the process of migrating from "
|
||||
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to "
|
||||
+ "org.apache.hudi input format.")
|
||||
public Boolean usePreApacheInputFormat = false;
|
||||
public Boolean usePreApacheInputFormat;
|
||||
|
||||
@Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore", required = false)
|
||||
public String bucketSpec;
|
||||
|
||||
@Deprecated
|
||||
@Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
|
||||
public Boolean useJdbc = true;
|
||||
public Boolean useJdbc;
|
||||
|
||||
@Parameter(names = {"--sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql")
|
||||
public String syncMode;
|
||||
|
||||
@Parameter(names = {"--auto-create-database"}, description = "Auto create hive database")
|
||||
public Boolean autoCreateDatabase = true;
|
||||
public Boolean autoCreateDatabase;
|
||||
|
||||
@Parameter(names = {"--ignore-exceptions"}, description = "Ignore hive exceptions")
|
||||
public Boolean ignoreExceptions = false;
|
||||
public Boolean ignoreExceptions;
|
||||
|
||||
@Parameter(names = {"--skip-ro-suffix"}, description = "Skip the `_ro` suffix for Read optimized table, when registering")
|
||||
public Boolean skipROSuffix = false;
|
||||
|
||||
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
|
||||
public Boolean useFileListingFromMetadata = HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS;
|
||||
public Boolean skipROSuffix;
|
||||
|
||||
@Parameter(names = {"--table-properties"}, description = "Table properties to hive table")
|
||||
public String tableProperties;
|
||||
@@ -106,64 +78,170 @@ public class HiveSyncConfig implements Serializable {
|
||||
|
||||
@Parameter(names = {"--support-timestamp"}, description = "'INT64' with original type TIMESTAMP_MICROS is converted to hive 'timestamp' type."
|
||||
+ "Disabled by default for backward compatibility.")
|
||||
public Boolean supportTimestamp = false;
|
||||
|
||||
@Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
|
||||
public Boolean decodePartition = false;
|
||||
public Boolean supportTimestamp;
|
||||
|
||||
@Parameter(names = {"--managed-table"}, description = "Create a managed table")
|
||||
public Boolean createManagedTable = false;
|
||||
public Boolean createManagedTable;
|
||||
|
||||
@Parameter(names = {"--batch-sync-num"}, description = "The number of partitions one batch when synchronous partitions to hive")
|
||||
public Integer batchSyncNum = 1000;
|
||||
public Integer batchSyncNum;
|
||||
|
||||
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table.")
|
||||
public Boolean syncAsSparkDataSourceTable = true;
|
||||
public Boolean syncAsSparkDataSourceTable;
|
||||
|
||||
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore.")
|
||||
public int sparkSchemaLengthThreshold = 4000;
|
||||
public int sparkSchemaLengthThreshold;
|
||||
|
||||
@Parameter(names = {"--with-operation-field"}, description = "Whether to include the '_hoodie_operation' field in the metadata fields")
|
||||
public Boolean withOperationField = false;
|
||||
|
||||
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
|
||||
public Boolean isConditionalSync = false;
|
||||
|
||||
@Parameter(names = {"--spark-version"}, description = "The spark version", required = false)
|
||||
public String sparkVersion;
|
||||
|
||||
@Parameter(names = {"--sync-comment"}, description = "synchronize table comments to hive")
|
||||
public boolean syncComment = false;
|
||||
|
||||
// enhance the similar function in child class
|
||||
public static HiveSyncConfig copy(HiveSyncConfig cfg) {
|
||||
HiveSyncConfig newConfig = new HiveSyncConfig();
|
||||
newConfig.basePath = cfg.basePath;
|
||||
newConfig.assumeDatePartitioning = cfg.assumeDatePartitioning;
|
||||
newConfig.databaseName = cfg.databaseName;
|
||||
newConfig.hivePass = cfg.hivePass;
|
||||
newConfig.hiveUser = cfg.hiveUser;
|
||||
newConfig.partitionFields = cfg.partitionFields;
|
||||
newConfig.partitionValueExtractorClass = cfg.partitionValueExtractorClass;
|
||||
newConfig.jdbcUrl = cfg.jdbcUrl;
|
||||
newConfig.metastoreUris = cfg.metastoreUris;
|
||||
newConfig.tableName = cfg.tableName;
|
||||
newConfig.bucketSpec = cfg.bucketSpec;
|
||||
newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat;
|
||||
newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
|
||||
newConfig.supportTimestamp = cfg.supportTimestamp;
|
||||
newConfig.decodePartition = cfg.decodePartition;
|
||||
newConfig.tableProperties = cfg.tableProperties;
|
||||
newConfig.serdeProperties = cfg.serdeProperties;
|
||||
newConfig.createManagedTable = cfg.createManagedTable;
|
||||
newConfig.batchSyncNum = cfg.batchSyncNum;
|
||||
newConfig.syncAsSparkDataSourceTable = cfg.syncAsSparkDataSourceTable;
|
||||
newConfig.sparkSchemaLengthThreshold = cfg.sparkSchemaLengthThreshold;
|
||||
newConfig.withOperationField = cfg.withOperationField;
|
||||
newConfig.isConditionalSync = cfg.isConditionalSync;
|
||||
newConfig.sparkVersion = cfg.sparkVersion;
|
||||
newConfig.syncComment = cfg.syncComment;
|
||||
return newConfig;
|
||||
// HIVE SYNC SPECIFIC CONFIGS
|
||||
// NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes
|
||||
// unexpected issues with config getting reset
|
||||
public static final ConfigProperty<String> HIVE_SYNC_ENABLED = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.enable")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("When set to true, register/sync the table to Apache Hive metastore.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_USER = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.username")
|
||||
.defaultValue("hive")
|
||||
.withDocumentation("hive user name to use");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_PASS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.password")
|
||||
.defaultValue("hive")
|
||||
.withDocumentation("hive password to use");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_URL = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.jdbcurl")
|
||||
.defaultValue("jdbc:hive2://localhost:10000")
|
||||
.withDocumentation("Hive metastore url");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_USE_PRE_APACHE_INPUT_FORMAT = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.use_pre_apache_input_format")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. "
|
||||
+ "Use this when you are in the process of migrating from "
|
||||
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format");
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #HIVE_SYNC_MODE} instead of this config from 0.9.0
|
||||
*/
|
||||
@Deprecated
|
||||
public static final ConfigProperty<String> HIVE_USE_JDBC = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.use_jdbc")
|
||||
.defaultValue("true")
|
||||
.deprecatedAfter("0.9.0")
|
||||
.withDocumentation("Use JDBC when hive synchronization is enabled");
|
||||
|
||||
public static final ConfigProperty<String> METASTORE_URIS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.metastore.uris")
|
||||
.defaultValue("thrift://localhost:9083")
|
||||
.withDocumentation("Hive metastore url");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_AUTO_CREATE_DATABASE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.auto_create_database")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("Auto create hive database if does not exists");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_IGNORE_EXCEPTIONS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.ignore_exceptions")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Ignore exceptions when syncing with Hive.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.skip_ro_suffix")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Skip the _ro suffix for Read optimized table, when registering");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SUPPORT_TIMESTAMP_TYPE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.support_timestamp")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. "
|
||||
+ "Disabled by default for backward compatibility.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_TABLE_PROPERTIES = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.table_properties")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Additional properties to store with table.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_TABLE_SERDE_PROPERTIES = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.serde_properties")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Serde properties to hive table.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SYNC_AS_DATA_SOURCE_TABLE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.sync_as_datasource")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("");
|
||||
|
||||
public static final ConfigProperty<Integer> HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.schema_string_length_thresh")
|
||||
.defaultValue(4000)
|
||||
.withDocumentation("");
|
||||
|
||||
// Create table as managed table
|
||||
public static final ConfigProperty<Boolean> HIVE_CREATE_MANAGED_TABLE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.create_managed_table")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Whether to sync the table as managed table.");
|
||||
|
||||
public static final ConfigProperty<Integer> HIVE_BATCH_SYNC_PARTITION_NUM = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.batch_num")
|
||||
.defaultValue(1000)
|
||||
.withDocumentation("The number of partitions one batch when synchronous partitions to hive.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SYNC_MODE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.mode")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.");
|
||||
|
||||
public static final ConfigProperty<Boolean> HIVE_SYNC_BUCKET_SYNC = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.bucket_sync")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Whether sync hive metastore bucket specification when using bucket index."
|
||||
+ "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SYNC_BUCKET_SYNC_SPEC = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.bucket_sync_spec")
|
||||
.defaultValue("")
|
||||
.withDocumentation("The hive metastore bucket specification when using bucket index."
|
||||
+ "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SYNC_COMMENT = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.sync_comment")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Whether to sync the table column comments while syncing the table.");
|
||||
|
||||
public HiveSyncConfig() {
|
||||
this(new TypedProperties());
|
||||
}
|
||||
|
||||
public HiveSyncConfig(TypedProperties props) {
|
||||
super(props);
|
||||
this.hiveUser = getStringOrDefault(HIVE_USER);
|
||||
this.hivePass = getStringOrDefault(HIVE_PASS);
|
||||
this.jdbcUrl = getStringOrDefault(HIVE_URL);
|
||||
this.usePreApacheInputFormat = getBooleanOrDefault(HIVE_USE_PRE_APACHE_INPUT_FORMAT);
|
||||
this.useJdbc = getBooleanOrDefault(HIVE_USE_JDBC);
|
||||
this.metastoreUris = getStringOrDefault(METASTORE_URIS);
|
||||
this.syncMode = getString(HIVE_SYNC_MODE);
|
||||
this.autoCreateDatabase = getBooleanOrDefault(HIVE_AUTO_CREATE_DATABASE);
|
||||
this.ignoreExceptions = getBooleanOrDefault(HIVE_IGNORE_EXCEPTIONS);
|
||||
this.skipROSuffix = getBooleanOrDefault(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE);
|
||||
this.tableProperties = getString(HIVE_TABLE_PROPERTIES);
|
||||
this.serdeProperties = getString(HIVE_TABLE_SERDE_PROPERTIES);
|
||||
this.supportTimestamp = getBooleanOrDefault(HIVE_SUPPORT_TIMESTAMP_TYPE);
|
||||
this.batchSyncNum = getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM);
|
||||
this.syncAsSparkDataSourceTable = getBooleanOrDefault(HIVE_SYNC_AS_DATA_SOURCE_TABLE);
|
||||
this.sparkSchemaLengthThreshold = getIntOrDefault(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD);
|
||||
this.createManagedTable = getBooleanOrDefault(HIVE_CREATE_MANAGED_TABLE);
|
||||
this.bucketSpec = getStringOrDefault(HIVE_SYNC_BUCKET_SYNC_SPEC);
|
||||
this.syncComment = getBooleanOrDefault(HIVE_SYNC_COMMENT);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -197,6 +275,7 @@ public class HiveSyncConfig implements Serializable {
|
||||
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold
|
||||
+ ", withOperationField=" + withOperationField
|
||||
+ ", isConditionalSync=" + isConditionalSync
|
||||
+ ", sparkVersion=" + sparkVersion
|
||||
+ ", syncComment=" + syncComment
|
||||
+ '}';
|
||||
}
|
||||
|
||||
@@ -25,6 +25,8 @@ import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
@@ -70,40 +72,53 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
|
||||
public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
|
||||
|
||||
protected final HiveSyncConfig cfg;
|
||||
protected final HiveSyncConfig hiveSyncConfig;
|
||||
protected HoodieHiveClient hoodieHiveClient = null;
|
||||
protected String snapshotTableName = null;
|
||||
protected Option<String> roTableName = null;
|
||||
|
||||
public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
|
||||
super(configuration.getAllProperties(), fs);
|
||||
public HiveSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
super(props, conf, fs);
|
||||
this.hiveSyncConfig = new HiveSyncConfig(props);
|
||||
init(hiveSyncConfig, new HiveConf(conf, HiveConf.class));
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public HiveSyncTool(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf, FileSystem fs) {
|
||||
super(hiveSyncConfig.getProps(), hiveConf, fs);
|
||||
this.hiveSyncConfig = hiveSyncConfig;
|
||||
init(hiveSyncConfig, hiveConf);
|
||||
}
|
||||
|
||||
private void init(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf) {
|
||||
try {
|
||||
this.hoodieHiveClient = new HoodieHiveClient(cfg, configuration, fs);
|
||||
if (StringUtils.isNullOrEmpty(hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname))) {
|
||||
hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname, hiveSyncConfig.metastoreUris);
|
||||
}
|
||||
this.hoodieHiveClient = new HoodieHiveClient(hiveSyncConfig, hiveConf, fs);
|
||||
} catch (RuntimeException e) {
|
||||
if (cfg.ignoreExceptions) {
|
||||
if (hiveSyncConfig.ignoreExceptions) {
|
||||
LOG.error("Got runtime exception when hive syncing, but continuing as ignoreExceptions config is set ", e);
|
||||
} else {
|
||||
throw new HoodieHiveSyncException("Got runtime exception when hive syncing", e);
|
||||
}
|
||||
}
|
||||
|
||||
this.cfg = cfg;
|
||||
// Set partitionFields to empty, when the NonPartitionedExtractor is used
|
||||
if (NonPartitionedExtractor.class.getName().equals(cfg.partitionValueExtractorClass)) {
|
||||
if (NonPartitionedExtractor.class.getName().equals(hiveSyncConfig.partitionValueExtractorClass)) {
|
||||
LOG.warn("Set partitionFields to empty, since the NonPartitionedExtractor is used");
|
||||
cfg.partitionFields = new ArrayList<>();
|
||||
hiveSyncConfig.partitionFields = new ArrayList<>();
|
||||
}
|
||||
if (hoodieHiveClient != null) {
|
||||
switch (hoodieHiveClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
this.snapshotTableName = cfg.tableName;
|
||||
this.snapshotTableName = hiveSyncConfig.tableName;
|
||||
this.roTableName = Option.empty();
|
||||
break;
|
||||
case MERGE_ON_READ:
|
||||
this.snapshotTableName = cfg.tableName + SUFFIX_SNAPSHOT_TABLE;
|
||||
this.roTableName = cfg.skipROSuffix ? Option.of(cfg.tableName) :
|
||||
Option.of(cfg.tableName + SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
this.snapshotTableName = hiveSyncConfig.tableName + SUFFIX_SNAPSHOT_TABLE;
|
||||
this.roTableName = hiveSyncConfig.skipROSuffix ? Option.of(hiveSyncConfig.tableName) :
|
||||
Option.of(hiveSyncConfig.tableName + SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
break;
|
||||
default:
|
||||
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
|
||||
@@ -116,10 +131,13 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
public void syncHoodieTable() {
|
||||
try {
|
||||
if (hoodieHiveClient != null) {
|
||||
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
|
||||
+ hiveSyncConfig.jdbcUrl + ", basePath :" + hiveSyncConfig.basePath);
|
||||
|
||||
doSync();
|
||||
}
|
||||
} catch (RuntimeException re) {
|
||||
throw new HoodieException("Got runtime exception when hive syncing " + cfg.tableName, re);
|
||||
throw new HoodieException("Got runtime exception when hive syncing " + hiveSyncConfig.tableName, re);
|
||||
} finally {
|
||||
if (hoodieHiveClient != null) {
|
||||
hoodieHiveClient.close();
|
||||
@@ -150,18 +168,19 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
+ " of type " + hoodieHiveClient.getTableType());
|
||||
|
||||
// check if the database exists else create it
|
||||
if (cfg.autoCreateDatabase) {
|
||||
if (hiveSyncConfig.autoCreateDatabase) {
|
||||
try {
|
||||
if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) {
|
||||
hoodieHiveClient.createDatabase(cfg.databaseName);
|
||||
if (!hoodieHiveClient.doesDataBaseExist(hiveSyncConfig.databaseName)) {
|
||||
hoodieHiveClient.createDatabase(hiveSyncConfig.databaseName);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing
|
||||
LOG.warn("Unable to create database", e);
|
||||
}
|
||||
} else {
|
||||
if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) {
|
||||
throw new HoodieHiveSyncException("hive database does not exist " + cfg.databaseName);
|
||||
if (!hoodieHiveClient.doesDataBaseExist(hiveSyncConfig.databaseName)) {
|
||||
LOG.error("Hive database does not exist " + hiveSyncConfig.databaseName);
|
||||
throw new HoodieHiveSyncException("hive database does not exist " + hiveSyncConfig.databaseName);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,7 +200,7 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
if (hoodieHiveClient.isBootstrap()
|
||||
&& hoodieHiveClient.getTableType() == HoodieTableType.MERGE_ON_READ
|
||||
&& !readAsOptimized) {
|
||||
cfg.syncAsSparkDataSourceTable = false;
|
||||
hiveSyncConfig.syncAsSparkDataSourceTable = false;
|
||||
}
|
||||
|
||||
// Sync schema if needed
|
||||
@@ -200,7 +219,7 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
// Sync the partitions if needed
|
||||
boolean partitionsChanged = syncPartitions(tableName, writtenPartitionsSince, isDropPartition);
|
||||
boolean meetSyncConditions = schemaChanged || partitionsChanged;
|
||||
if (!cfg.isConditionalSync || meetSyncConditions) {
|
||||
if (!hiveSyncConfig.isConditionalSync || meetSyncConditions) {
|
||||
hoodieHiveClient.updateLastCommitTimeSynced(tableName);
|
||||
}
|
||||
LOG.info("Sync complete for " + tableName);
|
||||
@@ -216,10 +235,10 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat,
|
||||
boolean readAsOptimized, MessageType schema) {
|
||||
// Append spark table properties & serde properties
|
||||
Map<String, String> tableProperties = ConfigUtils.toMap(cfg.tableProperties);
|
||||
Map<String, String> serdeProperties = ConfigUtils.toMap(cfg.serdeProperties);
|
||||
if (cfg.syncAsSparkDataSourceTable) {
|
||||
Map<String, String> sparkTableProperties = getSparkTableProperties(cfg.sparkSchemaLengthThreshold, schema);
|
||||
Map<String, String> tableProperties = ConfigUtils.toMap(hiveSyncConfig.tableProperties);
|
||||
Map<String, String> serdeProperties = ConfigUtils.toMap(hiveSyncConfig.serdeProperties);
|
||||
if (hiveSyncConfig.syncAsSparkDataSourceTable) {
|
||||
Map<String, String> sparkTableProperties = getSparkTableProperties(hiveSyncConfig.sparkSchemaLengthThreshold, schema);
|
||||
Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized);
|
||||
tableProperties.putAll(sparkTableProperties);
|
||||
serdeProperties.putAll(sparkSerdeProperties);
|
||||
@@ -228,10 +247,10 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
// Check and sync schema
|
||||
if (!tableExists) {
|
||||
LOG.info("Hive table " + tableName + " is not found. Creating it");
|
||||
HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase());
|
||||
HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(hiveSyncConfig.baseFileFormat.toUpperCase());
|
||||
String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat);
|
||||
|
||||
if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) {
|
||||
if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && hiveSyncConfig.usePreApacheInputFormat) {
|
||||
// Parquet input format had an InputFormat class visible under the old naming scheme.
|
||||
inputFormatClassName = useRealTimeInputFormat
|
||||
? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName()
|
||||
@@ -250,12 +269,12 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
} else {
|
||||
// Check if the table schema has evolved
|
||||
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName);
|
||||
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields, cfg.supportTimestamp);
|
||||
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, hiveSyncConfig.partitionFields, hiveSyncConfig.supportTimestamp);
|
||||
if (!schemaDiff.isEmpty()) {
|
||||
LOG.info("Schema difference found for " + tableName);
|
||||
hoodieHiveClient.updateTableDefinition(tableName, schema);
|
||||
// Sync the table properties if the schema has changed
|
||||
if (cfg.tableProperties != null || cfg.syncAsSparkDataSourceTable) {
|
||||
if (hiveSyncConfig.tableProperties != null || hiveSyncConfig.syncAsSparkDataSourceTable) {
|
||||
hoodieHiveClient.updateTableProperties(tableName, tableProperties);
|
||||
LOG.info("Sync table properties for " + tableName + ", table properties is: " + tableProperties);
|
||||
}
|
||||
@@ -265,7 +284,7 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
}
|
||||
}
|
||||
|
||||
if (cfg.syncComment) {
|
||||
if (hiveSyncConfig.syncComment) {
|
||||
Schema avroSchemaWithoutMetadataFields = hoodieHiveClient.getAvroSchemaWithoutMetadataFields();
|
||||
Map<String, String> newComments = avroSchemaWithoutMetadataFields.getFields()
|
||||
.stream().collect(Collectors.toMap(Schema.Field::name, field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
|
||||
@@ -290,7 +309,7 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
// The following code refers to the spark code in
|
||||
// https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
|
||||
GroupType originGroupType = schema.asGroupType();
|
||||
List<String> partitionNames = cfg.partitionFields;
|
||||
List<String> partitionNames = hiveSyncConfig.partitionFields;
|
||||
List<Type> partitionCols = new ArrayList<>();
|
||||
List<Type> dataCols = new ArrayList<>();
|
||||
Map<String, Type> column2Field = new HashMap<>();
|
||||
@@ -319,8 +338,8 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
|
||||
Map<String, String> sparkProperties = new HashMap<>();
|
||||
sparkProperties.put("spark.sql.sources.provider", "hudi");
|
||||
if (!StringUtils.isNullOrEmpty(cfg.sparkVersion)) {
|
||||
sparkProperties.put("spark.sql.create.version", cfg.sparkVersion);
|
||||
if (!StringUtils.isNullOrEmpty(hiveSyncConfig.sparkVersion)) {
|
||||
sparkProperties.put("spark.sql.create.version", hiveSyncConfig.sparkVersion);
|
||||
}
|
||||
// Split the schema string to multi-parts according the schemaLengthThreshold size.
|
||||
String schemaString = Parquet2SparkSchemaUtils.convertToSparkSchemaJson(reOrderedType);
|
||||
@@ -344,7 +363,7 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
|
||||
private Map<String, String> getSparkSerdeProperties(boolean readAsOptimized) {
|
||||
Map<String, String> sparkSerdeProperties = new HashMap<>();
|
||||
sparkSerdeProperties.put("path", cfg.basePath);
|
||||
sparkSerdeProperties.put("path", hiveSyncConfig.basePath);
|
||||
sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized));
|
||||
return sparkSerdeProperties;
|
||||
}
|
||||
|
||||
@@ -18,15 +18,24 @@
|
||||
|
||||
package org.apache.hudi.hive.replication;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
|
||||
public class GlobalHiveSyncConfig extends HiveSyncConfig {
|
||||
@Parameter(names = {"--replicated-timestamp"}, description = "Add globally replicated timestamp to enable consistent reads across clusters")
|
||||
public String globallyReplicatedTimeStamp;
|
||||
|
||||
public GlobalHiveSyncConfig() {
|
||||
}
|
||||
|
||||
public GlobalHiveSyncConfig(TypedProperties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
public static GlobalHiveSyncConfig copy(GlobalHiveSyncConfig cfg) {
|
||||
GlobalHiveSyncConfig newConfig = new GlobalHiveSyncConfig();
|
||||
GlobalHiveSyncConfig newConfig = new GlobalHiveSyncConfig(cfg.getProps());
|
||||
newConfig.basePath = cfg.basePath;
|
||||
newConfig.assumeDatePartitioning = cfg.assumeDatePartitioning;
|
||||
newConfig.databaseName = cfg.databaseName;
|
||||
|
||||
@@ -48,9 +48,9 @@ public class GlobalHiveSyncTool extends HiveSyncTool {
|
||||
@Override
|
||||
protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) {
|
||||
super.syncHoodieTable(tableName, useRealtimeInputFormat, readAsOptimized);
|
||||
if (((GlobalHiveSyncConfig)cfg).globallyReplicatedTimeStamp != null) {
|
||||
if (((GlobalHiveSyncConfig) hiveSyncConfig).globallyReplicatedTimeStamp != null) {
|
||||
hoodieHiveClient.updateLastReplicatedTimeStamp(tableName,
|
||||
((GlobalHiveSyncConfig) cfg).globallyReplicatedTimeStamp);
|
||||
((GlobalHiveSyncConfig) hiveSyncConfig).globallyReplicatedTimeStamp);
|
||||
}
|
||||
LOG.info("Sync complete for " + tableName);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -22,6 +22,7 @@ import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieAvroPayload;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
@@ -75,7 +76,6 @@ import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@@ -89,16 +89,21 @@ import static org.junit.jupiter.api.Assertions.fail;
|
||||
@SuppressWarnings("SameParameterValue")
|
||||
public class HiveTestUtil {
|
||||
|
||||
public static final String DB_NAME = "testdb";
|
||||
public static final String TABLE_NAME = "test1";
|
||||
public static String basePath;
|
||||
public static TypedProperties hiveSyncProps;
|
||||
public static HiveTestService hiveTestService;
|
||||
public static FileSystem fileSystem;
|
||||
public static QueryBasedDDLExecutor ddlExecutor;
|
||||
|
||||
private static ZooKeeperServer zkServer;
|
||||
private static HiveServer2 hiveServer;
|
||||
public static HiveTestService hiveTestService;
|
||||
private static ZookeeperTestService zkService;
|
||||
private static Configuration configuration;
|
||||
public static HiveSyncConfig hiveSyncConfig;
|
||||
private static HiveSyncConfig hiveSyncConfig;
|
||||
private static DateTimeFormatter dtfOut;
|
||||
public static FileSystem fileSystem;
|
||||
private static Set<String> createdTablesSet = new HashSet<>();
|
||||
public static QueryBasedDDLExecutor ddlExecutor;
|
||||
|
||||
public static void setUp() throws IOException, InterruptedException, HiveException, MetaException {
|
||||
configuration = new Configuration();
|
||||
@@ -112,16 +117,21 @@ public class HiveTestUtil {
|
||||
}
|
||||
fileSystem = FileSystem.get(configuration);
|
||||
|
||||
hiveSyncConfig = new HiveSyncConfig();
|
||||
hiveSyncConfig.jdbcUrl = hiveTestService.getJdbcHive2Url();
|
||||
hiveSyncConfig.hiveUser = "";
|
||||
hiveSyncConfig.hivePass = "";
|
||||
hiveSyncConfig.databaseName = "testdb";
|
||||
hiveSyncConfig.tableName = "test1";
|
||||
hiveSyncConfig.basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString();
|
||||
hiveSyncConfig.assumeDatePartitioning = true;
|
||||
hiveSyncConfig.usePreApacheInputFormat = false;
|
||||
hiveSyncConfig.partitionFields = Collections.singletonList("datestr");
|
||||
basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString();
|
||||
|
||||
hiveSyncProps = new TypedProperties();
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_URL.key(), hiveTestService.getJdbcHive2Url());
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USER.key(), "");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_PASS.key(), "");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), DB_NAME);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), TABLE_NAME);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_BASE_PATH.key(), basePath);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_BATCH_SYNC_PARTITION_NUM.key(), "3");
|
||||
|
||||
hiveSyncConfig = new HiveSyncConfig(hiveSyncProps);
|
||||
|
||||
dtfOut = DateTimeFormatter.ofPattern("yyyy/MM/dd");
|
||||
ddlExecutor = new HiveQueryDDLExecutor(hiveSyncConfig, fileSystem, getHiveConf());
|
||||
@@ -138,18 +148,18 @@ public class HiveTestUtil {
|
||||
}
|
||||
|
||||
public static void clear() throws IOException, HiveException, MetaException {
|
||||
fileSystem.delete(new Path(hiveSyncConfig.basePath), true);
|
||||
fileSystem.delete(new Path(basePath), true);
|
||||
HoodieTableMetaClient.withPropertyBuilder()
|
||||
.setTableType(HoodieTableType.COPY_ON_WRITE)
|
||||
.setTableName(hiveSyncConfig.tableName)
|
||||
.setTableName(TABLE_NAME)
|
||||
.setPayloadClass(HoodieAvroPayload.class)
|
||||
.initTable(configuration, hiveSyncConfig.basePath);
|
||||
.initTable(configuration, basePath);
|
||||
|
||||
for (String tableName : createdTablesSet) {
|
||||
ddlExecutor.runSQL("drop table if exists " + tableName);
|
||||
}
|
||||
createdTablesSet.clear();
|
||||
ddlExecutor.runSQL("drop database if exists " + hiveSyncConfig.databaseName + " cascade");
|
||||
ddlExecutor.runSQL("drop database if exists " + DB_NAME + " cascade");
|
||||
}
|
||||
|
||||
public static HiveConf getHiveConf() {
|
||||
@@ -189,7 +199,7 @@ public class HiveTestUtil {
|
||||
|
||||
public static void createCOWTable(String instantTime, int numberOfPartitions, boolean useSchemaFromCommitMetadata)
|
||||
throws IOException, URISyntaxException {
|
||||
createCOWTable(instantTime, numberOfPartitions, useSchemaFromCommitMetadata, hiveSyncConfig.basePath, hiveSyncConfig.databaseName, hiveSyncConfig.tableName);
|
||||
createCOWTable(instantTime, numberOfPartitions, useSchemaFromCommitMetadata, basePath, DB_NAME, TABLE_NAME);
|
||||
}
|
||||
|
||||
public static void createReplaceCommit(String instantTime, String partitions, WriteOperationType type, boolean isParquetSchemaSimple, boolean useSchemaFromCommitMetadata)
|
||||
@@ -205,13 +215,13 @@ public class HiveTestUtil {
|
||||
|
||||
public static void createCOWTableWithSchema(String instantTime, String schemaFileName)
|
||||
throws IOException, URISyntaxException {
|
||||
Path path = new Path(hiveSyncConfig.basePath);
|
||||
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
|
||||
Path path = new Path(basePath);
|
||||
FileIOUtils.deleteDirectory(new File(basePath));
|
||||
HoodieTableMetaClient.withPropertyBuilder()
|
||||
.setTableType(HoodieTableType.COPY_ON_WRITE)
|
||||
.setTableName(hiveSyncConfig.tableName)
|
||||
.setTableName(TABLE_NAME)
|
||||
.setPayloadClass(HoodieAvroPayload.class)
|
||||
.initTable(configuration, hiveSyncConfig.basePath);
|
||||
.initTable(configuration, basePath);
|
||||
|
||||
boolean result = fileSystem.mkdirs(path);
|
||||
checkResult(result);
|
||||
@@ -219,7 +229,7 @@ public class HiveTestUtil {
|
||||
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
String partitionPath = dateTime.format(dtfOut);
|
||||
Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath);
|
||||
Path partPath = new Path(basePath + "/" + partitionPath);
|
||||
fileSystem.makeQualified(partPath);
|
||||
fileSystem.mkdirs(partPath);
|
||||
List<HoodieWriteStat> writeStats = new ArrayList<>();
|
||||
@@ -233,30 +243,30 @@ public class HiveTestUtil {
|
||||
writeStats.add(writeStat);
|
||||
writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s));
|
||||
commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, schema.toString());
|
||||
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
|
||||
createCommitFile(commitMetadata, instantTime, hiveSyncConfig.basePath);
|
||||
createdTablesSet.add(DB_NAME + "." + TABLE_NAME);
|
||||
createCommitFile(commitMetadata, instantTime, basePath);
|
||||
}
|
||||
|
||||
public static void createMORTable(String commitTime, String deltaCommitTime, int numberOfPartitions,
|
||||
boolean createDeltaCommit, boolean useSchemaFromCommitMetadata)
|
||||
throws IOException, URISyntaxException, InterruptedException {
|
||||
Path path = new Path(hiveSyncConfig.basePath);
|
||||
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
|
||||
Path path = new Path(basePath);
|
||||
FileIOUtils.deleteDirectory(new File(basePath));
|
||||
HoodieTableMetaClient.withPropertyBuilder()
|
||||
.setTableType(HoodieTableType.MERGE_ON_READ)
|
||||
.setTableName(hiveSyncConfig.tableName)
|
||||
.setTableName(TABLE_NAME)
|
||||
.setPayloadClass(HoodieAvroPayload.class)
|
||||
.initTable(configuration, hiveSyncConfig.basePath);
|
||||
.initTable(configuration, basePath);
|
||||
|
||||
boolean result = fileSystem.mkdirs(path);
|
||||
checkResult(result);
|
||||
ZonedDateTime dateTime = ZonedDateTime.now();
|
||||
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true,
|
||||
useSchemaFromCommitMetadata, dateTime, commitTime, hiveSyncConfig.basePath);
|
||||
useSchemaFromCommitMetadata, dateTime, commitTime, basePath);
|
||||
createdTablesSet
|
||||
.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
.add(DB_NAME + "." + TABLE_NAME + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
createdTablesSet
|
||||
.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE);
|
||||
.add(DB_NAME + "." + TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE);
|
||||
HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata();
|
||||
commitMetadata.getPartitionToWriteStats()
|
||||
.forEach((key, value) -> value.forEach(l -> compactionMetadata.addWriteStat(key, l)));
|
||||
@@ -274,26 +284,26 @@ public class HiveTestUtil {
|
||||
public static void addCOWPartitions(int numberOfPartitions, boolean isParquetSchemaSimple,
|
||||
boolean useSchemaFromCommitMetadata, ZonedDateTime startFrom, String instantTime) throws IOException, URISyntaxException {
|
||||
HoodieCommitMetadata commitMetadata =
|
||||
createPartitions(numberOfPartitions, isParquetSchemaSimple, useSchemaFromCommitMetadata, startFrom, instantTime, hiveSyncConfig.basePath);
|
||||
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
|
||||
createCommitFile(commitMetadata, instantTime, hiveSyncConfig.basePath);
|
||||
createPartitions(numberOfPartitions, isParquetSchemaSimple, useSchemaFromCommitMetadata, startFrom, instantTime, basePath);
|
||||
createdTablesSet.add(DB_NAME + "." + TABLE_NAME);
|
||||
createCommitFile(commitMetadata, instantTime, basePath);
|
||||
}
|
||||
|
||||
public static void addCOWPartition(String partitionPath, boolean isParquetSchemaSimple,
|
||||
boolean useSchemaFromCommitMetadata, String instantTime) throws IOException, URISyntaxException {
|
||||
HoodieCommitMetadata commitMetadata =
|
||||
createPartition(partitionPath, isParquetSchemaSimple, useSchemaFromCommitMetadata, instantTime);
|
||||
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
|
||||
createCommitFile(commitMetadata, instantTime, hiveSyncConfig.basePath);
|
||||
createdTablesSet.add(DB_NAME + "." + TABLE_NAME);
|
||||
createCommitFile(commitMetadata, instantTime, basePath);
|
||||
}
|
||||
|
||||
public static void addMORPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, boolean isLogSchemaSimple,
|
||||
boolean useSchemaFromCommitMetadata, ZonedDateTime startFrom, String instantTime, String deltaCommitTime)
|
||||
throws IOException, URISyntaxException, InterruptedException {
|
||||
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, isParquetSchemaSimple,
|
||||
useSchemaFromCommitMetadata, startFrom, instantTime, hiveSyncConfig.basePath);
|
||||
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE);
|
||||
useSchemaFromCommitMetadata, startFrom, instantTime, basePath);
|
||||
createdTablesSet.add(DB_NAME + "." + TABLE_NAME + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
createdTablesSet.add(DB_NAME + "." + TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE);
|
||||
HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata();
|
||||
commitMetadata.getPartitionToWriteStats()
|
||||
.forEach((key, value) -> value.forEach(l -> compactionMetadata.addWriteStat(key, l)));
|
||||
@@ -346,7 +356,7 @@ public class HiveTestUtil {
|
||||
private static HoodieCommitMetadata createPartition(String partitionPath, boolean isParquetSchemaSimple,
|
||||
boolean useSchemaFromCommitMetadata, String instantTime) throws IOException, URISyntaxException {
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath);
|
||||
Path partPath = new Path(basePath + "/" + partitionPath);
|
||||
fileSystem.makeQualified(partPath);
|
||||
fileSystem.mkdirs(partPath);
|
||||
List<HoodieWriteStat> writeStats = createTestData(partPath, isParquetSchemaSimple, instantTime);
|
||||
@@ -471,7 +481,7 @@ public class HiveTestUtil {
|
||||
|
||||
public static void createReplaceCommitFile(HoodieCommitMetadata commitMetadata, String instantTime) throws IOException {
|
||||
byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
|
||||
Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
Path fullPath = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
+ HoodieTimeline.makeReplaceFileName(instantTime));
|
||||
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
|
||||
fsout.write(bytes);
|
||||
@@ -480,13 +490,13 @@ public class HiveTestUtil {
|
||||
|
||||
public static void createCommitFileWithSchema(HoodieCommitMetadata commitMetadata, String instantTime, boolean isSimpleSchema) throws IOException {
|
||||
addSchemaToCommitMetadata(commitMetadata, isSimpleSchema, true);
|
||||
createCommitFile(commitMetadata, instantTime, hiveSyncConfig.basePath);
|
||||
createCommitFile(commitMetadata, instantTime, basePath);
|
||||
}
|
||||
|
||||
private static void createCompactionCommitFile(HoodieCommitMetadata commitMetadata, String instantTime)
|
||||
throws IOException {
|
||||
byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
|
||||
Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
Path fullPath = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
+ HoodieTimeline.makeCommitFileName(instantTime));
|
||||
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
|
||||
fsout.write(bytes);
|
||||
@@ -496,7 +506,7 @@ public class HiveTestUtil {
|
||||
private static void createDeltaCommitFile(HoodieCommitMetadata deltaCommitMetadata, String deltaCommitTime)
|
||||
throws IOException {
|
||||
byte[] bytes = deltaCommitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
|
||||
Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
Path fullPath = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
+ HoodieTimeline.makeDeltaFileName(deltaCommitTime));
|
||||
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
|
||||
fsout.write(bytes);
|
||||
|
||||
@@ -45,6 +45,65 @@
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.beust</groupId>
|
||||
<artifactId>jcommander</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Required for Testing -->
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-core</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-api</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-engine</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.junit.vintage</groupId>
|
||||
<artifactId>junit-vintage-engine</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-params</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-junit-jupiter</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.junit.platform</groupId>
|
||||
<artifactId>junit-platform-runner</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.junit.platform</groupId>
|
||||
<artifactId>junit-platform-suite-api</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.junit.platform</groupId>
|
||||
<artifactId>junit-platform-commons</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
||||
@@ -17,17 +17,31 @@
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Base class to sync Hudi meta data with Metastores to make
|
||||
* Hudi table queryable through external systems.
|
||||
*/
|
||||
public abstract class AbstractSyncTool {
|
||||
protected Properties props;
|
||||
protected FileSystem fileSystem;
|
||||
protected final Configuration conf;
|
||||
protected final FileSystem fs;
|
||||
protected TypedProperties props;
|
||||
|
||||
public AbstractSyncTool(Properties props, FileSystem fileSystem) {
|
||||
public AbstractSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
this.props = props;
|
||||
this.fileSystem = fileSystem;
|
||||
this.conf = conf;
|
||||
this.fs = fs;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public AbstractSyncTool(Properties props, FileSystem fileSystem) {
|
||||
this(new TypedProperties(props), fileSystem.getConf(), fileSystem);
|
||||
}
|
||||
|
||||
public abstract void syncHoodieTable();
|
||||
|
||||
@@ -0,0 +1,188 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.HoodieConfig;
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
* Configs needed to sync data into external meta stores, catalogs, etc.
|
||||
*/
|
||||
public class HoodieSyncConfig extends HoodieConfig {
|
||||
|
||||
@Parameter(names = {"--database"}, description = "name of the target database in meta store", required = true)
|
||||
public String databaseName;
|
||||
|
||||
@Parameter(names = {"--table"}, description = "name of the target table in meta store", required = true)
|
||||
public String tableName;
|
||||
|
||||
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
|
||||
public String basePath;
|
||||
|
||||
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
|
||||
public String baseFileFormat;
|
||||
|
||||
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
|
||||
public List<String> partitionFields;
|
||||
|
||||
@Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
|
||||
+ "to extract the partition values from HDFS path")
|
||||
public String partitionValueExtractorClass;
|
||||
|
||||
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
|
||||
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
|
||||
public Boolean assumeDatePartitioning;
|
||||
|
||||
@Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
|
||||
public Boolean decodePartition;
|
||||
|
||||
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
|
||||
public Boolean useFileListingFromMetadata;
|
||||
|
||||
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
|
||||
public Boolean isConditionalSync;
|
||||
|
||||
@Parameter(names = {"--spark-version"}, description = "The spark version")
|
||||
public String sparkVersion;
|
||||
|
||||
public static final ConfigProperty<String> META_SYNC_BASE_PATH = ConfigProperty
|
||||
.key("hoodie.datasource.meta.sync.base.path")
|
||||
.defaultValue("")
|
||||
.withDocumentation("Base path of the hoodie table to sync");
|
||||
|
||||
public static final ConfigProperty<String> META_SYNC_ENABLED = ConfigProperty
|
||||
.key("hoodie.datasource.meta.sync.enable")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Enable Syncing the Hudi Table with an external meta store or data catalog.");
|
||||
|
||||
// ToDo change the prefix of the following configs from hive_sync to meta_sync
|
||||
public static final ConfigProperty<String> META_SYNC_DATABASE_NAME = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.database")
|
||||
.defaultValue("default")
|
||||
.withDocumentation("The name of the destination database that we should sync the hudi table to.");
|
||||
|
||||
// If the table name for the metastore destination is not provided, pick it up from write or table configs.
|
||||
public static final Function<HoodieConfig, Option<String>> TABLE_NAME_INFERENCE_FUNCTION = cfg -> {
|
||||
if (cfg.contains(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY)) {
|
||||
return Option.of(cfg.getString(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY));
|
||||
} else if (cfg.contains(HoodieTableConfig.HOODIE_TABLE_NAME_KEY)) {
|
||||
return Option.of(cfg.getString(HoodieTableConfig.HOODIE_TABLE_NAME_KEY));
|
||||
} else {
|
||||
return Option.empty();
|
||||
}
|
||||
};
|
||||
public static final ConfigProperty<String> META_SYNC_TABLE_NAME = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.table")
|
||||
.defaultValue("unknown")
|
||||
.withInferFunction(TABLE_NAME_INFERENCE_FUNCTION)
|
||||
.withDocumentation("The name of the destination table that we should sync the hudi table to.");
|
||||
|
||||
public static final ConfigProperty<String> META_SYNC_BASE_FILE_FORMAT = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.base_file_format")
|
||||
.defaultValue("PARQUET")
|
||||
.withDocumentation("Base file format for the sync.");
|
||||
|
||||
// If partition fields are not explicitly provided, obtain from the KeyGeneration Configs
|
||||
public static final Function<HoodieConfig, Option<String>> PARTITION_FIELDS_INFERENCE_FUNCTION = cfg -> {
|
||||
if (cfg.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME)) {
|
||||
return Option.of(cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME));
|
||||
} else {
|
||||
return Option.empty();
|
||||
}
|
||||
};
|
||||
public static final ConfigProperty<String> META_SYNC_PARTITION_FIELDS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.partition_fields")
|
||||
.defaultValue("")
|
||||
.withInferFunction(PARTITION_FIELDS_INFERENCE_FUNCTION)
|
||||
.withDocumentation("Field in the table to use for determining hive partition columns.");
|
||||
|
||||
// If partition value extraction class is not explicitly provided, configure based on the partition fields.
|
||||
public static final Function<HoodieConfig, Option<String>> PARTITION_EXTRACTOR_CLASS_FUNCTION = cfg -> {
|
||||
if (!cfg.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME)) {
|
||||
return Option.of("org.apache.hudi.hive.NonPartitionedExtractor");
|
||||
} else {
|
||||
int numOfPartFields = cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME).split(",").length;
|
||||
if (numOfPartFields == 1
|
||||
&& cfg.contains(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE)
|
||||
&& cfg.getString(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE).equals("true")) {
|
||||
return Option.of("org.apache.hudi.hive.HiveStylePartitionValueExtractor");
|
||||
} else {
|
||||
return Option.of("org.apache.hudi.hive.MultiPartKeysValueExtractor");
|
||||
}
|
||||
}
|
||||
};
|
||||
public static final ConfigProperty<String> META_SYNC_PARTITION_EXTRACTOR_CLASS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.partition_extractor_class")
|
||||
.defaultValue("org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor")
|
||||
.withInferFunction(PARTITION_EXTRACTOR_CLASS_FUNCTION)
|
||||
.withDocumentation("Class which implements PartitionValueExtractor to extract the partition values, "
|
||||
+ "default 'SlashEncodedDayPartitionValueExtractor'.");
|
||||
|
||||
public static final ConfigProperty<String> META_SYNC_ASSUME_DATE_PARTITION = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.assume_date_partitioning")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Assume partitioning is yyyy/mm/dd");
|
||||
|
||||
public static final ConfigProperty<Boolean> META_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty
|
||||
.key("hoodie.meta.sync.metadata_file_listing")
|
||||
.defaultValue(HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS)
|
||||
.withDocumentation("Enable the internal metadata table for file listing for syncing with metastores");
|
||||
|
||||
public static final ConfigProperty<String> META_SYNC_CONDITIONAL_SYNC = ConfigProperty
|
||||
.key("hoodie.datasource.meta_sync.condition.sync")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("If true, only sync on conditions like schema change or partition change.");
|
||||
|
||||
public static final ConfigProperty<String> META_SYNC_SPARK_VERSION = ConfigProperty
|
||||
.key("hoodie.meta_sync.spark.version")
|
||||
.defaultValue("")
|
||||
.withDocumentation("The spark version used when syncing with a metastore.");
|
||||
|
||||
public HoodieSyncConfig(TypedProperties props) {
|
||||
super(props);
|
||||
setDefaults();
|
||||
|
||||
this.basePath = getStringOrDefault(META_SYNC_BASE_PATH);
|
||||
this.databaseName = getStringOrDefault(META_SYNC_DATABASE_NAME);
|
||||
this.tableName = getStringOrDefault(META_SYNC_TABLE_NAME);
|
||||
this.baseFileFormat = getStringOrDefault(META_SYNC_BASE_FILE_FORMAT);
|
||||
this.partitionFields = props.getStringList(META_SYNC_PARTITION_FIELDS.key(), ",", Collections.emptyList());
|
||||
this.partitionValueExtractorClass = getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS);
|
||||
this.assumeDatePartitioning = getBooleanOrDefault(META_SYNC_ASSUME_DATE_PARTITION);
|
||||
this.decodePartition = getBooleanOrDefault(KeyGeneratorOptions.URL_ENCODE_PARTITIONING);
|
||||
this.useFileListingFromMetadata = getBooleanOrDefault(META_SYNC_USE_FILE_LISTING_FROM_METADATA);
|
||||
this.isConditionalSync = getBooleanOrDefault(META_SYNC_CONDITIONAL_SYNC);
|
||||
this.sparkVersion = getStringOrDefault(META_SYNC_SPARK_VERSION);
|
||||
}
|
||||
|
||||
protected void setDefaults() {
|
||||
this.setDefaultValue(META_SYNC_TABLE_NAME);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
package org.apache.hudi.sync.common.util;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Helper class for syncing Hudi commit data with external metastores.
|
||||
*/
|
||||
public class SyncUtilHelpers {
|
||||
private static final Logger LOG = LogManager.getLogger(SyncUtilHelpers.class);
|
||||
|
||||
/**
|
||||
* Create an instance of an implementation of {@link AbstractSyncTool} that will sync all the relevant meta information
|
||||
* with an external metastore such as Hive etc. to ensure Hoodie tables can be queried or read via external systems.
|
||||
*
|
||||
* @param metaSyncFQCN The class that implements the sync of the metadata.
|
||||
* @param props property map.
|
||||
* @param hadoopConfig Hadoop confs.
|
||||
* @param fs Filesystem used.
|
||||
* @param targetBasePath The target base path that contains the hoodie table.
|
||||
* @param baseFileFormat The file format used by the hoodie table (defaults to PARQUET).
|
||||
*/
|
||||
public static void runHoodieMetaSync(String metaSyncFQCN,
|
||||
TypedProperties props,
|
||||
Configuration hadoopConfig,
|
||||
FileSystem fs,
|
||||
String targetBasePath,
|
||||
String baseFileFormat) {
|
||||
try {
|
||||
instantiateMetaSyncTool(metaSyncFQCN, props, hadoopConfig, fs, targetBasePath, baseFileFormat).syncHoodieTable();
|
||||
} catch (Throwable e) {
|
||||
throw new HoodieException("Could not sync using the meta sync class " + metaSyncFQCN, e);
|
||||
}
|
||||
}
|
||||
|
||||
static AbstractSyncTool instantiateMetaSyncTool(String metaSyncFQCN,
|
||||
TypedProperties props,
|
||||
Configuration hadoopConfig,
|
||||
FileSystem fs,
|
||||
String targetBasePath,
|
||||
String baseFileFormat) {
|
||||
TypedProperties properties = new TypedProperties();
|
||||
properties.putAll(props);
|
||||
properties.put(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), targetBasePath);
|
||||
properties.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), baseFileFormat);
|
||||
|
||||
if (ReflectionUtils.hasConstructor(metaSyncFQCN,
|
||||
new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class})) {
|
||||
return ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class},
|
||||
properties, hadoopConfig, fs));
|
||||
} else {
|
||||
LOG.warn("Falling back to deprecated constructor for class: " + metaSyncFQCN);
|
||||
try {
|
||||
return ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, FileSystem.class}, properties, fs));
|
||||
} catch (Throwable t) {
|
||||
throw new HoodieException("Could not load meta sync class " + metaSyncFQCN, t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common.util;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class TestSyncUtilHelpers {
|
||||
private static final String BASE_PATH = "/tmp/test";
|
||||
private static final String BASE_FORMAT = "PARQUET";
|
||||
|
||||
private Configuration hadoopConf;
|
||||
private FileSystem fileSystem;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
fileSystem = FSUtils.getFs(BASE_PATH, new Configuration());
|
||||
hadoopConf = fileSystem.getConf();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateValidSyncClass() {
|
||||
AbstractSyncTool metaSyncTool = SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
ValidMetaSyncClass.class.getName(),
|
||||
new TypedProperties(),
|
||||
hadoopConf,
|
||||
fileSystem,
|
||||
BASE_PATH,
|
||||
BASE_FORMAT
|
||||
);
|
||||
assertTrue(metaSyncTool instanceof ValidMetaSyncClass);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure it still works for the deprecated constructor of {@link AbstractSyncTool}
|
||||
* as we implemented the fallback.
|
||||
*/
|
||||
@Test
|
||||
public void testCreateDeprecatedSyncClass() {
|
||||
Properties properties = new Properties();
|
||||
AbstractSyncTool deprecatedMetaSyncClass = SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
DeprecatedMetaSyncClass.class.getName(),
|
||||
new TypedProperties(properties),
|
||||
hadoopConf,
|
||||
fileSystem,
|
||||
BASE_PATH,
|
||||
BASE_FORMAT
|
||||
);
|
||||
assertTrue(deprecatedMetaSyncClass instanceof DeprecatedMetaSyncClass);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateInvalidSyncClass() {
|
||||
Exception exception = assertThrows(HoodieException.class, () -> {
|
||||
SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
InvalidSyncClass.class.getName(),
|
||||
new TypedProperties(),
|
||||
hadoopConf,
|
||||
fileSystem,
|
||||
BASE_PATH,
|
||||
BASE_FORMAT
|
||||
);
|
||||
});
|
||||
|
||||
String expectedMessage = "Could not load meta sync class " + InvalidSyncClass.class.getName();
|
||||
assertTrue(exception.getMessage().contains(expectedMessage));
|
||||
|
||||
}
|
||||
|
||||
public static class ValidMetaSyncClass extends AbstractSyncTool {
|
||||
public ValidMetaSyncClass(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
super(props, conf, fs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
throw new HoodieException("Method unimplemented as its a test class");
|
||||
}
|
||||
}
|
||||
|
||||
public static class DeprecatedMetaSyncClass extends AbstractSyncTool {
|
||||
public DeprecatedMetaSyncClass(Properties props, FileSystem fileSystem) {
|
||||
super(props, fileSystem);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
throw new HoodieException("Method unimplemented as its a test class");
|
||||
}
|
||||
}
|
||||
|
||||
public static class InvalidSyncClass {
|
||||
public InvalidSyncClass(Properties props) {
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=WARN, CONSOLE
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
|
||||
# CONSOLE is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# CONSOLE uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
@@ -0,0 +1,30 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=WARN, CONSOLE
|
||||
log4j.logger.org.apache=INFO
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
|
||||
# A1 is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# A1 uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
Reference in New Issue
Block a user