[HUDI-3730] Improve meta sync class design and hierarchies (#5854)
* [HUDI-3730] Improve meta sync class design and hierarchies (#5754) * Implements class design proposed in RFC-55 Co-authored-by: jian.feng <fengjian428@gmial.com> Co-authored-by: jian.feng <jian.feng@shopee.com>
This commit is contained in:
@@ -1,128 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.adb;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.hive.PartitionValueExtractor;
|
||||
import org.apache.hudi.hive.SchemaDifference;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class AbstractAdbSyncHoodieClient extends AbstractSyncHoodieClient {
|
||||
protected AdbSyncConfig adbSyncConfig;
|
||||
protected PartitionValueExtractor partitionValueExtractor;
|
||||
protected HoodieTimeline activeTimeline;
|
||||
|
||||
public AbstractAdbSyncHoodieClient(AdbSyncConfig syncConfig, FileSystem fs) {
|
||||
super(syncConfig.basePath, syncConfig.assumeDatePartitioning,
|
||||
syncConfig.useFileListingFromMetadata, false, fs);
|
||||
this.adbSyncConfig = syncConfig;
|
||||
final String clazz = adbSyncConfig.partitionValueExtractorClass;
|
||||
try {
|
||||
this.partitionValueExtractor = (PartitionValueExtractor) Class.forName(clazz).newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Fail to init PartitionValueExtractor class " + clazz, e);
|
||||
}
|
||||
|
||||
activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
public List<PartitionEvent> getPartitionEvents(Map<List<String>, String> tablePartitions,
|
||||
List<String> partitionStoragePartitions) {
|
||||
Map<String, String> paths = new HashMap<>();
|
||||
|
||||
for (Map.Entry<List<String>, String> entry : tablePartitions.entrySet()) {
|
||||
List<String> partitionValues = entry.getKey();
|
||||
String fullTablePartitionPath = entry.getValue();
|
||||
paths.put(String.join(", ", partitionValues), fullTablePartitionPath);
|
||||
}
|
||||
List<PartitionEvent> events = new ArrayList<>();
|
||||
for (String storagePartition : partitionStoragePartitions) {
|
||||
Path storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, storagePartition);
|
||||
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
// Check if the partition values or if hdfs path is the same
|
||||
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
|
||||
if (adbSyncConfig.useHiveStylePartitioning) {
|
||||
String partition = String.join("/", storagePartitionValues);
|
||||
storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
|
||||
fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
}
|
||||
if (!storagePartitionValues.isEmpty()) {
|
||||
String storageValue = String.join(", ", storagePartitionValues);
|
||||
if (!paths.containsKey(storageValue)) {
|
||||
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
|
||||
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
|
||||
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
|
||||
}
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
|
||||
}
|
||||
|
||||
public abstract Map<List<String>, String> scanTablePartitions(String tableName) throws Exception;
|
||||
|
||||
public abstract void updateTableDefinition(String tableName, SchemaDifference schemaDiff) throws Exception;
|
||||
|
||||
public abstract boolean databaseExists(String databaseName) throws Exception;
|
||||
|
||||
public abstract void createDatabase(String databaseName) throws Exception;
|
||||
|
||||
public abstract void dropTable(String tableName);
|
||||
|
||||
protected String getDatabasePath() {
|
||||
String dbLocation = adbSyncConfig.dbLocation;
|
||||
Path dbLocationPath;
|
||||
if (StringUtils.isNullOrEmpty(dbLocation)) {
|
||||
if (new Path(adbSyncConfig.basePath).isRoot()) {
|
||||
dbLocationPath = new Path(adbSyncConfig.basePath);
|
||||
} else {
|
||||
dbLocationPath = new Path(adbSyncConfig.basePath).getParent();
|
||||
}
|
||||
} else {
|
||||
dbLocationPath = new Path(dbLocation);
|
||||
}
|
||||
return generateAbsolutePathStr(dbLocationPath);
|
||||
}
|
||||
|
||||
protected String generateAbsolutePathStr(Path path) {
|
||||
String absolutePathStr = path.toString();
|
||||
if (path.toUri().getScheme() == null) {
|
||||
absolutePathStr = getDefaultFs() + absolutePathStr;
|
||||
}
|
||||
return absolutePathStr.endsWith("/") ? absolutePathStr : absolutePathStr + "/";
|
||||
}
|
||||
|
||||
protected String getDefaultFs() {
|
||||
return fs.getConf().get("fs.defaultFS");
|
||||
}
|
||||
}
|
||||
@@ -20,62 +20,19 @@ package org.apache.hudi.sync.adb;
|
||||
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
import com.beust.jcommander.ParametersDelegate;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Configs needed to sync data into Alibaba Cloud AnalyticDB(ADB).
|
||||
*/
|
||||
public class AdbSyncConfig extends HoodieSyncConfig {
|
||||
|
||||
@Parameter(names = {"--user"}, description = "Adb username", required = true)
|
||||
public String adbUser;
|
||||
|
||||
@Parameter(names = {"--pass"}, description = "Adb password", required = true)
|
||||
public String adbPass;
|
||||
|
||||
@Parameter(names = {"--jdbc-url"}, description = "Adb jdbc connect url", required = true)
|
||||
public String jdbcUrl;
|
||||
|
||||
@Parameter(names = {"--skip-ro-suffix"}, description = "Whether skip the `_ro` suffix for read optimized table when syncing")
|
||||
public Boolean skipROSuffix;
|
||||
|
||||
@Parameter(names = {"--skip-rt-sync"}, description = "Whether skip the rt table when syncing")
|
||||
public Boolean skipRTSync;
|
||||
|
||||
@Parameter(names = {"--hive-style-partitioning"}, description = "Whether use hive style partitioning, true if like the following style: field1=value1/field2=value2")
|
||||
public Boolean useHiveStylePartitioning;
|
||||
|
||||
@Parameter(names = {"--support-timestamp"}, description = "If true, converts int64(timestamp_micros) to timestamp type")
|
||||
public Boolean supportTimestamp;
|
||||
|
||||
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table")
|
||||
public Boolean syncAsSparkDataSourceTable;
|
||||
|
||||
@Parameter(names = {"--table-properties"}, description = "Table properties, to support read hoodie table as datasource table", required = true)
|
||||
public String tableProperties;
|
||||
|
||||
@Parameter(names = {"--serde-properties"}, description = "Serde properties, to support read hoodie table as datasource table", required = true)
|
||||
public String serdeProperties;
|
||||
|
||||
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore")
|
||||
public int sparkSchemaLengthThreshold;
|
||||
|
||||
@Parameter(names = {"--db-location"}, description = "Database location")
|
||||
public String dbLocation;
|
||||
|
||||
@Parameter(names = {"--auto-create-database"}, description = "Whether auto create adb database")
|
||||
public Boolean autoCreateDatabase = true;
|
||||
|
||||
@Parameter(names = {"--skip-last-commit-time-sync"}, description = "Whether skip last commit time syncing")
|
||||
public Boolean skipLastCommitTimeSync = false;
|
||||
|
||||
@Parameter(names = {"--drop-table-before-creation"}, description = "Whether drop table before creation")
|
||||
public Boolean dropTableBeforeCreation = false;
|
||||
|
||||
@Parameter(names = {"--help", "-h"}, help = true)
|
||||
public Boolean help = false;
|
||||
public class AdbSyncConfig extends HiveSyncConfig {
|
||||
|
||||
public static final ConfigProperty<String> ADB_SYNC_USER = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.username")
|
||||
@@ -152,89 +109,101 @@ public class AdbSyncConfig extends HoodieSyncConfig {
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Whether drop table before creation");
|
||||
|
||||
public AdbSyncConfig() {
|
||||
this(new TypedProperties());
|
||||
}
|
||||
|
||||
public AdbSyncConfig(TypedProperties props) {
|
||||
public AdbSyncConfig(Properties props) {
|
||||
super(props);
|
||||
|
||||
adbUser = getString(ADB_SYNC_USER);
|
||||
adbPass = getString(ADB_SYNC_PASS);
|
||||
jdbcUrl = getString(ADB_SYNC_JDBC_URL);
|
||||
skipROSuffix = getBooleanOrDefault(ADB_SYNC_SKIP_RO_SUFFIX);
|
||||
skipRTSync = getBooleanOrDefault(ADB_SYNC_SKIP_RT_SYNC);
|
||||
useHiveStylePartitioning = getBooleanOrDefault(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING);
|
||||
supportTimestamp = getBooleanOrDefault(ADB_SYNC_SUPPORT_TIMESTAMP);
|
||||
syncAsSparkDataSourceTable = getBooleanOrDefault(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE);
|
||||
tableProperties = getString(ADB_SYNC_TABLE_PROPERTIES);
|
||||
serdeProperties = getString(ADB_SYNC_SERDE_PROPERTIES);
|
||||
sparkSchemaLengthThreshold = getIntOrDefault(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD);
|
||||
dbLocation = getString(ADB_SYNC_DB_LOCATION);
|
||||
autoCreateDatabase = getBooleanOrDefault(ADB_SYNC_AUTO_CREATE_DATABASE);
|
||||
skipLastCommitTimeSync = getBooleanOrDefault(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC);
|
||||
dropTableBeforeCreation = getBooleanOrDefault(ADB_SYNC_DROP_TABLE_BEFORE_CREATION);
|
||||
}
|
||||
|
||||
public static TypedProperties toProps(AdbSyncConfig cfg) {
|
||||
TypedProperties properties = new TypedProperties();
|
||||
properties.put(META_SYNC_DATABASE_NAME.key(), cfg.databaseName);
|
||||
properties.put(META_SYNC_TABLE_NAME.key(), cfg.tableName);
|
||||
properties.put(ADB_SYNC_USER.key(), cfg.adbUser);
|
||||
properties.put(ADB_SYNC_PASS.key(), cfg.adbPass);
|
||||
properties.put(ADB_SYNC_JDBC_URL.key(), cfg.jdbcUrl);
|
||||
properties.put(META_SYNC_BASE_PATH.key(), cfg.basePath);
|
||||
properties.put(META_SYNC_PARTITION_FIELDS.key(), String.join(",", cfg.partitionFields));
|
||||
properties.put(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), cfg.partitionValueExtractorClass);
|
||||
properties.put(META_SYNC_ASSUME_DATE_PARTITION.key(), String.valueOf(cfg.assumeDatePartitioning));
|
||||
properties.put(ADB_SYNC_SKIP_RO_SUFFIX.key(), String.valueOf(cfg.skipROSuffix));
|
||||
properties.put(ADB_SYNC_SKIP_RT_SYNC.key(), String.valueOf(cfg.skipRTSync));
|
||||
properties.put(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING.key(), String.valueOf(cfg.useHiveStylePartitioning));
|
||||
properties.put(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(cfg.useFileListingFromMetadata));
|
||||
properties.put(ADB_SYNC_SUPPORT_TIMESTAMP.key(), String.valueOf(cfg.supportTimestamp));
|
||||
properties.put(ADB_SYNC_TABLE_PROPERTIES.key(), cfg.tableProperties);
|
||||
properties.put(ADB_SYNC_SERDE_PROPERTIES.key(), cfg.serdeProperties);
|
||||
properties.put(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE.key(), String.valueOf(cfg.syncAsSparkDataSourceTable));
|
||||
properties.put(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), String.valueOf(cfg.sparkSchemaLengthThreshold));
|
||||
properties.put(META_SYNC_SPARK_VERSION.key(), cfg.sparkVersion);
|
||||
properties.put(ADB_SYNC_DB_LOCATION.key(), cfg.dbLocation);
|
||||
properties.put(ADB_SYNC_AUTO_CREATE_DATABASE.key(), String.valueOf(cfg.autoCreateDatabase));
|
||||
properties.put(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC.key(), String.valueOf(cfg.skipLastCommitTimeSync));
|
||||
properties.put(ADB_SYNC_DROP_TABLE_BEFORE_CREATION.key(), String.valueOf(cfg.dropTableBeforeCreation));
|
||||
|
||||
return properties;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "AdbSyncConfig{"
|
||||
+ "adbUser='" + adbUser + '\''
|
||||
+ ", adbPass='" + adbPass + '\''
|
||||
+ ", jdbcUrl='" + jdbcUrl + '\''
|
||||
+ ", skipROSuffix=" + skipROSuffix
|
||||
+ ", skipRTSync=" + skipRTSync
|
||||
+ ", useHiveStylePartitioning=" + useHiveStylePartitioning
|
||||
+ ", supportTimestamp=" + supportTimestamp
|
||||
+ ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable
|
||||
+ ", tableProperties='" + tableProperties + '\''
|
||||
+ ", serdeProperties='" + serdeProperties + '\''
|
||||
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold
|
||||
+ ", dbLocation='" + dbLocation + '\''
|
||||
+ ", autoCreateDatabase=" + autoCreateDatabase
|
||||
+ ", skipLastCommitTimeSync=" + skipLastCommitTimeSync
|
||||
+ ", dropTableBeforeCreation=" + dropTableBeforeCreation
|
||||
+ ", help=" + help
|
||||
+ ", databaseName='" + databaseName + '\''
|
||||
+ ", tableName='" + tableName + '\''
|
||||
+ ", basePath='" + basePath + '\''
|
||||
+ ", baseFileFormat='" + baseFileFormat + '\''
|
||||
+ ", partitionFields=" + partitionFields
|
||||
+ ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\''
|
||||
+ ", assumeDatePartitioning=" + assumeDatePartitioning
|
||||
+ ", decodePartition=" + decodePartition
|
||||
+ ", useFileListingFromMetadata=" + useFileListingFromMetadata
|
||||
+ ", isConditionalSync=" + isConditionalSync
|
||||
+ ", sparkVersion='" + sparkVersion + '\''
|
||||
+ '}';
|
||||
public String getAbsoluteBasePath() {
|
||||
return generateAbsolutePathStr(new Path(getString(META_SYNC_BASE_PATH)));
|
||||
}
|
||||
|
||||
public String getDatabasePath() {
|
||||
Path basePath = new Path(getString(META_SYNC_BASE_PATH));
|
||||
Path dbLocationPath;
|
||||
String dbLocation = getString(ADB_SYNC_DB_LOCATION);
|
||||
if (StringUtils.isNullOrEmpty(dbLocation)) {
|
||||
if (basePath.isRoot()) {
|
||||
dbLocationPath = basePath;
|
||||
} else {
|
||||
dbLocationPath = basePath.getParent();
|
||||
}
|
||||
} else {
|
||||
dbLocationPath = new Path(dbLocation);
|
||||
}
|
||||
return generateAbsolutePathStr(dbLocationPath);
|
||||
}
|
||||
|
||||
public String generateAbsolutePathStr(Path path) {
|
||||
String absolutePathStr = path.toString();
|
||||
if (path.toUri().getScheme() == null) {
|
||||
absolutePathStr = getDefaultFs() + absolutePathStr;
|
||||
}
|
||||
return absolutePathStr.endsWith("/") ? absolutePathStr : absolutePathStr + "/";
|
||||
}
|
||||
|
||||
public String getDefaultFs() {
|
||||
return getHadoopConf().get("fs.defaultFS");
|
||||
}
|
||||
|
||||
public static class AdbSyncConfigParams {
|
||||
|
||||
@ParametersDelegate()
|
||||
public HiveSyncConfig.HiveSyncConfigParams hiveSyncConfigParams = new HiveSyncConfig.HiveSyncConfigParams();
|
||||
|
||||
@Parameter(names = {"--support-timestamp"}, description = "If true, converts int64(timestamp_micros) to timestamp type")
|
||||
public Boolean supportTimestamp;
|
||||
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table")
|
||||
public Boolean syncAsSparkDataSourceTable;
|
||||
@Parameter(names = {"--table-properties"}, description = "Table properties, to support read hoodie table as datasource table", required = true)
|
||||
public String tableProperties;
|
||||
@Parameter(names = {"--serde-properties"}, description = "Serde properties, to support read hoodie table as datasource table", required = true)
|
||||
public String serdeProperties;
|
||||
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore")
|
||||
public int sparkSchemaLengthThreshold;
|
||||
@Parameter(names = {"--hive-style-partitioning"}, description = "Whether use hive style partitioning, true if like the following style: field1=value1/field2=value2")
|
||||
public Boolean useHiveStylePartitioning;
|
||||
@Parameter(names = {"--skip-rt-sync"}, description = "Whether skip the rt table when syncing")
|
||||
public Boolean skipRTSync;
|
||||
@Parameter(names = {"--db-location"}, description = "Database location")
|
||||
public String dbLocation;
|
||||
@Parameter(names = {"--auto-create-database"}, description = "Whether auto create adb database")
|
||||
public Boolean autoCreateDatabase = true;
|
||||
@Parameter(names = {"--skip-last-commit-time-sync"}, description = "Whether skip last commit time syncing")
|
||||
public Boolean skipLastCommitTimeSync = false;
|
||||
@Parameter(names = {"--drop-table-before-creation"}, description = "Whether drop table before creation")
|
||||
public Boolean dropTableBeforeCreation = false;
|
||||
|
||||
public boolean isHelp() {
|
||||
return hiveSyncConfigParams.isHelp();
|
||||
}
|
||||
|
||||
public TypedProperties toProps() {
|
||||
final TypedProperties props = hiveSyncConfigParams.toProps();
|
||||
props.setPropertyIfNonNull(META_SYNC_DATABASE_NAME.key(), hiveSyncConfigParams.hoodieSyncConfigParams.databaseName);
|
||||
props.setPropertyIfNonNull(META_SYNC_TABLE_NAME.key(), hiveSyncConfigParams.hoodieSyncConfigParams.tableName);
|
||||
props.setPropertyIfNonNull(ADB_SYNC_USER.key(), hiveSyncConfigParams.hiveUser);
|
||||
props.setPropertyIfNonNull(ADB_SYNC_PASS.key(), hiveSyncConfigParams.hivePass);
|
||||
props.setPropertyIfNonNull(ADB_SYNC_JDBC_URL.key(), hiveSyncConfigParams.jdbcUrl);
|
||||
props.setPropertyIfNonNull(META_SYNC_BASE_PATH.key(), hiveSyncConfigParams.hoodieSyncConfigParams.basePath);
|
||||
props.setPropertyIfNonNull(META_SYNC_PARTITION_FIELDS.key(), String.join(",", hiveSyncConfigParams.hoodieSyncConfigParams.partitionFields));
|
||||
props.setPropertyIfNonNull(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), hiveSyncConfigParams.hoodieSyncConfigParams.partitionValueExtractorClass);
|
||||
props.setPropertyIfNonNull(META_SYNC_ASSUME_DATE_PARTITION.key(), String.valueOf(hiveSyncConfigParams.hoodieSyncConfigParams.assumeDatePartitioning));
|
||||
props.setPropertyIfNonNull(ADB_SYNC_SKIP_RO_SUFFIX.key(), String.valueOf(hiveSyncConfigParams.skipROSuffix));
|
||||
props.setPropertyIfNonNull(ADB_SYNC_SKIP_RT_SYNC.key(), String.valueOf(skipRTSync));
|
||||
props.setPropertyIfNonNull(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING.key(), String.valueOf(useHiveStylePartitioning));
|
||||
props.setPropertyIfNonNull(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(hiveSyncConfigParams.hoodieSyncConfigParams.useFileListingFromMetadata));
|
||||
props.setPropertyIfNonNull(ADB_SYNC_SUPPORT_TIMESTAMP.key(), String.valueOf(supportTimestamp));
|
||||
props.setPropertyIfNonNull(ADB_SYNC_TABLE_PROPERTIES.key(), tableProperties);
|
||||
props.setPropertyIfNonNull(ADB_SYNC_SERDE_PROPERTIES.key(), serdeProperties);
|
||||
props.setPropertyIfNonNull(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsSparkDataSourceTable));
|
||||
props.setPropertyIfNonNull(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), String.valueOf(sparkSchemaLengthThreshold));
|
||||
props.setPropertyIfNonNull(META_SYNC_SPARK_VERSION.key(), hiveSyncConfigParams.hoodieSyncConfigParams.sparkVersion);
|
||||
props.setPropertyIfNonNull(ADB_SYNC_DB_LOCATION.key(), dbLocation);
|
||||
props.setPropertyIfNonNull(ADB_SYNC_AUTO_CREATE_DATABASE.key(), String.valueOf(autoCreateDatabase));
|
||||
props.setPropertyIfNonNull(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC.key(), String.valueOf(skipLastCommitTimeSync));
|
||||
props.setPropertyIfNonNull(ADB_SYNC_DROP_TABLE_BEFORE_CREATION.key(), String.valueOf(dropTableBeforeCreation));
|
||||
return props;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,22 +18,19 @@
|
||||
|
||||
package org.apache.hudi.sync.adb;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
|
||||
import org.apache.hudi.hive.SchemaDifference;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.HoodieSyncTool;
|
||||
import org.apache.hudi.sync.common.model.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.model.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
import org.apache.hudi.sync.common.util.SparkDataSourceTableUtils;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
|
||||
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
@@ -43,8 +40,25 @@ import org.slf4j.LoggerFactory;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_AUTO_CREATE_DATABASE;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_DROP_TABLE_BEFORE_CREATION;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SERDE_PROPERTIES;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_RO_SUFFIX;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_RT_SYNC;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SUPPORT_TIMESTAMP;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_TABLE_PROPERTIES;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_SPARK_VERSION;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
|
||||
|
||||
/**
|
||||
* Adb sync tool is mainly used to sync hoodie tables to Alibaba Cloud AnalyticDB(ADB),
|
||||
* it can be used as API `AdbSyncTool.syncHoodieTable(AdbSyncConfig)` or as command
|
||||
@@ -55,45 +69,52 @@ import java.util.stream.Collectors;
|
||||
* incremental partitions will be synced as well.
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
public class AdbSyncTool extends AbstractSyncTool {
|
||||
public class AdbSyncTool extends HoodieSyncTool {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(AdbSyncTool.class);
|
||||
|
||||
public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
|
||||
public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
|
||||
|
||||
private final AdbSyncConfig adbSyncConfig;
|
||||
private final AbstractAdbSyncHoodieClient hoodieAdbClient;
|
||||
private final AdbSyncConfig config;
|
||||
private final String databaseName;
|
||||
private final String tableName;
|
||||
private final HoodieAdbJdbcClient syncClient;
|
||||
private final String snapshotTableName;
|
||||
private final Option<String> roTableTableName;
|
||||
|
||||
public AdbSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
super(props, conf, fs);
|
||||
this.adbSyncConfig = new AdbSyncConfig(props);
|
||||
this.hoodieAdbClient = getHoodieAdbClient(adbSyncConfig, fs);
|
||||
switch (hoodieAdbClient.getTableType()) {
|
||||
public AdbSyncTool(Properties props) {
|
||||
super(props);
|
||||
this.config = new AdbSyncConfig(props);
|
||||
this.databaseName = config.getString(META_SYNC_DATABASE_NAME);
|
||||
this.tableName = config.getString(META_SYNC_TABLE_NAME);
|
||||
this.syncClient = new HoodieAdbJdbcClient(config);
|
||||
switch (syncClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
this.snapshotTableName = adbSyncConfig.tableName;
|
||||
this.snapshotTableName = tableName;
|
||||
this.roTableTableName = Option.empty();
|
||||
break;
|
||||
case MERGE_ON_READ:
|
||||
this.snapshotTableName = adbSyncConfig.tableName + SUFFIX_SNAPSHOT_TABLE;
|
||||
this.roTableTableName = adbSyncConfig.skipROSuffix ? Option.of(adbSyncConfig.tableName)
|
||||
: Option.of(adbSyncConfig.tableName + SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
this.snapshotTableName = tableName + SUFFIX_SNAPSHOT_TABLE;
|
||||
this.roTableTableName = config.getBoolean(ADB_SYNC_SKIP_RO_SUFFIX) ? Option.of(tableName)
|
||||
: Option.of(tableName + SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
break;
|
||||
default:
|
||||
throw new HoodieAdbSyncException("Unknown table type:" + hoodieAdbClient.getTableType()
|
||||
+ ", basePath:" + hoodieAdbClient.getBasePath());
|
||||
throw new HoodieAdbSyncException("Unknown table type:" + syncClient.getTableType()
|
||||
+ ", basePath:" + syncClient.getBasePath());
|
||||
}
|
||||
}
|
||||
|
||||
private AbstractAdbSyncHoodieClient getHoodieAdbClient(AdbSyncConfig adbSyncConfig, FileSystem fs) {
|
||||
return new HoodieAdbJdbcClient(adbSyncConfig, fs);
|
||||
@Override
|
||||
public void close() {
|
||||
if (syncClient != null) {
|
||||
syncClient.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
try {
|
||||
switch (hoodieAdbClient.getTableType()) {
|
||||
switch (syncClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
syncHoodieTable(snapshotTableName, false, false);
|
||||
break;
|
||||
@@ -101,39 +122,38 @@ public class AdbSyncTool extends AbstractSyncTool {
|
||||
// Sync a ro table for MOR table
|
||||
syncHoodieTable(roTableTableName.get(), false, true);
|
||||
// Sync a rt table for MOR table
|
||||
if (!adbSyncConfig.skipRTSync) {
|
||||
if (!config.getBoolean(ADB_SYNC_SKIP_RT_SYNC)) {
|
||||
syncHoodieTable(snapshotTableName, true, false);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new HoodieAdbSyncException("Unknown table type:" + hoodieAdbClient.getTableType()
|
||||
+ ", basePath:" + hoodieAdbClient.getBasePath());
|
||||
throw new HoodieAdbSyncException("Unknown table type:" + syncClient.getTableType()
|
||||
+ ", basePath:" + syncClient.getBasePath());
|
||||
}
|
||||
} catch (Exception re) {
|
||||
throw new HoodieAdbSyncException("Sync hoodie table to ADB failed, tableName:" + adbSyncConfig.tableName, re);
|
||||
throw new HoodieAdbSyncException("Sync hoodie table to ADB failed, tableName:" + tableName, re);
|
||||
} finally {
|
||||
hoodieAdbClient.close();
|
||||
syncClient.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void syncHoodieTable(String tableName, boolean useRealtimeInputFormat,
|
||||
boolean readAsOptimized) throws Exception {
|
||||
private void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) throws Exception {
|
||||
LOG.info("Try to sync hoodie table, tableName:{}, path:{}, tableType:{}",
|
||||
tableName, hoodieAdbClient.getBasePath(), hoodieAdbClient.getTableType());
|
||||
tableName, syncClient.getBasePath(), syncClient.getTableType());
|
||||
|
||||
if (adbSyncConfig.autoCreateDatabase) {
|
||||
if (config.getBoolean(ADB_SYNC_AUTO_CREATE_DATABASE)) {
|
||||
try {
|
||||
synchronized (AdbSyncTool.class) {
|
||||
if (!hoodieAdbClient.databaseExists(adbSyncConfig.databaseName)) {
|
||||
hoodieAdbClient.createDatabase(adbSyncConfig.databaseName);
|
||||
if (!syncClient.databaseExists(databaseName)) {
|
||||
syncClient.createDatabase(databaseName);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieAdbSyncException("Failed to create database:" + adbSyncConfig.databaseName
|
||||
throw new HoodieAdbSyncException("Failed to create database:" + databaseName
|
||||
+ ", useRealtimeInputFormat = " + useRealtimeInputFormat, e);
|
||||
}
|
||||
} else if (!hoodieAdbClient.databaseExists(adbSyncConfig.databaseName)) {
|
||||
throw new HoodieAdbSyncException("ADB database does not exists:" + adbSyncConfig.databaseName);
|
||||
} else if (!syncClient.databaseExists(databaseName)) {
|
||||
throw new HoodieAdbSyncException("ADB database does not exists:" + databaseName);
|
||||
}
|
||||
|
||||
// Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table,
|
||||
@@ -141,22 +161,22 @@ public class AdbSyncTool extends AbstractSyncTool {
|
||||
// by the data source way (which will use the HoodieBootstrapRelation).
|
||||
// TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071],
|
||||
// we can remove this logical.
|
||||
if (hoodieAdbClient.isBootstrap()
|
||||
&& hoodieAdbClient.getTableType() == HoodieTableType.MERGE_ON_READ
|
||||
if (syncClient.isBootstrap()
|
||||
&& syncClient.getTableType() == HoodieTableType.MERGE_ON_READ
|
||||
&& !readAsOptimized) {
|
||||
adbSyncConfig.syncAsSparkDataSourceTable = false;
|
||||
config.setValue(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE, "false");
|
||||
LOG.info("Disable sync as spark datasource table for mor rt table:{}", tableName);
|
||||
}
|
||||
|
||||
if (adbSyncConfig.dropTableBeforeCreation) {
|
||||
if (config.getBoolean(ADB_SYNC_DROP_TABLE_BEFORE_CREATION)) {
|
||||
LOG.info("Drop table before creation, tableName:{}", tableName);
|
||||
hoodieAdbClient.dropTable(tableName);
|
||||
syncClient.dropTable(tableName);
|
||||
}
|
||||
|
||||
boolean tableExists = hoodieAdbClient.tableExists(tableName);
|
||||
boolean tableExists = syncClient.tableExists(tableName);
|
||||
|
||||
// Get the parquet schema for this table looking at the latest commit
|
||||
MessageType schema = hoodieAdbClient.getDataSchema();
|
||||
MessageType schema = syncClient.getStorageSchema();
|
||||
|
||||
// Sync schema if needed
|
||||
syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema);
|
||||
@@ -165,16 +185,16 @@ public class AdbSyncTool extends AbstractSyncTool {
|
||||
// Get the last time we successfully synced partitions
|
||||
Option<String> lastCommitTimeSynced = Option.empty();
|
||||
if (tableExists) {
|
||||
lastCommitTimeSynced = hoodieAdbClient.getLastCommitTimeSynced(tableName);
|
||||
lastCommitTimeSynced = syncClient.getLastCommitTimeSynced(tableName);
|
||||
}
|
||||
LOG.info("Last commit time synced was found:{}", lastCommitTimeSynced.orElse("null"));
|
||||
|
||||
// Scan synced partitions
|
||||
List<String> writtenPartitionsSince;
|
||||
if (adbSyncConfig.partitionFields.isEmpty()) {
|
||||
if (config.getSplitStrings(META_SYNC_PARTITION_FIELDS).isEmpty()) {
|
||||
writtenPartitionsSince = new ArrayList<>();
|
||||
} else {
|
||||
writtenPartitionsSince = hoodieAdbClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
|
||||
writtenPartitionsSince = syncClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
|
||||
}
|
||||
LOG.info("Scan partitions complete, partitionNum:{}", writtenPartitionsSince.size());
|
||||
|
||||
@@ -183,8 +203,8 @@ public class AdbSyncTool extends AbstractSyncTool {
|
||||
|
||||
// Update sync commit time
|
||||
// whether to skip syncing commit time stored in tbl properties, since it is time consuming.
|
||||
if (!adbSyncConfig.skipLastCommitTimeSync) {
|
||||
hoodieAdbClient.updateLastCommitTimeSynced(tableName);
|
||||
if (!config.getBoolean(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC)) {
|
||||
syncClient.updateLastCommitTimeSynced(tableName);
|
||||
}
|
||||
LOG.info("Sync complete for table:{}", tableName);
|
||||
}
|
||||
@@ -200,14 +220,14 @@ public class AdbSyncTool extends AbstractSyncTool {
|
||||
* @param schema The extracted schema
|
||||
*/
|
||||
private void syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat,
|
||||
boolean readAsOptimized, MessageType schema) throws Exception {
|
||||
boolean readAsOptimized, MessageType schema) {
|
||||
// Append spark table properties & serde properties
|
||||
Map<String, String> tableProperties = ConfigUtils.toMap(adbSyncConfig.tableProperties);
|
||||
Map<String, String> serdeProperties = ConfigUtils.toMap(adbSyncConfig.serdeProperties);
|
||||
if (adbSyncConfig.syncAsSparkDataSourceTable) {
|
||||
Map<String, String> sparkTableProperties = getSparkTableProperties(adbSyncConfig.partitionFields,
|
||||
adbSyncConfig.sparkVersion, adbSyncConfig.sparkSchemaLengthThreshold, schema);
|
||||
Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized, adbSyncConfig.basePath);
|
||||
Map<String, String> tableProperties = ConfigUtils.toMap(config.getString(ADB_SYNC_TABLE_PROPERTIES));
|
||||
Map<String, String> serdeProperties = ConfigUtils.toMap(config.getString(ADB_SYNC_SERDE_PROPERTIES));
|
||||
if (config.getBoolean(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE)) {
|
||||
Map<String, String> sparkTableProperties = SparkDataSourceTableUtils.getSparkTableProperties(config.getSplitStrings(META_SYNC_PARTITION_FIELDS),
|
||||
config.getString(META_SYNC_SPARK_VERSION), config.getInt(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD), schema);
|
||||
Map<String, String> sparkSerdeProperties = SparkDataSourceTableUtils.getSparkSerdeProperties(readAsOptimized, config.getString(META_SYNC_BASE_PATH));
|
||||
tableProperties.putAll(sparkTableProperties);
|
||||
serdeProperties.putAll(sparkSerdeProperties);
|
||||
LOG.info("Sync as spark datasource table, tableName:{}, tableExists:{}, tableProperties:{}, sederProperties:{}",
|
||||
@@ -222,16 +242,16 @@ public class AdbSyncTool extends AbstractSyncTool {
|
||||
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
|
||||
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
|
||||
// /ql/exec/DDLTask.java#L3488
|
||||
hoodieAdbClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(),
|
||||
syncClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(),
|
||||
ParquetHiveSerDe.class.getName(), serdeProperties, tableProperties);
|
||||
} else {
|
||||
// Check if the table schema has evolved
|
||||
Map<String, String> tableSchema = hoodieAdbClient.getTableSchema(tableName);
|
||||
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, adbSyncConfig.partitionFields,
|
||||
adbSyncConfig.supportTimestamp);
|
||||
Map<String, String> tableSchema = syncClient.getMetastoreSchema(tableName);
|
||||
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, config.getSplitStrings(META_SYNC_PARTITION_FIELDS),
|
||||
config.getBoolean(ADB_SYNC_SUPPORT_TIMESTAMP));
|
||||
if (!schemaDiff.isEmpty()) {
|
||||
LOG.info("Schema difference found for table:{}", tableName);
|
||||
hoodieAdbClient.updateTableDefinition(tableName, schemaDiff);
|
||||
syncClient.updateTableDefinition(tableName, schemaDiff);
|
||||
} else {
|
||||
LOG.info("No Schema difference for table:{}", tableName);
|
||||
}
|
||||
@@ -244,19 +264,19 @@ public class AdbSyncTool extends AbstractSyncTool {
|
||||
*/
|
||||
private void syncPartitions(String tableName, List<String> writtenPartitionsSince) {
|
||||
try {
|
||||
if (adbSyncConfig.partitionFields.isEmpty()) {
|
||||
if (config.getSplitStrings(META_SYNC_PARTITION_FIELDS).isEmpty()) {
|
||||
LOG.info("Not a partitioned table.");
|
||||
return;
|
||||
}
|
||||
|
||||
Map<List<String>, String> partitions = hoodieAdbClient.scanTablePartitions(tableName);
|
||||
List<PartitionEvent> partitionEvents = hoodieAdbClient.getPartitionEvents(partitions, writtenPartitionsSince);
|
||||
Map<List<String>, String> partitions = syncClient.scanTablePartitions(tableName);
|
||||
List<PartitionEvent> partitionEvents = syncClient.getPartitionEvents(partitions, writtenPartitionsSince);
|
||||
List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
|
||||
LOG.info("New Partitions:{}", newPartitions);
|
||||
hoodieAdbClient.addPartitionsToTable(tableName, newPartitions);
|
||||
syncClient.addPartitionsToTable(tableName, newPartitions);
|
||||
List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
|
||||
LOG.info("Changed Partitions:{}", updatePartitions);
|
||||
hoodieAdbClient.updatePartitionsToTable(tableName, updatePartitions);
|
||||
syncClient.updatePartitionsToTable(tableName, updatePartitions);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieAdbSyncException("Failed to sync partitions for table:" + tableName, e);
|
||||
}
|
||||
@@ -268,16 +288,13 @@ public class AdbSyncTool extends AbstractSyncTool {
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
// parse the params
|
||||
final AdbSyncConfig cfg = new AdbSyncConfig();
|
||||
JCommander cmd = new JCommander(cfg, null, args);
|
||||
if (cfg.help || args.length == 0) {
|
||||
final AdbSyncConfig.AdbSyncConfigParams params = new AdbSyncConfig.AdbSyncConfigParams();
|
||||
JCommander cmd = JCommander.newBuilder().addObject(params).build();
|
||||
cmd.parse(args);
|
||||
if (params.isHelp()) {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
Configuration hadoopConf = new Configuration();
|
||||
FileSystem fs = FSUtils.getFs(cfg.basePath, hadoopConf);
|
||||
new AdbSyncTool(AdbSyncConfig.toProps(cfg), hadoopConf, fs).syncHoodieTable();
|
||||
new AdbSyncTool(params.toProps()).syncHoodieTable();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,12 +23,12 @@ import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.SchemaDifference;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
import org.apache.hudi.sync.common.HoodieSyncClient;
|
||||
import org.apache.hudi.sync.common.model.PartitionEvent;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.slf4j.Logger;
|
||||
@@ -47,13 +47,21 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
|
||||
public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_JDBC_URL;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_PASS;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_USER;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_USE_HIVE_STYLE_PARTITIONING;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
|
||||
public class HoodieAdbJdbcClient extends HoodieSyncClient {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(HoodieAdbJdbcClient.class);
|
||||
|
||||
public static final String HOODIE_LAST_COMMIT_TIME_SYNC = "hoodie_last_sync";
|
||||
// Make sure we have the jdbc driver in classpath
|
||||
private static final String DRIVER_NAME = "com.mysql.jdbc.Driver";
|
||||
public static final String ADB_ESCAPE_CHARACTER = "";
|
||||
private static final String TBL_PROPERTIES_STR = "TBLPROPERTIES";
|
||||
|
||||
static {
|
||||
@@ -64,12 +72,16 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
}
|
||||
}
|
||||
|
||||
private final AdbSyncConfig config;
|
||||
private final String databaseName;
|
||||
private Connection connection;
|
||||
|
||||
public HoodieAdbJdbcClient(AdbSyncConfig syncConfig, FileSystem fs) {
|
||||
super(syncConfig, fs);
|
||||
public HoodieAdbJdbcClient(AdbSyncConfig config) {
|
||||
super(config);
|
||||
this.config = config;
|
||||
this.databaseName = config.getString(META_SYNC_DATABASE_NAME);
|
||||
createAdbConnection();
|
||||
LOG.info("Init adb jdbc client success, jdbcUrl:{}", syncConfig.jdbcUrl);
|
||||
LOG.info("Init adb jdbc client success, jdbcUrl:{}", config.getString(ADB_SYNC_JDBC_URL));
|
||||
}
|
||||
|
||||
private void createAdbConnection() {
|
||||
@@ -82,7 +94,9 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
}
|
||||
try {
|
||||
this.connection = DriverManager.getConnection(
|
||||
adbSyncConfig.jdbcUrl, adbSyncConfig.adbUser, adbSyncConfig.adbPass);
|
||||
config.getString(ADB_SYNC_JDBC_URL),
|
||||
config.getString(ADB_SYNC_USER),
|
||||
config.getString(ADB_SYNC_PASS));
|
||||
} catch (SQLException e) {
|
||||
throw new HoodieException("Cannot create adb connection ", e);
|
||||
}
|
||||
@@ -91,12 +105,12 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
|
||||
@Override
|
||||
public void createTable(String tableName, MessageType storageSchema, String inputFormatClass,
|
||||
String outputFormatClass, String serdeClass,
|
||||
Map<String, String> serdeProperties, Map<String, String> tableProperties) {
|
||||
String outputFormatClass, String serdeClass,
|
||||
Map<String, String> serdeProperties, Map<String, String> tableProperties) {
|
||||
try {
|
||||
LOG.info("Creating table:{}", tableName);
|
||||
String createSQLQuery = HiveSchemaUtil.generateCreateDDL(tableName, storageSchema,
|
||||
getHiveSyncConfig(), inputFormatClass, outputFormatClass, serdeClass, serdeProperties, tableProperties);
|
||||
config, inputFormatClass, outputFormatClass, serdeClass, serdeProperties, tableProperties);
|
||||
executeAdbSql(createSQLQuery);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieException("Fail to create table:" + tableName, e);
|
||||
@@ -106,17 +120,18 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
@Override
|
||||
public void dropTable(String tableName) {
|
||||
LOG.info("Dropping table:{}", tableName);
|
||||
String dropTable = "drop table if exists `" + adbSyncConfig.databaseName + "`.`" + tableName + "`";
|
||||
String dropTable = "drop table if exists `" + databaseName + "`.`" + tableName + "`";
|
||||
executeAdbSql(dropTable);
|
||||
}
|
||||
|
||||
public Map<String, String> getTableSchema(String tableName) {
|
||||
@Override
|
||||
public Map<String, String> getMetastoreSchema(String tableName) {
|
||||
Map<String, String> schema = new HashMap<>();
|
||||
ResultSet result = null;
|
||||
try {
|
||||
DatabaseMetaData databaseMetaData = connection.getMetaData();
|
||||
result = databaseMetaData.getColumns(adbSyncConfig.databaseName,
|
||||
adbSyncConfig.databaseName, tableName, null);
|
||||
result = databaseMetaData.getColumns(databaseName,
|
||||
databaseName, tableName, null);
|
||||
while (result.next()) {
|
||||
String columnName = result.getString(4);
|
||||
String columnType = result.getString(6);
|
||||
@@ -174,7 +189,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
}
|
||||
|
||||
public void createDatabase(String databaseName) {
|
||||
String rootPath = getDatabasePath();
|
||||
String rootPath = config.getDatabasePath();
|
||||
LOG.info("Creating database:{}, databaseLocation:{}", databaseName, rootPath);
|
||||
String sql = constructCreateDatabaseSql(rootPath);
|
||||
executeAdbSql(sql);
|
||||
@@ -197,7 +212,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean doesTableExist(String tableName) {
|
||||
public boolean tableExists(String tableName) {
|
||||
String sql = constructShowLikeTableSql(tableName);
|
||||
Function<ResultSet, Boolean> transform = resultSet -> {
|
||||
try {
|
||||
@@ -209,11 +224,6 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
return executeQuerySQL(sql, transform);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean tableExists(String tableName) {
|
||||
return doesTableExist(tableName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getLastCommitTimeSynced(String tableName) {
|
||||
String sql = constructShowCreateTableSql(tableName);
|
||||
@@ -251,7 +261,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
@Override
|
||||
public void updateLastCommitTimeSynced(String tableName) {
|
||||
// Set the last commit time from the TBLProperties
|
||||
String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp();
|
||||
String lastCommitSynced = getActiveTimeline().lastInstant().get().getTimestamp();
|
||||
try {
|
||||
String sql = constructUpdateTblPropertiesSql(tableName, lastCommitSynced);
|
||||
executeAdbSql(sql);
|
||||
@@ -275,6 +285,11 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
throw new UnsupportedOperationException("Not support deleteLastReplicatedTimeStamp yet");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateTableProperties(String tableName, Map<String, String> tableProperties) {
|
||||
throw new UnsupportedOperationException("Not support updateTableProperties yet");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
|
||||
if (changedPartitions.isEmpty()) {
|
||||
@@ -294,6 +309,9 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
throw new UnsupportedOperationException("Not support dropPartitions yet.");
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO migrate to implementation of {@link #getAllPartitions(String)}
|
||||
*/
|
||||
public Map<List<String>, String> scanTablePartitions(String tableName) {
|
||||
String sql = constructShowPartitionSql(tableName);
|
||||
Function<ResultSet, Map<List<String>, String>> transform = resultSet -> {
|
||||
@@ -304,7 +322,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
String str = resultSet.getString(1);
|
||||
if (!StringUtils.isNullOrEmpty(str)) {
|
||||
List<String> values = partitionValueExtractor.extractPartitionValuesInPath(str);
|
||||
Path storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, String.join("/", values));
|
||||
Path storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), String.join("/", values));
|
||||
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
partitions.put(values, fullStoragePartitionPath);
|
||||
}
|
||||
@@ -318,6 +336,9 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
return executeQuerySQL(sql, transform);
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO align with {@link org.apache.hudi.sync.common.HoodieMetaSyncOperations#updateTableSchema}
|
||||
*/
|
||||
public void updateTableDefinition(String tableName, SchemaDifference schemaDiff) {
|
||||
LOG.info("Adding columns for table:{}", tableName);
|
||||
schemaDiff.getAddColumnTypes().forEach((columnName, columnType) ->
|
||||
@@ -332,12 +353,12 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
|
||||
private String constructAddPartitionsSql(String tableName, List<String> partitions) {
|
||||
StringBuilder sqlBuilder = new StringBuilder("alter table `");
|
||||
sqlBuilder.append(adbSyncConfig.databaseName).append("`").append(".`")
|
||||
sqlBuilder.append(databaseName).append("`").append(".`")
|
||||
.append(tableName).append("`").append(" add if not exists ");
|
||||
for (String partition : partitions) {
|
||||
String partitionClause = getPartitionClause(partition);
|
||||
Path partitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
|
||||
String fullPartitionPathStr = generateAbsolutePathStr(partitionPath);
|
||||
Path partitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
|
||||
String fullPartitionPathStr = config.generateAbsolutePathStr(partitionPath);
|
||||
sqlBuilder.append(" partition (").append(partitionClause).append(") location '")
|
||||
.append(fullPartitionPathStr).append("' ");
|
||||
}
|
||||
@@ -347,14 +368,14 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
|
||||
private List<String> constructChangePartitionsSql(String tableName, List<String> partitions) {
|
||||
List<String> changePartitions = new ArrayList<>();
|
||||
String useDatabase = "use `" + adbSyncConfig.databaseName + "`";
|
||||
String useDatabase = "use `" + databaseName + "`";
|
||||
changePartitions.add(useDatabase);
|
||||
|
||||
String alterTable = "alter table `" + tableName + "`";
|
||||
for (String partition : partitions) {
|
||||
String partitionClause = getPartitionClause(partition);
|
||||
Path partitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
|
||||
String fullPartitionPathStr = generateAbsolutePathStr(partitionPath);
|
||||
Path partitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
|
||||
String fullPartitionPathStr = config.generateAbsolutePathStr(partitionPath);
|
||||
String changePartition = alterTable + " add if not exists partition (" + partitionClause
|
||||
+ ") location '" + fullPartitionPathStr + "'";
|
||||
changePartitions.add(changePartition);
|
||||
@@ -371,32 +392,32 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
*/
|
||||
private String getPartitionClause(String partition) {
|
||||
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
|
||||
ValidationUtils.checkArgument(adbSyncConfig.partitionFields.size() == partitionValues.size(),
|
||||
"Partition key parts " + adbSyncConfig.partitionFields
|
||||
ValidationUtils.checkArgument(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() == partitionValues.size(),
|
||||
"Partition key parts " + config.getSplitStrings(META_SYNC_PARTITION_FIELDS)
|
||||
+ " does not match with partition values " + partitionValues + ". Check partition strategy. ");
|
||||
List<String> partBuilder = new ArrayList<>();
|
||||
for (int i = 0; i < adbSyncConfig.partitionFields.size(); i++) {
|
||||
partBuilder.add(adbSyncConfig.partitionFields.get(i) + "='" + partitionValues.get(i) + "'");
|
||||
for (int i = 0; i < config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size(); i++) {
|
||||
partBuilder.add(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).get(i) + "='" + partitionValues.get(i) + "'");
|
||||
}
|
||||
|
||||
return String.join(",", partBuilder);
|
||||
}
|
||||
|
||||
private String constructShowPartitionSql(String tableName) {
|
||||
return String.format("show partitions `%s`.`%s`", adbSyncConfig.databaseName, tableName);
|
||||
return String.format("show partitions `%s`.`%s`", databaseName, tableName);
|
||||
}
|
||||
|
||||
private String constructShowCreateTableSql(String tableName) {
|
||||
return String.format("show create table `%s`.`%s`", adbSyncConfig.databaseName, tableName);
|
||||
return String.format("show create table `%s`.`%s`", databaseName, tableName);
|
||||
}
|
||||
|
||||
private String constructShowLikeTableSql(String tableName) {
|
||||
return String.format("show tables from `%s` like '%s'", adbSyncConfig.databaseName, tableName);
|
||||
return String.format("show tables from `%s` like '%s'", databaseName, tableName);
|
||||
}
|
||||
|
||||
private String constructCreateDatabaseSql(String rootPath) {
|
||||
return String.format("create database if not exists `%s` with dbproperties(catalog = 'oss', location = '%s')",
|
||||
adbSyncConfig.databaseName, rootPath);
|
||||
databaseName, rootPath);
|
||||
}
|
||||
|
||||
private String constructShowCreateDatabaseSql(String databaseName) {
|
||||
@@ -405,26 +426,69 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
|
||||
private String constructUpdateTblPropertiesSql(String tableName, String lastCommitSynced) {
|
||||
return String.format("alter table `%s`.`%s` set tblproperties('%s' = '%s')",
|
||||
adbSyncConfig.databaseName, tableName, HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced);
|
||||
databaseName, tableName, HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced);
|
||||
}
|
||||
|
||||
private String constructAddColumnSql(String tableName, String columnName, String columnType) {
|
||||
return String.format("alter table `%s`.`%s` add columns(`%s` %s)",
|
||||
adbSyncConfig.databaseName, tableName, columnName, columnType);
|
||||
databaseName, tableName, columnName, columnType);
|
||||
}
|
||||
|
||||
private String constructChangeColumnSql(String tableName, String columnName, String columnType) {
|
||||
return String.format("alter table `%s`.`%s` change `%s` `%s` %s",
|
||||
adbSyncConfig.databaseName, tableName, columnName, columnName, columnType);
|
||||
databaseName, tableName, columnName, columnName, columnType);
|
||||
}
|
||||
|
||||
private HiveSyncConfig getHiveSyncConfig() {
|
||||
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
|
||||
hiveSyncConfig.partitionFields = adbSyncConfig.partitionFields;
|
||||
hiveSyncConfig.databaseName = adbSyncConfig.databaseName;
|
||||
Path basePath = new Path(adbSyncConfig.basePath);
|
||||
hiveSyncConfig.basePath = generateAbsolutePathStr(basePath);
|
||||
return hiveSyncConfig;
|
||||
/**
|
||||
* TODO align with {@link HoodieSyncClient#getPartitionEvents}
|
||||
*/
|
||||
public List<PartitionEvent> getPartitionEvents(Map<List<String>, String> tablePartitions, List<String> partitionStoragePartitions) {
|
||||
Map<String, String> paths = new HashMap<>();
|
||||
|
||||
for (Map.Entry<List<String>, String> entry : tablePartitions.entrySet()) {
|
||||
List<String> partitionValues = entry.getKey();
|
||||
String fullTablePartitionPath = entry.getValue();
|
||||
paths.put(String.join(", ", partitionValues), fullTablePartitionPath);
|
||||
}
|
||||
List<PartitionEvent> events = new ArrayList<>();
|
||||
for (String storagePartition : partitionStoragePartitions) {
|
||||
Path storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), storagePartition);
|
||||
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
// Check if the partition values or if hdfs path is the same
|
||||
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
|
||||
if (config.getBoolean(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING)) {
|
||||
String partition = String.join("/", storagePartitionValues);
|
||||
storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
|
||||
fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
}
|
||||
if (!storagePartitionValues.isEmpty()) {
|
||||
String storageValue = String.join(", ", storagePartitionValues);
|
||||
if (!paths.containsKey(storageValue)) {
|
||||
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
|
||||
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
|
||||
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
|
||||
}
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
public void closeQuietly(ResultSet resultSet, Statement stmt) {
|
||||
try {
|
||||
if (stmt != null) {
|
||||
stmt.close();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("Could not close the statement opened ", e);
|
||||
}
|
||||
|
||||
try {
|
||||
if (resultSet != null) {
|
||||
resultSet.close();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("Could not close the resultset opened ", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@@ -19,47 +19,72 @@
|
||||
|
||||
package org.apache.hudi.sync.adb;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_DB_LOCATION;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_JDBC_URL;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_PASS;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SERDE_PROPERTIES;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_RO_SUFFIX;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_TABLE_PROPERTIES;
|
||||
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_USER;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class TestAdbSyncConfig {
|
||||
|
||||
@Test
|
||||
public void testCopy() {
|
||||
AdbSyncConfig adbSyncConfig = new AdbSyncConfig();
|
||||
adbSyncConfig.partitionFields = Arrays.asList("a", "b");
|
||||
adbSyncConfig.basePath = "/tmp";
|
||||
adbSyncConfig.assumeDatePartitioning = true;
|
||||
adbSyncConfig.databaseName = "test";
|
||||
adbSyncConfig.tableName = "test";
|
||||
adbSyncConfig.adbUser = "adb";
|
||||
adbSyncConfig.adbPass = "adb";
|
||||
adbSyncConfig.jdbcUrl = "jdbc:mysql://localhost:3306";
|
||||
adbSyncConfig.skipROSuffix = false;
|
||||
adbSyncConfig.tableProperties = "spark.sql.sources.provider= 'hudi'\\n"
|
||||
+ "spark.sql.sources.schema.numParts = '1'\\n "
|
||||
+ "spark.sql.sources.schema.part.0 ='xx'\\n "
|
||||
+ "spark.sql.sources.schema.numPartCols = '1'\\n"
|
||||
+ "spark.sql.sources.schema.partCol.0 = 'dt'";
|
||||
adbSyncConfig.serdeProperties = "'path'='/tmp/test_db/tbl'";
|
||||
adbSyncConfig.dbLocation = "file://tmp/test_db";
|
||||
public void testInitConfig() {
|
||||
Properties props = new Properties();
|
||||
props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "a,b");
|
||||
props.setProperty(META_SYNC_BASE_PATH.key(), "/tmp");
|
||||
props.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
|
||||
props.setProperty(META_SYNC_DATABASE_NAME.key(), "test");
|
||||
props.setProperty(META_SYNC_TABLE_NAME.key(), "test");
|
||||
props.setProperty(ADB_SYNC_USER.key(), "adb");
|
||||
props.setProperty(ADB_SYNC_PASS.key(), "adb");
|
||||
props.setProperty(ADB_SYNC_JDBC_URL.key(), "jdbc:mysql://localhost:3306");
|
||||
props.setProperty(ADB_SYNC_SKIP_RO_SUFFIX.key(), "false");
|
||||
String tableProps = "spark.sql.sources.provider=hudi\n"
|
||||
+ "spark.sql.sources.schema.numParts=1\n"
|
||||
+ "spark.sql.sources.schema.part.0=xx\n"
|
||||
+ "spark.sql.sources.schema.numPartCols=1\n"
|
||||
+ "spark.sql.sources.schema.partCol.0=dt";
|
||||
props.setProperty(ADB_SYNC_TABLE_PROPERTIES.key(), tableProps);
|
||||
props.setProperty(ADB_SYNC_SERDE_PROPERTIES.key(), "path=/tmp/test_db/tbl");
|
||||
props.setProperty(ADB_SYNC_DB_LOCATION.key(), "file://tmp/test_db");
|
||||
|
||||
TypedProperties props = AdbSyncConfig.toProps(adbSyncConfig);
|
||||
AdbSyncConfig copied = new AdbSyncConfig(props);
|
||||
|
||||
assertEquals(copied.partitionFields, adbSyncConfig.partitionFields);
|
||||
assertEquals(copied.basePath, adbSyncConfig.basePath);
|
||||
assertEquals(copied.assumeDatePartitioning, adbSyncConfig.assumeDatePartitioning);
|
||||
assertEquals(copied.databaseName, adbSyncConfig.databaseName);
|
||||
assertEquals(copied.tableName, adbSyncConfig.tableName);
|
||||
assertEquals(copied.adbUser, adbSyncConfig.adbUser);
|
||||
assertEquals(copied.adbPass, adbSyncConfig.adbPass);
|
||||
assertEquals(copied.basePath, adbSyncConfig.basePath);
|
||||
assertEquals(copied.jdbcUrl, adbSyncConfig.jdbcUrl);
|
||||
assertEquals(copied.skipROSuffix, adbSyncConfig.skipROSuffix);
|
||||
assertEquals(copied.supportTimestamp, adbSyncConfig.supportTimestamp);
|
||||
AdbSyncConfig config = new AdbSyncConfig(props);
|
||||
assertEquals(Arrays.asList("a", "b"), config.getSplitStrings(META_SYNC_PARTITION_FIELDS));
|
||||
assertEquals("/tmp", config.getString(META_SYNC_BASE_PATH));
|
||||
assertEquals(true, config.getBoolean(META_SYNC_ASSUME_DATE_PARTITION));
|
||||
assertEquals("test", config.getString(META_SYNC_DATABASE_NAME));
|
||||
assertEquals("test", config.getString(META_SYNC_TABLE_NAME));
|
||||
assertEquals("adb", config.getString(ADB_SYNC_USER));
|
||||
assertEquals("adb", config.getString(ADB_SYNC_PASS));
|
||||
assertEquals("jdbc:mysql://localhost:3306", config.getString(ADB_SYNC_JDBC_URL));
|
||||
assertEquals(false, config.getBoolean(ADB_SYNC_SKIP_RO_SUFFIX));
|
||||
Map<String, String> tablePropsMap = new HashMap<>();
|
||||
tablePropsMap.put("spark.sql.sources.provider", "hudi");
|
||||
tablePropsMap.put("spark.sql.sources.schema.numParts", "1");
|
||||
tablePropsMap.put("spark.sql.sources.schema.part.0", "xx");
|
||||
tablePropsMap.put("spark.sql.sources.schema.numPartCols", "1");
|
||||
tablePropsMap.put("spark.sql.sources.schema.partCol.0", "dt");
|
||||
assertEquals(tablePropsMap, ConfigUtils.toMap(config.getString(ADB_SYNC_TABLE_PROPERTIES)));
|
||||
Map<String, String> serdePropsMap = new HashMap<>();
|
||||
serdePropsMap.put("path", "/tmp/test_db/tbl");
|
||||
assertEquals(serdePropsMap, ConfigUtils.toMap(config.getString(ADB_SYNC_SERDE_PROPERTIES)));
|
||||
assertEquals("file://tmp/test_db", config.getString(ADB_SYNC_DB_LOCATION));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,9 +21,8 @@ package org.apache.hudi.sync.datahub;
|
||||
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
|
||||
import org.apache.hudi.sync.common.HoodieSyncClient;
|
||||
import org.apache.hudi.sync.common.HoodieSyncException;
|
||||
import org.apache.hudi.sync.datahub.config.DataHubSyncConfig;
|
||||
|
||||
@@ -51,8 +50,6 @@ import datahub.client.rest.RestEmitter;
|
||||
import datahub.event.MetadataChangeProposalWrapper;
|
||||
import org.apache.avro.AvroTypeException;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.Collections;
|
||||
@@ -60,40 +57,15 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class DataHubSyncClient extends AbstractSyncHoodieClient {
|
||||
public class DataHubSyncClient extends HoodieSyncClient {
|
||||
|
||||
private final HoodieTimeline activeTimeline;
|
||||
private final DataHubSyncConfig syncConfig;
|
||||
private final Configuration hadoopConf;
|
||||
protected final DataHubSyncConfig config;
|
||||
private final DatasetUrn datasetUrn;
|
||||
|
||||
public DataHubSyncClient(DataHubSyncConfig syncConfig, Configuration hadoopConf, FileSystem fs) {
|
||||
super(syncConfig.basePath, syncConfig.assumeDatePartitioning, syncConfig.useFileListingFromMetadata, false, fs);
|
||||
this.syncConfig = syncConfig;
|
||||
this.hadoopConf = hadoopConf;
|
||||
this.datasetUrn = syncConfig.datasetIdentifier.getDatasetUrn();
|
||||
this.activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void createTable(String tableName,
|
||||
MessageType storageSchema,
|
||||
String inputFormatClass,
|
||||
String outputFormatClass,
|
||||
String serdeClass,
|
||||
Map<String, String> serdeProperties,
|
||||
Map<String, String> tableProperties) {
|
||||
throw new UnsupportedOperationException("Not supported: `createTable`");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean doesTableExist(String tableName) {
|
||||
return tableExists(tableName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean tableExists(String tableName) {
|
||||
throw new UnsupportedOperationException("Not supported: `tableExists`");
|
||||
public DataHubSyncClient(DataHubSyncConfig config) {
|
||||
super(config);
|
||||
this.config = config;
|
||||
this.datasetUrn = config.datasetIdentifier.getDatasetUrn();
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -103,37 +75,7 @@ public class DataHubSyncClient extends AbstractSyncHoodieClient {
|
||||
|
||||
@Override
|
||||
public void updateLastCommitTimeSynced(String tableName) {
|
||||
updateTableProperties(tableName, Collections.singletonMap(HOODIE_LAST_COMMIT_TIME_SYNC, activeTimeline.lastInstant().get().getTimestamp()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getLastReplicatedTime(String tableName) {
|
||||
throw new UnsupportedOperationException("Not supported: `getLastReplicatedTime`");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
|
||||
throw new UnsupportedOperationException("Not supported: `updateLastReplicatedTimeStamp`");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteLastReplicatedTimeStamp(String tableName) {
|
||||
throw new UnsupportedOperationException("Not supported: `deleteLastReplicatedTimeStamp`");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addPartitionsToTable(String tableName, List<String> partitionsToAdd) {
|
||||
throw new UnsupportedOperationException("Not supported: `addPartitionsToTable`");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
|
||||
throw new UnsupportedOperationException("Not supported: `updatePartitionsToTable`");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void dropPartitions(String tableName, List<String> partitionsToDrop) {
|
||||
throw new UnsupportedOperationException("Not supported: `dropPartitions`");
|
||||
updateTableProperties(tableName, Collections.singletonMap(HOODIE_LAST_COMMIT_TIME_SYNC, getActiveTimeline().lastInstant().get().getTimestamp()));
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -145,14 +87,15 @@ public class DataHubSyncClient extends AbstractSyncHoodieClient {
|
||||
.aspect(new DatasetProperties().setCustomProperties(new StringMap(tableProperties)))
|
||||
.build();
|
||||
|
||||
try (RestEmitter emitter = syncConfig.getRestEmitter()) {
|
||||
try (RestEmitter emitter = config.getRestEmitter()) {
|
||||
emitter.emit(propertiesChangeProposal, null).get();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieDataHubSyncException("Fail to change properties for Dataset " + datasetUrn + ": " + tableProperties, e);
|
||||
}
|
||||
}
|
||||
|
||||
public void updateTableDefinition(String tableName) {
|
||||
@Override
|
||||
public void updateTableSchema(String tableName, MessageType schema) {
|
||||
Schema avroSchema = getAvroSchemaWithoutMetadataFields(metaClient);
|
||||
List<SchemaField> fields = avroSchema.getFields().stream().map(f -> new SchemaField()
|
||||
.setFieldPath(f.name())
|
||||
@@ -175,7 +118,7 @@ public class DataHubSyncClient extends AbstractSyncHoodieClient {
|
||||
.setFields(new SchemaFieldArray(fields)))
|
||||
.build();
|
||||
|
||||
try (RestEmitter emitter = syncConfig.getRestEmitter()) {
|
||||
try (RestEmitter emitter = config.getRestEmitter()) {
|
||||
emitter.emit(schemaChangeProposal, null).get();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieDataHubSyncException("Fail to change schema for Dataset " + datasetUrn, e);
|
||||
@@ -183,7 +126,7 @@ public class DataHubSyncClient extends AbstractSyncHoodieClient {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> getTableSchema(String tableName) {
|
||||
public Map<String, String> getMetastoreSchema(String tableName) {
|
||||
throw new UnsupportedOperationException("Not supported: `getTableSchema`");
|
||||
}
|
||||
|
||||
|
||||
@@ -19,14 +19,14 @@
|
||||
|
||||
package org.apache.hudi.sync.datahub;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.HoodieSyncTool;
|
||||
import org.apache.hudi.sync.datahub.config.DataHubSyncConfig;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
|
||||
|
||||
/**
|
||||
* To sync with DataHub via REST APIs.
|
||||
@@ -34,17 +34,13 @@ import org.apache.hadoop.fs.FileSystem;
|
||||
* @Experimental
|
||||
* @see <a href="https://datahubproject.io/">https://datahubproject.io/</a>
|
||||
*/
|
||||
public class DataHubSyncTool extends AbstractSyncTool {
|
||||
public class DataHubSyncTool extends HoodieSyncTool {
|
||||
|
||||
private final DataHubSyncConfig config;
|
||||
protected final DataHubSyncConfig config;
|
||||
|
||||
public DataHubSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
this(new DataHubSyncConfig(props), conf, fs);
|
||||
}
|
||||
|
||||
public DataHubSyncTool(DataHubSyncConfig config, Configuration conf, FileSystem fs) {
|
||||
super(config.getProps(), conf, fs);
|
||||
this.config = config;
|
||||
public DataHubSyncTool(Properties props) {
|
||||
super(props);
|
||||
this.config = new DataHubSyncConfig(props);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -55,20 +51,20 @@ public class DataHubSyncTool extends AbstractSyncTool {
|
||||
*/
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
try (DataHubSyncClient syncClient = new DataHubSyncClient(config, conf, fs)) {
|
||||
syncClient.updateTableDefinition(config.tableName);
|
||||
syncClient.updateLastCommitTimeSynced(config.tableName);
|
||||
try (DataHubSyncClient syncClient = new DataHubSyncClient(config)) {
|
||||
syncClient.updateTableSchema(config.getString(META_SYNC_TABLE_NAME), null);
|
||||
syncClient.updateLastCommitTimeSynced(config.getString(META_SYNC_TABLE_NAME));
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
final DataHubSyncConfig cfg = new DataHubSyncConfig();
|
||||
JCommander cmd = new JCommander(cfg, null, args);
|
||||
if (cfg.help || args.length == 0) {
|
||||
final DataHubSyncConfig.DataHubSyncConfigParams params = new DataHubSyncConfig.DataHubSyncConfigParams();
|
||||
JCommander cmd = JCommander.newBuilder().addObject(params).build();
|
||||
cmd.parse(args);
|
||||
if (params.isHelp()) {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
System.exit(0);
|
||||
}
|
||||
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
|
||||
new DataHubSyncTool(cfg, fs.getConf(), fs).syncHoodieTable();
|
||||
new DataHubSyncTool(params.toProps()).syncHoodieTable();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,8 +25,11 @@ import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
import com.beust.jcommander.ParametersDelegate;
|
||||
import datahub.client.rest.RestEmitter;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
public class DataHubSyncConfig extends HoodieSyncConfig {
|
||||
|
||||
public static final ConfigProperty<String> META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS = ConfigProperty
|
||||
@@ -49,45 +52,52 @@ public class DataHubSyncConfig extends HoodieSyncConfig {
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Pluggable class to supply a DataHub REST emitter to connect to the DataHub instance. This overwrites other emitter configs.");
|
||||
|
||||
@Parameter(names = {"--identifier-class"}, description = "Pluggable class to help provide info to identify a DataHub Dataset.")
|
||||
public String identifierClass;
|
||||
|
||||
@Parameter(names = {"--emitter-server"}, description = "Server URL of the DataHub instance.")
|
||||
public String emitterServer;
|
||||
|
||||
@Parameter(names = {"--emitter-token"}, description = "Auth token to connect to the DataHub instance.")
|
||||
public String emitterToken;
|
||||
|
||||
@Parameter(names = {"--emitter-supplier-class"}, description = "Pluggable class to supply a DataHub REST emitter to connect to the DataHub instance. This overwrites other emitter configs.")
|
||||
public String emitterSupplierClass;
|
||||
|
||||
@Parameter(names = {"--help", "-h"}, help = true)
|
||||
public Boolean help = false;
|
||||
|
||||
public final HoodieDataHubDatasetIdentifier datasetIdentifier;
|
||||
|
||||
public DataHubSyncConfig() {
|
||||
this(new TypedProperties());
|
||||
}
|
||||
|
||||
public DataHubSyncConfig(TypedProperties props) {
|
||||
public DataHubSyncConfig(Properties props) {
|
||||
super(props);
|
||||
identifierClass = getStringOrDefault(META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS);
|
||||
emitterServer = getStringOrDefault(META_SYNC_DATAHUB_EMITTER_SERVER, null);
|
||||
emitterToken = getStringOrDefault(META_SYNC_DATAHUB_EMITTER_TOKEN, null);
|
||||
emitterSupplierClass = getStringOrDefault(META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS, null);
|
||||
|
||||
datasetIdentifier = (HoodieDataHubDatasetIdentifier) ReflectionUtils
|
||||
.loadClass(identifierClass, new Class<?>[] {TypedProperties.class}, props);
|
||||
String identifierClass = getStringOrDefault(META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS);
|
||||
datasetIdentifier = (HoodieDataHubDatasetIdentifier) ReflectionUtils.loadClass(identifierClass, new Class<?>[] {Properties.class}, props);
|
||||
}
|
||||
|
||||
public RestEmitter getRestEmitter() {
|
||||
if (emitterSupplierClass != null) {
|
||||
return ((DataHubEmitterSupplier) ReflectionUtils.loadClass(emitterSupplierClass)).get();
|
||||
} else if (emitterServer != null) {
|
||||
return RestEmitter.create(b -> b.server(emitterServer).token(emitterToken));
|
||||
if (contains(META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS)) {
|
||||
return ((DataHubEmitterSupplier) ReflectionUtils.loadClass(getString(META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS))).get();
|
||||
} else if (contains(META_SYNC_DATAHUB_EMITTER_SERVER)) {
|
||||
return RestEmitter.create(b -> b.server(getString(META_SYNC_DATAHUB_EMITTER_SERVER)).token(getStringOrDefault(META_SYNC_DATAHUB_EMITTER_TOKEN, null)));
|
||||
} else {
|
||||
return RestEmitter.createWithDefaults();
|
||||
}
|
||||
}
|
||||
|
||||
public static class DataHubSyncConfigParams {
|
||||
|
||||
@ParametersDelegate()
|
||||
public final HoodieSyncConfigParams hoodieSyncConfigParams = new HoodieSyncConfigParams();
|
||||
|
||||
@Parameter(names = {"--identifier-class"}, description = "Pluggable class to help provide info to identify a DataHub Dataset.")
|
||||
public String identifierClass;
|
||||
|
||||
@Parameter(names = {"--emitter-server"}, description = "Server URL of the DataHub instance.")
|
||||
public String emitterServer;
|
||||
|
||||
@Parameter(names = {"--emitter-token"}, description = "Auth token to connect to the DataHub instance.")
|
||||
public String emitterToken;
|
||||
|
||||
@Parameter(names = {"--emitter-supplier-class"}, description = "Pluggable class to supply a DataHub REST emitter to connect to the DataHub instance. This overwrites other emitter configs.")
|
||||
public String emitterSupplierClass;
|
||||
|
||||
public boolean isHelp() {
|
||||
return hoodieSyncConfigParams.isHelp();
|
||||
}
|
||||
|
||||
public Properties toProps() {
|
||||
final TypedProperties props = hoodieSyncConfigParams.toProps();
|
||||
props.setPropertyIfNonNull(META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS.key(), identifierClass);
|
||||
props.setPropertyIfNonNull(META_SYNC_DATAHUB_EMITTER_SERVER.key(), emitterServer);
|
||||
props.setPropertyIfNonNull(META_SYNC_DATAHUB_EMITTER_TOKEN.key(), emitterToken);
|
||||
props.setPropertyIfNonNull(META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS.key(), emitterSupplierClass);
|
||||
return props;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,12 +19,15 @@
|
||||
|
||||
package org.apache.hudi.sync.datahub.config;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
|
||||
import com.linkedin.common.FabricType;
|
||||
import com.linkedin.common.urn.DataPlatformUrn;
|
||||
import com.linkedin.common.urn.DatasetUrn;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
|
||||
|
||||
/**
|
||||
* Construct and provide the default {@link DatasetUrn} to identify the Dataset on DataHub.
|
||||
* <p>
|
||||
@@ -34,15 +37,15 @@ public class HoodieDataHubDatasetIdentifier {
|
||||
|
||||
public static final String DEFAULT_HOODIE_DATAHUB_PLATFORM_NAME = "hudi";
|
||||
|
||||
protected final TypedProperties props;
|
||||
protected final Properties props;
|
||||
|
||||
public HoodieDataHubDatasetIdentifier(TypedProperties props) {
|
||||
public HoodieDataHubDatasetIdentifier(Properties props) {
|
||||
this.props = props;
|
||||
}
|
||||
|
||||
public DatasetUrn getDatasetUrn() {
|
||||
DataPlatformUrn dataPlatformUrn = new DataPlatformUrn(DEFAULT_HOODIE_DATAHUB_PLATFORM_NAME);
|
||||
DataHubSyncConfig config = new DataHubSyncConfig(props);
|
||||
return new DatasetUrn(dataPlatformUrn, String.format("%s.%s", config.databaseName, config.tableName), FabricType.DEV);
|
||||
return new DatasetUrn(dataPlatformUrn, String.format("%s.%s", config.getString(META_SYNC_DATABASE_NAME), config.getString(META_SYNC_TABLE_NAME)), FabricType.DEV);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,22 +19,32 @@
|
||||
|
||||
package org.apache.hudi.sync.datahub.config;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
|
||||
import com.linkedin.common.FabricType;
|
||||
import com.linkedin.common.urn.DatasetUrn;
|
||||
import datahub.client.rest.RestEmitter;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.apache.hudi.sync.datahub.config.DataHubSyncConfig.META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS;
|
||||
import static org.apache.hudi.sync.datahub.config.DataHubSyncConfig.META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
class TestDataHubSyncConfig {
|
||||
|
||||
@Test
|
||||
void testGetEmitterFromSupplier() {
|
||||
Properties props = new Properties();
|
||||
props.setProperty(META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS.key(), DummySupplier.class.getName());
|
||||
DataHubSyncConfig syncConfig = new DataHubSyncConfig(props);
|
||||
assertNotNull(syncConfig.getRestEmitter());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testInstantiationWithProps() {
|
||||
TypedProperties props = new TypedProperties();
|
||||
Properties props = new Properties();
|
||||
props.setProperty(META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS.key(), DummyIdentifier.class.getName());
|
||||
DataHubSyncConfig syncConfig = new DataHubSyncConfig(props);
|
||||
DatasetUrn datasetUrn = syncConfig.datasetIdentifier.getDatasetUrn();
|
||||
@@ -43,9 +53,17 @@ class TestDataHubSyncConfig {
|
||||
assertEquals(FabricType.PROD, datasetUrn.getOriginEntity());
|
||||
}
|
||||
|
||||
public static class DummySupplier implements DataHubEmitterSupplier {
|
||||
|
||||
@Override
|
||||
public RestEmitter get() {
|
||||
return RestEmitter.createWithDefaults();
|
||||
}
|
||||
}
|
||||
|
||||
public static class DummyIdentifier extends HoodieDataHubDatasetIdentifier {
|
||||
|
||||
public DummyIdentifier(TypedProperties props) {
|
||||
public DummyIdentifier(Properties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,142 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
|
||||
import org.apache.hudi.sync.common.HoodieSyncException;
|
||||
import org.apache.hudi.sync.common.model.Partition;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Base class to sync Hudi tables with Hive based metastores, such as Hive server, HMS or managed Hive services.
|
||||
*/
|
||||
public abstract class AbstractHiveSyncHoodieClient extends AbstractSyncHoodieClient {
|
||||
|
||||
protected final HoodieTimeline activeTimeline;
|
||||
protected final HiveSyncConfig syncConfig;
|
||||
protected final Configuration hadoopConf;
|
||||
protected final PartitionValueExtractor partitionValueExtractor;
|
||||
|
||||
public AbstractHiveSyncHoodieClient(HiveSyncConfig syncConfig, Configuration hadoopConf, FileSystem fs) {
|
||||
super(syncConfig.basePath, syncConfig.assumeDatePartitioning, syncConfig.useFileListingFromMetadata, syncConfig.withOperationField, fs);
|
||||
this.syncConfig = syncConfig;
|
||||
this.hadoopConf = hadoopConf;
|
||||
this.partitionValueExtractor = ReflectionUtils.loadClass(syncConfig.partitionValueExtractorClass);
|
||||
this.activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
public HoodieTimeline getActiveTimeline() {
|
||||
return activeTimeline;
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over the storage partitions and find if there are any new partitions that need to be added or updated.
|
||||
* Generate a list of PartitionEvent based on the changes required.
|
||||
*/
|
||||
protected List<PartitionEvent> getPartitionEvents(List<Partition> tablePartitions, List<String> partitionStoragePartitions, boolean isDropPartition) {
|
||||
Map<String, String> paths = new HashMap<>();
|
||||
for (Partition tablePartition : tablePartitions) {
|
||||
List<String> hivePartitionValues = tablePartition.getValues();
|
||||
String fullTablePartitionPath =
|
||||
Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getStorageLocation())).toUri().getPath();
|
||||
paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath);
|
||||
}
|
||||
|
||||
List<PartitionEvent> events = new ArrayList<>();
|
||||
for (String storagePartition : partitionStoragePartitions) {
|
||||
Path storagePartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, storagePartition);
|
||||
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
// Check if the partition values or if hdfs path is the same
|
||||
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
|
||||
|
||||
if (isDropPartition) {
|
||||
events.add(PartitionEvent.newPartitionDropEvent(storagePartition));
|
||||
} else {
|
||||
if (!storagePartitionValues.isEmpty()) {
|
||||
String storageValue = String.join(", ", storagePartitionValues);
|
||||
if (!paths.containsKey(storageValue)) {
|
||||
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
|
||||
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
|
||||
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all partitions for the table in the metastore.
|
||||
*/
|
||||
public abstract List<Partition> getAllPartitions(String tableName);
|
||||
|
||||
/**
|
||||
* Check if a database already exists in the metastore.
|
||||
*/
|
||||
public abstract boolean databaseExists(String databaseName);
|
||||
|
||||
/**
|
||||
* Create a database in the metastore.
|
||||
*/
|
||||
public abstract void createDatabase(String databaseName);
|
||||
|
||||
/**
|
||||
* Update schema for the table in the metastore.
|
||||
*/
|
||||
public abstract void updateTableDefinition(String tableName, MessageType newSchema);
|
||||
|
||||
/*
|
||||
* APIs below need to be re-worked by modeling field comment in hudi-sync-common,
|
||||
* instead of relying on Avro or Hive schema class.
|
||||
*/
|
||||
|
||||
public Schema getAvroSchemaWithoutMetadataFields() {
|
||||
try {
|
||||
return new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieSyncException("Failed to read avro schema", e);
|
||||
}
|
||||
}
|
||||
|
||||
public abstract List<FieldSchema> getTableCommentUsingMetastoreClient(String tableName);
|
||||
|
||||
public abstract void updateTableComments(String tableName, List<FieldSchema> oldSchema, List<Schema.Field> newSchema);
|
||||
|
||||
public abstract void updateTableComments(String tableName, List<FieldSchema> oldSchema, Map<String, String> newComments);
|
||||
|
||||
/*
|
||||
* APIs above need to be re-worked by modeling field comment in hudi-sync-common,
|
||||
* instead of relying on Avro or Hive schema class.
|
||||
*/
|
||||
}
|
||||
@@ -18,6 +18,8 @@
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
@@ -18,269 +18,147 @@
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
import com.beust.jcommander.ParametersDelegate;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_COMMENT;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_MODE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_JDBC;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.METASTORE_URIS;
|
||||
|
||||
/**
|
||||
* Configs needed to sync data into the Hive Metastore.
|
||||
*/
|
||||
public class HiveSyncConfig extends HoodieSyncConfig {
|
||||
|
||||
@Parameter(names = {"--user"}, description = "Hive username")
|
||||
public String hiveUser;
|
||||
|
||||
@Parameter(names = {"--pass"}, description = "Hive password")
|
||||
public String hivePass;
|
||||
|
||||
@Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url")
|
||||
public String jdbcUrl;
|
||||
|
||||
@Parameter(names = {"--metastore-uris"}, description = "Hive metastore uris")
|
||||
public String metastoreUris;
|
||||
|
||||
@Parameter(names = {"--use-pre-apache-input-format"},
|
||||
description = "Use InputFormat under com.uber.hoodie package "
|
||||
+ "instead of org.apache.hudi package. Use this when you are in the process of migrating from "
|
||||
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to "
|
||||
+ "org.apache.hudi input format.")
|
||||
public Boolean usePreApacheInputFormat;
|
||||
|
||||
@Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore", required = false)
|
||||
public String bucketSpec;
|
||||
|
||||
@Deprecated
|
||||
@Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
|
||||
public Boolean useJdbc;
|
||||
|
||||
@Parameter(names = {"--sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms,glue,jdbc and hiveql")
|
||||
public String syncMode;
|
||||
|
||||
@Parameter(names = {"--auto-create-database"}, description = "Auto create hive database")
|
||||
public Boolean autoCreateDatabase;
|
||||
|
||||
@Parameter(names = {"--ignore-exceptions"}, description = "Ignore hive exceptions")
|
||||
public Boolean ignoreExceptions;
|
||||
|
||||
@Parameter(names = {"--skip-ro-suffix"}, description = "Skip the `_ro` suffix for Read optimized table, when registering")
|
||||
public Boolean skipROSuffix;
|
||||
|
||||
@Parameter(names = {"--table-properties"}, description = "Table properties to hive table")
|
||||
public String tableProperties;
|
||||
|
||||
@Parameter(names = {"--serde-properties"}, description = "Serde properties to hive table")
|
||||
public String serdeProperties;
|
||||
|
||||
@Parameter(names = {"--help", "-h"}, help = true)
|
||||
public Boolean help = false;
|
||||
|
||||
@Parameter(names = {"--support-timestamp"}, description = "'INT64' with original type TIMESTAMP_MICROS is converted to hive 'timestamp' type."
|
||||
+ "Disabled by default for backward compatibility.")
|
||||
public Boolean supportTimestamp;
|
||||
|
||||
@Parameter(names = {"--managed-table"}, description = "Create a managed table")
|
||||
public Boolean createManagedTable;
|
||||
|
||||
@Parameter(names = {"--batch-sync-num"}, description = "The number of partitions one batch when synchronous partitions to hive")
|
||||
public Integer batchSyncNum;
|
||||
|
||||
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table.")
|
||||
public Boolean syncAsSparkDataSourceTable;
|
||||
|
||||
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore.")
|
||||
public int sparkSchemaLengthThreshold;
|
||||
|
||||
@Parameter(names = {"--with-operation-field"}, description = "Whether to include the '_hoodie_operation' field in the metadata fields")
|
||||
public Boolean withOperationField = false;
|
||||
|
||||
@Parameter(names = {"--sync-comment"}, description = "synchronize table comments to hive")
|
||||
public boolean syncComment = false;
|
||||
|
||||
// HIVE SYNC SPECIFIC CONFIGS
|
||||
// NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes
|
||||
// unexpected issues with config getting reset
|
||||
public static final ConfigProperty<String> HIVE_SYNC_ENABLED = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.enable")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("When set to true, register/sync the table to Apache Hive metastore.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_USER = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.username")
|
||||
.defaultValue("hive")
|
||||
.withDocumentation("hive user name to use");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_PASS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.password")
|
||||
.defaultValue("hive")
|
||||
.withDocumentation("hive password to use");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_URL = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.jdbcurl")
|
||||
.defaultValue("jdbc:hive2://localhost:10000")
|
||||
.withDocumentation("Hive metastore url");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_USE_PRE_APACHE_INPUT_FORMAT = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.use_pre_apache_input_format")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. "
|
||||
+ "Use this when you are in the process of migrating from "
|
||||
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format");
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #HIVE_SYNC_MODE} instead of this config from 0.9.0
|
||||
*/
|
||||
@Deprecated
|
||||
public static final ConfigProperty<String> HIVE_USE_JDBC = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.use_jdbc")
|
||||
.defaultValue("true")
|
||||
.deprecatedAfter("0.9.0")
|
||||
.withDocumentation("Use JDBC when hive synchronization is enabled");
|
||||
|
||||
public static final ConfigProperty<String> METASTORE_URIS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.metastore.uris")
|
||||
.defaultValue("thrift://localhost:9083")
|
||||
.withDocumentation("Hive metastore url");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_AUTO_CREATE_DATABASE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.auto_create_database")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("Auto create hive database if does not exists");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_IGNORE_EXCEPTIONS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.ignore_exceptions")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Ignore exceptions when syncing with Hive.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.skip_ro_suffix")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Skip the _ro suffix for Read optimized table, when registering");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SUPPORT_TIMESTAMP_TYPE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.support_timestamp")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. "
|
||||
+ "Disabled by default for backward compatibility.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_TABLE_PROPERTIES = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.table_properties")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Additional properties to store with table.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_TABLE_SERDE_PROPERTIES = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.serde_properties")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Serde properties to hive table.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SYNC_AS_DATA_SOURCE_TABLE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.sync_as_datasource")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("");
|
||||
|
||||
public static final ConfigProperty<Integer> HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.schema_string_length_thresh")
|
||||
.defaultValue(4000)
|
||||
.withDocumentation("");
|
||||
|
||||
// Create table as managed table
|
||||
public static final ConfigProperty<Boolean> HIVE_CREATE_MANAGED_TABLE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.create_managed_table")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Whether to sync the table as managed table.");
|
||||
|
||||
public static final ConfigProperty<Integer> HIVE_BATCH_SYNC_PARTITION_NUM = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.batch_num")
|
||||
.defaultValue(1000)
|
||||
.withDocumentation("The number of partitions one batch when synchronous partitions to hive.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SYNC_MODE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.mode")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.");
|
||||
|
||||
public static final ConfigProperty<Boolean> HIVE_SYNC_BUCKET_SYNC = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.bucket_sync")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Whether sync hive metastore bucket specification when using bucket index."
|
||||
+ "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SYNC_BUCKET_SYNC_SPEC = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.bucket_sync_spec")
|
||||
.defaultValue("")
|
||||
.withDocumentation("The hive metastore bucket specification when using bucket index."
|
||||
+ "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_SYNC_COMMENT = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.sync_comment")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Whether to sync the table column comments while syncing the table.");
|
||||
|
||||
public HiveSyncConfig() {
|
||||
this(new TypedProperties());
|
||||
}
|
||||
|
||||
public HiveSyncConfig(TypedProperties props) {
|
||||
super(props);
|
||||
this.hiveUser = getStringOrDefault(HIVE_USER);
|
||||
this.hivePass = getStringOrDefault(HIVE_PASS);
|
||||
this.jdbcUrl = getStringOrDefault(HIVE_URL);
|
||||
this.usePreApacheInputFormat = getBooleanOrDefault(HIVE_USE_PRE_APACHE_INPUT_FORMAT);
|
||||
this.useJdbc = getBooleanOrDefault(HIVE_USE_JDBC);
|
||||
this.metastoreUris = getStringOrDefault(METASTORE_URIS);
|
||||
this.syncMode = getString(HIVE_SYNC_MODE);
|
||||
this.autoCreateDatabase = getBooleanOrDefault(HIVE_AUTO_CREATE_DATABASE);
|
||||
this.ignoreExceptions = getBooleanOrDefault(HIVE_IGNORE_EXCEPTIONS);
|
||||
this.skipROSuffix = getBooleanOrDefault(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE);
|
||||
this.tableProperties = getString(HIVE_TABLE_PROPERTIES);
|
||||
this.serdeProperties = getString(HIVE_TABLE_SERDE_PROPERTIES);
|
||||
this.supportTimestamp = getBooleanOrDefault(HIVE_SUPPORT_TIMESTAMP_TYPE);
|
||||
this.batchSyncNum = getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM);
|
||||
this.syncAsSparkDataSourceTable = getBooleanOrDefault(HIVE_SYNC_AS_DATA_SOURCE_TABLE);
|
||||
this.sparkSchemaLengthThreshold = getIntOrDefault(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD);
|
||||
this.createManagedTable = getBooleanOrDefault(HIVE_CREATE_MANAGED_TABLE);
|
||||
this.bucketSpec = getStringOrDefault(HIVE_SYNC_BUCKET_SYNC_SPEC);
|
||||
this.syncComment = getBooleanOrDefault(HIVE_SYNC_COMMENT);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "HiveSyncConfig{"
|
||||
+ "databaseName='" + databaseName + '\''
|
||||
+ ", tableName='" + tableName + '\''
|
||||
+ ", bucketSpec='" + bucketSpec + '\''
|
||||
+ ", baseFileFormat='" + baseFileFormat + '\''
|
||||
+ ", hiveUser='" + hiveUser + '\''
|
||||
+ ", hivePass='" + hivePass + '\''
|
||||
+ ", jdbcUrl='" + jdbcUrl + '\''
|
||||
+ ", metastoreUris='" + metastoreUris + '\''
|
||||
+ ", basePath='" + basePath + '\''
|
||||
+ ", partitionFields=" + partitionFields
|
||||
+ ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\''
|
||||
+ ", assumeDatePartitioning=" + assumeDatePartitioning
|
||||
+ ", usePreApacheInputFormat=" + usePreApacheInputFormat
|
||||
+ ", useJdbc=" + useJdbc
|
||||
+ ", autoCreateDatabase=" + autoCreateDatabase
|
||||
+ ", ignoreExceptions=" + ignoreExceptions
|
||||
+ ", skipROSuffix=" + skipROSuffix
|
||||
+ ", useFileListingFromMetadata=" + useFileListingFromMetadata
|
||||
+ ", tableProperties='" + tableProperties + '\''
|
||||
+ ", serdeProperties='" + serdeProperties + '\''
|
||||
+ ", help=" + help
|
||||
+ ", supportTimestamp=" + supportTimestamp
|
||||
+ ", decodePartition=" + decodePartition
|
||||
+ ", createManagedTable=" + createManagedTable
|
||||
+ ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable
|
||||
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold
|
||||
+ ", withOperationField=" + withOperationField
|
||||
+ ", isConditionalSync=" + isConditionalSync
|
||||
+ ", sparkVersion=" + sparkVersion
|
||||
+ ", syncComment=" + syncComment
|
||||
+ '}';
|
||||
}
|
||||
|
||||
public static String getBucketSpec(String bucketCols, int bucketNum) {
|
||||
return "CLUSTERED BY (" + bucketCols + " INTO " + bucketNum + " BUCKETS";
|
||||
}
|
||||
|
||||
public HiveSyncConfig(Properties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
public HiveSyncConfig(Properties props, Configuration hadoopConf) {
|
||||
super(props, hadoopConf);
|
||||
HiveConf hiveConf = new HiveConf(hadoopConf, HiveConf.class);
|
||||
// HiveConf needs to load fs conf to allow instantiation via AWSGlueClientFactory
|
||||
hiveConf.addResource(getHadoopFileSystem().getConf());
|
||||
setHadoopConf(hiveConf);
|
||||
}
|
||||
|
||||
public HiveConf getHiveConf() {
|
||||
return (HiveConf) getHadoopConf();
|
||||
}
|
||||
|
||||
public boolean useBucketSync() {
|
||||
return getBooleanOrDefault(HIVE_SYNC_BUCKET_SYNC);
|
||||
}
|
||||
|
||||
public static class HiveSyncConfigParams {
|
||||
|
||||
@ParametersDelegate()
|
||||
public final HoodieSyncConfigParams hoodieSyncConfigParams = new HoodieSyncConfigParams();
|
||||
|
||||
@Parameter(names = {"--user"}, description = "Hive username")
|
||||
public String hiveUser;
|
||||
@Parameter(names = {"--pass"}, description = "Hive password")
|
||||
public String hivePass;
|
||||
@Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url")
|
||||
public String jdbcUrl;
|
||||
@Parameter(names = {"--use-pre-apache-input-format"},
|
||||
description = "Use InputFormat under com.uber.hoodie package "
|
||||
+ "instead of org.apache.hudi package. Use this when you are in the process of migrating from "
|
||||
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to "
|
||||
+ "org.apache.hudi input format.")
|
||||
public Boolean usePreApacheInputFormat;
|
||||
@Deprecated
|
||||
@Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
|
||||
public Boolean useJdbc;
|
||||
@Parameter(names = {"--metastore-uris"}, description = "Hive metastore uris")
|
||||
public String metastoreUris;
|
||||
@Parameter(names = {"--sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms,glue,jdbc and hiveql")
|
||||
public String syncMode;
|
||||
@Parameter(names = {"--auto-create-database"}, description = "Auto create hive database")
|
||||
public Boolean autoCreateDatabase;
|
||||
@Parameter(names = {"--ignore-exceptions"}, description = "Ignore hive exceptions")
|
||||
public Boolean ignoreExceptions;
|
||||
@Parameter(names = {"--skip-ro-suffix"}, description = "Skip the `_ro` suffix for Read optimized table, when registering")
|
||||
public Boolean skipROSuffix;
|
||||
@Parameter(names = {"--table-properties"}, description = "Table properties to hive table")
|
||||
public String tableProperties;
|
||||
@Parameter(names = {"--serde-properties"}, description = "Serde properties to hive table")
|
||||
public String serdeProperties;
|
||||
@Parameter(names = {"--support-timestamp"}, description = "'INT64' with original type TIMESTAMP_MICROS is converted to hive 'timestamp' type."
|
||||
+ "Disabled by default for backward compatibility.")
|
||||
public Boolean supportTimestamp;
|
||||
@Parameter(names = {"--managed-table"}, description = "Create a managed table")
|
||||
public Boolean createManagedTable;
|
||||
@Parameter(names = {"--batch-sync-num"}, description = "The number of partitions one batch when synchronous partitions to hive")
|
||||
public Integer batchSyncNum;
|
||||
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table.")
|
||||
public Boolean syncAsSparkDataSourceTable;
|
||||
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore.")
|
||||
public Integer sparkSchemaLengthThreshold;
|
||||
@Parameter(names = {"--bucket-sync"}, description = "use bucket sync")
|
||||
public Boolean bucketSync;
|
||||
@Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore")
|
||||
public String bucketSpec;
|
||||
@Parameter(names = {"--sync-comment"}, description = "synchronize table comments to hive")
|
||||
public Boolean syncComment;
|
||||
@Parameter(names = {"--with-operation-field"}, description = "Whether to include the '_hoodie_operation' field in the metadata fields")
|
||||
public Boolean withOperationField; // TODO remove this as it's not used
|
||||
|
||||
public boolean isHelp() {
|
||||
return hoodieSyncConfigParams.isHelp();
|
||||
}
|
||||
|
||||
public TypedProperties toProps() {
|
||||
final TypedProperties props = hoodieSyncConfigParams.toProps();
|
||||
props.setPropertyIfNonNull(HIVE_USER.key(), hiveUser);
|
||||
props.setPropertyIfNonNull(HIVE_PASS.key(), hivePass);
|
||||
props.setPropertyIfNonNull(HIVE_URL.key(), jdbcUrl);
|
||||
props.setPropertyIfNonNull(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), usePreApacheInputFormat);
|
||||
props.setPropertyIfNonNull(HIVE_USE_JDBC.key(), useJdbc);
|
||||
props.setPropertyIfNonNull(HIVE_SYNC_MODE.key(), syncMode);
|
||||
props.setPropertyIfNonNull(METASTORE_URIS.key(), metastoreUris);
|
||||
props.setPropertyIfNonNull(HIVE_AUTO_CREATE_DATABASE.key(), autoCreateDatabase);
|
||||
props.setPropertyIfNonNull(HIVE_IGNORE_EXCEPTIONS.key(), ignoreExceptions);
|
||||
props.setPropertyIfNonNull(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key(), skipROSuffix);
|
||||
props.setPropertyIfNonNull(HIVE_SUPPORT_TIMESTAMP_TYPE.key(), supportTimestamp);
|
||||
props.setPropertyIfNonNull(HIVE_TABLE_PROPERTIES.key(), tableProperties);
|
||||
props.setPropertyIfNonNull(HIVE_TABLE_SERDE_PROPERTIES.key(), serdeProperties);
|
||||
props.setPropertyIfNonNull(HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), syncAsSparkDataSourceTable);
|
||||
props.setPropertyIfNonNull(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), sparkSchemaLengthThreshold);
|
||||
props.setPropertyIfNonNull(HIVE_CREATE_MANAGED_TABLE.key(), createManagedTable);
|
||||
props.setPropertyIfNonNull(HIVE_BATCH_SYNC_PARTITION_NUM.key(), batchSyncNum);
|
||||
props.setPropertyIfNonNull(HIVE_SYNC_BUCKET_SYNC.key(), bucketSync);
|
||||
props.setPropertyIfNonNull(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), bucketSpec);
|
||||
props.setPropertyIfNonNull(HIVE_SYNC_COMMENT.key(), syncComment);
|
||||
return props;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
|
||||
public class HiveSyncConfigHolder {
|
||||
/*
|
||||
* NOTE: below are HIVE SYNC SPECIFIC CONFIGS which should be under HiveSyncConfig.java
|
||||
* But since DataSourceOptions.scala references constants to some of these, and HiveSyncConfig.java imports HiveConf,
|
||||
* it causes HiveConf ClassNotFound issue for loading DataSourceOptions.
|
||||
*
|
||||
* NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes
|
||||
* unexpected issues with config getting reset
|
||||
*/
|
||||
public static final ConfigProperty<String> HIVE_SYNC_ENABLED = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.enable")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("When set to true, register/sync the table to Apache Hive metastore.");
|
||||
public static final ConfigProperty<String> HIVE_USER = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.username")
|
||||
.defaultValue("hive")
|
||||
.withDocumentation("hive user name to use");
|
||||
public static final ConfigProperty<String> HIVE_PASS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.password")
|
||||
.defaultValue("hive")
|
||||
.withDocumentation("hive password to use");
|
||||
public static final ConfigProperty<String> HIVE_URL = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.jdbcurl")
|
||||
.defaultValue("jdbc:hive2://localhost:10000")
|
||||
.withDocumentation("Hive metastore url");
|
||||
public static final ConfigProperty<String> HIVE_USE_PRE_APACHE_INPUT_FORMAT = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.use_pre_apache_input_format")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. "
|
||||
+ "Use this when you are in the process of migrating from "
|
||||
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format");
|
||||
/**
|
||||
* @deprecated Use {@link #HIVE_SYNC_MODE} instead of this config from 0.9.0
|
||||
*/
|
||||
@Deprecated
|
||||
public static final ConfigProperty<String> HIVE_USE_JDBC = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.use_jdbc")
|
||||
.defaultValue("true")
|
||||
.deprecatedAfter("0.9.0")
|
||||
.withDocumentation("Use JDBC when hive synchronization is enabled");
|
||||
public static final ConfigProperty<String> METASTORE_URIS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.metastore.uris")
|
||||
.defaultValue("thrift://localhost:9083")
|
||||
.withDocumentation("Hive metastore url");
|
||||
public static final ConfigProperty<String> HIVE_AUTO_CREATE_DATABASE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.auto_create_database")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("Auto create hive database if does not exists");
|
||||
public static final ConfigProperty<String> HIVE_IGNORE_EXCEPTIONS = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.ignore_exceptions")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Ignore exceptions when syncing with Hive.");
|
||||
public static final ConfigProperty<String> HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.skip_ro_suffix")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Skip the _ro suffix for Read optimized table, when registering");
|
||||
public static final ConfigProperty<String> HIVE_SUPPORT_TIMESTAMP_TYPE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.support_timestamp")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. "
|
||||
+ "Disabled by default for backward compatibility.");
|
||||
public static final ConfigProperty<String> HIVE_TABLE_PROPERTIES = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.table_properties")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Additional properties to store with table.");
|
||||
public static final ConfigProperty<String> HIVE_TABLE_SERDE_PROPERTIES = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.serde_properties")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Serde properties to hive table.");
|
||||
public static final ConfigProperty<String> HIVE_SYNC_AS_DATA_SOURCE_TABLE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.sync_as_datasource")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("");
|
||||
public static final ConfigProperty<Integer> HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.schema_string_length_thresh")
|
||||
.defaultValue(4000)
|
||||
.withDocumentation("");
|
||||
// Create table as managed table
|
||||
public static final ConfigProperty<Boolean> HIVE_CREATE_MANAGED_TABLE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.create_managed_table")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Whether to sync the table as managed table.");
|
||||
public static final ConfigProperty<Integer> HIVE_BATCH_SYNC_PARTITION_NUM = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.batch_num")
|
||||
.defaultValue(1000)
|
||||
.withDocumentation("The number of partitions one batch when synchronous partitions to hive.");
|
||||
public static final ConfigProperty<String> HIVE_SYNC_MODE = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.mode")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.");
|
||||
public static final ConfigProperty<Boolean> HIVE_SYNC_BUCKET_SYNC = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.bucket_sync")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Whether sync hive metastore bucket specification when using bucket index."
|
||||
+ "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'");
|
||||
public static final ConfigProperty<String> HIVE_SYNC_BUCKET_SYNC_SPEC = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.bucket_sync_spec")
|
||||
.defaultValue("")
|
||||
.withDocumentation("The hive metastore bucket specification when using bucket index."
|
||||
+ "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'");
|
||||
public static final ConfigProperty<String> HIVE_SYNC_COMMENT = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.sync_comment")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Whether to sync the table column comments while syncing the table.");
|
||||
}
|
||||
@@ -18,37 +18,53 @@
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.InvalidTableException;
|
||||
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.HoodieSyncClient;
|
||||
import org.apache.hudi.sync.common.HoodieSyncTool;
|
||||
import org.apache.hudi.sync.common.model.FieldSchema;
|
||||
import org.apache.hudi.sync.common.model.Partition;
|
||||
import org.apache.hudi.sync.common.model.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.model.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
import org.apache.hudi.sync.common.util.SparkDataSourceTableUtils;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_COMMENT;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.METASTORE_URIS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_SPARK_VERSION;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
|
||||
import static org.apache.hudi.sync.common.util.TableUtils.tableId;
|
||||
|
||||
/**
|
||||
* Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api
|
||||
* HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive-sync.jar HiveSyncTool [args]
|
||||
@@ -57,38 +73,34 @@ import java.util.stream.Collectors;
|
||||
* partitions incrementally (all the partitions modified since the last commit)
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
public class HiveSyncTool extends HoodieSyncTool implements AutoCloseable {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class);
|
||||
public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
|
||||
public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
|
||||
|
||||
protected HiveSyncConfig hiveSyncConfig;
|
||||
protected AbstractHiveSyncHoodieClient hoodieHiveClient;
|
||||
protected String snapshotTableName = null;
|
||||
protected Option<String> roTableName = null;
|
||||
protected final HiveSyncConfig config;
|
||||
protected final String databaseName;
|
||||
protected final String tableName;
|
||||
protected HoodieSyncClient syncClient;
|
||||
protected String snapshotTableName;
|
||||
protected Option<String> roTableName;
|
||||
|
||||
public HiveSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
this(new HiveSyncConfig(props), new HiveConf(conf, HiveConf.class), fs);
|
||||
public HiveSyncTool(Properties props, Configuration hadoopConf) {
|
||||
super(props, hadoopConf);
|
||||
HiveSyncConfig config = new HiveSyncConfig(props, hadoopConf);
|
||||
this.config = config;
|
||||
this.databaseName = config.getStringOrDefault(META_SYNC_DATABASE_NAME);
|
||||
this.tableName = config.getString(META_SYNC_TABLE_NAME);
|
||||
initSyncClient(config);
|
||||
initTableNameVars(config);
|
||||
}
|
||||
|
||||
public HiveSyncTool(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf, FileSystem fs) {
|
||||
super(hiveSyncConfig.getProps(), hiveConf, fs);
|
||||
// TODO: reconcile the way to set METASTOREURIS
|
||||
if (StringUtils.isNullOrEmpty(hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname))) {
|
||||
hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname, hiveSyncConfig.metastoreUris);
|
||||
}
|
||||
// HiveConf needs to load fs conf to allow instantiation via AWSGlueClientFactory
|
||||
hiveConf.addResource(fs.getConf());
|
||||
initClient(hiveSyncConfig, hiveConf);
|
||||
initConfig(hiveSyncConfig);
|
||||
}
|
||||
|
||||
protected void initClient(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf) {
|
||||
protected void initSyncClient(HiveSyncConfig config) {
|
||||
try {
|
||||
this.hoodieHiveClient = new HoodieHiveClient(hiveSyncConfig, hiveConf, fs);
|
||||
this.syncClient = new HoodieHiveSyncClient(config);
|
||||
} catch (RuntimeException e) {
|
||||
if (hiveSyncConfig.ignoreExceptions) {
|
||||
if (config.getBoolean(HIVE_IGNORE_EXCEPTIONS)) {
|
||||
LOG.error("Got runtime exception when hive syncing, but continuing as ignoreExceptions config is set ", e);
|
||||
} else {
|
||||
throw new HoodieHiveSyncException("Got runtime exception when hive syncing", e);
|
||||
@@ -96,28 +108,22 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
private void initConfig(HiveSyncConfig hiveSyncConfig) {
|
||||
// Set partitionFields to empty, when the NonPartitionedExtractor is used
|
||||
// TODO: HiveSyncConfig should be responsible for inferring config value
|
||||
if (NonPartitionedExtractor.class.getName().equals(hiveSyncConfig.partitionValueExtractorClass)) {
|
||||
LOG.warn("Set partitionFields to empty, since the NonPartitionedExtractor is used");
|
||||
hiveSyncConfig.partitionFields = new ArrayList<>();
|
||||
}
|
||||
this.hiveSyncConfig = hiveSyncConfig;
|
||||
if (hoodieHiveClient != null) {
|
||||
switch (hoodieHiveClient.getTableType()) {
|
||||
private void initTableNameVars(HiveSyncConfig config) {
|
||||
if (syncClient != null) {
|
||||
switch (syncClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
this.snapshotTableName = hiveSyncConfig.tableName;
|
||||
this.snapshotTableName = tableName;
|
||||
this.roTableName = Option.empty();
|
||||
break;
|
||||
case MERGE_ON_READ:
|
||||
this.snapshotTableName = hiveSyncConfig.tableName + SUFFIX_SNAPSHOT_TABLE;
|
||||
this.roTableName = hiveSyncConfig.skipROSuffix ? Option.of(hiveSyncConfig.tableName) :
|
||||
Option.of(hiveSyncConfig.tableName + SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
this.snapshotTableName = tableName + SUFFIX_SNAPSHOT_TABLE;
|
||||
this.roTableName = config.getBoolean(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE)
|
||||
? Option.of(tableName)
|
||||
: Option.of(tableName + SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
break;
|
||||
default:
|
||||
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
|
||||
throw new InvalidTableException(hoodieHiveClient.getBasePath());
|
||||
LOG.error("Unknown table type " + syncClient.getTableType());
|
||||
throw new InvalidTableException(syncClient.getBasePath());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -125,21 +131,23 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
try {
|
||||
if (hoodieHiveClient != null) {
|
||||
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
|
||||
+ hiveSyncConfig.jdbcUrl + ", basePath :" + hiveSyncConfig.basePath);
|
||||
if (syncClient != null) {
|
||||
LOG.info("Syncing target hoodie table with hive table("
|
||||
+ tableId(databaseName, tableName) + "). Hive metastore URL :"
|
||||
+ config.getString(METASTORE_URIS) + ", basePath :"
|
||||
+ config.getString(META_SYNC_BASE_PATH));
|
||||
|
||||
doSync();
|
||||
}
|
||||
} catch (RuntimeException re) {
|
||||
throw new HoodieException("Got runtime exception when hive syncing " + hiveSyncConfig.tableName, re);
|
||||
throw new HoodieException("Got runtime exception when hive syncing " + tableName, re);
|
||||
} finally {
|
||||
close();
|
||||
}
|
||||
}
|
||||
|
||||
protected void doSync() {
|
||||
switch (hoodieHiveClient.getTableType()) {
|
||||
switch (syncClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
syncHoodieTable(snapshotTableName, false, false);
|
||||
break;
|
||||
@@ -150,61 +158,60 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
syncHoodieTable(snapshotTableName, true, false);
|
||||
break;
|
||||
default:
|
||||
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
|
||||
throw new InvalidTableException(hoodieHiveClient.getBasePath());
|
||||
LOG.error("Unknown table type " + syncClient.getTableType());
|
||||
throw new InvalidTableException(syncClient.getBasePath());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
if (hoodieHiveClient != null) {
|
||||
if (syncClient != null) {
|
||||
try {
|
||||
hoodieHiveClient.close();
|
||||
syncClient.close();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Fail to close sync client.", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat,
|
||||
boolean readAsOptimized) {
|
||||
LOG.info("Trying to sync hoodie table " + tableName + " with base path " + hoodieHiveClient.getBasePath()
|
||||
+ " of type " + hoodieHiveClient.getTableType());
|
||||
protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) {
|
||||
LOG.info("Trying to sync hoodie table " + tableName + " with base path " + syncClient.getBasePath()
|
||||
+ " of type " + syncClient.getTableType());
|
||||
|
||||
// check if the database exists else create it
|
||||
if (hiveSyncConfig.autoCreateDatabase) {
|
||||
if (config.getBoolean(HIVE_AUTO_CREATE_DATABASE)) {
|
||||
try {
|
||||
if (!hoodieHiveClient.databaseExists(hiveSyncConfig.databaseName)) {
|
||||
hoodieHiveClient.createDatabase(hiveSyncConfig.databaseName);
|
||||
if (!syncClient.databaseExists(databaseName)) {
|
||||
syncClient.createDatabase(databaseName);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing
|
||||
LOG.warn("Unable to create database", e);
|
||||
}
|
||||
} else {
|
||||
if (!hoodieHiveClient.databaseExists(hiveSyncConfig.databaseName)) {
|
||||
LOG.error("Hive database does not exist " + hiveSyncConfig.databaseName);
|
||||
throw new HoodieHiveSyncException("hive database does not exist " + hiveSyncConfig.databaseName);
|
||||
if (!syncClient.databaseExists(databaseName)) {
|
||||
LOG.error("Hive database does not exist " + databaseName);
|
||||
throw new HoodieHiveSyncException("hive database does not exist " + databaseName);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the necessary table exists
|
||||
boolean tableExists = hoodieHiveClient.tableExists(tableName);
|
||||
boolean tableExists = syncClient.tableExists(tableName);
|
||||
|
||||
// check if isDropPartition
|
||||
boolean isDropPartition = hoodieHiveClient.isDropPartition();
|
||||
boolean isDropPartition = syncClient.isDropPartition();
|
||||
|
||||
// Get the parquet schema for this table looking at the latest commit
|
||||
MessageType schema = hoodieHiveClient.getDataSchema();
|
||||
MessageType schema = syncClient.getStorageSchema();
|
||||
|
||||
// Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table,
|
||||
// so we disable the syncAsSparkDataSourceTable here to avoid read such kind table
|
||||
// by the data source way (which will use the HoodieBootstrapRelation).
|
||||
// TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071], we can remove this logical.
|
||||
if (hoodieHiveClient.isBootstrap()
|
||||
&& hoodieHiveClient.getTableType() == HoodieTableType.MERGE_ON_READ
|
||||
&& !readAsOptimized) {
|
||||
hiveSyncConfig.syncAsSparkDataSourceTable = false;
|
||||
if (syncClient.isBootstrap()
|
||||
&& syncClient.getTableType() == HoodieTableType.MERGE_ON_READ
|
||||
&& !readAsOptimized) {
|
||||
config.setValue(HIVE_SYNC_AS_DATA_SOURCE_TABLE, "false");
|
||||
}
|
||||
|
||||
// Sync schema if needed
|
||||
@@ -214,17 +221,17 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
// Get the last time we successfully synced partitions
|
||||
Option<String> lastCommitTimeSynced = Option.empty();
|
||||
if (tableExists) {
|
||||
lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced(tableName);
|
||||
lastCommitTimeSynced = syncClient.getLastCommitTimeSynced(tableName);
|
||||
}
|
||||
LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null"));
|
||||
List<String> writtenPartitionsSince = hoodieHiveClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
|
||||
List<String> writtenPartitionsSince = syncClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
|
||||
LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size());
|
||||
|
||||
// Sync the partitions if needed
|
||||
boolean partitionsChanged = syncPartitions(tableName, writtenPartitionsSince, isDropPartition);
|
||||
boolean meetSyncConditions = schemaChanged || partitionsChanged;
|
||||
if (!hiveSyncConfig.isConditionalSync || meetSyncConditions) {
|
||||
hoodieHiveClient.updateLastCommitTimeSynced(tableName);
|
||||
if (!config.getBoolean(META_SYNC_CONDITIONAL_SYNC) || meetSyncConditions) {
|
||||
syncClient.updateLastCommitTimeSynced(tableName);
|
||||
}
|
||||
LOG.info("Sync complete for " + tableName);
|
||||
}
|
||||
@@ -233,18 +240,18 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
* Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the
|
||||
* table schema.
|
||||
*
|
||||
* @param tableExists - does table exist
|
||||
* @param schema - extracted schema
|
||||
* @param tableExists does table exist
|
||||
* @param schema extracted schema
|
||||
*/
|
||||
private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat,
|
||||
boolean readAsOptimized, MessageType schema) {
|
||||
boolean readAsOptimized, MessageType schema) {
|
||||
// Append spark table properties & serde properties
|
||||
Map<String, String> tableProperties = ConfigUtils.toMap(hiveSyncConfig.tableProperties);
|
||||
Map<String, String> serdeProperties = ConfigUtils.toMap(hiveSyncConfig.serdeProperties);
|
||||
if (hiveSyncConfig.syncAsSparkDataSourceTable) {
|
||||
Map<String, String> sparkTableProperties = getSparkTableProperties(hiveSyncConfig.partitionFields,
|
||||
hiveSyncConfig.sparkVersion, hiveSyncConfig.sparkSchemaLengthThreshold, schema);
|
||||
Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized, hiveSyncConfig.basePath);
|
||||
Map<String, String> tableProperties = ConfigUtils.toMap(config.getString(HIVE_TABLE_PROPERTIES));
|
||||
Map<String, String> serdeProperties = ConfigUtils.toMap(config.getString(HIVE_TABLE_SERDE_PROPERTIES));
|
||||
if (config.getBoolean(HIVE_SYNC_AS_DATA_SOURCE_TABLE)) {
|
||||
Map<String, String> sparkTableProperties = SparkDataSourceTableUtils.getSparkTableProperties(config.getSplitStrings(META_SYNC_PARTITION_FIELDS),
|
||||
config.getStringOrDefault(META_SYNC_SPARK_VERSION), config.getIntOrDefault(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD), schema);
|
||||
Map<String, String> sparkSerdeProperties = SparkDataSourceTableUtils.getSparkSerdeProperties(readAsOptimized, config.getString(META_SYNC_BASE_PATH));
|
||||
tableProperties.putAll(sparkTableProperties);
|
||||
serdeProperties.putAll(sparkSerdeProperties);
|
||||
}
|
||||
@@ -252,10 +259,10 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
// Check and sync schema
|
||||
if (!tableExists) {
|
||||
LOG.info("Hive table " + tableName + " is not found. Creating it");
|
||||
HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(hiveSyncConfig.baseFileFormat.toUpperCase());
|
||||
HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(config.getStringOrDefault(META_SYNC_BASE_FILE_FORMAT).toUpperCase());
|
||||
String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat);
|
||||
|
||||
if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && hiveSyncConfig.usePreApacheInputFormat) {
|
||||
if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && config.getBooleanOrDefault(HIVE_USE_PRE_APACHE_INPUT_FORMAT)) {
|
||||
// Parquet input format had an InputFormat class visible under the old naming scheme.
|
||||
inputFormatClassName = useRealTimeInputFormat
|
||||
? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName()
|
||||
@@ -268,19 +275,20 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
|
||||
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
|
||||
// /ql/exec/DDLTask.java#L3488
|
||||
hoodieHiveClient.createTable(tableName, schema, inputFormatClassName,
|
||||
syncClient.createTable(tableName, schema, inputFormatClassName,
|
||||
outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties);
|
||||
schemaChanged = true;
|
||||
} else {
|
||||
// Check if the table schema has evolved
|
||||
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName);
|
||||
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, hiveSyncConfig.partitionFields, hiveSyncConfig.supportTimestamp);
|
||||
Map<String, String> tableSchema = syncClient.getMetastoreSchema(tableName);
|
||||
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, config.getSplitStrings(META_SYNC_PARTITION_FIELDS),
|
||||
config.getBooleanOrDefault(HIVE_SUPPORT_TIMESTAMP_TYPE));
|
||||
if (!schemaDiff.isEmpty()) {
|
||||
LOG.info("Schema difference found for " + tableName);
|
||||
hoodieHiveClient.updateTableDefinition(tableName, schema);
|
||||
syncClient.updateTableSchema(tableName, schema);
|
||||
// Sync the table properties if the schema has changed
|
||||
if (hiveSyncConfig.tableProperties != null || hiveSyncConfig.syncAsSparkDataSourceTable) {
|
||||
hoodieHiveClient.updateTableProperties(tableName, tableProperties);
|
||||
if (config.getString(HIVE_TABLE_PROPERTIES) != null || config.getBoolean(HIVE_SYNC_AS_DATA_SOURCE_TABLE)) {
|
||||
syncClient.updateTableProperties(tableName, tableProperties);
|
||||
LOG.info("Sync table properties for " + tableName + ", table properties is: " + tableProperties);
|
||||
}
|
||||
schemaChanged = true;
|
||||
@@ -289,17 +297,10 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
if (hiveSyncConfig.syncComment) {
|
||||
Schema avroSchemaWithoutMetadataFields = hoodieHiveClient.getAvroSchemaWithoutMetadataFields();
|
||||
Map<String, String> newComments = avroSchemaWithoutMetadataFields.getFields()
|
||||
.stream().collect(Collectors.toMap(Schema.Field::name, field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
|
||||
boolean allEmpty = newComments.values().stream().allMatch(StringUtils::isNullOrEmpty);
|
||||
if (!allEmpty) {
|
||||
List<FieldSchema> hiveSchema = hoodieHiveClient.getTableCommentUsingMetastoreClient(tableName);
|
||||
hoodieHiveClient.updateTableComments(tableName, hiveSchema, avroSchemaWithoutMetadataFields.getFields());
|
||||
} else {
|
||||
LOG.info(String.format("No comment %s need to add", tableName));
|
||||
}
|
||||
if (config.getBoolean(HIVE_SYNC_COMMENT)) {
|
||||
List<FieldSchema> fromMetastore = syncClient.getMetastoreFieldSchemas(tableName);
|
||||
List<FieldSchema> fromStorage = syncClient.getStorageFieldSchemas();
|
||||
syncClient.updateTableComments(tableName, fromMetastore, fromStorage);
|
||||
}
|
||||
return schemaChanged;
|
||||
}
|
||||
@@ -311,26 +312,26 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
private boolean syncPartitions(String tableName, List<String> writtenPartitionsSince, boolean isDropPartition) {
|
||||
boolean partitionsChanged;
|
||||
try {
|
||||
List<Partition> hivePartitions = hoodieHiveClient.getAllPartitions(tableName);
|
||||
List<Partition> hivePartitions = syncClient.getAllPartitions(tableName);
|
||||
List<PartitionEvent> partitionEvents =
|
||||
hoodieHiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, isDropPartition);
|
||||
syncClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, isDropPartition);
|
||||
|
||||
List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
|
||||
if (!newPartitions.isEmpty()) {
|
||||
LOG.info("New Partitions " + newPartitions);
|
||||
hoodieHiveClient.addPartitionsToTable(tableName, newPartitions);
|
||||
syncClient.addPartitionsToTable(tableName, newPartitions);
|
||||
}
|
||||
|
||||
List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
|
||||
if (!updatePartitions.isEmpty()) {
|
||||
LOG.info("Changed Partitions " + updatePartitions);
|
||||
hoodieHiveClient.updatePartitionsToTable(tableName, updatePartitions);
|
||||
syncClient.updatePartitionsToTable(tableName, updatePartitions);
|
||||
}
|
||||
|
||||
List<String> dropPartitions = filterPartitions(partitionEvents, PartitionEventType.DROP);
|
||||
if (!dropPartitions.isEmpty()) {
|
||||
LOG.info("Drop Partitions " + dropPartitions);
|
||||
hoodieHiveClient.dropPartitions(tableName, dropPartitions);
|
||||
syncClient.dropPartitions(tableName, dropPartitions);
|
||||
}
|
||||
|
||||
partitionsChanged = !updatePartitions.isEmpty() || !newPartitions.isEmpty() || !dropPartitions.isEmpty();
|
||||
@@ -346,16 +347,13 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
// parse the params
|
||||
final HiveSyncConfig cfg = new HiveSyncConfig();
|
||||
JCommander cmd = new JCommander(cfg, null, args);
|
||||
if (cfg.help || args.length == 0) {
|
||||
final HiveSyncConfig.HiveSyncConfigParams params = new HiveSyncConfig.HiveSyncConfigParams();
|
||||
JCommander cmd = JCommander.newBuilder().addObject(params).build();
|
||||
cmd.parse(args);
|
||||
if (params.isHelp()) {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
System.exit(0);
|
||||
}
|
||||
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
|
||||
HiveConf hiveConf = new HiveConf();
|
||||
hiveConf.addResource(fs.getConf());
|
||||
new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable();
|
||||
new HiveSyncTool(params.toProps(), new Configuration()).syncHoodieTable();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,22 +18,21 @@
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.hive.ddl.DDLExecutor;
|
||||
import org.apache.hudi.hive.ddl.HMSDDLExecutor;
|
||||
import org.apache.hudi.hive.ddl.HiveQueryDDLExecutor;
|
||||
import org.apache.hudi.hive.ddl.HiveSyncMode;
|
||||
import org.apache.hudi.hive.ddl.JDBCExecutor;
|
||||
import org.apache.hudi.sync.common.HoodieSyncClient;
|
||||
import org.apache.hudi.sync.common.model.FieldSchema;
|
||||
import org.apache.hudi.sync.common.model.Partition;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
|
||||
import org.apache.hadoop.hive.metastore.api.Table;
|
||||
import org.apache.hadoop.hive.ql.metadata.Hive;
|
||||
@@ -49,115 +48,100 @@ import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.hadoop.utils.HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_MODE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_JDBC;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.util.TableUtils.tableId;
|
||||
|
||||
/**
|
||||
* This class implements logic to sync a Hudi table with either the Hive server or the Hive Metastore.
|
||||
*/
|
||||
public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
|
||||
public class HoodieHiveSyncClient extends HoodieSyncClient {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieHiveClient.class);
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieHiveSyncClient.class);
|
||||
protected final HiveSyncConfig config;
|
||||
private final String databaseName;
|
||||
DDLExecutor ddlExecutor;
|
||||
private IMetaStoreClient client;
|
||||
|
||||
public HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
|
||||
super(cfg, configuration, fs);
|
||||
public HoodieHiveSyncClient(HiveSyncConfig config) {
|
||||
super(config);
|
||||
this.config = config;
|
||||
this.databaseName = config.getStringOrDefault(META_SYNC_DATABASE_NAME);
|
||||
|
||||
// Support JDBC, HiveQL and metastore based implementations for backwards compatibility. Future users should
|
||||
// disable jdbc and depend on metastore client for all hive registrations
|
||||
try {
|
||||
if (!StringUtils.isNullOrEmpty(cfg.syncMode)) {
|
||||
HiveSyncMode syncMode = HiveSyncMode.of(cfg.syncMode);
|
||||
if (!StringUtils.isNullOrEmpty(config.getString(HIVE_SYNC_MODE))) {
|
||||
HiveSyncMode syncMode = HiveSyncMode.of(config.getString(HIVE_SYNC_MODE));
|
||||
switch (syncMode) {
|
||||
case HMS:
|
||||
ddlExecutor = new HMSDDLExecutor(configuration, cfg, fs);
|
||||
ddlExecutor = new HMSDDLExecutor(config);
|
||||
break;
|
||||
case HIVEQL:
|
||||
ddlExecutor = new HiveQueryDDLExecutor(cfg, fs, configuration);
|
||||
ddlExecutor = new HiveQueryDDLExecutor(config);
|
||||
break;
|
||||
case JDBC:
|
||||
ddlExecutor = new JDBCExecutor(cfg, fs);
|
||||
ddlExecutor = new JDBCExecutor(config);
|
||||
break;
|
||||
default:
|
||||
throw new HoodieHiveSyncException("Invalid sync mode given " + cfg.syncMode);
|
||||
throw new HoodieHiveSyncException("Invalid sync mode given " + config.getString(HIVE_SYNC_MODE));
|
||||
}
|
||||
} else {
|
||||
ddlExecutor = cfg.useJdbc ? new JDBCExecutor(cfg, fs) : new HiveQueryDDLExecutor(cfg, fs, configuration);
|
||||
ddlExecutor = config.getBoolean(HIVE_USE_JDBC) ? new JDBCExecutor(config) : new HiveQueryDDLExecutor(config);
|
||||
}
|
||||
this.client = Hive.get(configuration).getMSC();
|
||||
this.client = Hive.get(config.getHiveConf()).getMSC();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to create HiveMetaStoreClient", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the (NEW) partitions to the table.
|
||||
*/
|
||||
@Override
|
||||
public void addPartitionsToTable(String tableName, List<String> partitionsToAdd) {
|
||||
ddlExecutor.addPartitionsToTable(tableName, partitionsToAdd);
|
||||
}
|
||||
|
||||
/**
|
||||
* Partition path has changed - update the path for te following partitions.
|
||||
*/
|
||||
@Override
|
||||
public void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
|
||||
ddlExecutor.updatePartitionsToTable(tableName, changedPartitions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Partition path has changed - drop the following partitions.
|
||||
*/
|
||||
@Override
|
||||
public void dropPartitions(String tableName, List<String> partitionsToDrop) {
|
||||
ddlExecutor.dropPartitionsToTable(tableName, partitionsToDrop);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the table properties to the table.
|
||||
*/
|
||||
@Override
|
||||
public void updateTableProperties(String tableName, Map<String, String> tableProperties) {
|
||||
if (tableProperties == null || tableProperties.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
Table table = client.getTable(syncConfig.databaseName, tableName);
|
||||
Table table = client.getTable(databaseName, tableName);
|
||||
for (Map.Entry<String, String> entry : tableProperties.entrySet()) {
|
||||
table.putToParameters(entry.getKey(), entry.getValue());
|
||||
}
|
||||
client.alter_table(syncConfig.databaseName, tableName, table);
|
||||
client.alter_table(databaseName, tableName, table);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to update table properties for table: "
|
||||
+ tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Scan table partitions.
|
||||
*
|
||||
* @deprecated Use {@link #getAllPartitions} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public List<org.apache.hadoop.hive.metastore.api.Partition> scanTablePartitions(String tableName) throws TException {
|
||||
return client.listPartitions(syncConfig.databaseName, tableName, (short) -1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateTableDefinition(String tableName, MessageType newSchema) {
|
||||
public void updateTableSchema(String tableName, MessageType newSchema) {
|
||||
ddlExecutor.updateTableDefinition(tableName, newSchema);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Partition> getAllPartitions(String tableName) {
|
||||
try {
|
||||
return client.listPartitions(syncConfig.databaseName, tableName, (short) -1)
|
||||
return client.listPartitions(databaseName, tableName, (short) -1)
|
||||
.stream()
|
||||
.map(p -> new Partition(p.getValues(), p.getSd().getLocation()))
|
||||
.collect(Collectors.toList());
|
||||
} catch (TException e) {
|
||||
throw new HoodieHiveSyncException("Failed to get all partitions for table " + tableId(syncConfig.databaseName, tableName), e);
|
||||
throw new HoodieHiveSyncException("Failed to get all partitions for table " + tableId(databaseName, tableName), e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,11 +152,8 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
|
||||
ddlExecutor.createTable(tableName, storageSchema, inputFormatClass, outputFormatClass, serdeClass, serdeProperties, tableProperties);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the table schema.
|
||||
*/
|
||||
@Override
|
||||
public Map<String, String> getTableSchema(String tableName) {
|
||||
public Map<String, String> getMetastoreSchema(String tableName) {
|
||||
if (!tableExists(tableName)) {
|
||||
throw new IllegalArgumentException(
|
||||
"Failed to get schema for table " + tableName + " does not exist");
|
||||
@@ -180,26 +161,15 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
|
||||
return ddlExecutor.getTableSchema(tableName);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
@Override
|
||||
public boolean doesTableExist(String tableName) {
|
||||
return tableExists(tableName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean tableExists(String tableName) {
|
||||
try {
|
||||
return client.tableExists(syncConfig.databaseName, tableName);
|
||||
return client.tableExists(databaseName, tableName);
|
||||
} catch (TException e) {
|
||||
throw new HoodieHiveSyncException("Failed to check if table exists " + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public boolean doesDataBaseExist(String databaseName) {
|
||||
return databaseExists(databaseName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean databaseExists(String databaseName) {
|
||||
try {
|
||||
@@ -222,7 +192,7 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
|
||||
public Option<String> getLastCommitTimeSynced(String tableName) {
|
||||
// Get the last commit time from the TBLproperties
|
||||
try {
|
||||
Table table = client.getTable(syncConfig.databaseName, tableName);
|
||||
Table table = client.getTable(databaseName, tableName);
|
||||
return Option.ofNullable(table.getParameters().getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null));
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to get the last commit time synced from the table " + tableName, e);
|
||||
@@ -232,10 +202,10 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
|
||||
public Option<String> getLastReplicatedTime(String tableName) {
|
||||
// Get the last replicated time from the TBLproperties
|
||||
try {
|
||||
Table table = client.getTable(syncConfig.databaseName, tableName);
|
||||
Table table = client.getTable(databaseName, tableName);
|
||||
return Option.ofNullable(table.getParameters().getOrDefault(GLOBALLY_CONSISTENT_READ_TIMESTAMP, null));
|
||||
} catch (NoSuchObjectException e) {
|
||||
LOG.warn("the said table not found in hms " + syncConfig.databaseName + "." + tableName);
|
||||
LOG.warn("the said table not found in hms " + tableId(databaseName, tableName));
|
||||
return Option.empty();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to get the last replicated time from the table " + tableName, e);
|
||||
@@ -243,15 +213,14 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
|
||||
}
|
||||
|
||||
public void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
|
||||
if (!activeTimeline.filterCompletedInstants().getInstants()
|
||||
.anyMatch(i -> i.getTimestamp().equals(timeStamp))) {
|
||||
if (getActiveTimeline().getInstants().noneMatch(i -> i.getTimestamp().equals(timeStamp))) {
|
||||
throw new HoodieHiveSyncException(
|
||||
"Not a valid completed timestamp " + timeStamp + " for table " + tableName);
|
||||
}
|
||||
try {
|
||||
Table table = client.getTable(syncConfig.databaseName, tableName);
|
||||
Table table = client.getTable(databaseName, tableName);
|
||||
table.putToParameters(GLOBALLY_CONSISTENT_READ_TIMESTAMP, timeStamp);
|
||||
client.alter_table(syncConfig.databaseName, tableName, table);
|
||||
client.alter_table(databaseName, tableName, table);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException(
|
||||
"Failed to update last replicated time to " + timeStamp + " for " + tableName, e);
|
||||
@@ -260,9 +229,9 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
|
||||
|
||||
public void deleteLastReplicatedTimeStamp(String tableName) {
|
||||
try {
|
||||
Table table = client.getTable(syncConfig.databaseName, tableName);
|
||||
Table table = client.getTable(databaseName, tableName);
|
||||
String timestamp = table.getParameters().remove(GLOBALLY_CONSISTENT_READ_TIMESTAMP);
|
||||
client.alter_table(syncConfig.databaseName, tableName, table);
|
||||
client.alter_table(databaseName, tableName, table);
|
||||
if (timestamp != null) {
|
||||
LOG.info("deleted last replicated timestamp " + timestamp + " for table " + tableName);
|
||||
}
|
||||
@@ -290,12 +259,12 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
|
||||
@Override
|
||||
public void updateLastCommitTimeSynced(String tableName) {
|
||||
// Set the last commit time from the TBLproperties
|
||||
Option<String> lastCommitSynced = activeTimeline.lastInstant().map(HoodieInstant::getTimestamp);
|
||||
Option<String> lastCommitSynced = getActiveTimeline().lastInstant().map(HoodieInstant::getTimestamp);
|
||||
if (lastCommitSynced.isPresent()) {
|
||||
try {
|
||||
Table table = client.getTable(syncConfig.databaseName, tableName);
|
||||
Table table = client.getTable(databaseName, tableName);
|
||||
table.putToParameters(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced.get());
|
||||
client.alter_table(syncConfig.databaseName, tableName, table);
|
||||
client.alter_table(databaseName, tableName, table);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to get update last commit time synced to " + lastCommitSynced, e);
|
||||
}
|
||||
@@ -303,36 +272,48 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<FieldSchema> getTableCommentUsingMetastoreClient(String tableName) {
|
||||
public List<FieldSchema> getMetastoreFieldSchemas(String tableName) {
|
||||
try {
|
||||
return client.getSchema(syncConfig.databaseName, tableName);
|
||||
return client.getSchema(databaseName, tableName)
|
||||
.stream()
|
||||
.map(f -> new FieldSchema(f.getName(), f.getType(), f.getComment()))
|
||||
.collect(Collectors.toList());
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to get table comments for : " + tableName, e);
|
||||
throw new HoodieHiveSyncException("Failed to get field schemas from metastore for table : " + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateTableComments(String tableName, List<FieldSchema> oldSchema, List<Schema.Field> newSchema) {
|
||||
Map<String,String> newComments = newSchema.stream().collect(Collectors.toMap(field -> field.name().toLowerCase(Locale.ROOT), field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
|
||||
updateTableComments(tableName,oldSchema,newComments);
|
||||
public List<FieldSchema> getStorageFieldSchemas() {
|
||||
try {
|
||||
return new TableSchemaResolver(metaClient).getTableAvroSchema(false)
|
||||
.getFields()
|
||||
.stream()
|
||||
.map(f -> new FieldSchema(f.name(), f.schema().getType().getName(), f.doc()))
|
||||
.collect(Collectors.toList());
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to get field schemas from storage : ", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateTableComments(String tableName, List<FieldSchema> oldSchema, Map<String,String> newComments) {
|
||||
Map<String,String> oldComments = oldSchema.stream().collect(Collectors.toMap(fieldSchema -> fieldSchema.getName().toLowerCase(Locale.ROOT),
|
||||
fieldSchema -> StringUtils.isNullOrEmpty(fieldSchema.getComment()) ? "" : fieldSchema.getComment()));
|
||||
Map<String,String> types = oldSchema.stream().collect(Collectors.toMap(FieldSchema::getName, FieldSchema::getType));
|
||||
Map<String, ImmutablePair<String,String>> alterComments = new HashMap<>();
|
||||
oldComments.forEach((name,comment) -> {
|
||||
String newComment = newComments.getOrDefault(name,"");
|
||||
if (!newComment.equals(comment)) {
|
||||
alterComments.put(name,new ImmutablePair<>(types.get(name),newComment));
|
||||
public void updateTableComments(String tableName, List<FieldSchema> fromMetastore, List<FieldSchema> fromStorage) {
|
||||
Map<String, FieldSchema> metastoreMap = fromMetastore.stream().collect(Collectors.toMap(f -> f.getName().toLowerCase(Locale.ROOT), f -> f));
|
||||
Map<String, FieldSchema> storageMap = fromStorage.stream().collect(Collectors.toMap(f -> f.getName().toLowerCase(Locale.ROOT), f -> f));
|
||||
Map<String, Pair<String, String>> alterComments = new HashMap<>();
|
||||
metastoreMap.forEach((name, metastoreFieldSchema) -> {
|
||||
if (storageMap.containsKey(name)) {
|
||||
boolean updated = metastoreFieldSchema.updateComment(storageMap.get(name));
|
||||
if (updated) {
|
||||
alterComments.put(name, Pair.of(metastoreFieldSchema.getType(), metastoreFieldSchema.getCommentOrEmpty()));
|
||||
}
|
||||
}
|
||||
});
|
||||
if (alterComments.size() > 0) {
|
||||
ddlExecutor.updateTableComments(tableName, alterComments);
|
||||
if (alterComments.isEmpty()) {
|
||||
LOG.info(String.format("No comment difference of %s ", tableName));
|
||||
} else {
|
||||
LOG.info(String.format("No comment difference of %s ",tableName));
|
||||
ddlExecutor.updateTableComments(tableName, alterComments);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -7,19 +7,21 @@
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import java.util.Collections;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@@ -7,17 +7,20 @@
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
@@ -7,17 +7,20 @@
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
|
||||
@@ -7,17 +7,20 @@
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
package org.apache.hudi.hive.ddl;
|
||||
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
@@ -98,5 +98,5 @@ public interface DDLExecutor extends AutoCloseable {
|
||||
* @param tableName
|
||||
* @param newSchema Map key: field name, Map value: [field type, field comment]
|
||||
*/
|
||||
void updateTableComments(String tableName, Map<String, ImmutablePair<String, String>> newSchema);
|
||||
void updateTableComments(String tableName, Map<String, Pair<String, String>> newSchema);
|
||||
}
|
||||
|
||||
@@ -20,17 +20,15 @@ package org.apache.hudi.hive.ddl;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.StorageSchemes;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.PartitionValueExtractor;
|
||||
import org.apache.hudi.hive.util.HivePartitionUtil;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.common.StatsSetupConst;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
|
||||
import org.apache.hadoop.hive.metastore.TableType;
|
||||
import org.apache.hadoop.hive.metastore.api.Database;
|
||||
@@ -55,26 +53,35 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
|
||||
/**
|
||||
* DDLExecutor impl based on HMS which use HMS apis directly for all DDL tasks.
|
||||
*/
|
||||
public class HMSDDLExecutor implements DDLExecutor {
|
||||
private static final Logger LOG = LogManager.getLogger(HMSDDLExecutor.class);
|
||||
private final HiveSyncConfig syncConfig;
|
||||
private final PartitionValueExtractor partitionValueExtractor;
|
||||
private final FileSystem fs;
|
||||
private final IMetaStoreClient client;
|
||||
|
||||
public HMSDDLExecutor(HiveConf conf, HiveSyncConfig syncConfig, FileSystem fs) throws HiveException, MetaException {
|
||||
this.client = Hive.get(conf).getMSC();
|
||||
private static final Logger LOG = LogManager.getLogger(HMSDDLExecutor.class);
|
||||
|
||||
private final HiveSyncConfig syncConfig;
|
||||
private final String databaseName;
|
||||
private final IMetaStoreClient client;
|
||||
private final PartitionValueExtractor partitionValueExtractor;
|
||||
|
||||
public HMSDDLExecutor(HiveSyncConfig syncConfig) throws HiveException, MetaException {
|
||||
this.syncConfig = syncConfig;
|
||||
this.fs = fs;
|
||||
this.databaseName = syncConfig.getStringOrDefault(META_SYNC_DATABASE_NAME);
|
||||
this.client = Hive.get(syncConfig.getHiveConf()).getMSC();
|
||||
try {
|
||||
this.partitionValueExtractor =
|
||||
(PartitionValueExtractor) Class.forName(syncConfig.partitionValueExtractorClass).newInstance();
|
||||
(PartitionValueExtractor) Class.forName(syncConfig.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS)).newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException(
|
||||
"Failed to initialize PartitionValueExtractor class " + syncConfig.partitionValueExtractorClass, e);
|
||||
"Failed to initialize PartitionValueExtractor class " + syncConfig.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS), e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,16 +100,16 @@ public class HMSDDLExecutor implements DDLExecutor {
|
||||
public void createTable(String tableName, MessageType storageSchema, String inputFormatClass, String outputFormatClass, String serdeClass, Map<String, String> serdeProperties,
|
||||
Map<String, String> tableProperties) {
|
||||
try {
|
||||
LinkedHashMap<String, String> mapSchema = HiveSchemaUtil.parquetSchemaToMapSchema(storageSchema, syncConfig.supportTimestamp, false);
|
||||
LinkedHashMap<String, String> mapSchema = HiveSchemaUtil.parquetSchemaToMapSchema(storageSchema, syncConfig.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE), false);
|
||||
|
||||
List<FieldSchema> fieldSchema = HiveSchemaUtil.convertMapSchemaToHiveFieldSchema(mapSchema, syncConfig);
|
||||
|
||||
List<FieldSchema> partitionSchema = syncConfig.partitionFields.stream().map(partitionKey -> {
|
||||
List<FieldSchema> partitionSchema = syncConfig.getSplitStrings(META_SYNC_PARTITION_FIELDS).stream().map(partitionKey -> {
|
||||
String partitionKeyType = HiveSchemaUtil.getPartitionKeyType(mapSchema, partitionKey);
|
||||
return new FieldSchema(partitionKey, partitionKeyType.toLowerCase(), "");
|
||||
}).collect(Collectors.toList());
|
||||
Table newTb = new Table();
|
||||
newTb.setDbName(syncConfig.databaseName);
|
||||
newTb.setDbName(databaseName);
|
||||
newTb.setTableName(tableName);
|
||||
newTb.setOwner(UserGroupInformation.getCurrentUser().getShortUserName());
|
||||
newTb.setCreateTime((int) System.currentTimeMillis());
|
||||
@@ -110,13 +117,13 @@ public class HMSDDLExecutor implements DDLExecutor {
|
||||
storageDescriptor.setCols(fieldSchema);
|
||||
storageDescriptor.setInputFormat(inputFormatClass);
|
||||
storageDescriptor.setOutputFormat(outputFormatClass);
|
||||
storageDescriptor.setLocation(syncConfig.basePath);
|
||||
storageDescriptor.setLocation(syncConfig.getString(META_SYNC_BASE_PATH));
|
||||
serdeProperties.put("serialization.format", "1");
|
||||
storageDescriptor.setSerdeInfo(new SerDeInfo(null, serdeClass, serdeProperties));
|
||||
newTb.setSd(storageDescriptor);
|
||||
newTb.setPartitionKeys(partitionSchema);
|
||||
|
||||
if (!syncConfig.createManagedTable) {
|
||||
if (!syncConfig.getBoolean(HIVE_CREATE_MANAGED_TABLE)) {
|
||||
newTb.putToParameters("EXTERNAL", "TRUE");
|
||||
}
|
||||
|
||||
@@ -134,9 +141,9 @@ public class HMSDDLExecutor implements DDLExecutor {
|
||||
@Override
|
||||
public void updateTableDefinition(String tableName, MessageType newSchema) {
|
||||
try {
|
||||
boolean cascade = syncConfig.partitionFields.size() > 0;
|
||||
boolean cascade = syncConfig.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() > 0;
|
||||
List<FieldSchema> fieldSchema = HiveSchemaUtil.convertParquetSchemaToHiveFieldSchema(newSchema, syncConfig);
|
||||
Table table = client.getTable(syncConfig.databaseName, tableName);
|
||||
Table table = client.getTable(databaseName, tableName);
|
||||
StorageDescriptor sd = table.getSd();
|
||||
sd.setCols(fieldSchema);
|
||||
table.setSd(sd);
|
||||
@@ -145,7 +152,7 @@ public class HMSDDLExecutor implements DDLExecutor {
|
||||
LOG.info("partition table,need cascade");
|
||||
environmentContext.putToProperties(StatsSetupConst.CASCADE, StatsSetupConst.TRUE);
|
||||
}
|
||||
client.alter_table_with_environmentContext(syncConfig.databaseName, tableName, table, environmentContext);
|
||||
client.alter_table_with_environmentContext(databaseName, tableName, table, environmentContext);
|
||||
} catch (Exception e) {
|
||||
LOG.error("Failed to update table for " + tableName, e);
|
||||
throw new HoodieHiveSyncException("Failed to update table for " + tableName, e);
|
||||
@@ -158,7 +165,7 @@ public class HMSDDLExecutor implements DDLExecutor {
|
||||
// HiveMetastoreClient returns partition keys separate from Columns, hence get both and merge to
|
||||
// get the Schema of the table.
|
||||
final long start = System.currentTimeMillis();
|
||||
Table table = this.client.getTable(syncConfig.databaseName, tableName);
|
||||
Table table = this.client.getTable(databaseName, tableName);
|
||||
Map<String, String> partitionKeysMap =
|
||||
table.getPartitionKeys().stream().collect(Collectors.toMap(FieldSchema::getName, f -> f.getType().toUpperCase()));
|
||||
|
||||
@@ -184,22 +191,22 @@ public class HMSDDLExecutor implements DDLExecutor {
|
||||
}
|
||||
LOG.info("Adding partitions " + partitionsToAdd.size() + " to table " + tableName);
|
||||
try {
|
||||
StorageDescriptor sd = client.getTable(syncConfig.databaseName, tableName).getSd();
|
||||
StorageDescriptor sd = client.getTable(databaseName, tableName).getSd();
|
||||
List<Partition> partitionList = partitionsToAdd.stream().map(partition -> {
|
||||
StorageDescriptor partitionSd = new StorageDescriptor();
|
||||
partitionSd.setCols(sd.getCols());
|
||||
partitionSd.setInputFormat(sd.getInputFormat());
|
||||
partitionSd.setOutputFormat(sd.getOutputFormat());
|
||||
partitionSd.setSerdeInfo(sd.getSerdeInfo());
|
||||
String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition).toString();
|
||||
String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.getString(META_SYNC_BASE_PATH), partition).toString();
|
||||
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
|
||||
partitionSd.setLocation(fullPartitionPath);
|
||||
return new Partition(partitionValues, syncConfig.databaseName, tableName, 0, 0, partitionSd, null);
|
||||
return new Partition(partitionValues, databaseName, tableName, 0, 0, partitionSd, null);
|
||||
}).collect(Collectors.toList());
|
||||
client.add_partitions(partitionList, true, false);
|
||||
} catch (TException e) {
|
||||
LOG.error(syncConfig.databaseName + "." + tableName + " add partition failed", e);
|
||||
throw new HoodieHiveSyncException(syncConfig.databaseName + "." + tableName + " add partition failed", e);
|
||||
LOG.error(databaseName + "." + tableName + " add partition failed", e);
|
||||
throw new HoodieHiveSyncException(databaseName + "." + tableName + " add partition failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,20 +218,20 @@ public class HMSDDLExecutor implements DDLExecutor {
|
||||
}
|
||||
LOG.info("Changing partitions " + changedPartitions.size() + " on " + tableName);
|
||||
try {
|
||||
StorageDescriptor sd = client.getTable(syncConfig.databaseName, tableName).getSd();
|
||||
StorageDescriptor sd = client.getTable(databaseName, tableName).getSd();
|
||||
List<Partition> partitionList = changedPartitions.stream().map(partition -> {
|
||||
Path partitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition);
|
||||
Path partitionPath = FSUtils.getPartitionPath(syncConfig.getString(META_SYNC_BASE_PATH), partition);
|
||||
String partitionScheme = partitionPath.toUri().getScheme();
|
||||
String fullPartitionPath = StorageSchemes.HDFS.getScheme().equals(partitionScheme)
|
||||
? FSUtils.getDFSFullPartitionPath(fs, partitionPath) : partitionPath.toString();
|
||||
? FSUtils.getDFSFullPartitionPath(syncConfig.getHadoopFileSystem(), partitionPath) : partitionPath.toString();
|
||||
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
|
||||
sd.setLocation(fullPartitionPath);
|
||||
return new Partition(partitionValues, syncConfig.databaseName, tableName, 0, 0, sd, null);
|
||||
return new Partition(partitionValues, databaseName, tableName, 0, 0, sd, null);
|
||||
}).collect(Collectors.toList());
|
||||
client.alter_partitions(syncConfig.databaseName, tableName, partitionList, null);
|
||||
client.alter_partitions(databaseName, tableName, partitionList, null);
|
||||
} catch (TException e) {
|
||||
LOG.error(syncConfig.databaseName + "." + tableName + " update partition failed", e);
|
||||
throw new HoodieHiveSyncException(syncConfig.databaseName + "." + tableName + " update partition failed", e);
|
||||
LOG.error(databaseName + "." + tableName + " update partition failed", e);
|
||||
throw new HoodieHiveSyncException(databaseName + "." + tableName + " update partition failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -241,20 +248,20 @@ public class HMSDDLExecutor implements DDLExecutor {
|
||||
if (HivePartitionUtil.partitionExists(client, tableName, dropPartition, partitionValueExtractor, syncConfig)) {
|
||||
String partitionClause =
|
||||
HivePartitionUtil.getPartitionClauseForDrop(dropPartition, partitionValueExtractor, syncConfig);
|
||||
client.dropPartition(syncConfig.databaseName, tableName, partitionClause, false);
|
||||
client.dropPartition(databaseName, tableName, partitionClause, false);
|
||||
}
|
||||
LOG.info("Drop partition " + dropPartition + " on " + tableName);
|
||||
}
|
||||
} catch (TException e) {
|
||||
LOG.error(syncConfig.databaseName + "." + tableName + " drop partition failed", e);
|
||||
throw new HoodieHiveSyncException(syncConfig.databaseName + "." + tableName + " drop partition failed", e);
|
||||
LOG.error(databaseName + "." + tableName + " drop partition failed", e);
|
||||
throw new HoodieHiveSyncException(databaseName + "." + tableName + " drop partition failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateTableComments(String tableName, Map<String, ImmutablePair<String,String>> alterSchema) {
|
||||
public void updateTableComments(String tableName, Map<String, Pair<String, String>> alterSchema) {
|
||||
try {
|
||||
Table table = client.getTable(syncConfig.databaseName, tableName);
|
||||
Table table = client.getTable(databaseName, tableName);
|
||||
StorageDescriptor sd = new StorageDescriptor(table.getSd());
|
||||
for (FieldSchema fieldSchema : sd.getCols()) {
|
||||
if (alterSchema.containsKey(fieldSchema.getName())) {
|
||||
@@ -264,7 +271,7 @@ public class HMSDDLExecutor implements DDLExecutor {
|
||||
}
|
||||
table.setSd(sd);
|
||||
EnvironmentContext environmentContext = new EnvironmentContext();
|
||||
client.alter_table_with_environmentContext(syncConfig.databaseName, tableName, table, environmentContext);
|
||||
client.alter_table_with_environmentContext(databaseName, tableName, table, environmentContext);
|
||||
sd.clear();
|
||||
} catch (Exception e) {
|
||||
LOG.error("Failed to update table comments for " + tableName, e);
|
||||
|
||||
@@ -21,9 +21,8 @@ package org.apache.hudi.hive.ddl;
|
||||
import org.apache.hudi.common.util.HoodieTimer;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.util.HivePartitionUtil;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
import org.apache.hadoop.hive.metastore.api.MetaException;
|
||||
@@ -34,7 +33,6 @@ import org.apache.hadoop.hive.ql.metadata.HiveException;
|
||||
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
|
||||
import org.apache.hadoop.hive.ql.session.SessionState;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hudi.hive.util.HivePartitionUtil;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -46,26 +44,28 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.sync.common.util.TableUtils.tableId;
|
||||
|
||||
/**
|
||||
* This class offers DDL executor backed by the hive.ql Driver This class preserves the old useJDBC = false way of doing things.
|
||||
*/
|
||||
public class HiveQueryDDLExecutor extends QueryBasedDDLExecutor {
|
||||
private static final Logger LOG = LogManager.getLogger(HiveQueryDDLExecutor.class);
|
||||
private final HiveSyncConfig config;
|
||||
private final IMetaStoreClient metaStoreClient;
|
||||
private SessionState sessionState = null;
|
||||
private Driver hiveDriver = null;
|
||||
|
||||
public HiveQueryDDLExecutor(HiveSyncConfig config, FileSystem fs, HiveConf configuration) throws HiveException, MetaException {
|
||||
super(config, fs);
|
||||
this.config = config;
|
||||
this.metaStoreClient = Hive.get(configuration).getMSC();
|
||||
private static final Logger LOG = LogManager.getLogger(HiveQueryDDLExecutor.class);
|
||||
|
||||
private final IMetaStoreClient metaStoreClient;
|
||||
private SessionState sessionState;
|
||||
private Driver hiveDriver;
|
||||
|
||||
public HiveQueryDDLExecutor(HiveSyncConfig config) throws HiveException, MetaException {
|
||||
super(config);
|
||||
this.metaStoreClient = Hive.get(config.getHiveConf()).getMSC();
|
||||
try {
|
||||
this.sessionState = new SessionState(configuration,
|
||||
this.sessionState = new SessionState(config.getHiveConf(),
|
||||
UserGroupInformation.getCurrentUser().getShortUserName());
|
||||
SessionState.start(this.sessionState);
|
||||
this.sessionState.setCurrentDatabase(config.databaseName);
|
||||
hiveDriver = new org.apache.hadoop.hive.ql.Driver(configuration);
|
||||
this.sessionState.setCurrentDatabase(databaseName);
|
||||
this.hiveDriver = new org.apache.hadoop.hive.ql.Driver(config.getHiveConf());
|
||||
} catch (Exception e) {
|
||||
if (sessionState != null) {
|
||||
try {
|
||||
@@ -109,7 +109,7 @@ public class HiveQueryDDLExecutor extends QueryBasedDDLExecutor {
|
||||
// HiveMetastoreClient returns partition keys separate from Columns, hence get both and merge to
|
||||
// get the Schema of the table.
|
||||
final long start = System.currentTimeMillis();
|
||||
Table table = metaStoreClient.getTable(config.databaseName, tableName);
|
||||
Table table = metaStoreClient.getTable(databaseName, tableName);
|
||||
Map<String, String> partitionKeysMap =
|
||||
table.getPartitionKeys().stream().collect(Collectors.toMap(FieldSchema::getName, f -> f.getType().toUpperCase()));
|
||||
|
||||
@@ -141,13 +141,13 @@ public class HiveQueryDDLExecutor extends QueryBasedDDLExecutor {
|
||||
config)) {
|
||||
String partitionClause =
|
||||
HivePartitionUtil.getPartitionClauseForDrop(dropPartition, partitionValueExtractor, config);
|
||||
metaStoreClient.dropPartition(config.databaseName, tableName, partitionClause, false);
|
||||
metaStoreClient.dropPartition(databaseName, tableName, partitionClause, false);
|
||||
}
|
||||
LOG.info("Drop partition " + dropPartition + " on " + tableName);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
LOG.error(config.databaseName + "." + tableName + " drop partition failed", e);
|
||||
throw new HoodieHiveSyncException(config.databaseName + "." + tableName + " drop partition failed", e);
|
||||
LOG.error(tableId(databaseName, tableName) + " drop partition failed", e);
|
||||
throw new HoodieHiveSyncException(tableId(databaseName, tableName) + " drop partition failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,12 +18,9 @@
|
||||
|
||||
package org.apache.hudi.hive.ddl;
|
||||
|
||||
import static org.apache.hudi.hive.util.HiveSchemaUtil.HIVE_ESCAPE_CHARACTER;
|
||||
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -39,21 +36,27 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
|
||||
import static org.apache.hudi.hive.util.HiveSchemaUtil.HIVE_ESCAPE_CHARACTER;
|
||||
|
||||
/**
|
||||
* This class offers DDL executor backed by the jdbc This class preserves the old useJDBC = true way of doing things.
|
||||
*/
|
||||
public class JDBCExecutor extends QueryBasedDDLExecutor {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(QueryBasedDDLExecutor.class);
|
||||
private final HiveSyncConfig config;
|
||||
|
||||
private Connection connection;
|
||||
|
||||
public JDBCExecutor(HiveSyncConfig config, FileSystem fs) {
|
||||
super(config, fs);
|
||||
Objects.requireNonNull(config.jdbcUrl, "--jdbc-url option is required for jdbc sync mode");
|
||||
Objects.requireNonNull(config.hiveUser, "--user option is required for jdbc sync mode");
|
||||
Objects.requireNonNull(config.hivePass, "--pass option is required for jdbc sync mode");
|
||||
this.config = config;
|
||||
createHiveConnection(config.jdbcUrl, config.hiveUser, config.hivePass);
|
||||
public JDBCExecutor(HiveSyncConfig config) {
|
||||
super(config);
|
||||
Objects.requireNonNull(config.getStringOrDefault(HIVE_URL), "--jdbc-url option is required for jdbc sync mode");
|
||||
Objects.requireNonNull(config.getStringOrDefault(HIVE_USER), "--user option is required for jdbc sync mode");
|
||||
Objects.requireNonNull(config.getStringOrDefault(HIVE_PASS), "--pass option is required for jdbc sync mode");
|
||||
createHiveConnection(config.getStringOrDefault(HIVE_URL), config.getStringOrDefault(HIVE_USER), config.getStringOrDefault(HIVE_PASS));
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -126,7 +129,7 @@ public class JDBCExecutor extends QueryBasedDDLExecutor {
|
||||
ResultSet result = null;
|
||||
try {
|
||||
DatabaseMetaData databaseMetaData = connection.getMetaData();
|
||||
result = databaseMetaData.getColumns(null, config.databaseName, tableName, null);
|
||||
result = databaseMetaData.getColumns(null, databaseName, tableName, null);
|
||||
while (result.next()) {
|
||||
String columnName = result.getString(4);
|
||||
String columnType = result.getString(6);
|
||||
@@ -157,11 +160,11 @@ public class JDBCExecutor extends QueryBasedDDLExecutor {
|
||||
}
|
||||
|
||||
private List<String> constructDropPartitions(String tableName, List<String> partitions) {
|
||||
if (config.batchSyncNum <= 0) {
|
||||
if (config.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM) <= 0) {
|
||||
throw new HoodieHiveSyncException("batch-sync-num for sync hive table must be greater than 0, pls check your parameter");
|
||||
}
|
||||
List<String> result = new ArrayList<>();
|
||||
int batchSyncPartitionNum = config.batchSyncNum;
|
||||
int batchSyncPartitionNum = config.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM);
|
||||
StringBuilder alterSQL = getAlterTableDropPrefix(tableName);
|
||||
|
||||
for (int i = 0; i < partitions.size(); i++) {
|
||||
@@ -186,7 +189,7 @@ public class JDBCExecutor extends QueryBasedDDLExecutor {
|
||||
|
||||
public StringBuilder getAlterTableDropPrefix(String tableName) {
|
||||
StringBuilder alterSQL = new StringBuilder("ALTER TABLE ");
|
||||
alterSQL.append(HIVE_ESCAPE_CHARACTER).append(config.databaseName)
|
||||
alterSQL.append(HIVE_ESCAPE_CHARACTER).append(databaseName)
|
||||
.append(HIVE_ESCAPE_CHARACTER).append(".").append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(tableName).append(HIVE_ESCAPE_CHARACTER).append(" DROP IF EXISTS ");
|
||||
return alterSQL;
|
||||
@@ -202,4 +205,4 @@ public class JDBCExecutor extends QueryBasedDDLExecutor {
|
||||
LOG.error("Could not close connection ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,13 +22,12 @@ import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.StorageSchemes;
|
||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.PartitionValueExtractor;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -39,26 +38,35 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
|
||||
import static org.apache.hudi.hive.util.HiveSchemaUtil.HIVE_ESCAPE_CHARACTER;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DECODE_PARTITION;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
|
||||
/**
|
||||
* This class adds functionality for all query based DDLExecutors. The classes extending it only have to provide runSQL(sql) functions.
|
||||
*/
|
||||
public abstract class QueryBasedDDLExecutor implements DDLExecutor {
|
||||
private static final Logger LOG = LogManager.getLogger(QueryBasedDDLExecutor.class);
|
||||
private final HiveSyncConfig config;
|
||||
public final PartitionValueExtractor partitionValueExtractor;
|
||||
private final FileSystem fs;
|
||||
|
||||
public QueryBasedDDLExecutor(HiveSyncConfig config, FileSystem fs) {
|
||||
this.fs = fs;
|
||||
private static final Logger LOG = LogManager.getLogger(QueryBasedDDLExecutor.class);
|
||||
|
||||
protected final HiveSyncConfig config;
|
||||
protected final String databaseName;
|
||||
protected final PartitionValueExtractor partitionValueExtractor;
|
||||
|
||||
public QueryBasedDDLExecutor(HiveSyncConfig config) {
|
||||
this.config = config;
|
||||
this.databaseName = config.getStringOrDefault(META_SYNC_DATABASE_NAME);
|
||||
try {
|
||||
this.partitionValueExtractor =
|
||||
(PartitionValueExtractor) Class.forName(config.partitionValueExtractorClass).newInstance();
|
||||
(PartitionValueExtractor) Class.forName(config.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS)).newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException(
|
||||
"Failed to initialize PartitionValueExtractor class " + config.partitionValueExtractorClass, e);
|
||||
"Failed to initialize PartitionValueExtractor class " + config.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS), e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,11 +98,11 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
|
||||
@Override
|
||||
public void updateTableDefinition(String tableName, MessageType newSchema) {
|
||||
try {
|
||||
String newSchemaStr = HiveSchemaUtil.generateSchemaString(newSchema, config.partitionFields, config.supportTimestamp);
|
||||
String newSchemaStr = HiveSchemaUtil.generateSchemaString(newSchema, config.getSplitStrings(META_SYNC_PARTITION_FIELDS), config.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE));
|
||||
// Cascade clause should not be present for non-partitioned tables
|
||||
String cascadeClause = config.partitionFields.size() > 0 ? " cascade" : "";
|
||||
String cascadeClause = config.getSplitStrings(HIVE_SUPPORT_TIMESTAMP_TYPE).size() > 0 ? " cascade" : "";
|
||||
StringBuilder sqlBuilder = new StringBuilder("ALTER TABLE ").append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(config.databaseName).append(HIVE_ESCAPE_CHARACTER).append(".")
|
||||
.append(databaseName).append(HIVE_ESCAPE_CHARACTER).append(".")
|
||||
.append(HIVE_ESCAPE_CHARACTER).append(tableName)
|
||||
.append(HIVE_ESCAPE_CHARACTER).append(" REPLACE COLUMNS(")
|
||||
.append(newSchemaStr).append(" )").append(cascadeClause);
|
||||
@@ -130,15 +138,15 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateTableComments(String tableName, Map<String, ImmutablePair<String,String>> newSchema) {
|
||||
for (Map.Entry<String, ImmutablePair<String,String>> field : newSchema.entrySet()) {
|
||||
public void updateTableComments(String tableName, Map<String, Pair<String, String>> newSchema) {
|
||||
for (Map.Entry<String, Pair<String,String>> field : newSchema.entrySet()) {
|
||||
String name = field.getKey();
|
||||
StringBuilder sql = new StringBuilder();
|
||||
String type = field.getValue().getLeft();
|
||||
String comment = field.getValue().getRight();
|
||||
comment = comment.replace("'","");
|
||||
sql.append("ALTER TABLE ").append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(config.databaseName).append(HIVE_ESCAPE_CHARACTER).append(".")
|
||||
.append(databaseName).append(HIVE_ESCAPE_CHARACTER).append(".")
|
||||
.append(HIVE_ESCAPE_CHARACTER).append(tableName)
|
||||
.append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(" CHANGE COLUMN `").append(name).append("` `").append(name)
|
||||
@@ -148,15 +156,15 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
|
||||
}
|
||||
|
||||
private List<String> constructAddPartitions(String tableName, List<String> partitions) {
|
||||
if (config.batchSyncNum <= 0) {
|
||||
if (config.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM) <= 0) {
|
||||
throw new HoodieHiveSyncException("batch-sync-num for sync hive table must be greater than 0, pls check your parameter");
|
||||
}
|
||||
List<String> result = new ArrayList<>();
|
||||
int batchSyncPartitionNum = config.batchSyncNum;
|
||||
int batchSyncPartitionNum = config.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM);
|
||||
StringBuilder alterSQL = getAlterTablePrefix(tableName);
|
||||
for (int i = 0; i < partitions.size(); i++) {
|
||||
String partitionClause = getPartitionClause(partitions.get(i));
|
||||
String fullPartitionPath = FSUtils.getPartitionPath(config.basePath, partitions.get(i)).toString();
|
||||
String fullPartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partitions.get(i)).toString();
|
||||
alterSQL.append(" PARTITION (").append(partitionClause).append(") LOCATION '").append(fullPartitionPath)
|
||||
.append("' ");
|
||||
if ((i + 1) % batchSyncPartitionNum == 0) {
|
||||
@@ -173,7 +181,7 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
|
||||
|
||||
private StringBuilder getAlterTablePrefix(String tableName) {
|
||||
StringBuilder alterSQL = new StringBuilder("ALTER TABLE ");
|
||||
alterSQL.append(HIVE_ESCAPE_CHARACTER).append(config.databaseName)
|
||||
alterSQL.append(HIVE_ESCAPE_CHARACTER).append(databaseName)
|
||||
.append(HIVE_ESCAPE_CHARACTER).append(".").append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(tableName).append(HIVE_ESCAPE_CHARACTER).append(" ADD IF NOT EXISTS ");
|
||||
return alterSQL;
|
||||
@@ -181,18 +189,18 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
|
||||
|
||||
public String getPartitionClause(String partition) {
|
||||
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
|
||||
ValidationUtils.checkArgument(config.partitionFields.size() == partitionValues.size(),
|
||||
"Partition key parts " + config.partitionFields + " does not match with partition values " + partitionValues
|
||||
ValidationUtils.checkArgument(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() == partitionValues.size(),
|
||||
"Partition key parts " + config.getSplitStrings(META_SYNC_PARTITION_FIELDS) + " does not match with partition values " + partitionValues
|
||||
+ ". Check partition strategy. ");
|
||||
List<String> partBuilder = new ArrayList<>();
|
||||
for (int i = 0; i < config.partitionFields.size(); i++) {
|
||||
for (int i = 0; i < config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size(); i++) {
|
||||
String partitionValue = partitionValues.get(i);
|
||||
// decode the partition before sync to hive to prevent multiple escapes of HIVE
|
||||
if (config.decodePartition) {
|
||||
if (config.getBoolean(META_SYNC_DECODE_PARTITION)) {
|
||||
// This is a decode operator for encode in KeyGenUtils#getRecordPartitionPath
|
||||
partitionValue = PartitionPathEncodeUtils.unescapePathName(partitionValue);
|
||||
}
|
||||
partBuilder.add("`" + config.partitionFields.get(i) + "`='" + partitionValue + "'");
|
||||
partBuilder.add("`" + config.getSplitStrings(META_SYNC_PARTITION_FIELDS).get(i) + "`='" + partitionValue + "'");
|
||||
}
|
||||
return String.join(",", partBuilder);
|
||||
}
|
||||
@@ -200,15 +208,15 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
|
||||
private List<String> constructChangePartitions(String tableName, List<String> partitions) {
|
||||
List<String> changePartitions = new ArrayList<>();
|
||||
// Hive 2.x doesn't like db.table name for operations, hence we need to change to using the database first
|
||||
String useDatabase = "USE " + HIVE_ESCAPE_CHARACTER + config.databaseName + HIVE_ESCAPE_CHARACTER;
|
||||
String useDatabase = "USE " + HIVE_ESCAPE_CHARACTER + databaseName + HIVE_ESCAPE_CHARACTER;
|
||||
changePartitions.add(useDatabase);
|
||||
String alterTable = "ALTER TABLE " + HIVE_ESCAPE_CHARACTER + tableName + HIVE_ESCAPE_CHARACTER;
|
||||
for (String partition : partitions) {
|
||||
String partitionClause = getPartitionClause(partition);
|
||||
Path partitionPath = FSUtils.getPartitionPath(config.basePath, partition);
|
||||
Path partitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
|
||||
String partitionScheme = partitionPath.toUri().getScheme();
|
||||
String fullPartitionPath = StorageSchemes.HDFS.getScheme().equals(partitionScheme)
|
||||
? FSUtils.getDFSFullPartitionPath(fs, partitionPath) : partitionPath.toString();
|
||||
? FSUtils.getDFSFullPartitionPath(config.getHadoopFileSystem(), partitionPath) : partitionPath.toString();
|
||||
String changePartition =
|
||||
alterTable + " PARTITION (" + partitionClause + ") SET LOCATION '" + fullPartitionPath + "'";
|
||||
changePartitions.add(changePartition);
|
||||
|
||||
@@ -18,46 +18,44 @@
|
||||
|
||||
package org.apache.hudi.hive.replication;
|
||||
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
import com.beust.jcommander.ParametersDelegate;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
public class GlobalHiveSyncConfig extends HiveSyncConfig {
|
||||
@Parameter(names = {"--replicated-timestamp"}, description = "Add globally replicated timestamp to enable consistent reads across clusters")
|
||||
public String globallyReplicatedTimeStamp;
|
||||
|
||||
public GlobalHiveSyncConfig() {
|
||||
public static final ConfigProperty<String> META_SYNC_GLOBAL_REPLICATE_TIMESTAMP = ConfigProperty
|
||||
.key("hoodie.meta_sync.global.replicate.timestamp")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("");
|
||||
|
||||
public GlobalHiveSyncConfig(Properties props, Configuration hadoopConf) {
|
||||
super(props, hadoopConf);
|
||||
}
|
||||
|
||||
public GlobalHiveSyncConfig(TypedProperties props) {
|
||||
super(props);
|
||||
}
|
||||
public static class GlobalHiveSyncConfigParams {
|
||||
|
||||
public static GlobalHiveSyncConfig copy(GlobalHiveSyncConfig cfg) {
|
||||
GlobalHiveSyncConfig newConfig = new GlobalHiveSyncConfig(cfg.getProps());
|
||||
newConfig.basePath = cfg.basePath;
|
||||
newConfig.assumeDatePartitioning = cfg.assumeDatePartitioning;
|
||||
newConfig.databaseName = cfg.databaseName;
|
||||
newConfig.hivePass = cfg.hivePass;
|
||||
newConfig.hiveUser = cfg.hiveUser;
|
||||
newConfig.partitionFields = cfg.partitionFields;
|
||||
newConfig.partitionValueExtractorClass = cfg.partitionValueExtractorClass;
|
||||
newConfig.jdbcUrl = cfg.jdbcUrl;
|
||||
newConfig.tableName = cfg.tableName;
|
||||
newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat;
|
||||
newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
|
||||
newConfig.supportTimestamp = cfg.supportTimestamp;
|
||||
newConfig.decodePartition = cfg.decodePartition;
|
||||
newConfig.batchSyncNum = cfg.batchSyncNum;
|
||||
newConfig.globallyReplicatedTimeStamp = cfg.globallyReplicatedTimeStamp;
|
||||
return newConfig;
|
||||
}
|
||||
@ParametersDelegate()
|
||||
public final HiveSyncConfigParams hiveSyncConfigParams = new HiveSyncConfigParams();
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "GlobalHiveSyncConfig{" + super.toString()
|
||||
+ " globallyReplicatedTimeStamp=" + globallyReplicatedTimeStamp + "}";
|
||||
@Parameter(names = {"--replicated-timestamp"}, description = "Add globally replicated timestamp to enable consistent reads across clusters")
|
||||
public String globallyReplicatedTimeStamp;
|
||||
|
||||
public boolean isHelp() {
|
||||
return hiveSyncConfigParams.isHelp();
|
||||
}
|
||||
|
||||
public TypedProperties toProps() {
|
||||
final TypedProperties props = hiveSyncConfigParams.toProps();
|
||||
props.setPropertyIfNonNull(META_SYNC_GLOBAL_REPLICATE_TIMESTAMP.key(), globallyReplicatedTimeStamp);
|
||||
return props;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -18,26 +18,28 @@
|
||||
|
||||
package org.apache.hudi.hive.replication;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.hive.HiveSyncTool;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.apache.hudi.hive.replication.GlobalHiveSyncConfig.META_SYNC_GLOBAL_REPLICATE_TIMESTAMP;
|
||||
|
||||
public class GlobalHiveSyncTool extends HiveSyncTool {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class);
|
||||
private static final Logger LOG = LogManager.getLogger(GlobalHiveSyncTool.class);
|
||||
protected final GlobalHiveSyncConfig config;
|
||||
|
||||
public GlobalHiveSyncTool(GlobalHiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
|
||||
super(cfg, configuration, fs);
|
||||
public GlobalHiveSyncTool(Properties props, Configuration hadoopConf) {
|
||||
super(props, hadoopConf);
|
||||
this.config = new GlobalHiveSyncConfig(props, hadoopConf);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -48,19 +50,21 @@ public class GlobalHiveSyncTool extends HiveSyncTool {
|
||||
@Override
|
||||
protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) {
|
||||
super.syncHoodieTable(tableName, useRealtimeInputFormat, readAsOptimized);
|
||||
if (((GlobalHiveSyncConfig) hiveSyncConfig).globallyReplicatedTimeStamp != null) {
|
||||
hoodieHiveClient.updateLastReplicatedTimeStamp(tableName,
|
||||
((GlobalHiveSyncConfig) hiveSyncConfig).globallyReplicatedTimeStamp);
|
||||
Option<String> timestamp = Option.ofNullable(config.getString(META_SYNC_GLOBAL_REPLICATE_TIMESTAMP));
|
||||
if (timestamp.isPresent()) {
|
||||
syncClient.updateLastReplicatedTimeStamp(tableName, timestamp.get());
|
||||
LOG.info("Sync complete for " + tableName);
|
||||
} else {
|
||||
LOG.warn("Sync skipped: " + META_SYNC_GLOBAL_REPLICATE_TIMESTAMP.key() + " is not set.");
|
||||
}
|
||||
LOG.info("Sync complete for " + tableName);
|
||||
}
|
||||
|
||||
public Map<String, Option<String>> getLastReplicatedTimeStampMap() {
|
||||
Map<String, Option<String>> timeStampMap = new HashMap<>();
|
||||
Option<String> timeStamp = hoodieHiveClient.getLastReplicatedTime(snapshotTableName);
|
||||
Option<String> timeStamp = syncClient.getLastReplicatedTime(snapshotTableName);
|
||||
timeStampMap.put(snapshotTableName, timeStamp);
|
||||
if (HoodieTableType.MERGE_ON_READ.equals(hoodieHiveClient.getTableType())) {
|
||||
Option<String> roTimeStamp = hoodieHiveClient.getLastReplicatedTime(roTableName.get());
|
||||
if (HoodieTableType.MERGE_ON_READ.equals(syncClient.getTableType())) {
|
||||
Option<String> roTimeStamp = syncClient.getLastReplicatedTime(roTableName.get());
|
||||
timeStampMap.put(roTableName.get(), roTimeStamp);
|
||||
}
|
||||
return timeStampMap;
|
||||
@@ -70,18 +74,12 @@ public class GlobalHiveSyncTool extends HiveSyncTool {
|
||||
for (String tableName : timeStampMap.keySet()) {
|
||||
Option<String> timestamp = timeStampMap.get(tableName);
|
||||
if (timestamp.isPresent()) {
|
||||
hoodieHiveClient.updateLastReplicatedTimeStamp(tableName, timestamp.get());
|
||||
syncClient.updateLastReplicatedTimeStamp(tableName, timestamp.get());
|
||||
LOG.info("updated timestamp for " + tableName + " to: " + timestamp.get());
|
||||
} else {
|
||||
hoodieHiveClient.deleteLastReplicatedTimeStamp(tableName);
|
||||
syncClient.deleteLastReplicatedTimeStamp(tableName);
|
||||
LOG.info("deleted timestamp for " + tableName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static GlobalHiveSyncTool buildGlobalHiveSyncTool(GlobalHiveSyncConfig cfg, HiveConf hiveConf) {
|
||||
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
|
||||
hiveConf.addResource(fs.getConf());
|
||||
return new GlobalHiveSyncTool(cfg, hiveConf, fs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,18 +18,22 @@
|
||||
|
||||
package org.apache.hudi.hive.replication;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
import com.beust.jcommander.Parameters;
|
||||
import java.io.File;
|
||||
import com.beust.jcommander.ParametersDelegate;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Properties;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
|
||||
// TODO: stop extending HiveSyncConfig and take all the variables needed from config file
|
||||
@Parameters(commandDescription = "A tool to sync the hudi table to hive from different clusters. Similar to HiveSyncTool but syncs it to more"
|
||||
@@ -40,9 +44,9 @@ import org.apache.log4j.Logger;
|
||||
+ " The tool tries to be transactional but does not guarantee it. If the sync fails midway in one cluster it will try to roll back the committed "
|
||||
+ " timestamp from already successful sync on other clusters but that can also fail."
|
||||
+ " The tool does not roll back any synced partitions but only the timestamp.")
|
||||
public class HiveSyncGlobalCommitConfig extends GlobalHiveSyncConfig {
|
||||
public class HiveSyncGlobalCommitParams {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HiveSyncGlobalCommitConfig.class);
|
||||
private static final Logger LOG = LogManager.getLogger(HiveSyncGlobalCommitParams.class);
|
||||
|
||||
public static String LOCAL_HIVE_SITE_URI = "hivesyncglobal.local_hive_site_uri";
|
||||
public static String REMOTE_HIVE_SITE_URI = "hivesyncglobal.remote_hive_site_uri";
|
||||
@@ -55,7 +59,14 @@ public class HiveSyncGlobalCommitConfig extends GlobalHiveSyncConfig {
|
||||
"--config-xml-file"}, description = "path to the config file in Hive", required = true)
|
||||
public String configFile;
|
||||
|
||||
public Properties properties = new Properties();
|
||||
@ParametersDelegate()
|
||||
public final GlobalHiveSyncConfig.GlobalHiveSyncConfigParams globalHiveSyncConfigParams = new GlobalHiveSyncConfig.GlobalHiveSyncConfigParams();
|
||||
|
||||
public boolean isHelp() {
|
||||
return globalHiveSyncConfigParams.isHelp();
|
||||
}
|
||||
|
||||
public Properties loadedProps = new Properties();
|
||||
|
||||
private boolean finalize = false;
|
||||
|
||||
@@ -64,33 +75,33 @@ public class HiveSyncGlobalCommitConfig extends GlobalHiveSyncConfig {
|
||||
throw new RuntimeException("trying to modify finalized config");
|
||||
}
|
||||
finalize = true;
|
||||
try (InputStream configStream = new FileInputStream(new File(configFile))) {
|
||||
properties.loadFromXML(configStream);
|
||||
try (InputStream configStream = new FileInputStream(configFile)) {
|
||||
loadedProps.loadFromXML(configStream);
|
||||
}
|
||||
if (StringUtils.isNullOrEmpty(globallyReplicatedTimeStamp)) {
|
||||
if (StringUtils.isNullOrEmpty(globalHiveSyncConfigParams.globallyReplicatedTimeStamp)) {
|
||||
throw new RuntimeException("globally replicated timestamp not set");
|
||||
}
|
||||
}
|
||||
|
||||
GlobalHiveSyncConfig mkGlobalHiveSyncConfig(boolean forRemote) {
|
||||
GlobalHiveSyncConfig cfg = GlobalHiveSyncConfig.copy(this);
|
||||
cfg.basePath = forRemote ? properties.getProperty(REMOTE_BASE_PATH)
|
||||
: properties.getProperty(LOCAL_BASE_PATH, cfg.basePath);
|
||||
cfg.jdbcUrl = forRemote ? properties.getProperty(REMOTE_HIVE_SERVER_JDBC_URLS)
|
||||
: properties.getProperty(LOCAL_HIVE_SERVER_JDBC_URLS, cfg.jdbcUrl);
|
||||
LOG.info("building hivesync config forRemote: " + forRemote + " " + cfg.jdbcUrl + " "
|
||||
+ cfg.basePath);
|
||||
return cfg;
|
||||
Properties mkGlobalHiveSyncProps(boolean forRemote) {
|
||||
TypedProperties props = new TypedProperties(loadedProps);
|
||||
props.putAll(globalHiveSyncConfigParams.toProps());
|
||||
String basePath = forRemote ? loadedProps.getProperty(REMOTE_BASE_PATH)
|
||||
: loadedProps.getProperty(LOCAL_BASE_PATH, loadedProps.getProperty(META_SYNC_BASE_PATH.key()));
|
||||
props.setPropertyIfNonNull(META_SYNC_BASE_PATH.key(), basePath);
|
||||
String jdbcUrl = forRemote ? loadedProps.getProperty(REMOTE_HIVE_SERVER_JDBC_URLS)
|
||||
: loadedProps.getProperty(LOCAL_HIVE_SERVER_JDBC_URLS, loadedProps.getProperty(HIVE_URL.key()));
|
||||
props.setPropertyIfNonNull(HIVE_URL.key(), jdbcUrl);
|
||||
LOG.info("building hivesync config forRemote: " + forRemote + " " + jdbcUrl + " "
|
||||
+ basePath);
|
||||
return props;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "HiveSyncGlobalCommitConfig{ " + "configFile=" + configFile + ", properties="
|
||||
+ properties + ", " + super.toString()
|
||||
return "HiveSyncGlobalCommitParams{ " + "configFile=" + configFile + ", properties="
|
||||
+ loadedProps + ", " + super.toString()
|
||||
+ " }";
|
||||
}
|
||||
|
||||
public void storeToXML(OutputStream configStream) throws IOException {
|
||||
this.properties.storeToXML(configStream, "hivesync global config");
|
||||
}
|
||||
}
|
||||
@@ -18,36 +18,37 @@
|
||||
|
||||
package org.apache.hudi.hive.replication;
|
||||
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.LOCAL_HIVE_SITE_URI;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.REMOTE_HIVE_SITE_URI;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.LOCAL_HIVE_SITE_URI;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.REMOTE_HIVE_SITE_URI;
|
||||
|
||||
public class HiveSyncGlobalCommitTool implements HiveSyncGlobalCommit, AutoCloseable {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HiveSyncGlobalCommitTool.class);
|
||||
private final HiveSyncGlobalCommitConfig config;
|
||||
private List<ReplicationStateSync> replicationStateSyncList;
|
||||
private final HiveSyncGlobalCommitParams params;
|
||||
private final List<ReplicationStateSync> replicationStateSyncList;
|
||||
|
||||
private ReplicationStateSync getReplicatedState(boolean forRemote) {
|
||||
ReplicationStateSync getReplicatedState(boolean forRemote) {
|
||||
HiveConf hiveConf = new HiveConf();
|
||||
// we probably just need to set the metastore URIs
|
||||
// TODO: figure out how to integrate this in production
|
||||
// how to load balance between piper HMS,HS2
|
||||
// if we have list of uris, we can do something similar to createHiveConf in reairsync
|
||||
hiveConf.addResource(new Path(config.properties.getProperty(
|
||||
hiveConf.addResource(new Path(params.loadedProps.getProperty(
|
||||
forRemote ? REMOTE_HIVE_SITE_URI : LOCAL_HIVE_SITE_URI)));
|
||||
// TODO: get clusterId as input parameters
|
||||
ReplicationStateSync state = new ReplicationStateSync(config.mkGlobalHiveSyncConfig(forRemote),
|
||||
ReplicationStateSync state = new ReplicationStateSync(params.mkGlobalHiveSyncProps(forRemote),
|
||||
hiveConf, forRemote ? "REMOTESYNC" : "LOCALSYNC");
|
||||
return state;
|
||||
}
|
||||
@@ -93,23 +94,24 @@ public class HiveSyncGlobalCommitTool implements HiveSyncGlobalCommit, AutoClose
|
||||
return true;
|
||||
}
|
||||
|
||||
public HiveSyncGlobalCommitTool(HiveSyncGlobalCommitConfig config) {
|
||||
this.config = config;
|
||||
public HiveSyncGlobalCommitTool(HiveSyncGlobalCommitParams params) {
|
||||
this.params = params;
|
||||
this.replicationStateSyncList = new ArrayList<>(2);
|
||||
this.replicationStateSyncList.add(getReplicatedState(false));
|
||||
this.replicationStateSyncList.add(getReplicatedState(true));
|
||||
}
|
||||
|
||||
private static HiveSyncGlobalCommitConfig getHiveSyncGlobalCommitConfig(String[] args)
|
||||
private static HiveSyncGlobalCommitParams loadParams(String[] args)
|
||||
throws IOException {
|
||||
HiveSyncGlobalCommitConfig cfg = new HiveSyncGlobalCommitConfig();
|
||||
JCommander cmd = new JCommander(cfg, null, args);
|
||||
if (cfg.help || args.length == 0) {
|
||||
final HiveSyncGlobalCommitParams params = new HiveSyncGlobalCommitParams();
|
||||
JCommander cmd = JCommander.newBuilder().addObject(params).build();
|
||||
cmd.parse(args);
|
||||
if (params.isHelp()) {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
System.exit(0);
|
||||
}
|
||||
cfg.load();
|
||||
return cfg;
|
||||
params.load();
|
||||
return params;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -120,8 +122,8 @@ public class HiveSyncGlobalCommitTool implements HiveSyncGlobalCommit, AutoClose
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException, HoodieHiveSyncException {
|
||||
final HiveSyncGlobalCommitConfig cfg = getHiveSyncGlobalCommitConfig(args);
|
||||
try (final HiveSyncGlobalCommitTool globalCommitTool = new HiveSyncGlobalCommitTool(cfg)) {
|
||||
final HiveSyncGlobalCommitParams params = loadParams(args);
|
||||
try (final HiveSyncGlobalCommitTool globalCommitTool = new HiveSyncGlobalCommitTool(params)) {
|
||||
boolean success = globalCommitTool.commit();
|
||||
if (!success) {
|
||||
if (!globalCommitTool.rollback()) {
|
||||
|
||||
@@ -18,31 +18,26 @@
|
||||
|
||||
package org.apache.hudi.hive.replication;
|
||||
|
||||
import java.util.Map;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
public class ReplicationStateSync {
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
|
||||
private GlobalHiveSyncTool globalHiveSyncTool;
|
||||
private final GlobalHiveSyncConfig globalHiveSyncConfig;
|
||||
private final HiveConf hiveConf;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
public class ReplicationStateSync implements AutoCloseable {
|
||||
|
||||
protected GlobalHiveSyncTool globalHiveSyncTool;
|
||||
private Map<String, Option<String>> replicatedTimeStampMap;
|
||||
private Map<String, Option<String>> oldReplicatedTimeStampMap;
|
||||
private final String clusterId;
|
||||
|
||||
ReplicationStateSync(GlobalHiveSyncConfig conf, HiveConf hiveConf, String uid) {
|
||||
this.globalHiveSyncConfig = conf;
|
||||
this.hiveConf = hiveConf;
|
||||
initGlobalHiveSyncTool();
|
||||
ReplicationStateSync(Properties props, HiveConf hiveConf, String uid) {
|
||||
globalHiveSyncTool = new GlobalHiveSyncTool(props, hiveConf);
|
||||
replicatedTimeStampMap = globalHiveSyncTool.getLastReplicatedTimeStampMap();
|
||||
clusterId = uid;
|
||||
}
|
||||
|
||||
private void initGlobalHiveSyncTool() {
|
||||
globalHiveSyncTool = GlobalHiveSyncTool.buildGlobalHiveSyncTool(globalHiveSyncConfig, hiveConf);
|
||||
}
|
||||
|
||||
public void sync() throws Exception {
|
||||
// the cluster maybe down by the time we reach here so we refresh our replication
|
||||
// state right before we set the oldReplicatedTimeStamp to narrow this window. this is a
|
||||
@@ -80,6 +75,7 @@ public class ReplicationStateSync {
|
||||
return clusterId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
if (globalHiveSyncTool != null) {
|
||||
globalHiveSyncTool.close();
|
||||
|
||||
@@ -7,16 +7,17 @@
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
package org.apache.hudi.hive.transaction.lock;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
@@ -18,20 +18,26 @@
|
||||
|
||||
package org.apache.hudi.hive.util;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
|
||||
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.PartitionValueExtractor;
|
||||
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
|
||||
|
||||
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
|
||||
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.thrift.TException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DECODE_PARTITION;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
|
||||
public class HivePartitionUtil {
|
||||
private static final Logger LOG = LogManager.getLogger(HivePartitionUtil.class);
|
||||
|
||||
@@ -40,18 +46,18 @@ public class HivePartitionUtil {
|
||||
*/
|
||||
public static String getPartitionClauseForDrop(String partition, PartitionValueExtractor partitionValueExtractor, HiveSyncConfig config) {
|
||||
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
|
||||
ValidationUtils.checkArgument(config.partitionFields.size() == partitionValues.size(),
|
||||
"Partition key parts " + config.partitionFields + " does not match with partition values " + partitionValues
|
||||
ValidationUtils.checkArgument(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() == partitionValues.size(),
|
||||
"Partition key parts " + config.getSplitStrings(META_SYNC_PARTITION_FIELDS) + " does not match with partition values " + partitionValues
|
||||
+ ". Check partition strategy. ");
|
||||
List<String> partBuilder = new ArrayList<>();
|
||||
for (int i = 0; i < config.partitionFields.size(); i++) {
|
||||
for (int i = 0; i < config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size(); i++) {
|
||||
String partitionValue = partitionValues.get(i);
|
||||
// decode the partition before sync to hive to prevent multiple escapes of HIVE
|
||||
if (config.decodePartition) {
|
||||
if (config.getBoolean(META_SYNC_DECODE_PARTITION)) {
|
||||
// This is a decode operator for encode in KeyGenUtils#getRecordPartitionPath
|
||||
partitionValue = PartitionPathEncodeUtils.unescapePathName(partitionValue);
|
||||
}
|
||||
partBuilder.add(config.partitionFields.get(i) + "=" + partitionValue);
|
||||
partBuilder.add(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).get(i) + "=" + partitionValue);
|
||||
}
|
||||
return String.join("/", partBuilder);
|
||||
}
|
||||
@@ -61,7 +67,7 @@ public class HivePartitionUtil {
|
||||
Partition newPartition;
|
||||
try {
|
||||
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partitionPath);
|
||||
newPartition = client.getPartition(config.databaseName, tableName, partitionValues);
|
||||
newPartition = client.getPartition(config.getStringOrDefault(META_SYNC_DATABASE_NAME), tableName, partitionValues);
|
||||
} catch (NoSuchObjectException ignored) {
|
||||
newPartition = null;
|
||||
} catch (TException e) {
|
||||
|
||||
@@ -42,6 +42,12 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
|
||||
/**
|
||||
* Schema Utilities.
|
||||
*/
|
||||
@@ -156,7 +162,7 @@ public class HiveSchemaUtil {
|
||||
* @return : Hive Table schema read from parquet file List[FieldSchema] without partitionField
|
||||
*/
|
||||
public static List<FieldSchema> convertParquetSchemaToHiveFieldSchema(MessageType messageType, HiveSyncConfig syncConfig) throws IOException {
|
||||
return convertMapSchemaToHiveFieldSchema(parquetSchemaToMapSchema(messageType, syncConfig.supportTimestamp, false), syncConfig);
|
||||
return convertMapSchemaToHiveFieldSchema(parquetSchemaToMapSchema(messageType, syncConfig.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE), false), syncConfig);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -202,7 +208,7 @@ public class HiveSchemaUtil {
|
||||
public static List<FieldSchema> convertMapSchemaToHiveFieldSchema(LinkedHashMap<String, String> schema, HiveSyncConfig syncConfig) throws IOException {
|
||||
return schema.keySet().stream()
|
||||
.map(key -> new FieldSchema(key, schema.get(key).toLowerCase(), ""))
|
||||
.filter(field -> !syncConfig.partitionFields.contains(field.getName()))
|
||||
.filter(field -> !syncConfig.getSplitStrings(META_SYNC_PARTITION_FIELDS).contains(field.getName()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@@ -448,11 +454,11 @@ public class HiveSchemaUtil {
|
||||
public static String generateCreateDDL(String tableName, MessageType storageSchema, HiveSyncConfig config, String inputFormatClass,
|
||||
String outputFormatClass, String serdeClass, Map<String, String> serdeProperties,
|
||||
Map<String, String> tableProperties) throws IOException {
|
||||
Map<String, String> hiveSchema = convertParquetSchemaToHiveSchema(storageSchema, config.supportTimestamp);
|
||||
String columns = generateSchemaString(storageSchema, config.partitionFields, config.supportTimestamp);
|
||||
Map<String, String> hiveSchema = convertParquetSchemaToHiveSchema(storageSchema, config.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE));
|
||||
String columns = generateSchemaString(storageSchema, config.getSplitStrings(META_SYNC_PARTITION_FIELDS), config.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE));
|
||||
|
||||
List<String> partitionFields = new ArrayList<>();
|
||||
for (String partitionKey : config.partitionFields) {
|
||||
for (String partitionKey : config.getSplitStrings(META_SYNC_PARTITION_FIELDS)) {
|
||||
String partitionKeyWithTicks = tickSurround(partitionKey);
|
||||
partitionFields.add(new StringBuilder().append(partitionKeyWithTicks).append(" ")
|
||||
.append(getPartitionKeyType(hiveSchema, partitionKeyWithTicks)).toString());
|
||||
@@ -460,26 +466,26 @@ public class HiveSchemaUtil {
|
||||
|
||||
String partitionsStr = String.join(",", partitionFields);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (config.createManagedTable) {
|
||||
if (config.getBoolean(HIVE_CREATE_MANAGED_TABLE)) {
|
||||
sb.append("CREATE TABLE IF NOT EXISTS ");
|
||||
} else {
|
||||
sb.append("CREATE EXTERNAL TABLE IF NOT EXISTS ");
|
||||
}
|
||||
sb.append(HIVE_ESCAPE_CHARACTER).append(config.databaseName).append(HIVE_ESCAPE_CHARACTER)
|
||||
sb.append(HIVE_ESCAPE_CHARACTER).append(config.getStringOrDefault(META_SYNC_DATABASE_NAME)).append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(".").append(HIVE_ESCAPE_CHARACTER).append(tableName).append(HIVE_ESCAPE_CHARACTER);
|
||||
sb.append("( ").append(columns).append(")");
|
||||
if (!config.partitionFields.isEmpty()) {
|
||||
if (!config.getSplitStrings(META_SYNC_PARTITION_FIELDS).isEmpty()) {
|
||||
sb.append(" PARTITIONED BY (").append(partitionsStr).append(")");
|
||||
}
|
||||
if (config.bucketSpec != null) {
|
||||
sb.append(' ' + config.bucketSpec + ' ');
|
||||
if (config.getString(HIVE_SYNC_BUCKET_SYNC_SPEC) != null) {
|
||||
sb.append(' ' + config.getString(HIVE_SYNC_BUCKET_SYNC_SPEC) + ' ');
|
||||
}
|
||||
sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'");
|
||||
if (serdeProperties != null && !serdeProperties.isEmpty()) {
|
||||
sb.append(" WITH SERDEPROPERTIES (").append(propertyToString(serdeProperties)).append(")");
|
||||
}
|
||||
sb.append(" STORED AS INPUTFORMAT '").append(inputFormatClass).append("'");
|
||||
sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '").append(config.basePath).append("'");
|
||||
sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '").append(config.getAbsoluteBasePath()).append("'");
|
||||
|
||||
if (tableProperties != null && !tableProperties.isEmpty()) {
|
||||
sb.append(" TBLPROPERTIES(").append(propertyToString(tableProperties)).append(")");
|
||||
|
||||
@@ -1,128 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import static org.apache.hudi.hadoop.utils.HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.LOCAL_BASE_PATH;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.LOCAL_HIVE_SERVER_JDBC_URLS;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.LOCAL_HIVE_SITE_URI;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.REMOTE_BASE_PATH;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.REMOTE_HIVE_SERVER_JDBC_URLS;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.REMOTE_HIVE_SITE_URI;
|
||||
|
||||
import java.util.Collections;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig;
|
||||
import org.apache.hudi.hive.replication.HiveSyncGlobalCommitTool;
|
||||
import org.apache.hudi.hive.testutils.TestCluster;
|
||||
import org.junit.jupiter.api.extension.RegisterExtension;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TestHiveSyncGlobalCommitTool {
|
||||
|
||||
@RegisterExtension
|
||||
public static TestCluster localCluster = new TestCluster();
|
||||
@RegisterExtension
|
||||
public static TestCluster remoteCluster = new TestCluster();
|
||||
|
||||
private static String DB_NAME = "foo";
|
||||
private static String TBL_NAME = "bar";
|
||||
|
||||
private HiveSyncGlobalCommitConfig getGlobalCommitConfig(
|
||||
String commitTime, String dbName, String tblName) throws Exception {
|
||||
HiveSyncGlobalCommitConfig config = new HiveSyncGlobalCommitConfig();
|
||||
config.properties.setProperty(LOCAL_HIVE_SITE_URI, localCluster.getHiveSiteXmlLocation());
|
||||
config.properties.setProperty(REMOTE_HIVE_SITE_URI, remoteCluster.getHiveSiteXmlLocation());
|
||||
config.properties.setProperty(LOCAL_HIVE_SERVER_JDBC_URLS, localCluster.getHiveJdBcUrl());
|
||||
config.properties.setProperty(REMOTE_HIVE_SERVER_JDBC_URLS, remoteCluster.getHiveJdBcUrl());
|
||||
config.properties.setProperty(LOCAL_BASE_PATH, localCluster.tablePath(dbName, tblName));
|
||||
config.properties.setProperty(REMOTE_BASE_PATH, remoteCluster.tablePath(dbName, tblName));
|
||||
config.globallyReplicatedTimeStamp = commitTime;
|
||||
config.hiveUser = System.getProperty("user.name");
|
||||
config.hivePass = "";
|
||||
config.databaseName = dbName;
|
||||
config.tableName = tblName;
|
||||
config.basePath = localCluster.tablePath(dbName, tblName);
|
||||
config.assumeDatePartitioning = true;
|
||||
config.usePreApacheInputFormat = false;
|
||||
config.partitionFields = Collections.singletonList("datestr");
|
||||
return config;
|
||||
}
|
||||
|
||||
private void compareEqualLastReplicatedTimeStamp(HiveSyncGlobalCommitConfig config) throws Exception {
|
||||
Assertions.assertEquals(localCluster.getHMSClient()
|
||||
.getTable(config.databaseName, config.tableName).getParameters()
|
||||
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP), remoteCluster.getHMSClient()
|
||||
.getTable(config.databaseName, config.tableName).getParameters()
|
||||
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP), "compare replicated timestamps");
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
localCluster.forceCreateDb(DB_NAME);
|
||||
remoteCluster.forceCreateDb(DB_NAME);
|
||||
localCluster.dfsCluster.getFileSystem().delete(new Path(localCluster.tablePath(DB_NAME, TBL_NAME)), true);
|
||||
remoteCluster.dfsCluster.getFileSystem().delete(new Path(remoteCluster.tablePath(DB_NAME, TBL_NAME)), true);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void clear() throws Exception {
|
||||
localCluster.getHMSClient().dropTable(DB_NAME, TBL_NAME);
|
||||
remoteCluster.getHMSClient().dropTable(DB_NAME, TBL_NAME);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicGlobalCommit() throws Exception {
|
||||
String commitTime = "100";
|
||||
localCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
|
||||
// simulate drs
|
||||
remoteCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
|
||||
HiveSyncGlobalCommitConfig config = getGlobalCommitConfig(commitTime, DB_NAME, TBL_NAME);
|
||||
HiveSyncGlobalCommitTool tool = new HiveSyncGlobalCommitTool(config);
|
||||
Assertions.assertTrue(tool.commit());
|
||||
compareEqualLastReplicatedTimeStamp(config);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicRollback() throws Exception {
|
||||
String commitTime = "100";
|
||||
localCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
|
||||
// simulate drs
|
||||
remoteCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
|
||||
HiveSyncGlobalCommitConfig config = getGlobalCommitConfig(commitTime, DB_NAME, TBL_NAME);
|
||||
HiveSyncGlobalCommitTool tool = new HiveSyncGlobalCommitTool(config);
|
||||
Assertions.assertFalse(localCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
|
||||
Assertions.assertFalse(remoteCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
|
||||
// stop the remote cluster hive server to simulate cluster going down
|
||||
remoteCluster.stopHiveServer2();
|
||||
Assertions.assertFalse(tool.commit());
|
||||
Assertions.assertEquals(commitTime, localCluster.getHMSClient()
|
||||
.getTable(config.databaseName, config.tableName).getParameters()
|
||||
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP));
|
||||
Assertions.assertTrue(tool.rollback()); // do a rollback
|
||||
Assertions.assertNotEquals(commitTime, localCluster.getHMSClient()
|
||||
.getTable(config.databaseName, config.tableName).getParameters()
|
||||
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP));
|
||||
Assertions.assertFalse(remoteCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
|
||||
remoteCluster.startHiveServer2();
|
||||
}
|
||||
}
|
||||
@@ -27,19 +27,18 @@ import org.apache.hudi.common.testutils.SchemaTestUtil;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.hive.testutils.HiveTestUtil;
|
||||
import org.apache.hudi.sync.common.model.FieldSchema;
|
||||
import org.apache.hudi.sync.common.model.Partition;
|
||||
import org.apache.hudi.sync.common.model.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.model.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.Schema.Field;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
import org.apache.hadoop.hive.metastore.api.MetaException;
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
import org.apache.hadoop.hive.ql.Driver;
|
||||
import org.apache.hadoop.hive.ql.metadata.HiveException;
|
||||
import org.apache.hadoop.hive.ql.session.SessionState;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
@@ -53,17 +52,30 @@ import java.net.URISyntaxException;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_COMMENT;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_MODE;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
|
||||
import static org.apache.hudi.hive.testutils.HiveTestUtil.basePath;
|
||||
import static org.apache.hudi.hive.testutils.HiveTestUtil.ddlExecutor;
|
||||
import static org.apache.hudi.hive.testutils.HiveTestUtil.fileSystem;
|
||||
import static org.apache.hudi.hive.testutils.HiveTestUtil.getHiveConf;
|
||||
import static org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncProps;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
@@ -92,7 +104,7 @@ public class TestHiveSyncTool {
|
||||
}
|
||||
|
||||
private HiveSyncTool hiveSyncTool;
|
||||
private HoodieHiveClient hiveClient;
|
||||
private HoodieHiveSyncClient hiveClient;
|
||||
|
||||
@AfterAll
|
||||
public static void cleanUpClass() {
|
||||
@@ -131,7 +143,7 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource({"syncModeAndSchemaFromCommitMetadata"})
|
||||
public void testBasicSync(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata);
|
||||
@@ -144,29 +156,29 @@ public class TestHiveSyncTool {
|
||||
|
||||
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
|
||||
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
|
||||
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getDataSchema().getColumns().size() + 1,
|
||||
assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getStorageSchema().getColumns().size() + 1,
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(5, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
|
||||
// Adding of new partitions
|
||||
List<String> newPartition = Arrays.asList("2050/01/01");
|
||||
hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, Arrays.asList());
|
||||
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
List<String> newPartition = Collections.singletonList("2050/01/01");
|
||||
hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, Collections.emptyList());
|
||||
assertEquals(5, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"No new partition should be added");
|
||||
hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition);
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"New partition should be added");
|
||||
|
||||
// Update partitions
|
||||
hiveClient.updatePartitionsToTable(HiveTestUtil.TABLE_NAME, Arrays.asList());
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.updatePartitionsToTable(HiveTestUtil.TABLE_NAME, Collections.emptyList());
|
||||
assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Partition count should remain the same");
|
||||
hiveClient.updatePartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition);
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Partition count should remain the same");
|
||||
|
||||
// Alter partitions
|
||||
@@ -175,7 +187,7 @@ public class TestHiveSyncTool {
|
||||
ddlExecutor.runSQL("ALTER TABLE `" + HiveTestUtil.TABLE_NAME
|
||||
+ "` PARTITION (`datestr`='2050-01-01') SET LOCATION '/some/new/location'");
|
||||
|
||||
List<org.apache.hudi.sync.common.model.Partition> hivePartitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
|
||||
List<Partition> hivePartitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
|
||||
List<String> writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(Option.empty());
|
||||
List<PartitionEvent> partitionEvents = hiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, false);
|
||||
assertEquals(1, partitionEvents.size(), "There should be only one partition event");
|
||||
@@ -186,7 +198,7 @@ public class TestHiveSyncTool {
|
||||
reSyncHiveTable();
|
||||
|
||||
// Sync should update the changed partition to correct path
|
||||
List<Partition> tablePartitions = hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME);
|
||||
List<Partition> tablePartitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
|
||||
assertEquals(6, tablePartitions.size(), "The one partition we wrote should be added to hive");
|
||||
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be 100");
|
||||
@@ -195,33 +207,33 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource({"syncMode"})
|
||||
public void testSyncDataBase(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, true);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), HiveTestUtil.DB_NAME);
|
||||
hiveSyncProps.setProperty(META_SYNC_DATABASE_NAME.key(), HiveTestUtil.DB_NAME);
|
||||
|
||||
// while autoCreateDatabase is false and database not exists;
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "false");
|
||||
hiveSyncProps.setProperty(HIVE_AUTO_CREATE_DATABASE.key(), "false");
|
||||
reinitHiveSyncClient();
|
||||
// Lets do the sync
|
||||
assertThrows(Exception.class, (this::reSyncHiveTable));
|
||||
|
||||
// while autoCreateDatabase is true and database not exists;
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "true");
|
||||
hiveSyncProps.setProperty(HIVE_AUTO_CREATE_DATABASE.key(), "true");
|
||||
reinitHiveSyncClient();
|
||||
assertDoesNotThrow((this::reSyncHiveTable));
|
||||
assertTrue(hiveClient.databaseExists(HiveTestUtil.DB_NAME),
|
||||
"DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes");
|
||||
|
||||
// while autoCreateDatabase is false and database exists;
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "false");
|
||||
hiveSyncProps.setProperty(HIVE_AUTO_CREATE_DATABASE.key(), "false");
|
||||
reinitHiveSyncClient();
|
||||
assertDoesNotThrow((this::reSyncHiveTable));
|
||||
assertTrue(hiveClient.databaseExists(HiveTestUtil.DB_NAME),
|
||||
"DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes");
|
||||
|
||||
// while autoCreateDatabase is true and database exists;
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "true");
|
||||
hiveSyncProps.setProperty(HIVE_AUTO_CREATE_DATABASE.key(), "true");
|
||||
assertDoesNotThrow((this::reSyncHiveTable));
|
||||
assertTrue(hiveClient.databaseExists(HiveTestUtil.DB_NAME),
|
||||
"DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes");
|
||||
@@ -244,10 +256,10 @@ public class TestHiveSyncTool {
|
||||
put("tp_1", "p1");
|
||||
}
|
||||
};
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsDataSourceTable));
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_SERDE_PROPERTIES.key(), ConfigUtils.configToString(serdeProperties));
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_PROPERTIES.key(), ConfigUtils.configToString(tableProperties));
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsDataSourceTable));
|
||||
hiveSyncProps.setProperty(HIVE_TABLE_SERDE_PROPERTIES.key(), ConfigUtils.configToString(serdeProperties));
|
||||
hiveSyncProps.setProperty(HIVE_TABLE_PROPERTIES.key(), ConfigUtils.configToString(tableProperties));
|
||||
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata);
|
||||
@@ -335,10 +347,10 @@ public class TestHiveSyncTool {
|
||||
put("tp_1", "p1");
|
||||
}
|
||||
};
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsDataSourceTable));
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_SERDE_PROPERTIES.key(), ConfigUtils.configToString(serdeProperties));
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_PROPERTIES.key(), ConfigUtils.configToString(tableProperties));
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsDataSourceTable));
|
||||
hiveSyncProps.setProperty(HIVE_TABLE_SERDE_PROPERTIES.key(), ConfigUtils.configToString(serdeProperties));
|
||||
hiveSyncProps.setProperty(HIVE_TABLE_PROPERTIES.key(), ConfigUtils.configToString(tableProperties));
|
||||
|
||||
String instantTime = "100";
|
||||
String deltaCommitTime = "101";
|
||||
@@ -394,8 +406,8 @@ public class TestHiveSyncTool {
|
||||
public void testSyncManagedTable(boolean useSchemaFromCommitMetadata,
|
||||
boolean isManagedTable,
|
||||
String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE.key(), String.valueOf(isManagedTable));
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_CREATE_MANAGED_TABLE.key(), String.valueOf(isManagedTable));
|
||||
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata);
|
||||
@@ -422,13 +434,13 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testSyncWithSchema(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
String commitTime = "100";
|
||||
HiveTestUtil.createCOWTableWithSchema(commitTime, "/complex.schema.avsc");
|
||||
|
||||
reinitHiveSyncClient();
|
||||
reSyncHiveTable();
|
||||
assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(1, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(commitTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
@@ -437,12 +449,12 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testSyncIncremental(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
String commitTime1 = "100";
|
||||
HiveTestUtil.createCOWTable(commitTime1, 5, true);
|
||||
reinitHiveSyncClient();
|
||||
reSyncHiveTable();
|
||||
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(5, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
@@ -463,7 +475,7 @@ public class TestHiveSyncTool {
|
||||
|
||||
// Sync should add the one partition
|
||||
reSyncHiveTable();
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"The one partition we wrote should be added to hive");
|
||||
assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be 101");
|
||||
@@ -472,13 +484,13 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testSyncIncrementalWithSchemaEvolution(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
String commitTime1 = "100";
|
||||
HiveTestUtil.createCOWTable(commitTime1, 5, true);
|
||||
reinitHiveSyncClient();
|
||||
reSyncHiveTable();
|
||||
|
||||
int fields = hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size();
|
||||
int fields = hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size();
|
||||
|
||||
// Now lets create more partitions and these are the only ones which needs to be synced
|
||||
ZonedDateTime dateTime = ZonedDateTime.now().plusDays(6);
|
||||
@@ -488,15 +500,15 @@ public class TestHiveSyncTool {
|
||||
// Lets do the sync
|
||||
reinitHiveSyncClient();
|
||||
reSyncHiveTable();
|
||||
assertEquals(fields + 3, hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(fields + 3, hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Hive Schema has evolved and should not be 3 more field");
|
||||
assertEquals("BIGINT", hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).get("favorite_number"),
|
||||
assertEquals("BIGINT", hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).get("favorite_number"),
|
||||
"Hive Schema has evolved - Field favorite_number has evolved from int to long");
|
||||
assertTrue(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).containsKey("favorite_movie"),
|
||||
assertTrue(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).containsKey("favorite_movie"),
|
||||
"Hive Schema has evolved - Field favorite_movie was added");
|
||||
|
||||
// Sync should add the one partition
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"The one partition we wrote should be added to hive");
|
||||
assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be 101");
|
||||
@@ -505,13 +517,13 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testUpdateTableComments(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
String commitTime = "100";
|
||||
HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test.avsc");
|
||||
reinitHiveSyncClient();
|
||||
reSyncHiveTable();
|
||||
|
||||
Map<String, ImmutablePair<String,String>> alterCommentSchema = new HashMap<>();
|
||||
Map<String, Pair<String, String>> alterCommentSchema = new HashMap<>();
|
||||
//generate commented schema field
|
||||
Schema schema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test.avsc");
|
||||
Schema commentedSchema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test-doced.avsc");
|
||||
@@ -521,16 +533,16 @@ public class TestHiveSyncTool {
|
||||
String name = field.name().toLowerCase(Locale.ROOT);
|
||||
String comment = fieldsNameAndDoc.get(name);
|
||||
if (fieldsNameAndDoc.containsKey(name) && !comment.equals(field.doc())) {
|
||||
alterCommentSchema.put(name, new ImmutablePair<>(field.schema().getType().name(),comment));
|
||||
alterCommentSchema.put(name, new ImmutablePair<>(field.schema().getType().name(), comment));
|
||||
}
|
||||
}
|
||||
|
||||
ddlExecutor.updateTableComments(HiveTestUtil.TABLE_NAME, alterCommentSchema);
|
||||
|
||||
List<FieldSchema> fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(HiveTestUtil.TABLE_NAME);
|
||||
List<FieldSchema> fieldSchemas = hiveClient.getMetastoreFieldSchemas(HiveTestUtil.TABLE_NAME);
|
||||
int commentCnt = 0;
|
||||
for (FieldSchema fieldSchema : fieldSchemas) {
|
||||
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
|
||||
if (StringUtils.nonEmpty(fieldSchema.getCommentOrEmpty())) {
|
||||
commentCnt++;
|
||||
}
|
||||
}
|
||||
@@ -540,29 +552,29 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testSyncWithCommentedSchema(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_COMMENT.key(), "false");
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_COMMENT.key(), "false");
|
||||
String commitTime = "100";
|
||||
HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test-doced.avsc");
|
||||
|
||||
reinitHiveSyncClient();
|
||||
reSyncHiveTable();
|
||||
List<FieldSchema> fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(HiveTestUtil.TABLE_NAME);
|
||||
List<FieldSchema> fieldSchemas = hiveClient.getMetastoreFieldSchemas(HiveTestUtil.TABLE_NAME);
|
||||
int commentCnt = 0;
|
||||
for (FieldSchema fieldSchema : fieldSchemas) {
|
||||
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
|
||||
if (StringUtils.nonEmpty(fieldSchema.getCommentOrEmpty())) {
|
||||
commentCnt++;
|
||||
}
|
||||
}
|
||||
assertEquals(0, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers");
|
||||
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_COMMENT.key(), "true");
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_COMMENT.key(), "true");
|
||||
reinitHiveSyncClient();
|
||||
reSyncHiveTable();
|
||||
fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(HiveTestUtil.TABLE_NAME);
|
||||
fieldSchemas = hiveClient.getMetastoreFieldSchemas(HiveTestUtil.TABLE_NAME);
|
||||
commentCnt = 0;
|
||||
for (FieldSchema fieldSchema : fieldSchemas) {
|
||||
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
|
||||
if (StringUtils.nonEmpty(fieldSchema.getCommentOrEmpty())) {
|
||||
commentCnt++;
|
||||
}
|
||||
}
|
||||
@@ -572,7 +584,7 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncModeAndSchemaFromCommitMetadata")
|
||||
public void testSyncMergeOnRead(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
String instantTime = "100";
|
||||
String deltaCommitTime = "101";
|
||||
HiveTestUtil.createMORTable(instantTime, deltaCommitTime, 5, true,
|
||||
@@ -587,18 +599,18 @@ public class TestHiveSyncTool {
|
||||
assertTrue(hiveClient.tableExists(roTableName), "Table " + roTableName + " should exist after sync completes");
|
||||
|
||||
if (useSchemaFromCommitMetadata) {
|
||||
assertEquals(hiveClient.getTableSchema(roTableName).size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(roTableName).size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
} else {
|
||||
// The data generated and schema in the data file do not have metadata columns, so we need a separate check.
|
||||
assertEquals(hiveClient.getTableSchema(roTableName).size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(roTableName).size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize(),
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
}
|
||||
|
||||
assertEquals(5, hiveClient.scanTablePartitions(roTableName).size(),
|
||||
assertEquals(5, hiveClient.getAllPartitions(roTableName).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(deltaCommitTime, hiveClient.getLastCommitTimeSynced(roTableName).get(),
|
||||
"The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
@@ -616,18 +628,18 @@ public class TestHiveSyncTool {
|
||||
reSyncHiveTable();
|
||||
|
||||
if (useSchemaFromCommitMetadata) {
|
||||
assertEquals(hiveClient.getTableSchema(roTableName).size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(roTableName).size(),
|
||||
SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the evolved table schema + partition field");
|
||||
} else {
|
||||
// The data generated and schema in the data file do not have metadata columns, so we need a separate check.
|
||||
assertEquals(hiveClient.getTableSchema(roTableName).size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(roTableName).size(),
|
||||
SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize(),
|
||||
"Hive Schema should match the evolved table schema + partition field");
|
||||
}
|
||||
// Sync should add the one partition
|
||||
assertEquals(6, hiveClient.scanTablePartitions(roTableName).size(),
|
||||
assertEquals(6, hiveClient.getAllPartitions(roTableName).size(),
|
||||
"The 2 partitions we wrote should be added to hive");
|
||||
assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(roTableName).get(),
|
||||
"The last commit that was synced should be 103");
|
||||
@@ -636,7 +648,7 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncModeAndSchemaFromCommitMetadata")
|
||||
public void testSyncMergeOnReadRT(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
String instantTime = "100";
|
||||
String deltaCommitTime = "101";
|
||||
String snapshotTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
|
||||
@@ -654,18 +666,18 @@ public class TestHiveSyncTool {
|
||||
+ " should exist after sync completes");
|
||||
|
||||
if (useSchemaFromCommitMetadata) {
|
||||
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
} else {
|
||||
// The data generated and schema in the data file do not have metadata columns, so we need a separate check.
|
||||
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize(),
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
}
|
||||
|
||||
assertEquals(5, hiveClient.scanTablePartitions(snapshotTableName).size(),
|
||||
assertEquals(5, hiveClient.getAllPartitions(snapshotTableName).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(deltaCommitTime, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(),
|
||||
"The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
@@ -682,18 +694,18 @@ public class TestHiveSyncTool {
|
||||
reSyncHiveTable();
|
||||
|
||||
if (useSchemaFromCommitMetadata) {
|
||||
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the evolved table schema + partition field");
|
||||
} else {
|
||||
// The data generated and schema in the data file do not have metadata columns, so we need a separate check.
|
||||
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize(),
|
||||
"Hive Schema should match the evolved table schema + partition field");
|
||||
}
|
||||
// Sync should add the one partition
|
||||
assertEquals(6, hiveClient.scanTablePartitions(snapshotTableName).size(),
|
||||
assertEquals(6, hiveClient.getAllPartitions(snapshotTableName).size(),
|
||||
"The 2 partitions we wrote should be added to hive");
|
||||
assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(),
|
||||
"The last commit that was synced should be 103");
|
||||
@@ -702,12 +714,12 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testMultiPartitionKeySync(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, true);
|
||||
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName());
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day");
|
||||
hiveSyncProps.setProperty(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName());
|
||||
hiveSyncProps.setProperty(META_SYNC_PARTITION_FIELDS.key(), "year,month,day");
|
||||
|
||||
HiveTestUtil.getCreatedTablesSet().add(HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME);
|
||||
|
||||
@@ -718,15 +730,15 @@ public class TestHiveSyncTool {
|
||||
reSyncHiveTable();
|
||||
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
|
||||
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
|
||||
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getDataSchema().getColumns().size() + 3,
|
||||
assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getStorageSchema().getColumns().size() + 3,
|
||||
"Hive Schema should match the table schema + partition fields");
|
||||
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(5, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
|
||||
// HoodieHiveClient had a bug where partition vals were sorted
|
||||
// HoodieHiveSyncClient had a bug where partition vals were sorted
|
||||
// and stored as keys in a map. The following tests this particular case.
|
||||
// Now lets create partition "2010/01/02" and followed by "2010/02/01".
|
||||
String commitTime2 = "101";
|
||||
@@ -742,7 +754,7 @@ public class TestHiveSyncTool {
|
||||
|
||||
reSyncHiveTable();
|
||||
// Sync should add the one partition
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be 101");
|
||||
@@ -756,10 +768,10 @@ public class TestHiveSyncTool {
|
||||
reSyncHiveTable();
|
||||
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
|
||||
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
|
||||
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getDataSchema().getColumns().size() + 3,
|
||||
assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getStorageSchema().getColumns().size() + 3,
|
||||
"Hive Schema should match the table schema + partition fields");
|
||||
assertEquals(7, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(7, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(commitTime3, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
@@ -769,7 +781,7 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testDropPartitionKeySync(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 1, true);
|
||||
@@ -782,21 +794,21 @@ public class TestHiveSyncTool {
|
||||
|
||||
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
|
||||
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
|
||||
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getDataSchema().getColumns().size() + 1,
|
||||
assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getStorageSchema().getColumns().size() + 1,
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(1, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
|
||||
// Adding of new partitions
|
||||
List<String> newPartition = Arrays.asList("2050/01/01");
|
||||
hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, Arrays.asList());
|
||||
assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
List<String> newPartition = Collections.singletonList("2050/01/01");
|
||||
hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, Collections.emptyList());
|
||||
assertEquals(1, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"No new partition should be added");
|
||||
hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition);
|
||||
assertEquals(2, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(2, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"New partition should be added");
|
||||
|
||||
reSyncHiveTable();
|
||||
@@ -805,7 +817,7 @@ public class TestHiveSyncTool {
|
||||
ddlExecutor.runSQL("ALTER TABLE `" + HiveTestUtil.TABLE_NAME
|
||||
+ "` DROP PARTITION (`datestr`='2050-01-01')");
|
||||
|
||||
List<Partition> hivePartitions = hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME);
|
||||
List<Partition> hivePartitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
|
||||
assertEquals(1, hivePartitions.size(),
|
||||
"Table should have 1 partition because of the drop 1 partition");
|
||||
}
|
||||
@@ -813,7 +825,7 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testDropPartition(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 1, true);
|
||||
@@ -825,15 +837,15 @@ public class TestHiveSyncTool {
|
||||
reSyncHiveTable();
|
||||
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
|
||||
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
|
||||
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getDataSchema().getColumns().size() + 1,
|
||||
assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getStorageSchema().getColumns().size() + 1,
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
List<Partition> partitions = hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME);
|
||||
List<Partition> partitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
|
||||
assertEquals(1, partitions.size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
|
||||
"The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
String partitiontoDelete = partitions.get(0).getValues().get(0).replace("-","/");
|
||||
String partitiontoDelete = partitions.get(0).getValues().get(0).replace("-", "/");
|
||||
// create a replace commit to delete current partitions+
|
||||
HiveTestUtil.createReplaceCommit("101", partitiontoDelete, WriteOperationType.DELETE_PARTITION, true, true);
|
||||
|
||||
@@ -841,7 +853,7 @@ public class TestHiveSyncTool {
|
||||
reinitHiveSyncClient();
|
||||
reSyncHiveTable();
|
||||
|
||||
List<Partition> hivePartitions = hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME);
|
||||
List<Partition> hivePartitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
|
||||
assertEquals(0, hivePartitions.size(),
|
||||
"Table should have 0 partition because of the drop the only one partition");
|
||||
}
|
||||
@@ -849,12 +861,12 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testNonPartitionedSync(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, true);
|
||||
// Set partition value extractor to NonPartitionedExtractor
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), NonPartitionedExtractor.class.getCanonicalName());
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year, month, day");
|
||||
hiveSyncProps.setProperty(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), NonPartitionedExtractor.class.getCanonicalName());
|
||||
hiveSyncProps.setProperty(META_SYNC_PARTITION_FIELDS.key(), "");
|
||||
|
||||
HiveTestUtil.getCreatedTablesSet().add(HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME);
|
||||
|
||||
@@ -865,17 +877,17 @@ public class TestHiveSyncTool {
|
||||
reSyncHiveTable();
|
||||
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
|
||||
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
|
||||
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getDataSchema().getColumns().size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getStorageSchema().getColumns().size(),
|
||||
"Hive Schema should match the table schema,ignoring the partition fields");
|
||||
assertEquals(0, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
assertEquals(0, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
|
||||
"Table should not have partitions because of the NonPartitionedExtractor");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testReadSchemaForMOR(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
String commitTime = "100";
|
||||
String snapshotTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
|
||||
HiveTestUtil.createMORTable(commitTime, "", 5, false, true);
|
||||
@@ -891,11 +903,11 @@ public class TestHiveSyncTool {
|
||||
+ " should exist after sync completes");
|
||||
|
||||
// Schema being read from compacted base files
|
||||
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
assertEquals(5, hiveClient.scanTablePartitions(snapshotTableName).size(), "Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(5, hiveClient.getAllPartitions(snapshotTableName).size(), "Table partitions should match the number of partitions we wrote");
|
||||
|
||||
// Now lets create more partitions and these are the only ones which needs to be synced
|
||||
ZonedDateTime dateTime = ZonedDateTime.now().plusDays(6);
|
||||
@@ -908,28 +920,28 @@ public class TestHiveSyncTool {
|
||||
reSyncHiveTable();
|
||||
|
||||
// Schema being read from the log filesTestHiveSyncTool
|
||||
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(),
|
||||
assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the evolved table schema + partition field");
|
||||
// Sync should add the one partition
|
||||
assertEquals(6, hiveClient.scanTablePartitions(snapshotTableName).size(), "The 1 partition we wrote should be added to hive");
|
||||
assertEquals(6, hiveClient.getAllPartitions(snapshotTableName).size(), "The 1 partition we wrote should be added to hive");
|
||||
assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(),
|
||||
"The last commit that was synced should be 103");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConnectExceptionIgnoreConfigSet() throws IOException, URISyntaxException, HiveException, MetaException {
|
||||
public void testConnectExceptionIgnoreConfigSet() throws IOException, URISyntaxException {
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, false);
|
||||
reinitHiveSyncClient();
|
||||
HoodieHiveClient prevHiveClient = hiveClient;
|
||||
HoodieHiveSyncClient prevHiveClient = hiveClient;
|
||||
assertFalse(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
|
||||
"Table " + HiveTestUtil.TABLE_NAME + " should not exist initially");
|
||||
|
||||
// Lets do the sync
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS.key(), "true");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_URL.key(), hiveSyncProps.getString(HiveSyncConfig.HIVE_URL.key())
|
||||
hiveSyncProps.setProperty(HIVE_IGNORE_EXCEPTIONS.key(), "true");
|
||||
hiveSyncProps.setProperty(HIVE_URL.key(), hiveSyncProps.getString(HIVE_URL.key())
|
||||
.replace(String.valueOf(HiveTestUtil.hiveTestService.getHiveServerPort()), String.valueOf(NetworkTestUtils.nextFreePort())));
|
||||
reinitHiveSyncClient();
|
||||
reSyncHiveTable();
|
||||
@@ -939,12 +951,12 @@ public class TestHiveSyncTool {
|
||||
"Table " + HiveTestUtil.TABLE_NAME + " should not exist initially");
|
||||
}
|
||||
|
||||
private void verifyOldParquetFileTest(HoodieHiveClient hiveClient, String emptyCommitTime) throws Exception {
|
||||
private void verifyOldParquetFileTest(HoodieHiveSyncClient hiveClient, String emptyCommitTime) throws Exception {
|
||||
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
|
||||
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getDataSchema().getColumns().size() + 1,
|
||||
assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
|
||||
hiveClient.getStorageSchema().getColumns().size() + 1,
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(1, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(emptyCommitTime,
|
||||
hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
|
||||
@@ -952,19 +964,19 @@ public class TestHiveSyncTool {
|
||||
Schema schema = SchemaTestUtil.getSimpleSchema();
|
||||
for (Field field : schema.getFields()) {
|
||||
assertEquals(field.schema().getType().getName(),
|
||||
hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).get(field.name()).toLowerCase(),
|
||||
hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).get(field.name()).toLowerCase(),
|
||||
String.format("Hive Schema Field %s was added", field));
|
||||
}
|
||||
assertEquals("string",
|
||||
hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).get("datestr").toLowerCase(), "Hive Schema Field datestr was added");
|
||||
hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).get("datestr").toLowerCase(), "Hive Schema Field datestr was added");
|
||||
assertEquals(schema.getFields().size() + 1 + HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), "Hive Schema fields size");
|
||||
hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(), "Hive Schema fields size");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testPickingOlderParquetFileIfLatestIsEmptyCommit(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
final String commitTime = "100";
|
||||
HiveTestUtil.createCOWTable(commitTime, 1, true);
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
@@ -983,7 +995,7 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testNotPickingOlderParquetFileWhenLatestCommitReadFails(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
final String commitTime = "100";
|
||||
HiveTestUtil.createCOWTable(commitTime, 1, true);
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
@@ -1001,7 +1013,7 @@ public class TestHiveSyncTool {
|
||||
assertFalse(
|
||||
hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially");
|
||||
|
||||
HiveSyncTool tool = new HiveSyncTool(hiveSyncProps, getHiveConf(), fileSystem);
|
||||
HiveSyncTool tool = new HiveSyncTool(hiveSyncProps, getHiveConf());
|
||||
// now delete the evolved commit instant
|
||||
Path fullPath = new Path(HiveTestUtil.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
+ hiveClient.getActiveTimeline().getInstants()
|
||||
@@ -1022,7 +1034,7 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testNotPickingOlderParquetFileWhenLatestCommitReadFailsForExistingTable(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
final String commitTime = "100";
|
||||
HiveTestUtil.createCOWTable(commitTime, 1, true);
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
@@ -1067,7 +1079,7 @@ public class TestHiveSyncTool {
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testTypeConverter(String syncMode) throws Exception {
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
HiveTestUtil.createCOWTable("100", 5, true);
|
||||
// create database.
|
||||
ddlExecutor.runSQL("create database " + HiveTestUtil.DB_NAME);
|
||||
@@ -1082,24 +1094,24 @@ public class TestHiveSyncTool {
|
||||
// test one column in DECIMAL
|
||||
String oneTargetColumnSql = createTableSqlPrefix + "(`decimal_col` DECIMAL(9,8), `bigint_col` BIGINT)";
|
||||
ddlExecutor.runSQL(oneTargetColumnSql);
|
||||
System.out.println(hiveClient.getTableSchema(tableName));
|
||||
assertTrue(hiveClient.getTableSchema(tableName).containsValue("DECIMAL(9,8)"), errorMsg);
|
||||
System.out.println(hiveClient.getMetastoreSchema(tableName));
|
||||
assertTrue(hiveClient.getMetastoreSchema(tableName).containsValue("DECIMAL(9,8)"), errorMsg);
|
||||
ddlExecutor.runSQL(dropTableSql);
|
||||
|
||||
// test multiple columns in DECIMAL
|
||||
String multipleTargetColumnSql =
|
||||
createTableSqlPrefix + "(`decimal_col1` DECIMAL(9,8), `bigint_col` BIGINT, `decimal_col2` DECIMAL(7,4))";
|
||||
ddlExecutor.runSQL(multipleTargetColumnSql);
|
||||
System.out.println(hiveClient.getTableSchema(tableName));
|
||||
assertTrue(hiveClient.getTableSchema(tableName).containsValue("DECIMAL(9,8)")
|
||||
&& hiveClient.getTableSchema(tableName).containsValue("DECIMAL(7,4)"), errorMsg);
|
||||
System.out.println(hiveClient.getMetastoreSchema(tableName));
|
||||
assertTrue(hiveClient.getMetastoreSchema(tableName).containsValue("DECIMAL(9,8)")
|
||||
&& hiveClient.getMetastoreSchema(tableName).containsValue("DECIMAL(7,4)"), errorMsg);
|
||||
ddlExecutor.runSQL(dropTableSql);
|
||||
|
||||
// test no columns in DECIMAL
|
||||
String noTargetColumnsSql = createTableSqlPrefix + "(`bigint_col` BIGINT)";
|
||||
ddlExecutor.runSQL(noTargetColumnsSql);
|
||||
System.out.println(hiveClient.getTableSchema(tableName));
|
||||
assertTrue(hiveClient.getTableSchema(tableName).size() == 1 && hiveClient.getTableSchema(tableName)
|
||||
System.out.println(hiveClient.getMetastoreSchema(tableName));
|
||||
assertTrue(hiveClient.getMetastoreSchema(tableName).size() == 1 && hiveClient.getMetastoreSchema(tableName)
|
||||
.containsValue("BIGINT"), errorMsg);
|
||||
ddlExecutor.runSQL(dropTableSql);
|
||||
}
|
||||
@@ -1108,8 +1120,8 @@ public class TestHiveSyncTool {
|
||||
@MethodSource("syncMode")
|
||||
public void testSyncWithoutDiffs(String syncMode) throws Exception {
|
||||
String tableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_CONDITIONAL_SYNC.key(), "true");
|
||||
hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
|
||||
hiveSyncProps.setProperty(META_SYNC_CONDITIONAL_SYNC.key(), "true");
|
||||
|
||||
String commitTime0 = "100";
|
||||
String commitTime1 = "101";
|
||||
@@ -1136,11 +1148,11 @@ public class TestHiveSyncTool {
|
||||
}
|
||||
|
||||
private void reinitHiveSyncClient() {
|
||||
hiveSyncTool = new HiveSyncTool(hiveSyncProps, HiveTestUtil.getHiveConf(), fileSystem);
|
||||
hiveClient = (HoodieHiveClient) hiveSyncTool.hoodieHiveClient;
|
||||
hiveSyncTool = new HiveSyncTool(hiveSyncProps, HiveTestUtil.getHiveConf());
|
||||
hiveClient = (HoodieHiveSyncClient) hiveSyncTool.syncClient;
|
||||
}
|
||||
|
||||
private int getPartitionFieldSize() {
|
||||
return hiveSyncProps.getString(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key()).split(",").length;
|
||||
return hiveSyncProps.getString(META_SYNC_PARTITION_FIELDS.key()).split(",").length;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ package org.apache.hudi.hive.functional;
|
||||
|
||||
import org.apache.hudi.common.config.LockConfiguration;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.hive.HiveMetastoreBasedLockProvider;
|
||||
import org.apache.hudi.hive.transaction.lock.HiveMetastoreBasedLockProvider;
|
||||
import org.apache.hudi.hive.testutils.HiveSyncFunctionalTestHarness;
|
||||
|
||||
import org.apache.hadoop.hive.metastore.api.DataOperationType;
|
||||
|
||||
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive.replication;
|
||||
|
||||
import org.apache.hudi.hive.testutils.TestCluster;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.RegisterExtension;
|
||||
|
||||
import static org.apache.hudi.hadoop.utils.HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
|
||||
import static org.apache.hudi.hive.replication.GlobalHiveSyncConfig.META_SYNC_GLOBAL_REPLICATE_TIMESTAMP;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.LOCAL_BASE_PATH;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.LOCAL_HIVE_SERVER_JDBC_URLS;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.LOCAL_HIVE_SITE_URI;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.REMOTE_BASE_PATH;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.REMOTE_HIVE_SERVER_JDBC_URLS;
|
||||
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.REMOTE_HIVE_SITE_URI;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class TestHiveSyncGlobalCommitTool {
|
||||
|
||||
@RegisterExtension
|
||||
public static TestCluster localCluster = new TestCluster();
|
||||
@RegisterExtension
|
||||
public static TestCluster remoteCluster = new TestCluster();
|
||||
|
||||
private static final String DB_NAME = "foo";
|
||||
private static final String TBL_NAME = "bar";
|
||||
|
||||
private HiveSyncGlobalCommitParams getGlobalCommitConfig(String commitTime) throws Exception {
|
||||
HiveSyncGlobalCommitParams params = new HiveSyncGlobalCommitParams();
|
||||
params.loadedProps.setProperty(LOCAL_HIVE_SITE_URI, localCluster.getHiveSiteXmlLocation());
|
||||
params.loadedProps.setProperty(REMOTE_HIVE_SITE_URI, remoteCluster.getHiveSiteXmlLocation());
|
||||
params.loadedProps.setProperty(LOCAL_HIVE_SERVER_JDBC_URLS, localCluster.getHiveJdBcUrl());
|
||||
params.loadedProps.setProperty(REMOTE_HIVE_SERVER_JDBC_URLS, remoteCluster.getHiveJdBcUrl());
|
||||
params.loadedProps.setProperty(LOCAL_BASE_PATH, localCluster.tablePath(DB_NAME, TBL_NAME));
|
||||
params.loadedProps.setProperty(REMOTE_BASE_PATH, remoteCluster.tablePath(DB_NAME, TBL_NAME));
|
||||
params.loadedProps.setProperty(META_SYNC_GLOBAL_REPLICATE_TIMESTAMP.key(), commitTime);
|
||||
params.loadedProps.setProperty(HIVE_USER.key(), System.getProperty("user.name"));
|
||||
params.loadedProps.setProperty(HIVE_PASS.key(), "");
|
||||
params.loadedProps.setProperty(META_SYNC_DATABASE_NAME.key(), DB_NAME);
|
||||
params.loadedProps.setProperty(META_SYNC_TABLE_NAME.key(), TBL_NAME);
|
||||
params.loadedProps.setProperty(META_SYNC_BASE_PATH.key(), localCluster.tablePath(DB_NAME, TBL_NAME));
|
||||
params.loadedProps.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
|
||||
params.loadedProps.setProperty(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
|
||||
params.loadedProps.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr");
|
||||
return params;
|
||||
}
|
||||
|
||||
private void compareEqualLastReplicatedTimeStamp(HiveSyncGlobalCommitParams config) throws Exception {
|
||||
assertEquals(localCluster.getHMSClient()
|
||||
.getTable(DB_NAME, TBL_NAME).getParameters()
|
||||
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP), remoteCluster.getHMSClient()
|
||||
.getTable(DB_NAME, TBL_NAME).getParameters()
|
||||
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP), "compare replicated timestamps");
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
localCluster.forceCreateDb(DB_NAME);
|
||||
remoteCluster.forceCreateDb(DB_NAME);
|
||||
localCluster.dfsCluster.getFileSystem().delete(new Path(localCluster.tablePath(DB_NAME, TBL_NAME)), true);
|
||||
remoteCluster.dfsCluster.getFileSystem().delete(new Path(remoteCluster.tablePath(DB_NAME, TBL_NAME)), true);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void clear() throws Exception {
|
||||
localCluster.getHMSClient().dropTable(DB_NAME, TBL_NAME);
|
||||
remoteCluster.getHMSClient().dropTable(DB_NAME, TBL_NAME);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHiveConfigShouldMatchClusterConf() throws Exception {
|
||||
String commitTime = "100";
|
||||
localCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
|
||||
// simulate drs
|
||||
remoteCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
|
||||
HiveSyncGlobalCommitParams params = getGlobalCommitConfig(commitTime);
|
||||
HiveSyncGlobalCommitTool tool = new HiveSyncGlobalCommitTool(params);
|
||||
ReplicationStateSync localReplicationStateSync = tool.getReplicatedState(false);
|
||||
ReplicationStateSync remoteReplicationStateSync = tool.getReplicatedState(true);
|
||||
assertEquals(localReplicationStateSync.globalHiveSyncTool.config.getHiveConf().get("hive.metastore.uris"),
|
||||
localCluster.getHiveConf().get("hive.metastore.uris"));
|
||||
assertEquals(remoteReplicationStateSync.globalHiveSyncTool.config.getHiveConf().get("hive.metastore.uris"),
|
||||
remoteCluster.getHiveConf().get("hive.metastore.uris"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicGlobalCommit() throws Exception {
|
||||
String commitTime = "100";
|
||||
localCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
|
||||
// simulate drs
|
||||
remoteCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
|
||||
HiveSyncGlobalCommitParams params = getGlobalCommitConfig(commitTime);
|
||||
HiveSyncGlobalCommitTool tool = new HiveSyncGlobalCommitTool(params);
|
||||
assertTrue(tool.commit());
|
||||
compareEqualLastReplicatedTimeStamp(params);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicRollback() throws Exception {
|
||||
String commitTime = "100";
|
||||
localCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
|
||||
// simulate drs
|
||||
remoteCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
|
||||
HiveSyncGlobalCommitParams params = getGlobalCommitConfig(commitTime);
|
||||
HiveSyncGlobalCommitTool tool = new HiveSyncGlobalCommitTool(params);
|
||||
assertFalse(localCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
|
||||
assertFalse(remoteCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
|
||||
// stop the remote cluster hive server to simulate cluster going down
|
||||
remoteCluster.stopHiveServer2();
|
||||
assertFalse(tool.commit());
|
||||
assertEquals(commitTime, localCluster.getHMSClient()
|
||||
.getTable(DB_NAME, TBL_NAME).getParameters()
|
||||
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP));
|
||||
assertTrue(tool.rollback()); // do a rollback
|
||||
assertNotEquals(commitTime, localCluster.getHMSClient()
|
||||
.getTable(DB_NAME, TBL_NAME).getParameters()
|
||||
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP));
|
||||
assertFalse(remoteCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
|
||||
remoteCluster.startHiveServer2();
|
||||
}
|
||||
}
|
||||
@@ -24,7 +24,7 @@ import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.testutils.minicluster.ZookeeperTestService;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveClient;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncClient;
|
||||
import org.apache.hudi.hive.ddl.HiveQueryDDLExecutor;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
@@ -39,7 +39,17 @@ import org.junit.jupiter.api.io.TempDir;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.time.Instant;
|
||||
import java.util.Collections;
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
|
||||
|
||||
public class HiveSyncFunctionalTestHarness {
|
||||
|
||||
@@ -79,42 +89,42 @@ public class HiveSyncFunctionalTestHarness {
|
||||
}
|
||||
|
||||
public HiveSyncConfig hiveSyncConf() throws IOException {
|
||||
HiveSyncConfig conf = new HiveSyncConfig();
|
||||
conf.jdbcUrl = hiveTestService.getJdbcHive2Url();
|
||||
conf.hiveUser = "";
|
||||
conf.hivePass = "";
|
||||
conf.databaseName = "hivesynctestdb";
|
||||
conf.tableName = "hivesynctesttable";
|
||||
conf.basePath = Files.createDirectories(tempDir.resolve("hivesynctestcase-" + Instant.now().toEpochMilli())).toUri().toString();
|
||||
conf.assumeDatePartitioning = true;
|
||||
conf.usePreApacheInputFormat = false;
|
||||
conf.partitionFields = Collections.singletonList("datestr");
|
||||
return conf;
|
||||
Properties props = new Properties();
|
||||
props.setProperty(HIVE_URL.key(), hiveTestService.getJdbcHive2Url());
|
||||
props.setProperty(HIVE_USER.key(), "");
|
||||
props.setProperty(HIVE_PASS.key(), "");
|
||||
props.setProperty(META_SYNC_DATABASE_NAME.key(), "hivesynctestdb");
|
||||
props.setProperty(META_SYNC_TABLE_NAME.key(), "hivesynctesttable");
|
||||
props.setProperty(META_SYNC_BASE_PATH.key(), Files.createDirectories(tempDir.resolve("hivesynctestcase-" + Instant.now().toEpochMilli())).toUri().toString());
|
||||
props.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
|
||||
props.setProperty(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
|
||||
props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr");
|
||||
return new HiveSyncConfig(props, hiveConf());
|
||||
}
|
||||
|
||||
public HoodieHiveClient hiveClient(HiveSyncConfig hiveSyncConfig) throws IOException {
|
||||
public HoodieHiveSyncClient hiveClient(HiveSyncConfig hiveSyncConfig) throws IOException {
|
||||
HoodieTableMetaClient.withPropertyBuilder()
|
||||
.setTableType(HoodieTableType.COPY_ON_WRITE)
|
||||
.setTableName(hiveSyncConfig.tableName)
|
||||
.setTableName(hiveSyncConfig.getString(META_SYNC_TABLE_NAME))
|
||||
.setPayloadClass(HoodieAvroPayload.class)
|
||||
.initTable(hadoopConf, hiveSyncConfig.basePath);
|
||||
return new HoodieHiveClient(hiveSyncConfig, hiveConf(), fs());
|
||||
.initTable(hadoopConf, hiveSyncConfig.getString(META_SYNC_BASE_PATH));
|
||||
return new HoodieHiveSyncClient(hiveSyncConfig);
|
||||
}
|
||||
|
||||
public void dropTables(String database, String... tables) throws IOException, HiveException, MetaException {
|
||||
HiveSyncConfig hiveSyncConfig = hiveSyncConf();
|
||||
hiveSyncConfig.databaseName = database;
|
||||
hiveSyncConfig.setValue(META_SYNC_DATABASE_NAME, database);
|
||||
for (String table : tables) {
|
||||
hiveSyncConfig.tableName = table;
|
||||
new HiveQueryDDLExecutor(hiveSyncConfig, fs(), hiveConf()).runSQL("drop table if exists " + table);
|
||||
hiveSyncConfig.setValue(META_SYNC_TABLE_NAME, table);
|
||||
new HiveQueryDDLExecutor(hiveSyncConfig).runSQL("drop table if exists " + table);
|
||||
}
|
||||
}
|
||||
|
||||
public void dropDatabases(String... databases) throws IOException, HiveException, MetaException {
|
||||
HiveSyncConfig hiveSyncConfig = hiveSyncConf();
|
||||
for (String database : databases) {
|
||||
hiveSyncConfig.databaseName = database;
|
||||
new HiveQueryDDLExecutor(hiveSyncConfig, fs(), hiveConf()).runSQL("drop database if exists " + database);
|
||||
hiveSyncConfig.setValue(META_SYNC_DATABASE_NAME, database);
|
||||
new HiveQueryDDLExecutor(hiveSyncConfig).runSQL("drop database if exists " + database);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -84,6 +84,16 @@ import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
|
||||
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
|
||||
@SuppressWarnings("SameParameterValue")
|
||||
@@ -120,21 +130,21 @@ public class HiveTestUtil {
|
||||
basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString();
|
||||
|
||||
hiveSyncProps = new TypedProperties();
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_URL.key(), hiveTestService.getJdbcHive2Url());
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USER.key(), "");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_PASS.key(), "");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), DB_NAME);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), TABLE_NAME);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_BASE_PATH.key(), basePath);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr");
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_BATCH_SYNC_PARTITION_NUM.key(), "3");
|
||||
hiveSyncProps.setProperty(HIVE_URL.key(), hiveTestService.getJdbcHive2Url());
|
||||
hiveSyncProps.setProperty(HIVE_USER.key(), "");
|
||||
hiveSyncProps.setProperty(HIVE_PASS.key(), "");
|
||||
hiveSyncProps.setProperty(META_SYNC_DATABASE_NAME.key(), DB_NAME);
|
||||
hiveSyncProps.setProperty(META_SYNC_TABLE_NAME.key(), TABLE_NAME);
|
||||
hiveSyncProps.setProperty(META_SYNC_BASE_PATH.key(), basePath);
|
||||
hiveSyncProps.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
|
||||
hiveSyncProps.setProperty(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
|
||||
hiveSyncProps.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr");
|
||||
hiveSyncProps.setProperty(HIVE_BATCH_SYNC_PARTITION_NUM.key(), "3");
|
||||
|
||||
hiveSyncConfig = new HiveSyncConfig(hiveSyncProps);
|
||||
hiveSyncConfig = new HiveSyncConfig(hiveSyncProps, configuration);
|
||||
|
||||
dtfOut = DateTimeFormatter.ofPattern("yyyy/MM/dd");
|
||||
ddlExecutor = new HiveQueryDDLExecutor(hiveSyncConfig, fileSystem, getHiveConf());
|
||||
ddlExecutor = new HiveQueryDDLExecutor(hiveSyncConfig);
|
||||
|
||||
clear();
|
||||
}
|
||||
|
||||
@@ -1,276 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.TimelineUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
public abstract class AbstractSyncHoodieClient implements AutoCloseable {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(AbstractSyncHoodieClient.class);
|
||||
|
||||
public static final String HOODIE_LAST_COMMIT_TIME_SYNC = "last_commit_time_sync";
|
||||
public static final TypeConverter TYPE_CONVERTOR = new TypeConverter() {};
|
||||
|
||||
protected final HoodieTableMetaClient metaClient;
|
||||
protected final HoodieTableType tableType;
|
||||
protected final FileSystem fs;
|
||||
private final String basePath;
|
||||
private final boolean assumeDatePartitioning;
|
||||
private final boolean useFileListingFromMetadata;
|
||||
private final boolean withOperationField;
|
||||
|
||||
@Deprecated
|
||||
public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata,
|
||||
boolean verifyMetadataFileListing, boolean withOperationField, FileSystem fs) {
|
||||
this(basePath, assumeDatePartitioning, useFileListingFromMetadata, withOperationField, fs);
|
||||
}
|
||||
|
||||
public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata,
|
||||
boolean withOperationField, FileSystem fs) {
|
||||
this.metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
|
||||
this.tableType = metaClient.getTableType();
|
||||
this.basePath = basePath;
|
||||
this.assumeDatePartitioning = assumeDatePartitioning;
|
||||
this.useFileListingFromMetadata = useFileListingFromMetadata;
|
||||
this.withOperationField = withOperationField;
|
||||
this.fs = fs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the table.
|
||||
* @param tableName The table name.
|
||||
* @param storageSchema The table schema.
|
||||
* @param inputFormatClass The input format class of this table.
|
||||
* @param outputFormatClass The output format class of this table.
|
||||
* @param serdeClass The serde class of this table.
|
||||
* @param serdeProperties The serde properties of this table.
|
||||
* @param tableProperties The table properties for this table.
|
||||
*/
|
||||
public abstract void createTable(String tableName, MessageType storageSchema,
|
||||
String inputFormatClass, String outputFormatClass,
|
||||
String serdeClass, Map<String, String> serdeProperties,
|
||||
Map<String, String> tableProperties);
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #tableExists} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public abstract boolean doesTableExist(String tableName);
|
||||
|
||||
public abstract boolean tableExists(String tableName);
|
||||
|
||||
public abstract Option<String> getLastCommitTimeSynced(String tableName);
|
||||
|
||||
public abstract void updateLastCommitTimeSynced(String tableName);
|
||||
|
||||
public abstract Option<String> getLastReplicatedTime(String tableName);
|
||||
|
||||
public abstract void updateLastReplicatedTimeStamp(String tableName, String timeStamp);
|
||||
|
||||
public abstract void deleteLastReplicatedTimeStamp(String tableName);
|
||||
|
||||
public abstract void addPartitionsToTable(String tableName, List<String> partitionsToAdd);
|
||||
|
||||
public abstract void updatePartitionsToTable(String tableName, List<String> changedPartitions);
|
||||
|
||||
public abstract void dropPartitions(String tableName, List<String> partitionsToDrop);
|
||||
|
||||
public void updateTableProperties(String tableName, Map<String, String> tableProperties) {}
|
||||
|
||||
public abstract Map<String, String> getTableSchema(String tableName);
|
||||
|
||||
public HoodieTableType getTableType() {
|
||||
return tableType;
|
||||
}
|
||||
|
||||
public String getBasePath() {
|
||||
return metaClient.getBasePath();
|
||||
}
|
||||
|
||||
public FileSystem getFs() {
|
||||
return fs;
|
||||
}
|
||||
|
||||
public boolean isBootstrap() {
|
||||
return metaClient.getTableConfig().getBootstrapBasePath().isPresent();
|
||||
}
|
||||
|
||||
public void closeQuietly(ResultSet resultSet, Statement stmt) {
|
||||
try {
|
||||
if (stmt != null) {
|
||||
stmt.close();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("Could not close the statement opened ", e);
|
||||
}
|
||||
|
||||
try {
|
||||
if (resultSet != null) {
|
||||
resultSet.close();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("Could not close the resultset opened ", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the schema for a hoodie table. Depending on the type of table, try to read schema from commit metadata if
|
||||
* present, else fallback to reading from any file written in the latest commit. We will assume that the schema has
|
||||
* not changed within a single atomic write.
|
||||
*
|
||||
* @return Parquet schema for this table
|
||||
*/
|
||||
public MessageType getDataSchema() {
|
||||
try {
|
||||
return new TableSchemaResolver(metaClient).getTableParquetSchema();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieSyncException("Failed to read data schema", e);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isDropPartition() {
|
||||
try {
|
||||
Option<HoodieCommitMetadata> hoodieCommitMetadata = HoodieTableMetadataUtil.getLatestCommitMetadata(metaClient);
|
||||
|
||||
if (hoodieCommitMetadata.isPresent()
|
||||
&& WriteOperationType.DELETE_PARTITION.equals(hoodieCommitMetadata.get().getOperationType())) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieSyncException("Failed to get commit metadata", e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
|
||||
public List<String> getPartitionsWrittenToSince(Option<String> lastCommitTimeSynced) {
|
||||
if (!lastCommitTimeSynced.isPresent()) {
|
||||
LOG.info("Last commit time synced is not known, listing all partitions in " + basePath + ",FS :" + fs);
|
||||
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||
return FSUtils.getAllPartitionPaths(engineContext, basePath, useFileListingFromMetadata, assumeDatePartitioning);
|
||||
} else {
|
||||
LOG.info("Last commit time synced is " + lastCommitTimeSynced.get() + ", Getting commits since then");
|
||||
return TimelineUtils.getPartitionsWritten(metaClient.getActiveTimeline().getCommitsTimeline()
|
||||
.findInstantsAfter(lastCommitTimeSynced.get(), Integer.MAX_VALUE));
|
||||
}
|
||||
}
|
||||
|
||||
public abstract static class TypeConverter implements Serializable {
|
||||
|
||||
static final String DEFAULT_TARGET_TYPE = "DECIMAL";
|
||||
|
||||
protected String targetType;
|
||||
|
||||
public TypeConverter() {
|
||||
this.targetType = DEFAULT_TARGET_TYPE;
|
||||
}
|
||||
|
||||
public TypeConverter(String targetType) {
|
||||
ValidationUtils.checkArgument(Objects.nonNull(targetType));
|
||||
this.targetType = targetType;
|
||||
}
|
||||
|
||||
public void doConvert(ResultSet resultSet, Map<String, String> schema) throws SQLException {
|
||||
schema.put(getColumnName(resultSet), targetType.equalsIgnoreCase(getColumnType(resultSet))
|
||||
? convert(resultSet) : getColumnType(resultSet));
|
||||
}
|
||||
|
||||
public String convert(ResultSet resultSet) throws SQLException {
|
||||
String columnType = getColumnType(resultSet);
|
||||
int columnSize = resultSet.getInt("COLUMN_SIZE");
|
||||
int decimalDigits = resultSet.getInt("DECIMAL_DIGITS");
|
||||
return columnType + String.format("(%s,%s)", columnSize, decimalDigits);
|
||||
}
|
||||
|
||||
public String getColumnName(ResultSet resultSet) throws SQLException {
|
||||
return resultSet.getString(4);
|
||||
}
|
||||
|
||||
public String getColumnType(ResultSet resultSet) throws SQLException {
|
||||
return resultSet.getString(6);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the schema from the log file on path.
|
||||
*/
|
||||
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
|
||||
private MessageType readSchemaFromLogFile(Option<HoodieInstant> lastCompactionCommitOpt, Path path) throws Exception {
|
||||
MessageType messageType = TableSchemaResolver.readSchemaFromLogFile(fs, path);
|
||||
// Fall back to read the schema from last compaction
|
||||
if (messageType == null) {
|
||||
LOG.info("Falling back to read the schema from last compaction " + lastCompactionCommitOpt);
|
||||
return new TableSchemaResolver(this.metaClient).readSchemaFromLastCompaction(lastCompactionCommitOpt);
|
||||
}
|
||||
return messageType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Partition Event captures any partition that needs to be added or updated.
|
||||
*/
|
||||
public static class PartitionEvent {
|
||||
|
||||
public enum PartitionEventType {
|
||||
ADD, UPDATE, DROP
|
||||
}
|
||||
|
||||
public PartitionEventType eventType;
|
||||
public String storagePartition;
|
||||
|
||||
PartitionEvent(PartitionEventType eventType, String storagePartition) {
|
||||
this.eventType = eventType;
|
||||
this.storagePartition = storagePartition;
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionAddEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.ADD, storagePartition);
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionUpdateEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.UPDATE, storagePartition);
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionDropEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.DROP, storagePartition);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.sync.common.model.FieldSchema;
|
||||
import org.apache.hudi.sync.common.model.Partition;
|
||||
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public interface HoodieMetaSyncOperations {
|
||||
|
||||
String HOODIE_LAST_COMMIT_TIME_SYNC = "last_commit_time_sync";
|
||||
|
||||
/**
|
||||
* Create the table.
|
||||
*
|
||||
* @param tableName The table name.
|
||||
* @param storageSchema The table schema.
|
||||
* @param inputFormatClass The input format class of this table.
|
||||
* @param outputFormatClass The output format class of this table.
|
||||
* @param serdeClass The serde class of this table.
|
||||
* @param serdeProperties The serde properties of this table.
|
||||
* @param tableProperties The table properties for this table.
|
||||
*/
|
||||
default void createTable(String tableName,
|
||||
MessageType storageSchema,
|
||||
String inputFormatClass,
|
||||
String outputFormatClass,
|
||||
String serdeClass,
|
||||
Map<String, String> serdeProperties,
|
||||
Map<String, String> tableProperties) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if table exists in metastore.
|
||||
*/
|
||||
default boolean tableExists(String tableName) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Drop table from metastore.
|
||||
*/
|
||||
default void dropTable(String tableName) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Add partitions to the table in metastore.
|
||||
*/
|
||||
default void addPartitionsToTable(String tableName, List<String> partitionsToAdd) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Update partitions to the table in metastore.
|
||||
*/
|
||||
default void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Drop partitions from the table in metastore.
|
||||
*/
|
||||
default void dropPartitions(String tableName, List<String> partitionsToDrop) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all partitions for the table in the metastore.
|
||||
*/
|
||||
default List<Partition> getAllPartitions(String tableName) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a database already exists in the metastore.
|
||||
*/
|
||||
default boolean databaseExists(String databaseName) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a database in the metastore.
|
||||
*/
|
||||
default void createDatabase(String databaseName) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the schema from metastore.
|
||||
*/
|
||||
default Map<String, String> getMetastoreSchema(String tableName) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the schema from the Hudi table on storage.
|
||||
*/
|
||||
default MessageType getStorageSchema() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update schema for the table in the metastore.
|
||||
*/
|
||||
default void updateTableSchema(String tableName, MessageType newSchema) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of field schemas from metastore.
|
||||
*/
|
||||
default List<FieldSchema> getMetastoreFieldSchemas(String tableName) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of field schema from the Hudi table on storage.
|
||||
*/
|
||||
default List<FieldSchema> getStorageFieldSchemas() {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the field comments for table in metastore, by using the ones from storage.
|
||||
*/
|
||||
default void updateTableComments(String tableName, List<FieldSchema> fromMetastore, List<FieldSchema> fromStorage) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the timestamp of last sync.
|
||||
*/
|
||||
default Option<String> getLastCommitTimeSynced(String tableName) {
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the timestamp of last sync.
|
||||
*/
|
||||
default void updateLastCommitTimeSynced(String tableName) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the table properties in metastore.
|
||||
*/
|
||||
default void updateTableProperties(String tableName, Map<String, String> tableProperties) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the timestamp of last replication.
|
||||
*/
|
||||
default Option<String> getLastReplicatedTime(String tableName) {
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the timestamp of last replication.
|
||||
*/
|
||||
default void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete the timestamp of last replication.
|
||||
*/
|
||||
default void deleteLastReplicatedTimeStamp(String tableName) {
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,161 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
|
||||
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.timeline.TimelineUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
|
||||
import org.apache.hudi.sync.common.model.Partition;
|
||||
import org.apache.hudi.sync.common.model.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_USE_FILE_LISTING_FROM_METADATA;
|
||||
|
||||
public abstract class HoodieSyncClient implements HoodieMetaSyncOperations, AutoCloseable {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieSyncClient.class);
|
||||
|
||||
protected final HoodieSyncConfig config;
|
||||
protected final PartitionValueExtractor partitionValueExtractor;
|
||||
protected final HoodieTableMetaClient metaClient;
|
||||
|
||||
public HoodieSyncClient(HoodieSyncConfig config) {
|
||||
this.config = config;
|
||||
this.partitionValueExtractor = ReflectionUtils.loadClass(config.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS));
|
||||
this.metaClient = HoodieTableMetaClient.builder()
|
||||
.setConf(config.getHadoopConf())
|
||||
.setBasePath(config.getString(META_SYNC_BASE_PATH))
|
||||
.setLoadActiveTimelineOnLoad(true)
|
||||
.build();
|
||||
}
|
||||
|
||||
public HoodieTimeline getActiveTimeline() {
|
||||
return metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
public HoodieTableType getTableType() {
|
||||
return metaClient.getTableType();
|
||||
}
|
||||
|
||||
public String getBasePath() {
|
||||
return metaClient.getBasePathV2().toString();
|
||||
}
|
||||
|
||||
public boolean isBootstrap() {
|
||||
return metaClient.getTableConfig().getBootstrapBasePath().isPresent();
|
||||
}
|
||||
|
||||
public boolean isDropPartition() {
|
||||
try {
|
||||
Option<HoodieCommitMetadata> hoodieCommitMetadata = HoodieTableMetadataUtil.getLatestCommitMetadata(metaClient);
|
||||
|
||||
if (hoodieCommitMetadata.isPresent()
|
||||
&& WriteOperationType.DELETE_PARTITION.equals(hoodieCommitMetadata.get().getOperationType())) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieSyncException("Failed to get commit metadata", e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MessageType getStorageSchema() {
|
||||
try {
|
||||
return new TableSchemaResolver(metaClient).getTableParquetSchema();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieSyncException("Failed to read schema from storage.", e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getPartitionsWrittenToSince(Option<String> lastCommitTimeSynced) {
|
||||
if (!lastCommitTimeSynced.isPresent()) {
|
||||
LOG.info("Last commit time synced is not known, listing all partitions in "
|
||||
+ config.getString(META_SYNC_BASE_PATH)
|
||||
+ ",FS :" + config.getHadoopFileSystem());
|
||||
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||
return FSUtils.getAllPartitionPaths(engineContext,
|
||||
config.getString(META_SYNC_BASE_PATH),
|
||||
config.getBoolean(META_SYNC_USE_FILE_LISTING_FROM_METADATA),
|
||||
config.getBoolean(META_SYNC_ASSUME_DATE_PARTITION));
|
||||
} else {
|
||||
LOG.info("Last commit time synced is " + lastCommitTimeSynced.get() + ", Getting commits since then");
|
||||
return TimelineUtils.getPartitionsWritten(metaClient.getActiveTimeline().getCommitsTimeline()
|
||||
.findInstantsAfter(lastCommitTimeSynced.get(), Integer.MAX_VALUE));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over the storage partitions and find if there are any new partitions that need to be added or updated.
|
||||
* Generate a list of PartitionEvent based on the changes required.
|
||||
*/
|
||||
public List<PartitionEvent> getPartitionEvents(List<Partition> tablePartitions, List<String> partitionStoragePartitions, boolean isDropPartition) {
|
||||
Map<String, String> paths = new HashMap<>();
|
||||
for (Partition tablePartition : tablePartitions) {
|
||||
List<String> hivePartitionValues = tablePartition.getValues();
|
||||
String fullTablePartitionPath =
|
||||
Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getStorageLocation())).toUri().getPath();
|
||||
paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath);
|
||||
}
|
||||
|
||||
List<PartitionEvent> events = new ArrayList<>();
|
||||
for (String storagePartition : partitionStoragePartitions) {
|
||||
Path storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), storagePartition);
|
||||
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
// Check if the partition values or if hdfs path is the same
|
||||
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
|
||||
|
||||
if (isDropPartition) {
|
||||
events.add(PartitionEvent.newPartitionDropEvent(storagePartition));
|
||||
} else {
|
||||
if (!storagePartitionValues.isEmpty()) {
|
||||
String storageValue = String.join(", ", storagePartitionValues);
|
||||
if (!paths.containsKey(storageValue)) {
|
||||
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
|
||||
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
|
||||
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
}
|
||||
@@ -22,14 +22,19 @@ import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.HoodieConfig;
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
@@ -37,41 +42,6 @@ import java.util.function.Function;
|
||||
*/
|
||||
public class HoodieSyncConfig extends HoodieConfig {
|
||||
|
||||
@Parameter(names = {"--database"}, description = "name of the target database in meta store", required = true)
|
||||
public String databaseName;
|
||||
|
||||
@Parameter(names = {"--table"}, description = "name of the target table in meta store", required = true)
|
||||
public String tableName;
|
||||
|
||||
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
|
||||
public String basePath;
|
||||
|
||||
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
|
||||
public String baseFileFormat;
|
||||
|
||||
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
|
||||
public List<String> partitionFields;
|
||||
|
||||
@Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
|
||||
+ "to extract the partition values from HDFS path")
|
||||
public String partitionValueExtractorClass;
|
||||
|
||||
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
|
||||
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
|
||||
public Boolean assumeDatePartitioning;
|
||||
|
||||
@Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
|
||||
public Boolean decodePartition;
|
||||
|
||||
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
|
||||
public Boolean useFileListingFromMetadata;
|
||||
|
||||
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
|
||||
public Boolean isConditionalSync;
|
||||
|
||||
@Parameter(names = {"--spark-version"}, description = "The spark version")
|
||||
public String sparkVersion;
|
||||
|
||||
public static final ConfigProperty<String> META_SYNC_BASE_PATH = ConfigProperty
|
||||
.key("hoodie.datasource.meta.sync.base.path")
|
||||
.defaultValue("")
|
||||
@@ -150,6 +120,11 @@ public class HoodieSyncConfig extends HoodieConfig {
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Assume partitioning is yyyy/mm/dd");
|
||||
|
||||
public static final ConfigProperty<Boolean> META_SYNC_DECODE_PARTITION = ConfigProperty
|
||||
.key("hoodie.meta.sync.decode_partition")
|
||||
.defaultValue(false) // TODO infer from url encode option
|
||||
.withDocumentation("");
|
||||
|
||||
public static final ConfigProperty<Boolean> META_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty
|
||||
.key("hoodie.meta.sync.metadata_file_listing")
|
||||
.defaultValue(HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS)
|
||||
@@ -165,24 +140,85 @@ public class HoodieSyncConfig extends HoodieConfig {
|
||||
.defaultValue("")
|
||||
.withDocumentation("The spark version used when syncing with a metastore.");
|
||||
|
||||
public HoodieSyncConfig(TypedProperties props) {
|
||||
super(props);
|
||||
setDefaults();
|
||||
private Configuration hadoopConf;
|
||||
|
||||
this.basePath = getStringOrDefault(META_SYNC_BASE_PATH);
|
||||
this.databaseName = getStringOrDefault(META_SYNC_DATABASE_NAME);
|
||||
this.tableName = getStringOrDefault(META_SYNC_TABLE_NAME);
|
||||
this.baseFileFormat = getStringOrDefault(META_SYNC_BASE_FILE_FORMAT);
|
||||
this.partitionFields = props.getStringList(META_SYNC_PARTITION_FIELDS.key(), ",", Collections.emptyList());
|
||||
this.partitionValueExtractorClass = getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS);
|
||||
this.assumeDatePartitioning = getBooleanOrDefault(META_SYNC_ASSUME_DATE_PARTITION);
|
||||
this.decodePartition = getBooleanOrDefault(KeyGeneratorOptions.URL_ENCODE_PARTITIONING);
|
||||
this.useFileListingFromMetadata = getBooleanOrDefault(META_SYNC_USE_FILE_LISTING_FROM_METADATA);
|
||||
this.isConditionalSync = getBooleanOrDefault(META_SYNC_CONDITIONAL_SYNC);
|
||||
this.sparkVersion = getStringOrDefault(META_SYNC_SPARK_VERSION);
|
||||
public HoodieSyncConfig(Properties props) {
|
||||
this(props, ConfigUtils.createHadoopConf(props));
|
||||
}
|
||||
|
||||
protected void setDefaults() {
|
||||
this.setDefaultValue(META_SYNC_TABLE_NAME);
|
||||
public HoodieSyncConfig(Properties props, Configuration hadoopConf) {
|
||||
super(props);
|
||||
this.hadoopConf = hadoopConf;
|
||||
}
|
||||
|
||||
public void setHadoopConf(Configuration hadoopConf) {
|
||||
this.hadoopConf = hadoopConf;
|
||||
}
|
||||
|
||||
public Configuration getHadoopConf() {
|
||||
return hadoopConf;
|
||||
}
|
||||
|
||||
public FileSystem getHadoopFileSystem() {
|
||||
return FSUtils.getFs(getString(META_SYNC_BASE_PATH), getHadoopConf());
|
||||
}
|
||||
|
||||
public String getAbsoluteBasePath() {
|
||||
return getString(META_SYNC_BASE_PATH);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return props.toString();
|
||||
}
|
||||
|
||||
public static class HoodieSyncConfigParams {
|
||||
@Parameter(names = {"--database"}, description = "name of the target database in meta store", required = true)
|
||||
public String databaseName;
|
||||
@Parameter(names = {"--table"}, description = "name of the target table in meta store", required = true)
|
||||
public String tableName;
|
||||
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
|
||||
public String basePath;
|
||||
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
|
||||
public String baseFileFormat;
|
||||
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
|
||||
public List<String> partitionFields;
|
||||
@Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
|
||||
+ "to extract the partition values from HDFS path")
|
||||
public String partitionValueExtractorClass;
|
||||
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
|
||||
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
|
||||
public Boolean assumeDatePartitioning;
|
||||
@Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
|
||||
public Boolean decodePartition;
|
||||
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
|
||||
public Boolean useFileListingFromMetadata;
|
||||
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
|
||||
public Boolean isConditionalSync;
|
||||
@Parameter(names = {"--spark-version"}, description = "The spark version")
|
||||
public String sparkVersion;
|
||||
|
||||
@Parameter(names = {"--help", "-h"}, help = true)
|
||||
public boolean help = false;
|
||||
|
||||
public boolean isHelp() {
|
||||
return help;
|
||||
}
|
||||
|
||||
public TypedProperties toProps() {
|
||||
final TypedProperties props = new TypedProperties();
|
||||
props.setPropertyIfNonNull(META_SYNC_BASE_PATH.key(), basePath);
|
||||
props.setPropertyIfNonNull(META_SYNC_DATABASE_NAME.key(), databaseName);
|
||||
props.setPropertyIfNonNull(META_SYNC_TABLE_NAME.key(), tableName);
|
||||
props.setPropertyIfNonNull(META_SYNC_BASE_FILE_FORMAT.key(), baseFileFormat);
|
||||
props.setPropertyIfNonNull(META_SYNC_PARTITION_FIELDS.key(), StringUtils.join(",", partitionFields));
|
||||
props.setPropertyIfNonNull(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), partitionValueExtractorClass);
|
||||
props.setPropertyIfNonNull(META_SYNC_ASSUME_DATE_PARTITION.key(), assumeDatePartitioning);
|
||||
props.setPropertyIfNonNull(META_SYNC_DECODE_PARTITION.key(), decodePartition);
|
||||
props.setPropertyIfNonNull(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), useFileListingFromMetadata);
|
||||
props.setPropertyIfNonNull(META_SYNC_CONDITIONAL_SYNC.key(), isConditionalSync);
|
||||
props.setPropertyIfNonNull(META_SYNC_SPARK_VERSION.key(), sparkVersion);
|
||||
return props;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Base class to sync metadata with metastores to make
|
||||
* Hudi table queryable through external systems.
|
||||
*/
|
||||
public abstract class HoodieSyncTool implements AutoCloseable {
|
||||
|
||||
protected Properties props;
|
||||
protected Configuration hadoopConf;
|
||||
|
||||
public HoodieSyncTool(Properties props) {
|
||||
this(props, ConfigUtils.createHadoopConf(props));
|
||||
}
|
||||
|
||||
public HoodieSyncTool(Properties props, Configuration hadoopConf) {
|
||||
this.props = props;
|
||||
this.hadoopConf = hadoopConf;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public HoodieSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
this(props, conf);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public HoodieSyncTool(Properties props, FileSystem fileSystem) {
|
||||
this(props, fileSystem.getConf());
|
||||
}
|
||||
|
||||
public abstract void syncHoodieTable();
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
// no op
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common.model;
|
||||
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public class FieldSchema {
|
||||
|
||||
private final String name;
|
||||
private String type;
|
||||
private Option<String> comment;
|
||||
|
||||
public FieldSchema(String name, String type) {
|
||||
this(name, type, Option.empty());
|
||||
}
|
||||
|
||||
public FieldSchema(String name, String type, String comment) {
|
||||
this(name, type, Option.ofNullable(comment));
|
||||
}
|
||||
|
||||
public FieldSchema(String name, String type, Option<String> comment) {
|
||||
this.name = name;
|
||||
this.type = type;
|
||||
this.comment = comment;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public Option<String> getComment() {
|
||||
return comment;
|
||||
}
|
||||
|
||||
public String getCommentOrEmpty() {
|
||||
return comment.orElse("");
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public void setComment(Option<String> comment) {
|
||||
this.comment = comment;
|
||||
}
|
||||
|
||||
public void setComment(String comment) {
|
||||
this.comment = Option.ofNullable(comment);
|
||||
}
|
||||
|
||||
public boolean updateComment(FieldSchema another) {
|
||||
if (Objects.equals(name, another.getName())
|
||||
&& !Objects.equals(getCommentOrEmpty(), another.getCommentOrEmpty())) {
|
||||
setComment(another.getComment());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common.model;
|
||||
|
||||
/**
|
||||
* Partition Event captures any partition that needs to be added or updated.
|
||||
*/
|
||||
public class PartitionEvent {
|
||||
|
||||
public enum PartitionEventType {
|
||||
ADD, UPDATE, DROP
|
||||
}
|
||||
|
||||
public PartitionEventType eventType;
|
||||
public String storagePartition;
|
||||
|
||||
PartitionEvent(PartitionEventType eventType, String storagePartition) {
|
||||
this.eventType = eventType;
|
||||
this.storagePartition = storagePartition;
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionAddEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.ADD, storagePartition);
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionUpdateEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.UPDATE, storagePartition);
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionDropEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.DROP, storagePartition);
|
||||
}
|
||||
}
|
||||
@@ -7,16 +7,17 @@
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
package org.apache.hudi.sync.common.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
@@ -18,9 +18,13 @@
|
||||
|
||||
package org.apache.hudi.sync.common.util;
|
||||
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import java.util.Properties;
|
||||
|
||||
public class ConfigUtils {
|
||||
/**
|
||||
@@ -32,6 +36,7 @@ public class ConfigUtils {
|
||||
/**
|
||||
* Convert the key-value config to a map.The format of the config
|
||||
* is a key-value pair just like "k1=v1\nk2=v2\nk3=v3".
|
||||
*
|
||||
* @param keyValueConfig
|
||||
* @return
|
||||
*/
|
||||
@@ -49,7 +54,7 @@ public class ConfigUtils {
|
||||
tableProperties.put(key, value);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Bad key-value config: " + keyValue + ", must be the"
|
||||
+ " format 'key = value'");
|
||||
+ " format 'key = value'");
|
||||
}
|
||||
}
|
||||
return tableProperties;
|
||||
@@ -58,6 +63,7 @@ public class ConfigUtils {
|
||||
/**
|
||||
* Convert map config to key-value string.The format of the config
|
||||
* is a key-value pair just like "k1=v1\nk2=v2\nk3=v3".
|
||||
*
|
||||
* @param config
|
||||
* @return
|
||||
*/
|
||||
@@ -75,4 +81,10 @@ public class ConfigUtils {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static Configuration createHadoopConf(Properties props) {
|
||||
Configuration hadoopConf = new Configuration();
|
||||
props.stringPropertyNames().forEach(k -> hadoopConf.set(k, props.getProperty(k)));
|
||||
return hadoopConf;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
@@ -15,15 +16,10 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
package org.apache.hudi.sync.common.util;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
import org.apache.hudi.sync.common.util.Parquet2SparkSchemaUtils;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.parquet.schema.GroupType;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.parquet.schema.PrimitiveType;
|
||||
@@ -33,40 +29,18 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.apache.parquet.schema.OriginalType.UTF8;
|
||||
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
|
||||
|
||||
/**
|
||||
* Base class to sync Hudi meta data with Metastores to make
|
||||
* Hudi table queryable through external systems.
|
||||
*/
|
||||
public abstract class AbstractSyncTool {
|
||||
protected final Configuration conf;
|
||||
protected final FileSystem fs;
|
||||
protected TypedProperties props;
|
||||
|
||||
public AbstractSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
this.props = props;
|
||||
this.conf = conf;
|
||||
this.fs = fs;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public AbstractSyncTool(Properties props, FileSystem fileSystem) {
|
||||
this(new TypedProperties(props), fileSystem.getConf(), fileSystem);
|
||||
}
|
||||
|
||||
public abstract void syncHoodieTable();
|
||||
|
||||
public class SparkDataSourceTableUtils {
|
||||
/**
|
||||
* Get Spark Sql related table properties. This is used for spark datasource table.
|
||||
* @param schema The schema to write to the table.
|
||||
* @return A new parameters added the spark's table properties.
|
||||
*/
|
||||
protected Map<String, String> getSparkTableProperties(List<String> partitionNames, String sparkVersion,
|
||||
int schemaLengthThreshold, MessageType schema) {
|
||||
public static Map<String, String> getSparkTableProperties(List<String> partitionNames, String sparkVersion,
|
||||
int schemaLengthThreshold, MessageType schema) {
|
||||
// Convert the schema and partition info used by spark sql to hive table properties.
|
||||
// The following code refers to the spark code in
|
||||
// https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
|
||||
@@ -122,7 +96,7 @@ public abstract class AbstractSyncTool {
|
||||
return sparkProperties;
|
||||
}
|
||||
|
||||
protected Map<String, String> getSparkSerdeProperties(boolean readAsOptimized, String basePath) {
|
||||
public static Map<String, String> getSparkSerdeProperties(boolean readAsOptimized, String basePath) {
|
||||
Map<String, String> sparkSerdeProperties = new HashMap<>();
|
||||
sparkSerdeProperties.put("path", basePath);
|
||||
sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized));
|
||||
@@ -22,13 +22,11 @@ package org.apache.hudi.sync.common.util;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
import org.apache.hudi.sync.common.HoodieSyncTool;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
@@ -36,13 +34,12 @@ import java.util.Properties;
|
||||
* Helper class for syncing Hudi commit data with external metastores.
|
||||
*/
|
||||
public class SyncUtilHelpers {
|
||||
private static final Logger LOG = LogManager.getLogger(SyncUtilHelpers.class);
|
||||
|
||||
/**
|
||||
* Create an instance of an implementation of {@link AbstractSyncTool} that will sync all the relevant meta information
|
||||
* Create an instance of an implementation of {@link HoodieSyncTool} that will sync all the relevant meta information
|
||||
* with an external metastore such as Hive etc. to ensure Hoodie tables can be queried or read via external systems.
|
||||
*
|
||||
* @param metaSyncFQCN The class that implements the sync of the metadata.
|
||||
* @param metaSyncFQCN The class that implements the sync of the metadata.
|
||||
* @param props property map.
|
||||
* @param hadoopConfig Hadoop confs.
|
||||
* @param fs Filesystem used.
|
||||
@@ -62,30 +59,40 @@ public class SyncUtilHelpers {
|
||||
}
|
||||
}
|
||||
|
||||
static AbstractSyncTool instantiateMetaSyncTool(String metaSyncFQCN,
|
||||
TypedProperties props,
|
||||
Configuration hadoopConfig,
|
||||
FileSystem fs,
|
||||
String targetBasePath,
|
||||
String baseFileFormat) {
|
||||
static HoodieSyncTool instantiateMetaSyncTool(String metaSyncFQCN,
|
||||
TypedProperties props,
|
||||
Configuration hadoopConfig,
|
||||
FileSystem fs,
|
||||
String targetBasePath,
|
||||
String baseFileFormat) {
|
||||
TypedProperties properties = new TypedProperties();
|
||||
properties.putAll(props);
|
||||
properties.put(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), targetBasePath);
|
||||
properties.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), baseFileFormat);
|
||||
|
||||
if (ReflectionUtils.hasConstructor(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, Configuration.class})) {
|
||||
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, Configuration.class},
|
||||
properties, hadoopConfig));
|
||||
} else if (ReflectionUtils.hasConstructor(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class})) {
|
||||
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class},
|
||||
properties));
|
||||
} else if (ReflectionUtils.hasConstructor(metaSyncFQCN,
|
||||
new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class})) {
|
||||
return ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class},
|
||||
properties, hadoopConfig, fs));
|
||||
} else if (ReflectionUtils.hasConstructor(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, FileSystem.class})) {
|
||||
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, FileSystem.class},
|
||||
properties, fs));
|
||||
} else {
|
||||
LOG.warn("Falling back to deprecated constructor for class: " + metaSyncFQCN);
|
||||
try {
|
||||
return ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, FileSystem.class}, properties, fs));
|
||||
} catch (Throwable t) {
|
||||
throw new HoodieException("Could not load meta sync class " + metaSyncFQCN, t);
|
||||
}
|
||||
throw new HoodieException("Could not load meta sync class " + metaSyncFQCN
|
||||
+ ": no valid constructor found.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,16 +20,19 @@ package org.apache.hudi.sync.common.util;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.HoodieSyncTool;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.ValueSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
@@ -46,42 +49,44 @@ public class TestSyncUtilHelpers {
|
||||
hadoopConf = fileSystem.getConf();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateValidSyncClass() {
|
||||
AbstractSyncTool metaSyncTool = SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
ValidMetaSyncClass.class.getName(),
|
||||
@ParameterizedTest
|
||||
@ValueSource(classes = {DummySyncTool1.class, DummySyncTool2.class})
|
||||
public void testCreateValidSyncClass(Class<?> clazz) {
|
||||
HoodieSyncTool syncTool = SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
clazz.getName(),
|
||||
new TypedProperties(),
|
||||
hadoopConf,
|
||||
fileSystem,
|
||||
BASE_PATH,
|
||||
BASE_FORMAT
|
||||
);
|
||||
assertTrue(metaSyncTool instanceof ValidMetaSyncClass);
|
||||
assertTrue(clazz.isAssignableFrom(syncTool.getClass()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure it still works for the deprecated constructor of {@link AbstractSyncTool}
|
||||
* Ensure it still works for the deprecated constructor of {@link HoodieSyncTool}
|
||||
* as we implemented the fallback.
|
||||
*/
|
||||
@Test
|
||||
public void testCreateDeprecatedSyncClass() {
|
||||
@ParameterizedTest
|
||||
@ValueSource(classes = {DeprecatedSyncTool1.class, DeprecatedSyncTool2.class})
|
||||
public void testCreateDeprecatedSyncClass(Class<?> clazz) {
|
||||
Properties properties = new Properties();
|
||||
AbstractSyncTool deprecatedMetaSyncClass = SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
DeprecatedMetaSyncClass.class.getName(),
|
||||
HoodieSyncTool syncTool = SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
clazz.getName(),
|
||||
new TypedProperties(properties),
|
||||
hadoopConf,
|
||||
fileSystem,
|
||||
BASE_PATH,
|
||||
BASE_FORMAT
|
||||
);
|
||||
assertTrue(deprecatedMetaSyncClass instanceof DeprecatedMetaSyncClass);
|
||||
assertTrue(clazz.isAssignableFrom(syncTool.getClass()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateInvalidSyncClass() {
|
||||
Exception exception = assertThrows(HoodieException.class, () -> {
|
||||
Throwable t = assertThrows(HoodieException.class, () -> {
|
||||
SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
InvalidSyncClass.class.getName(),
|
||||
InvalidSyncTool.class.getName(),
|
||||
new TypedProperties(),
|
||||
hadoopConf,
|
||||
fileSystem,
|
||||
@@ -90,14 +95,14 @@ public class TestSyncUtilHelpers {
|
||||
);
|
||||
});
|
||||
|
||||
String expectedMessage = "Could not load meta sync class " + InvalidSyncClass.class.getName();
|
||||
assertTrue(exception.getMessage().contains(expectedMessage));
|
||||
|
||||
String expectedMessage = "Could not load meta sync class " + InvalidSyncTool.class.getName()
|
||||
+ ": no valid constructor found.";
|
||||
assertEquals(expectedMessage, t.getMessage());
|
||||
}
|
||||
|
||||
public static class ValidMetaSyncClass extends AbstractSyncTool {
|
||||
public ValidMetaSyncClass(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
super(props, conf, fs);
|
||||
public static class DummySyncTool1 extends HoodieSyncTool {
|
||||
public DummySyncTool1(Properties props, Configuration hadoopConf) {
|
||||
super(props, hadoopConf);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -106,9 +111,9 @@ public class TestSyncUtilHelpers {
|
||||
}
|
||||
}
|
||||
|
||||
public static class DeprecatedMetaSyncClass extends AbstractSyncTool {
|
||||
public DeprecatedMetaSyncClass(Properties props, FileSystem fileSystem) {
|
||||
super(props, fileSystem);
|
||||
public static class DummySyncTool2 extends HoodieSyncTool {
|
||||
public DummySyncTool2(Properties props, Configuration hadoopConf) {
|
||||
super(props, hadoopConf);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -117,8 +122,30 @@ public class TestSyncUtilHelpers {
|
||||
}
|
||||
}
|
||||
|
||||
public static class InvalidSyncClass {
|
||||
public InvalidSyncClass(Properties props) {
|
||||
public static class DeprecatedSyncTool1 extends HoodieSyncTool {
|
||||
public DeprecatedSyncTool1(TypedProperties props, Configuration hadoopConf, FileSystem fs) {
|
||||
super(props, hadoopConf, fs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
throw new HoodieException("Method unimplemented as its a test class");
|
||||
}
|
||||
}
|
||||
|
||||
public static class DeprecatedSyncTool2 extends HoodieSyncTool {
|
||||
public DeprecatedSyncTool2(Properties props, FileSystem fs) {
|
||||
super(props, fs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
throw new HoodieException("Method unimplemented as its a test class");
|
||||
}
|
||||
}
|
||||
|
||||
public static class InvalidSyncTool {
|
||||
public InvalidSyncTool(Properties props, FileSystem fs, Configuration hadoopConf) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user