1
0

[HUDI-3730] Improve meta sync class design and hierarchies (#5854)

* [HUDI-3730] Improve meta sync class design and hierarchies (#5754)
* Implements class design proposed in RFC-55

Co-authored-by: jian.feng <fengjian428@gmial.com>
Co-authored-by: jian.feng <jian.feng@shopee.com>
This commit is contained in:
Shiyan Xu
2022-07-03 04:17:25 -05:00
committed by GitHub
parent c00ea84985
commit c0e1587966
86 changed files with 2977 additions and 2877 deletions

View File

@@ -1,128 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sync.adb;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hive.PartitionValueExtractor;
import org.apache.hudi.hive.SchemaDifference;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public abstract class AbstractAdbSyncHoodieClient extends AbstractSyncHoodieClient {
protected AdbSyncConfig adbSyncConfig;
protected PartitionValueExtractor partitionValueExtractor;
protected HoodieTimeline activeTimeline;
public AbstractAdbSyncHoodieClient(AdbSyncConfig syncConfig, FileSystem fs) {
super(syncConfig.basePath, syncConfig.assumeDatePartitioning,
syncConfig.useFileListingFromMetadata, false, fs);
this.adbSyncConfig = syncConfig;
final String clazz = adbSyncConfig.partitionValueExtractorClass;
try {
this.partitionValueExtractor = (PartitionValueExtractor) Class.forName(clazz).newInstance();
} catch (Exception e) {
throw new HoodieException("Fail to init PartitionValueExtractor class " + clazz, e);
}
activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
}
public List<PartitionEvent> getPartitionEvents(Map<List<String>, String> tablePartitions,
List<String> partitionStoragePartitions) {
Map<String, String> paths = new HashMap<>();
for (Map.Entry<List<String>, String> entry : tablePartitions.entrySet()) {
List<String> partitionValues = entry.getKey();
String fullTablePartitionPath = entry.getValue();
paths.put(String.join(", ", partitionValues), fullTablePartitionPath);
}
List<PartitionEvent> events = new ArrayList<>();
for (String storagePartition : partitionStoragePartitions) {
Path storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, storagePartition);
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
// Check if the partition values or if hdfs path is the same
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
if (adbSyncConfig.useHiveStylePartitioning) {
String partition = String.join("/", storagePartitionValues);
storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
}
if (!storagePartitionValues.isEmpty()) {
String storageValue = String.join(", ", storagePartitionValues);
if (!paths.containsKey(storageValue)) {
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
}
}
}
return events;
}
public void close() {
}
public abstract Map<List<String>, String> scanTablePartitions(String tableName) throws Exception;
public abstract void updateTableDefinition(String tableName, SchemaDifference schemaDiff) throws Exception;
public abstract boolean databaseExists(String databaseName) throws Exception;
public abstract void createDatabase(String databaseName) throws Exception;
public abstract void dropTable(String tableName);
protected String getDatabasePath() {
String dbLocation = adbSyncConfig.dbLocation;
Path dbLocationPath;
if (StringUtils.isNullOrEmpty(dbLocation)) {
if (new Path(adbSyncConfig.basePath).isRoot()) {
dbLocationPath = new Path(adbSyncConfig.basePath);
} else {
dbLocationPath = new Path(adbSyncConfig.basePath).getParent();
}
} else {
dbLocationPath = new Path(dbLocation);
}
return generateAbsolutePathStr(dbLocationPath);
}
protected String generateAbsolutePathStr(Path path) {
String absolutePathStr = path.toString();
if (path.toUri().getScheme() == null) {
absolutePathStr = getDefaultFs() + absolutePathStr;
}
return absolutePathStr.endsWith("/") ? absolutePathStr : absolutePathStr + "/";
}
protected String getDefaultFs() {
return fs.getConf().get("fs.defaultFS");
}
}

View File

@@ -20,62 +20,19 @@ package org.apache.hudi.sync.adb;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.sync.common.HoodieSyncConfig;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.hive.HiveSyncConfig;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import org.apache.hadoop.fs.Path;
import java.util.Properties;
/**
* Configs needed to sync data into Alibaba Cloud AnalyticDB(ADB).
*/
public class AdbSyncConfig extends HoodieSyncConfig {
@Parameter(names = {"--user"}, description = "Adb username", required = true)
public String adbUser;
@Parameter(names = {"--pass"}, description = "Adb password", required = true)
public String adbPass;
@Parameter(names = {"--jdbc-url"}, description = "Adb jdbc connect url", required = true)
public String jdbcUrl;
@Parameter(names = {"--skip-ro-suffix"}, description = "Whether skip the `_ro` suffix for read optimized table when syncing")
public Boolean skipROSuffix;
@Parameter(names = {"--skip-rt-sync"}, description = "Whether skip the rt table when syncing")
public Boolean skipRTSync;
@Parameter(names = {"--hive-style-partitioning"}, description = "Whether use hive style partitioning, true if like the following style: field1=value1/field2=value2")
public Boolean useHiveStylePartitioning;
@Parameter(names = {"--support-timestamp"}, description = "If true, converts int64(timestamp_micros) to timestamp type")
public Boolean supportTimestamp;
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table")
public Boolean syncAsSparkDataSourceTable;
@Parameter(names = {"--table-properties"}, description = "Table properties, to support read hoodie table as datasource table", required = true)
public String tableProperties;
@Parameter(names = {"--serde-properties"}, description = "Serde properties, to support read hoodie table as datasource table", required = true)
public String serdeProperties;
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore")
public int sparkSchemaLengthThreshold;
@Parameter(names = {"--db-location"}, description = "Database location")
public String dbLocation;
@Parameter(names = {"--auto-create-database"}, description = "Whether auto create adb database")
public Boolean autoCreateDatabase = true;
@Parameter(names = {"--skip-last-commit-time-sync"}, description = "Whether skip last commit time syncing")
public Boolean skipLastCommitTimeSync = false;
@Parameter(names = {"--drop-table-before-creation"}, description = "Whether drop table before creation")
public Boolean dropTableBeforeCreation = false;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
public class AdbSyncConfig extends HiveSyncConfig {
public static final ConfigProperty<String> ADB_SYNC_USER = ConfigProperty
.key("hoodie.datasource.adb.sync.username")
@@ -152,89 +109,101 @@ public class AdbSyncConfig extends HoodieSyncConfig {
.defaultValue(false)
.withDocumentation("Whether drop table before creation");
public AdbSyncConfig() {
this(new TypedProperties());
}
public AdbSyncConfig(TypedProperties props) {
public AdbSyncConfig(Properties props) {
super(props);
adbUser = getString(ADB_SYNC_USER);
adbPass = getString(ADB_SYNC_PASS);
jdbcUrl = getString(ADB_SYNC_JDBC_URL);
skipROSuffix = getBooleanOrDefault(ADB_SYNC_SKIP_RO_SUFFIX);
skipRTSync = getBooleanOrDefault(ADB_SYNC_SKIP_RT_SYNC);
useHiveStylePartitioning = getBooleanOrDefault(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING);
supportTimestamp = getBooleanOrDefault(ADB_SYNC_SUPPORT_TIMESTAMP);
syncAsSparkDataSourceTable = getBooleanOrDefault(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE);
tableProperties = getString(ADB_SYNC_TABLE_PROPERTIES);
serdeProperties = getString(ADB_SYNC_SERDE_PROPERTIES);
sparkSchemaLengthThreshold = getIntOrDefault(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD);
dbLocation = getString(ADB_SYNC_DB_LOCATION);
autoCreateDatabase = getBooleanOrDefault(ADB_SYNC_AUTO_CREATE_DATABASE);
skipLastCommitTimeSync = getBooleanOrDefault(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC);
dropTableBeforeCreation = getBooleanOrDefault(ADB_SYNC_DROP_TABLE_BEFORE_CREATION);
}
public static TypedProperties toProps(AdbSyncConfig cfg) {
TypedProperties properties = new TypedProperties();
properties.put(META_SYNC_DATABASE_NAME.key(), cfg.databaseName);
properties.put(META_SYNC_TABLE_NAME.key(), cfg.tableName);
properties.put(ADB_SYNC_USER.key(), cfg.adbUser);
properties.put(ADB_SYNC_PASS.key(), cfg.adbPass);
properties.put(ADB_SYNC_JDBC_URL.key(), cfg.jdbcUrl);
properties.put(META_SYNC_BASE_PATH.key(), cfg.basePath);
properties.put(META_SYNC_PARTITION_FIELDS.key(), String.join(",", cfg.partitionFields));
properties.put(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), cfg.partitionValueExtractorClass);
properties.put(META_SYNC_ASSUME_DATE_PARTITION.key(), String.valueOf(cfg.assumeDatePartitioning));
properties.put(ADB_SYNC_SKIP_RO_SUFFIX.key(), String.valueOf(cfg.skipROSuffix));
properties.put(ADB_SYNC_SKIP_RT_SYNC.key(), String.valueOf(cfg.skipRTSync));
properties.put(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING.key(), String.valueOf(cfg.useHiveStylePartitioning));
properties.put(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(cfg.useFileListingFromMetadata));
properties.put(ADB_SYNC_SUPPORT_TIMESTAMP.key(), String.valueOf(cfg.supportTimestamp));
properties.put(ADB_SYNC_TABLE_PROPERTIES.key(), cfg.tableProperties);
properties.put(ADB_SYNC_SERDE_PROPERTIES.key(), cfg.serdeProperties);
properties.put(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE.key(), String.valueOf(cfg.syncAsSparkDataSourceTable));
properties.put(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), String.valueOf(cfg.sparkSchemaLengthThreshold));
properties.put(META_SYNC_SPARK_VERSION.key(), cfg.sparkVersion);
properties.put(ADB_SYNC_DB_LOCATION.key(), cfg.dbLocation);
properties.put(ADB_SYNC_AUTO_CREATE_DATABASE.key(), String.valueOf(cfg.autoCreateDatabase));
properties.put(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC.key(), String.valueOf(cfg.skipLastCommitTimeSync));
properties.put(ADB_SYNC_DROP_TABLE_BEFORE_CREATION.key(), String.valueOf(cfg.dropTableBeforeCreation));
return properties;
}
@Override
public String toString() {
return "AdbSyncConfig{"
+ "adbUser='" + adbUser + '\''
+ ", adbPass='" + adbPass + '\''
+ ", jdbcUrl='" + jdbcUrl + '\''
+ ", skipROSuffix=" + skipROSuffix
+ ", skipRTSync=" + skipRTSync
+ ", useHiveStylePartitioning=" + useHiveStylePartitioning
+ ", supportTimestamp=" + supportTimestamp
+ ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable
+ ", tableProperties='" + tableProperties + '\''
+ ", serdeProperties='" + serdeProperties + '\''
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold
+ ", dbLocation='" + dbLocation + '\''
+ ", autoCreateDatabase=" + autoCreateDatabase
+ ", skipLastCommitTimeSync=" + skipLastCommitTimeSync
+ ", dropTableBeforeCreation=" + dropTableBeforeCreation
+ ", help=" + help
+ ", databaseName='" + databaseName + '\''
+ ", tableName='" + tableName + '\''
+ ", basePath='" + basePath + '\''
+ ", baseFileFormat='" + baseFileFormat + '\''
+ ", partitionFields=" + partitionFields
+ ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\''
+ ", assumeDatePartitioning=" + assumeDatePartitioning
+ ", decodePartition=" + decodePartition
+ ", useFileListingFromMetadata=" + useFileListingFromMetadata
+ ", isConditionalSync=" + isConditionalSync
+ ", sparkVersion='" + sparkVersion + '\''
+ '}';
public String getAbsoluteBasePath() {
return generateAbsolutePathStr(new Path(getString(META_SYNC_BASE_PATH)));
}
public String getDatabasePath() {
Path basePath = new Path(getString(META_SYNC_BASE_PATH));
Path dbLocationPath;
String dbLocation = getString(ADB_SYNC_DB_LOCATION);
if (StringUtils.isNullOrEmpty(dbLocation)) {
if (basePath.isRoot()) {
dbLocationPath = basePath;
} else {
dbLocationPath = basePath.getParent();
}
} else {
dbLocationPath = new Path(dbLocation);
}
return generateAbsolutePathStr(dbLocationPath);
}
public String generateAbsolutePathStr(Path path) {
String absolutePathStr = path.toString();
if (path.toUri().getScheme() == null) {
absolutePathStr = getDefaultFs() + absolutePathStr;
}
return absolutePathStr.endsWith("/") ? absolutePathStr : absolutePathStr + "/";
}
public String getDefaultFs() {
return getHadoopConf().get("fs.defaultFS");
}
public static class AdbSyncConfigParams {
@ParametersDelegate()
public HiveSyncConfig.HiveSyncConfigParams hiveSyncConfigParams = new HiveSyncConfig.HiveSyncConfigParams();
@Parameter(names = {"--support-timestamp"}, description = "If true, converts int64(timestamp_micros) to timestamp type")
public Boolean supportTimestamp;
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table")
public Boolean syncAsSparkDataSourceTable;
@Parameter(names = {"--table-properties"}, description = "Table properties, to support read hoodie table as datasource table", required = true)
public String tableProperties;
@Parameter(names = {"--serde-properties"}, description = "Serde properties, to support read hoodie table as datasource table", required = true)
public String serdeProperties;
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore")
public int sparkSchemaLengthThreshold;
@Parameter(names = {"--hive-style-partitioning"}, description = "Whether use hive style partitioning, true if like the following style: field1=value1/field2=value2")
public Boolean useHiveStylePartitioning;
@Parameter(names = {"--skip-rt-sync"}, description = "Whether skip the rt table when syncing")
public Boolean skipRTSync;
@Parameter(names = {"--db-location"}, description = "Database location")
public String dbLocation;
@Parameter(names = {"--auto-create-database"}, description = "Whether auto create adb database")
public Boolean autoCreateDatabase = true;
@Parameter(names = {"--skip-last-commit-time-sync"}, description = "Whether skip last commit time syncing")
public Boolean skipLastCommitTimeSync = false;
@Parameter(names = {"--drop-table-before-creation"}, description = "Whether drop table before creation")
public Boolean dropTableBeforeCreation = false;
public boolean isHelp() {
return hiveSyncConfigParams.isHelp();
}
public TypedProperties toProps() {
final TypedProperties props = hiveSyncConfigParams.toProps();
props.setPropertyIfNonNull(META_SYNC_DATABASE_NAME.key(), hiveSyncConfigParams.hoodieSyncConfigParams.databaseName);
props.setPropertyIfNonNull(META_SYNC_TABLE_NAME.key(), hiveSyncConfigParams.hoodieSyncConfigParams.tableName);
props.setPropertyIfNonNull(ADB_SYNC_USER.key(), hiveSyncConfigParams.hiveUser);
props.setPropertyIfNonNull(ADB_SYNC_PASS.key(), hiveSyncConfigParams.hivePass);
props.setPropertyIfNonNull(ADB_SYNC_JDBC_URL.key(), hiveSyncConfigParams.jdbcUrl);
props.setPropertyIfNonNull(META_SYNC_BASE_PATH.key(), hiveSyncConfigParams.hoodieSyncConfigParams.basePath);
props.setPropertyIfNonNull(META_SYNC_PARTITION_FIELDS.key(), String.join(",", hiveSyncConfigParams.hoodieSyncConfigParams.partitionFields));
props.setPropertyIfNonNull(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), hiveSyncConfigParams.hoodieSyncConfigParams.partitionValueExtractorClass);
props.setPropertyIfNonNull(META_SYNC_ASSUME_DATE_PARTITION.key(), String.valueOf(hiveSyncConfigParams.hoodieSyncConfigParams.assumeDatePartitioning));
props.setPropertyIfNonNull(ADB_SYNC_SKIP_RO_SUFFIX.key(), String.valueOf(hiveSyncConfigParams.skipROSuffix));
props.setPropertyIfNonNull(ADB_SYNC_SKIP_RT_SYNC.key(), String.valueOf(skipRTSync));
props.setPropertyIfNonNull(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING.key(), String.valueOf(useHiveStylePartitioning));
props.setPropertyIfNonNull(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(hiveSyncConfigParams.hoodieSyncConfigParams.useFileListingFromMetadata));
props.setPropertyIfNonNull(ADB_SYNC_SUPPORT_TIMESTAMP.key(), String.valueOf(supportTimestamp));
props.setPropertyIfNonNull(ADB_SYNC_TABLE_PROPERTIES.key(), tableProperties);
props.setPropertyIfNonNull(ADB_SYNC_SERDE_PROPERTIES.key(), serdeProperties);
props.setPropertyIfNonNull(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsSparkDataSourceTable));
props.setPropertyIfNonNull(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), String.valueOf(sparkSchemaLengthThreshold));
props.setPropertyIfNonNull(META_SYNC_SPARK_VERSION.key(), hiveSyncConfigParams.hoodieSyncConfigParams.sparkVersion);
props.setPropertyIfNonNull(ADB_SYNC_DB_LOCATION.key(), dbLocation);
props.setPropertyIfNonNull(ADB_SYNC_AUTO_CREATE_DATABASE.key(), String.valueOf(autoCreateDatabase));
props.setPropertyIfNonNull(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC.key(), String.valueOf(skipLastCommitTimeSync));
props.setPropertyIfNonNull(ADB_SYNC_DROP_TABLE_BEFORE_CREATION.key(), String.valueOf(dropTableBeforeCreation));
return props;
}
}
}

View File

@@ -18,22 +18,19 @@
package org.apache.hudi.sync.adb;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.hudi.hive.SchemaDifference;
import org.apache.hudi.hive.util.HiveSchemaUtil;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
import org.apache.hudi.sync.common.AbstractSyncTool;
import org.apache.hudi.sync.common.HoodieSyncTool;
import org.apache.hudi.sync.common.model.PartitionEvent;
import org.apache.hudi.sync.common.model.PartitionEvent.PartitionEventType;
import org.apache.hudi.sync.common.util.ConfigUtils;
import org.apache.hudi.sync.common.util.SparkDataSourceTableUtils;
import com.beust.jcommander.JCommander;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.parquet.schema.MessageType;
@@ -43,8 +40,25 @@ import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_AUTO_CREATE_DATABASE;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_DROP_TABLE_BEFORE_CREATION;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SERDE_PROPERTIES;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_RO_SUFFIX;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_RT_SYNC;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SUPPORT_TIMESTAMP;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_TABLE_PROPERTIES;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_SPARK_VERSION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
/**
* Adb sync tool is mainly used to sync hoodie tables to Alibaba Cloud AnalyticDB(ADB),
* it can be used as API `AdbSyncTool.syncHoodieTable(AdbSyncConfig)` or as command
@@ -55,45 +69,52 @@ import java.util.stream.Collectors;
* incremental partitions will be synced as well.
*/
@SuppressWarnings("WeakerAccess")
public class AdbSyncTool extends AbstractSyncTool {
public class AdbSyncTool extends HoodieSyncTool {
private static final Logger LOG = LoggerFactory.getLogger(AdbSyncTool.class);
public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
private final AdbSyncConfig adbSyncConfig;
private final AbstractAdbSyncHoodieClient hoodieAdbClient;
private final AdbSyncConfig config;
private final String databaseName;
private final String tableName;
private final HoodieAdbJdbcClient syncClient;
private final String snapshotTableName;
private final Option<String> roTableTableName;
public AdbSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
super(props, conf, fs);
this.adbSyncConfig = new AdbSyncConfig(props);
this.hoodieAdbClient = getHoodieAdbClient(adbSyncConfig, fs);
switch (hoodieAdbClient.getTableType()) {
public AdbSyncTool(Properties props) {
super(props);
this.config = new AdbSyncConfig(props);
this.databaseName = config.getString(META_SYNC_DATABASE_NAME);
this.tableName = config.getString(META_SYNC_TABLE_NAME);
this.syncClient = new HoodieAdbJdbcClient(config);
switch (syncClient.getTableType()) {
case COPY_ON_WRITE:
this.snapshotTableName = adbSyncConfig.tableName;
this.snapshotTableName = tableName;
this.roTableTableName = Option.empty();
break;
case MERGE_ON_READ:
this.snapshotTableName = adbSyncConfig.tableName + SUFFIX_SNAPSHOT_TABLE;
this.roTableTableName = adbSyncConfig.skipROSuffix ? Option.of(adbSyncConfig.tableName)
: Option.of(adbSyncConfig.tableName + SUFFIX_READ_OPTIMIZED_TABLE);
this.snapshotTableName = tableName + SUFFIX_SNAPSHOT_TABLE;
this.roTableTableName = config.getBoolean(ADB_SYNC_SKIP_RO_SUFFIX) ? Option.of(tableName)
: Option.of(tableName + SUFFIX_READ_OPTIMIZED_TABLE);
break;
default:
throw new HoodieAdbSyncException("Unknown table type:" + hoodieAdbClient.getTableType()
+ ", basePath:" + hoodieAdbClient.getBasePath());
throw new HoodieAdbSyncException("Unknown table type:" + syncClient.getTableType()
+ ", basePath:" + syncClient.getBasePath());
}
}
private AbstractAdbSyncHoodieClient getHoodieAdbClient(AdbSyncConfig adbSyncConfig, FileSystem fs) {
return new HoodieAdbJdbcClient(adbSyncConfig, fs);
@Override
public void close() {
if (syncClient != null) {
syncClient.close();
}
}
@Override
public void syncHoodieTable() {
try {
switch (hoodieAdbClient.getTableType()) {
switch (syncClient.getTableType()) {
case COPY_ON_WRITE:
syncHoodieTable(snapshotTableName, false, false);
break;
@@ -101,39 +122,38 @@ public class AdbSyncTool extends AbstractSyncTool {
// Sync a ro table for MOR table
syncHoodieTable(roTableTableName.get(), false, true);
// Sync a rt table for MOR table
if (!adbSyncConfig.skipRTSync) {
if (!config.getBoolean(ADB_SYNC_SKIP_RT_SYNC)) {
syncHoodieTable(snapshotTableName, true, false);
}
break;
default:
throw new HoodieAdbSyncException("Unknown table type:" + hoodieAdbClient.getTableType()
+ ", basePath:" + hoodieAdbClient.getBasePath());
throw new HoodieAdbSyncException("Unknown table type:" + syncClient.getTableType()
+ ", basePath:" + syncClient.getBasePath());
}
} catch (Exception re) {
throw new HoodieAdbSyncException("Sync hoodie table to ADB failed, tableName:" + adbSyncConfig.tableName, re);
throw new HoodieAdbSyncException("Sync hoodie table to ADB failed, tableName:" + tableName, re);
} finally {
hoodieAdbClient.close();
syncClient.close();
}
}
private void syncHoodieTable(String tableName, boolean useRealtimeInputFormat,
boolean readAsOptimized) throws Exception {
private void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) throws Exception {
LOG.info("Try to sync hoodie table, tableName:{}, path:{}, tableType:{}",
tableName, hoodieAdbClient.getBasePath(), hoodieAdbClient.getTableType());
tableName, syncClient.getBasePath(), syncClient.getTableType());
if (adbSyncConfig.autoCreateDatabase) {
if (config.getBoolean(ADB_SYNC_AUTO_CREATE_DATABASE)) {
try {
synchronized (AdbSyncTool.class) {
if (!hoodieAdbClient.databaseExists(adbSyncConfig.databaseName)) {
hoodieAdbClient.createDatabase(adbSyncConfig.databaseName);
if (!syncClient.databaseExists(databaseName)) {
syncClient.createDatabase(databaseName);
}
}
} catch (Exception e) {
throw new HoodieAdbSyncException("Failed to create database:" + adbSyncConfig.databaseName
throw new HoodieAdbSyncException("Failed to create database:" + databaseName
+ ", useRealtimeInputFormat = " + useRealtimeInputFormat, e);
}
} else if (!hoodieAdbClient.databaseExists(adbSyncConfig.databaseName)) {
throw new HoodieAdbSyncException("ADB database does not exists:" + adbSyncConfig.databaseName);
} else if (!syncClient.databaseExists(databaseName)) {
throw new HoodieAdbSyncException("ADB database does not exists:" + databaseName);
}
// Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table,
@@ -141,22 +161,22 @@ public class AdbSyncTool extends AbstractSyncTool {
// by the data source way (which will use the HoodieBootstrapRelation).
// TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071],
// we can remove this logical.
if (hoodieAdbClient.isBootstrap()
&& hoodieAdbClient.getTableType() == HoodieTableType.MERGE_ON_READ
if (syncClient.isBootstrap()
&& syncClient.getTableType() == HoodieTableType.MERGE_ON_READ
&& !readAsOptimized) {
adbSyncConfig.syncAsSparkDataSourceTable = false;
config.setValue(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE, "false");
LOG.info("Disable sync as spark datasource table for mor rt table:{}", tableName);
}
if (adbSyncConfig.dropTableBeforeCreation) {
if (config.getBoolean(ADB_SYNC_DROP_TABLE_BEFORE_CREATION)) {
LOG.info("Drop table before creation, tableName:{}", tableName);
hoodieAdbClient.dropTable(tableName);
syncClient.dropTable(tableName);
}
boolean tableExists = hoodieAdbClient.tableExists(tableName);
boolean tableExists = syncClient.tableExists(tableName);
// Get the parquet schema for this table looking at the latest commit
MessageType schema = hoodieAdbClient.getDataSchema();
MessageType schema = syncClient.getStorageSchema();
// Sync schema if needed
syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema);
@@ -165,16 +185,16 @@ public class AdbSyncTool extends AbstractSyncTool {
// Get the last time we successfully synced partitions
Option<String> lastCommitTimeSynced = Option.empty();
if (tableExists) {
lastCommitTimeSynced = hoodieAdbClient.getLastCommitTimeSynced(tableName);
lastCommitTimeSynced = syncClient.getLastCommitTimeSynced(tableName);
}
LOG.info("Last commit time synced was found:{}", lastCommitTimeSynced.orElse("null"));
// Scan synced partitions
List<String> writtenPartitionsSince;
if (adbSyncConfig.partitionFields.isEmpty()) {
if (config.getSplitStrings(META_SYNC_PARTITION_FIELDS).isEmpty()) {
writtenPartitionsSince = new ArrayList<>();
} else {
writtenPartitionsSince = hoodieAdbClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
writtenPartitionsSince = syncClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
}
LOG.info("Scan partitions complete, partitionNum:{}", writtenPartitionsSince.size());
@@ -183,8 +203,8 @@ public class AdbSyncTool extends AbstractSyncTool {
// Update sync commit time
// whether to skip syncing commit time stored in tbl properties, since it is time consuming.
if (!adbSyncConfig.skipLastCommitTimeSync) {
hoodieAdbClient.updateLastCommitTimeSynced(tableName);
if (!config.getBoolean(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC)) {
syncClient.updateLastCommitTimeSynced(tableName);
}
LOG.info("Sync complete for table:{}", tableName);
}
@@ -200,14 +220,14 @@ public class AdbSyncTool extends AbstractSyncTool {
* @param schema The extracted schema
*/
private void syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat,
boolean readAsOptimized, MessageType schema) throws Exception {
boolean readAsOptimized, MessageType schema) {
// Append spark table properties & serde properties
Map<String, String> tableProperties = ConfigUtils.toMap(adbSyncConfig.tableProperties);
Map<String, String> serdeProperties = ConfigUtils.toMap(adbSyncConfig.serdeProperties);
if (adbSyncConfig.syncAsSparkDataSourceTable) {
Map<String, String> sparkTableProperties = getSparkTableProperties(adbSyncConfig.partitionFields,
adbSyncConfig.sparkVersion, adbSyncConfig.sparkSchemaLengthThreshold, schema);
Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized, adbSyncConfig.basePath);
Map<String, String> tableProperties = ConfigUtils.toMap(config.getString(ADB_SYNC_TABLE_PROPERTIES));
Map<String, String> serdeProperties = ConfigUtils.toMap(config.getString(ADB_SYNC_SERDE_PROPERTIES));
if (config.getBoolean(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE)) {
Map<String, String> sparkTableProperties = SparkDataSourceTableUtils.getSparkTableProperties(config.getSplitStrings(META_SYNC_PARTITION_FIELDS),
config.getString(META_SYNC_SPARK_VERSION), config.getInt(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD), schema);
Map<String, String> sparkSerdeProperties = SparkDataSourceTableUtils.getSparkSerdeProperties(readAsOptimized, config.getString(META_SYNC_BASE_PATH));
tableProperties.putAll(sparkTableProperties);
serdeProperties.putAll(sparkSerdeProperties);
LOG.info("Sync as spark datasource table, tableName:{}, tableExists:{}, tableProperties:{}, sederProperties:{}",
@@ -222,16 +242,16 @@ public class AdbSyncTool extends AbstractSyncTool {
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
// /ql/exec/DDLTask.java#L3488
hoodieAdbClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(),
syncClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(),
ParquetHiveSerDe.class.getName(), serdeProperties, tableProperties);
} else {
// Check if the table schema has evolved
Map<String, String> tableSchema = hoodieAdbClient.getTableSchema(tableName);
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, adbSyncConfig.partitionFields,
adbSyncConfig.supportTimestamp);
Map<String, String> tableSchema = syncClient.getMetastoreSchema(tableName);
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, config.getSplitStrings(META_SYNC_PARTITION_FIELDS),
config.getBoolean(ADB_SYNC_SUPPORT_TIMESTAMP));
if (!schemaDiff.isEmpty()) {
LOG.info("Schema difference found for table:{}", tableName);
hoodieAdbClient.updateTableDefinition(tableName, schemaDiff);
syncClient.updateTableDefinition(tableName, schemaDiff);
} else {
LOG.info("No Schema difference for table:{}", tableName);
}
@@ -244,19 +264,19 @@ public class AdbSyncTool extends AbstractSyncTool {
*/
private void syncPartitions(String tableName, List<String> writtenPartitionsSince) {
try {
if (adbSyncConfig.partitionFields.isEmpty()) {
if (config.getSplitStrings(META_SYNC_PARTITION_FIELDS).isEmpty()) {
LOG.info("Not a partitioned table.");
return;
}
Map<List<String>, String> partitions = hoodieAdbClient.scanTablePartitions(tableName);
List<PartitionEvent> partitionEvents = hoodieAdbClient.getPartitionEvents(partitions, writtenPartitionsSince);
Map<List<String>, String> partitions = syncClient.scanTablePartitions(tableName);
List<PartitionEvent> partitionEvents = syncClient.getPartitionEvents(partitions, writtenPartitionsSince);
List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
LOG.info("New Partitions:{}", newPartitions);
hoodieAdbClient.addPartitionsToTable(tableName, newPartitions);
syncClient.addPartitionsToTable(tableName, newPartitions);
List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
LOG.info("Changed Partitions:{}", updatePartitions);
hoodieAdbClient.updatePartitionsToTable(tableName, updatePartitions);
syncClient.updatePartitionsToTable(tableName, updatePartitions);
} catch (Exception e) {
throw new HoodieAdbSyncException("Failed to sync partitions for table:" + tableName, e);
}
@@ -268,16 +288,13 @@ public class AdbSyncTool extends AbstractSyncTool {
}
public static void main(String[] args) {
// parse the params
final AdbSyncConfig cfg = new AdbSyncConfig();
JCommander cmd = new JCommander(cfg, null, args);
if (cfg.help || args.length == 0) {
final AdbSyncConfig.AdbSyncConfigParams params = new AdbSyncConfig.AdbSyncConfigParams();
JCommander cmd = JCommander.newBuilder().addObject(params).build();
cmd.parse(args);
if (params.isHelp()) {
cmd.usage();
System.exit(1);
System.exit(0);
}
Configuration hadoopConf = new Configuration();
FileSystem fs = FSUtils.getFs(cfg.basePath, hadoopConf);
new AdbSyncTool(AdbSyncConfig.toProps(cfg), hadoopConf, fs).syncHoodieTable();
new AdbSyncTool(params.toProps()).syncHoodieTable();
}
}

View File

@@ -23,12 +23,12 @@ import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HoodieHiveSyncException;
import org.apache.hudi.hive.SchemaDifference;
import org.apache.hudi.hive.util.HiveSchemaUtil;
import org.apache.hudi.sync.common.HoodieSyncClient;
import org.apache.hudi.sync.common.model.PartitionEvent;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.schema.MessageType;
import org.slf4j.Logger;
@@ -47,13 +47,21 @@ import java.util.List;
import java.util.Map;
import java.util.function.Function;
public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_JDBC_URL;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_PASS;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_USER;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_USE_HIVE_STYLE_PARTITIONING;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
public class HoodieAdbJdbcClient extends HoodieSyncClient {
private static final Logger LOG = LoggerFactory.getLogger(HoodieAdbJdbcClient.class);
public static final String HOODIE_LAST_COMMIT_TIME_SYNC = "hoodie_last_sync";
// Make sure we have the jdbc driver in classpath
private static final String DRIVER_NAME = "com.mysql.jdbc.Driver";
public static final String ADB_ESCAPE_CHARACTER = "";
private static final String TBL_PROPERTIES_STR = "TBLPROPERTIES";
static {
@@ -64,12 +72,16 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
}
}
private final AdbSyncConfig config;
private final String databaseName;
private Connection connection;
public HoodieAdbJdbcClient(AdbSyncConfig syncConfig, FileSystem fs) {
super(syncConfig, fs);
public HoodieAdbJdbcClient(AdbSyncConfig config) {
super(config);
this.config = config;
this.databaseName = config.getString(META_SYNC_DATABASE_NAME);
createAdbConnection();
LOG.info("Init adb jdbc client success, jdbcUrl:{}", syncConfig.jdbcUrl);
LOG.info("Init adb jdbc client success, jdbcUrl:{}", config.getString(ADB_SYNC_JDBC_URL));
}
private void createAdbConnection() {
@@ -82,7 +94,9 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
}
try {
this.connection = DriverManager.getConnection(
adbSyncConfig.jdbcUrl, adbSyncConfig.adbUser, adbSyncConfig.adbPass);
config.getString(ADB_SYNC_JDBC_URL),
config.getString(ADB_SYNC_USER),
config.getString(ADB_SYNC_PASS));
} catch (SQLException e) {
throw new HoodieException("Cannot create adb connection ", e);
}
@@ -91,12 +105,12 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
@Override
public void createTable(String tableName, MessageType storageSchema, String inputFormatClass,
String outputFormatClass, String serdeClass,
Map<String, String> serdeProperties, Map<String, String> tableProperties) {
String outputFormatClass, String serdeClass,
Map<String, String> serdeProperties, Map<String, String> tableProperties) {
try {
LOG.info("Creating table:{}", tableName);
String createSQLQuery = HiveSchemaUtil.generateCreateDDL(tableName, storageSchema,
getHiveSyncConfig(), inputFormatClass, outputFormatClass, serdeClass, serdeProperties, tableProperties);
config, inputFormatClass, outputFormatClass, serdeClass, serdeProperties, tableProperties);
executeAdbSql(createSQLQuery);
} catch (IOException e) {
throw new HoodieException("Fail to create table:" + tableName, e);
@@ -106,17 +120,18 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
@Override
public void dropTable(String tableName) {
LOG.info("Dropping table:{}", tableName);
String dropTable = "drop table if exists `" + adbSyncConfig.databaseName + "`.`" + tableName + "`";
String dropTable = "drop table if exists `" + databaseName + "`.`" + tableName + "`";
executeAdbSql(dropTable);
}
public Map<String, String> getTableSchema(String tableName) {
@Override
public Map<String, String> getMetastoreSchema(String tableName) {
Map<String, String> schema = new HashMap<>();
ResultSet result = null;
try {
DatabaseMetaData databaseMetaData = connection.getMetaData();
result = databaseMetaData.getColumns(adbSyncConfig.databaseName,
adbSyncConfig.databaseName, tableName, null);
result = databaseMetaData.getColumns(databaseName,
databaseName, tableName, null);
while (result.next()) {
String columnName = result.getString(4);
String columnType = result.getString(6);
@@ -174,7 +189,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
}
public void createDatabase(String databaseName) {
String rootPath = getDatabasePath();
String rootPath = config.getDatabasePath();
LOG.info("Creating database:{}, databaseLocation:{}", databaseName, rootPath);
String sql = constructCreateDatabaseSql(rootPath);
executeAdbSql(sql);
@@ -197,7 +212,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
}
@Override
public boolean doesTableExist(String tableName) {
public boolean tableExists(String tableName) {
String sql = constructShowLikeTableSql(tableName);
Function<ResultSet, Boolean> transform = resultSet -> {
try {
@@ -209,11 +224,6 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
return executeQuerySQL(sql, transform);
}
@Override
public boolean tableExists(String tableName) {
return doesTableExist(tableName);
}
@Override
public Option<String> getLastCommitTimeSynced(String tableName) {
String sql = constructShowCreateTableSql(tableName);
@@ -251,7 +261,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
@Override
public void updateLastCommitTimeSynced(String tableName) {
// Set the last commit time from the TBLProperties
String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp();
String lastCommitSynced = getActiveTimeline().lastInstant().get().getTimestamp();
try {
String sql = constructUpdateTblPropertiesSql(tableName, lastCommitSynced);
executeAdbSql(sql);
@@ -275,6 +285,11 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
throw new UnsupportedOperationException("Not support deleteLastReplicatedTimeStamp yet");
}
@Override
public void updateTableProperties(String tableName, Map<String, String> tableProperties) {
throw new UnsupportedOperationException("Not support updateTableProperties yet");
}
@Override
public void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
if (changedPartitions.isEmpty()) {
@@ -294,6 +309,9 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
throw new UnsupportedOperationException("Not support dropPartitions yet.");
}
/**
* TODO migrate to implementation of {@link #getAllPartitions(String)}
*/
public Map<List<String>, String> scanTablePartitions(String tableName) {
String sql = constructShowPartitionSql(tableName);
Function<ResultSet, Map<List<String>, String>> transform = resultSet -> {
@@ -304,7 +322,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
String str = resultSet.getString(1);
if (!StringUtils.isNullOrEmpty(str)) {
List<String> values = partitionValueExtractor.extractPartitionValuesInPath(str);
Path storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, String.join("/", values));
Path storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), String.join("/", values));
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
partitions.put(values, fullStoragePartitionPath);
}
@@ -318,6 +336,9 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
return executeQuerySQL(sql, transform);
}
/**
* TODO align with {@link org.apache.hudi.sync.common.HoodieMetaSyncOperations#updateTableSchema}
*/
public void updateTableDefinition(String tableName, SchemaDifference schemaDiff) {
LOG.info("Adding columns for table:{}", tableName);
schemaDiff.getAddColumnTypes().forEach((columnName, columnType) ->
@@ -332,12 +353,12 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
private String constructAddPartitionsSql(String tableName, List<String> partitions) {
StringBuilder sqlBuilder = new StringBuilder("alter table `");
sqlBuilder.append(adbSyncConfig.databaseName).append("`").append(".`")
sqlBuilder.append(databaseName).append("`").append(".`")
.append(tableName).append("`").append(" add if not exists ");
for (String partition : partitions) {
String partitionClause = getPartitionClause(partition);
Path partitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
String fullPartitionPathStr = generateAbsolutePathStr(partitionPath);
Path partitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
String fullPartitionPathStr = config.generateAbsolutePathStr(partitionPath);
sqlBuilder.append(" partition (").append(partitionClause).append(") location '")
.append(fullPartitionPathStr).append("' ");
}
@@ -347,14 +368,14 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
private List<String> constructChangePartitionsSql(String tableName, List<String> partitions) {
List<String> changePartitions = new ArrayList<>();
String useDatabase = "use `" + adbSyncConfig.databaseName + "`";
String useDatabase = "use `" + databaseName + "`";
changePartitions.add(useDatabase);
String alterTable = "alter table `" + tableName + "`";
for (String partition : partitions) {
String partitionClause = getPartitionClause(partition);
Path partitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
String fullPartitionPathStr = generateAbsolutePathStr(partitionPath);
Path partitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
String fullPartitionPathStr = config.generateAbsolutePathStr(partitionPath);
String changePartition = alterTable + " add if not exists partition (" + partitionClause
+ ") location '" + fullPartitionPathStr + "'";
changePartitions.add(changePartition);
@@ -371,32 +392,32 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
*/
private String getPartitionClause(String partition) {
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
ValidationUtils.checkArgument(adbSyncConfig.partitionFields.size() == partitionValues.size(),
"Partition key parts " + adbSyncConfig.partitionFields
ValidationUtils.checkArgument(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() == partitionValues.size(),
"Partition key parts " + config.getSplitStrings(META_SYNC_PARTITION_FIELDS)
+ " does not match with partition values " + partitionValues + ". Check partition strategy. ");
List<String> partBuilder = new ArrayList<>();
for (int i = 0; i < adbSyncConfig.partitionFields.size(); i++) {
partBuilder.add(adbSyncConfig.partitionFields.get(i) + "='" + partitionValues.get(i) + "'");
for (int i = 0; i < config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size(); i++) {
partBuilder.add(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).get(i) + "='" + partitionValues.get(i) + "'");
}
return String.join(",", partBuilder);
}
private String constructShowPartitionSql(String tableName) {
return String.format("show partitions `%s`.`%s`", adbSyncConfig.databaseName, tableName);
return String.format("show partitions `%s`.`%s`", databaseName, tableName);
}
private String constructShowCreateTableSql(String tableName) {
return String.format("show create table `%s`.`%s`", adbSyncConfig.databaseName, tableName);
return String.format("show create table `%s`.`%s`", databaseName, tableName);
}
private String constructShowLikeTableSql(String tableName) {
return String.format("show tables from `%s` like '%s'", adbSyncConfig.databaseName, tableName);
return String.format("show tables from `%s` like '%s'", databaseName, tableName);
}
private String constructCreateDatabaseSql(String rootPath) {
return String.format("create database if not exists `%s` with dbproperties(catalog = 'oss', location = '%s')",
adbSyncConfig.databaseName, rootPath);
databaseName, rootPath);
}
private String constructShowCreateDatabaseSql(String databaseName) {
@@ -405,26 +426,69 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
private String constructUpdateTblPropertiesSql(String tableName, String lastCommitSynced) {
return String.format("alter table `%s`.`%s` set tblproperties('%s' = '%s')",
adbSyncConfig.databaseName, tableName, HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced);
databaseName, tableName, HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced);
}
private String constructAddColumnSql(String tableName, String columnName, String columnType) {
return String.format("alter table `%s`.`%s` add columns(`%s` %s)",
adbSyncConfig.databaseName, tableName, columnName, columnType);
databaseName, tableName, columnName, columnType);
}
private String constructChangeColumnSql(String tableName, String columnName, String columnType) {
return String.format("alter table `%s`.`%s` change `%s` `%s` %s",
adbSyncConfig.databaseName, tableName, columnName, columnName, columnType);
databaseName, tableName, columnName, columnName, columnType);
}
private HiveSyncConfig getHiveSyncConfig() {
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
hiveSyncConfig.partitionFields = adbSyncConfig.partitionFields;
hiveSyncConfig.databaseName = adbSyncConfig.databaseName;
Path basePath = new Path(adbSyncConfig.basePath);
hiveSyncConfig.basePath = generateAbsolutePathStr(basePath);
return hiveSyncConfig;
/**
* TODO align with {@link HoodieSyncClient#getPartitionEvents}
*/
public List<PartitionEvent> getPartitionEvents(Map<List<String>, String> tablePartitions, List<String> partitionStoragePartitions) {
Map<String, String> paths = new HashMap<>();
for (Map.Entry<List<String>, String> entry : tablePartitions.entrySet()) {
List<String> partitionValues = entry.getKey();
String fullTablePartitionPath = entry.getValue();
paths.put(String.join(", ", partitionValues), fullTablePartitionPath);
}
List<PartitionEvent> events = new ArrayList<>();
for (String storagePartition : partitionStoragePartitions) {
Path storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), storagePartition);
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
// Check if the partition values or if hdfs path is the same
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
if (config.getBoolean(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING)) {
String partition = String.join("/", storagePartitionValues);
storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
}
if (!storagePartitionValues.isEmpty()) {
String storageValue = String.join(", ", storagePartitionValues);
if (!paths.containsKey(storageValue)) {
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
}
}
}
return events;
}
public void closeQuietly(ResultSet resultSet, Statement stmt) {
try {
if (stmt != null) {
stmt.close();
}
} catch (SQLException e) {
LOG.warn("Could not close the statement opened ", e);
}
try {
if (resultSet != null) {
resultSet.close();
}
} catch (SQLException e) {
LOG.warn("Could not close the resultset opened ", e);
}
}
@Override

View File

@@ -19,47 +19,72 @@
package org.apache.hudi.sync.adb;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.sync.common.util.ConfigUtils;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_DB_LOCATION;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_JDBC_URL;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_PASS;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SERDE_PROPERTIES;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_RO_SUFFIX;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_TABLE_PROPERTIES;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_USER;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestAdbSyncConfig {
@Test
public void testCopy() {
AdbSyncConfig adbSyncConfig = new AdbSyncConfig();
adbSyncConfig.partitionFields = Arrays.asList("a", "b");
adbSyncConfig.basePath = "/tmp";
adbSyncConfig.assumeDatePartitioning = true;
adbSyncConfig.databaseName = "test";
adbSyncConfig.tableName = "test";
adbSyncConfig.adbUser = "adb";
adbSyncConfig.adbPass = "adb";
adbSyncConfig.jdbcUrl = "jdbc:mysql://localhost:3306";
adbSyncConfig.skipROSuffix = false;
adbSyncConfig.tableProperties = "spark.sql.sources.provider= 'hudi'\\n"
+ "spark.sql.sources.schema.numParts = '1'\\n "
+ "spark.sql.sources.schema.part.0 ='xx'\\n "
+ "spark.sql.sources.schema.numPartCols = '1'\\n"
+ "spark.sql.sources.schema.partCol.0 = 'dt'";
adbSyncConfig.serdeProperties = "'path'='/tmp/test_db/tbl'";
adbSyncConfig.dbLocation = "file://tmp/test_db";
public void testInitConfig() {
Properties props = new Properties();
props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "a,b");
props.setProperty(META_SYNC_BASE_PATH.key(), "/tmp");
props.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
props.setProperty(META_SYNC_DATABASE_NAME.key(), "test");
props.setProperty(META_SYNC_TABLE_NAME.key(), "test");
props.setProperty(ADB_SYNC_USER.key(), "adb");
props.setProperty(ADB_SYNC_PASS.key(), "adb");
props.setProperty(ADB_SYNC_JDBC_URL.key(), "jdbc:mysql://localhost:3306");
props.setProperty(ADB_SYNC_SKIP_RO_SUFFIX.key(), "false");
String tableProps = "spark.sql.sources.provider=hudi\n"
+ "spark.sql.sources.schema.numParts=1\n"
+ "spark.sql.sources.schema.part.0=xx\n"
+ "spark.sql.sources.schema.numPartCols=1\n"
+ "spark.sql.sources.schema.partCol.0=dt";
props.setProperty(ADB_SYNC_TABLE_PROPERTIES.key(), tableProps);
props.setProperty(ADB_SYNC_SERDE_PROPERTIES.key(), "path=/tmp/test_db/tbl");
props.setProperty(ADB_SYNC_DB_LOCATION.key(), "file://tmp/test_db");
TypedProperties props = AdbSyncConfig.toProps(adbSyncConfig);
AdbSyncConfig copied = new AdbSyncConfig(props);
assertEquals(copied.partitionFields, adbSyncConfig.partitionFields);
assertEquals(copied.basePath, adbSyncConfig.basePath);
assertEquals(copied.assumeDatePartitioning, adbSyncConfig.assumeDatePartitioning);
assertEquals(copied.databaseName, adbSyncConfig.databaseName);
assertEquals(copied.tableName, adbSyncConfig.tableName);
assertEquals(copied.adbUser, adbSyncConfig.adbUser);
assertEquals(copied.adbPass, adbSyncConfig.adbPass);
assertEquals(copied.basePath, adbSyncConfig.basePath);
assertEquals(copied.jdbcUrl, adbSyncConfig.jdbcUrl);
assertEquals(copied.skipROSuffix, adbSyncConfig.skipROSuffix);
assertEquals(copied.supportTimestamp, adbSyncConfig.supportTimestamp);
AdbSyncConfig config = new AdbSyncConfig(props);
assertEquals(Arrays.asList("a", "b"), config.getSplitStrings(META_SYNC_PARTITION_FIELDS));
assertEquals("/tmp", config.getString(META_SYNC_BASE_PATH));
assertEquals(true, config.getBoolean(META_SYNC_ASSUME_DATE_PARTITION));
assertEquals("test", config.getString(META_SYNC_DATABASE_NAME));
assertEquals("test", config.getString(META_SYNC_TABLE_NAME));
assertEquals("adb", config.getString(ADB_SYNC_USER));
assertEquals("adb", config.getString(ADB_SYNC_PASS));
assertEquals("jdbc:mysql://localhost:3306", config.getString(ADB_SYNC_JDBC_URL));
assertEquals(false, config.getBoolean(ADB_SYNC_SKIP_RO_SUFFIX));
Map<String, String> tablePropsMap = new HashMap<>();
tablePropsMap.put("spark.sql.sources.provider", "hudi");
tablePropsMap.put("spark.sql.sources.schema.numParts", "1");
tablePropsMap.put("spark.sql.sources.schema.part.0", "xx");
tablePropsMap.put("spark.sql.sources.schema.numPartCols", "1");
tablePropsMap.put("spark.sql.sources.schema.partCol.0", "dt");
assertEquals(tablePropsMap, ConfigUtils.toMap(config.getString(ADB_SYNC_TABLE_PROPERTIES)));
Map<String, String> serdePropsMap = new HashMap<>();
serdePropsMap.put("path", "/tmp/test_db/tbl");
assertEquals(serdePropsMap, ConfigUtils.toMap(config.getString(ADB_SYNC_SERDE_PROPERTIES)));
assertEquals("file://tmp/test_db", config.getString(ADB_SYNC_DB_LOCATION));
}
}