[HUDI-3985] Refactor DLASyncTool to support read hoodie table as spark datasource table (#5532)
This commit is contained in:
46
hudi-sync/hudi-adb-sync/src/assembly/src.xml
Normal file
46
hudi-sync/hudi-adb-sync/src/assembly/src.xml
Normal file
@@ -0,0 +1,46 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3 http://maven.apache.org/xsd/assembly-1.1.3.xsd">
|
||||
<id>jar-with-dependencies</id>
|
||||
<formats>
|
||||
<format>jar</format>
|
||||
</formats>
|
||||
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<dependencySets>
|
||||
|
||||
<dependencySet>
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<unpack>true</unpack>
|
||||
<scope>runtime</scope>
|
||||
<excludes>
|
||||
<exclude>junit:junit</exclude>
|
||||
<exclude>com.google.code.findbugs:*</exclude>
|
||||
<exclude>org.apache.hbase:*</exclude>
|
||||
</excludes>
|
||||
</dependencySet>
|
||||
|
||||
<dependencySet>
|
||||
<unpack>true</unpack>
|
||||
<scope>provided</scope>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
||||
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.adb;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.hive.PartitionValueExtractor;
|
||||
import org.apache.hudi.hive.SchemaDifference;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class AbstractAdbSyncHoodieClient extends AbstractSyncHoodieClient {
|
||||
protected AdbSyncConfig adbSyncConfig;
|
||||
protected PartitionValueExtractor partitionValueExtractor;
|
||||
protected HoodieTimeline activeTimeline;
|
||||
|
||||
public AbstractAdbSyncHoodieClient(AdbSyncConfig syncConfig, FileSystem fs) {
|
||||
super(syncConfig.basePath, syncConfig.assumeDatePartitioning,
|
||||
syncConfig.useFileListingFromMetadata, false, fs);
|
||||
this.adbSyncConfig = syncConfig;
|
||||
final String clazz = adbSyncConfig.partitionValueExtractorClass;
|
||||
try {
|
||||
this.partitionValueExtractor = (PartitionValueExtractor) Class.forName(clazz).newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Fail to init PartitionValueExtractor class " + clazz, e);
|
||||
}
|
||||
|
||||
activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
public List<PartitionEvent> getPartitionEvents(Map<List<String>, String> tablePartitions,
|
||||
List<String> partitionStoragePartitions) {
|
||||
Map<String, String> paths = new HashMap<>();
|
||||
|
||||
for (Map.Entry<List<String>, String> entry : tablePartitions.entrySet()) {
|
||||
List<String> partitionValues = entry.getKey();
|
||||
String fullTablePartitionPath = entry.getValue();
|
||||
paths.put(String.join(", ", partitionValues), fullTablePartitionPath);
|
||||
}
|
||||
List<PartitionEvent> events = new ArrayList<>();
|
||||
for (String storagePartition : partitionStoragePartitions) {
|
||||
Path storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, storagePartition);
|
||||
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
// Check if the partition values or if hdfs path is the same
|
||||
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
|
||||
if (adbSyncConfig.useHiveStylePartitioning) {
|
||||
String partition = String.join("/", storagePartitionValues);
|
||||
storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
|
||||
fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
}
|
||||
if (!storagePartitionValues.isEmpty()) {
|
||||
String storageValue = String.join(", ", storagePartitionValues);
|
||||
if (!paths.containsKey(storageValue)) {
|
||||
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
|
||||
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
|
||||
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
|
||||
}
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
|
||||
}
|
||||
|
||||
public abstract Map<List<String>, String> scanTablePartitions(String tableName) throws Exception;
|
||||
|
||||
public abstract void updateTableDefinition(String tableName, SchemaDifference schemaDiff) throws Exception;
|
||||
|
||||
public abstract boolean databaseExists(String databaseName) throws Exception;
|
||||
|
||||
public abstract void createDatabase(String databaseName) throws Exception;
|
||||
|
||||
public abstract void dropTable(String tableName);
|
||||
|
||||
protected String getDatabasePath() {
|
||||
String dbLocation = adbSyncConfig.dbLocation;
|
||||
Path dbLocationPath;
|
||||
if (StringUtils.isNullOrEmpty(dbLocation)) {
|
||||
if (new Path(adbSyncConfig.basePath).isRoot()) {
|
||||
dbLocationPath = new Path(adbSyncConfig.basePath);
|
||||
} else {
|
||||
dbLocationPath = new Path(adbSyncConfig.basePath).getParent();
|
||||
}
|
||||
} else {
|
||||
dbLocationPath = new Path(dbLocation);
|
||||
}
|
||||
return generateAbsolutePathStr(dbLocationPath);
|
||||
}
|
||||
|
||||
protected String generateAbsolutePathStr(Path path) {
|
||||
String absolutePathStr = path.toString();
|
||||
if (path.toUri().getScheme() == null) {
|
||||
absolutePathStr = getDefaultFs() + absolutePathStr;
|
||||
}
|
||||
return absolutePathStr.endsWith("/") ? absolutePathStr : absolutePathStr + "/";
|
||||
}
|
||||
|
||||
protected String getDefaultFs() {
|
||||
return fs.getConf().get("fs.defaultFS");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,240 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.adb;
|
||||
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
|
||||
/**
|
||||
* Configs needed to sync data into Alibaba Cloud AnalyticDB(ADB).
|
||||
*/
|
||||
public class AdbSyncConfig extends HoodieSyncConfig {
|
||||
|
||||
@Parameter(names = {"--user"}, description = "Adb username", required = true)
|
||||
public String adbUser;
|
||||
|
||||
@Parameter(names = {"--pass"}, description = "Adb password", required = true)
|
||||
public String adbPass;
|
||||
|
||||
@Parameter(names = {"--jdbc-url"}, description = "Adb jdbc connect url", required = true)
|
||||
public String jdbcUrl;
|
||||
|
||||
@Parameter(names = {"--skip-ro-suffix"}, description = "Whether skip the `_ro` suffix for read optimized table when syncing")
|
||||
public Boolean skipROSuffix;
|
||||
|
||||
@Parameter(names = {"--skip-rt-sync"}, description = "Whether skip the rt table when syncing")
|
||||
public Boolean skipRTSync;
|
||||
|
||||
@Parameter(names = {"--hive-style-partitioning"}, description = "Whether use hive style partitioning, true if like the following style: field1=value1/field2=value2")
|
||||
public Boolean useHiveStylePartitioning;
|
||||
|
||||
@Parameter(names = {"--support-timestamp"}, description = "If true, converts int64(timestamp_micros) to timestamp type")
|
||||
public Boolean supportTimestamp;
|
||||
|
||||
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table")
|
||||
public Boolean syncAsSparkDataSourceTable;
|
||||
|
||||
@Parameter(names = {"--table-properties"}, description = "Table properties, to support read hoodie table as datasource table", required = true)
|
||||
public String tableProperties;
|
||||
|
||||
@Parameter(names = {"--serde-properties"}, description = "Serde properties, to support read hoodie table as datasource table", required = true)
|
||||
public String serdeProperties;
|
||||
|
||||
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore")
|
||||
public int sparkSchemaLengthThreshold;
|
||||
|
||||
@Parameter(names = {"--db-location"}, description = "Database location")
|
||||
public String dbLocation;
|
||||
|
||||
@Parameter(names = {"--auto-create-database"}, description = "Whether auto create adb database")
|
||||
public Boolean autoCreateDatabase = true;
|
||||
|
||||
@Parameter(names = {"--skip-last-commit-time-sync"}, description = "Whether skip last commit time syncing")
|
||||
public Boolean skipLastCommitTimeSync = false;
|
||||
|
||||
@Parameter(names = {"--drop-table-before-creation"}, description = "Whether drop table before creation")
|
||||
public Boolean dropTableBeforeCreation = false;
|
||||
|
||||
@Parameter(names = {"--help", "-h"}, help = true)
|
||||
public Boolean help = false;
|
||||
|
||||
public static final ConfigProperty<String> ADB_SYNC_USER = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.username")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("ADB username");
|
||||
|
||||
public static final ConfigProperty<String> ADB_SYNC_PASS = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.password")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("ADB user password");
|
||||
|
||||
public static final ConfigProperty<String> ADB_SYNC_JDBC_URL = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.jdbc_url")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Adb jdbc connect url");
|
||||
|
||||
public static final ConfigProperty<Boolean> ADB_SYNC_SKIP_RO_SUFFIX = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.skip_ro_suffix")
|
||||
.defaultValue(true)
|
||||
.withDocumentation("Whether skip the `_ro` suffix for read optimized table when syncing");
|
||||
|
||||
public static final ConfigProperty<Boolean> ADB_SYNC_SKIP_RT_SYNC = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.skip_rt_sync")
|
||||
.defaultValue(true)
|
||||
.withDocumentation("Whether skip the rt table when syncing");
|
||||
|
||||
public static final ConfigProperty<Boolean> ADB_SYNC_USE_HIVE_STYLE_PARTITIONING = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.hive_style_partitioning")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Whether use hive style partitioning, true if like the following style: field1=value1/field2=value2");
|
||||
|
||||
public static final ConfigProperty<Boolean> ADB_SYNC_SUPPORT_TIMESTAMP = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.support_timestamp")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("If true, converts int64(timestamp_micros) to timestamp type");
|
||||
|
||||
public static final ConfigProperty<Boolean> ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.sync_as_spark_datasource")
|
||||
.defaultValue(true)
|
||||
.withDocumentation("Whether sync this table as spark data source table");
|
||||
|
||||
public static final ConfigProperty<String> ADB_SYNC_TABLE_PROPERTIES = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.table_properties")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Table properties, to support read hoodie table as datasource table");
|
||||
|
||||
public static final ConfigProperty<String> ADB_SYNC_SERDE_PROPERTIES = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.serde_properties")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Serde properties, to support read hoodie table as datasource table");
|
||||
|
||||
public static final ConfigProperty<Integer> ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.schema_string_length_threshold")
|
||||
.defaultValue(4000)
|
||||
.withDocumentation("The maximum length allowed in a single cell when storing additional schema information in Hive's metastore");
|
||||
|
||||
public static final ConfigProperty<String> ADB_SYNC_DB_LOCATION = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.db_location")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Database location");
|
||||
|
||||
public static final ConfigProperty<Boolean> ADB_SYNC_AUTO_CREATE_DATABASE = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.auto_create_database")
|
||||
.defaultValue(true)
|
||||
.withDocumentation("Whether auto create adb database");
|
||||
|
||||
public static final ConfigProperty<Boolean> ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.skip_last_commit_time_sync")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Whether skip last commit time syncing");
|
||||
|
||||
public static final ConfigProperty<Boolean> ADB_SYNC_DROP_TABLE_BEFORE_CREATION = ConfigProperty
|
||||
.key("hoodie.datasource.adb.sync.drop_table_before_creation")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Whether drop table before creation");
|
||||
|
||||
public AdbSyncConfig() {
|
||||
this(new TypedProperties());
|
||||
}
|
||||
|
||||
public AdbSyncConfig(TypedProperties props) {
|
||||
super(props);
|
||||
|
||||
adbUser = getString(ADB_SYNC_USER);
|
||||
adbPass = getString(ADB_SYNC_PASS);
|
||||
jdbcUrl = getString(ADB_SYNC_JDBC_URL);
|
||||
skipROSuffix = getBooleanOrDefault(ADB_SYNC_SKIP_RO_SUFFIX);
|
||||
skipRTSync = getBooleanOrDefault(ADB_SYNC_SKIP_RT_SYNC);
|
||||
useHiveStylePartitioning = getBooleanOrDefault(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING);
|
||||
supportTimestamp = getBooleanOrDefault(ADB_SYNC_SUPPORT_TIMESTAMP);
|
||||
syncAsSparkDataSourceTable = getBooleanOrDefault(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE);
|
||||
tableProperties = getString(ADB_SYNC_TABLE_PROPERTIES);
|
||||
serdeProperties = getString(ADB_SYNC_SERDE_PROPERTIES);
|
||||
sparkSchemaLengthThreshold = getIntOrDefault(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD);
|
||||
dbLocation = getString(ADB_SYNC_DB_LOCATION);
|
||||
autoCreateDatabase = getBooleanOrDefault(ADB_SYNC_AUTO_CREATE_DATABASE);
|
||||
skipLastCommitTimeSync = getBooleanOrDefault(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC);
|
||||
dropTableBeforeCreation = getBooleanOrDefault(ADB_SYNC_DROP_TABLE_BEFORE_CREATION);
|
||||
}
|
||||
|
||||
public static TypedProperties toProps(AdbSyncConfig cfg) {
|
||||
TypedProperties properties = new TypedProperties();
|
||||
properties.put(META_SYNC_DATABASE_NAME.key(), cfg.databaseName);
|
||||
properties.put(META_SYNC_TABLE_NAME.key(), cfg.tableName);
|
||||
properties.put(ADB_SYNC_USER.key(), cfg.adbUser);
|
||||
properties.put(ADB_SYNC_PASS.key(), cfg.adbPass);
|
||||
properties.put(ADB_SYNC_JDBC_URL.key(), cfg.jdbcUrl);
|
||||
properties.put(META_SYNC_BASE_PATH.key(), cfg.basePath);
|
||||
properties.put(META_SYNC_PARTITION_FIELDS.key(), String.join(",", cfg.partitionFields));
|
||||
properties.put(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), cfg.partitionValueExtractorClass);
|
||||
properties.put(META_SYNC_ASSUME_DATE_PARTITION.key(), String.valueOf(cfg.assumeDatePartitioning));
|
||||
properties.put(ADB_SYNC_SKIP_RO_SUFFIX.key(), String.valueOf(cfg.skipROSuffix));
|
||||
properties.put(ADB_SYNC_SKIP_RT_SYNC.key(), String.valueOf(cfg.skipRTSync));
|
||||
properties.put(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING.key(), String.valueOf(cfg.useHiveStylePartitioning));
|
||||
properties.put(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(cfg.useFileListingFromMetadata));
|
||||
properties.put(ADB_SYNC_SUPPORT_TIMESTAMP.key(), String.valueOf(cfg.supportTimestamp));
|
||||
properties.put(ADB_SYNC_TABLE_PROPERTIES.key(), cfg.tableProperties);
|
||||
properties.put(ADB_SYNC_SERDE_PROPERTIES.key(), cfg.serdeProperties);
|
||||
properties.put(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE.key(), String.valueOf(cfg.syncAsSparkDataSourceTable));
|
||||
properties.put(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), String.valueOf(cfg.sparkSchemaLengthThreshold));
|
||||
properties.put(META_SYNC_SPARK_VERSION.key(), cfg.sparkVersion);
|
||||
properties.put(ADB_SYNC_DB_LOCATION.key(), cfg.dbLocation);
|
||||
properties.put(ADB_SYNC_AUTO_CREATE_DATABASE.key(), String.valueOf(cfg.autoCreateDatabase));
|
||||
properties.put(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC.key(), String.valueOf(cfg.skipLastCommitTimeSync));
|
||||
properties.put(ADB_SYNC_DROP_TABLE_BEFORE_CREATION.key(), String.valueOf(cfg.dropTableBeforeCreation));
|
||||
|
||||
return properties;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "AdbSyncConfig{"
|
||||
+ "adbUser='" + adbUser + '\''
|
||||
+ ", adbPass='" + adbPass + '\''
|
||||
+ ", jdbcUrl='" + jdbcUrl + '\''
|
||||
+ ", skipROSuffix=" + skipROSuffix
|
||||
+ ", skipRTSync=" + skipRTSync
|
||||
+ ", useHiveStylePartitioning=" + useHiveStylePartitioning
|
||||
+ ", supportTimestamp=" + supportTimestamp
|
||||
+ ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable
|
||||
+ ", tableProperties='" + tableProperties + '\''
|
||||
+ ", serdeProperties='" + serdeProperties + '\''
|
||||
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold
|
||||
+ ", dbLocation='" + dbLocation + '\''
|
||||
+ ", autoCreateDatabase=" + autoCreateDatabase
|
||||
+ ", skipLastCommitTimeSync=" + skipLastCommitTimeSync
|
||||
+ ", dropTableBeforeCreation=" + dropTableBeforeCreation
|
||||
+ ", help=" + help
|
||||
+ ", databaseName='" + databaseName + '\''
|
||||
+ ", tableName='" + tableName + '\''
|
||||
+ ", basePath='" + basePath + '\''
|
||||
+ ", baseFileFormat='" + baseFileFormat + '\''
|
||||
+ ", partitionFields=" + partitionFields
|
||||
+ ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\''
|
||||
+ ", assumeDatePartitioning=" + assumeDatePartitioning
|
||||
+ ", decodePartition=" + decodePartition
|
||||
+ ", useFileListingFromMetadata=" + useFileListingFromMetadata
|
||||
+ ", isConditionalSync=" + isConditionalSync
|
||||
+ ", sparkVersion='" + sparkVersion + '\''
|
||||
+ '}';
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,283 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.adb;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
|
||||
import org.apache.hudi.hive.SchemaDifference;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
|
||||
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Adb sync tool is mainly used to sync hoodie tables to Alibaba Cloud AnalyticDB(ADB),
|
||||
* it can be used as API `AdbSyncTool.syncHoodieTable(AdbSyncConfig)` or as command
|
||||
* line `java -cp hoodie-hive.jar AdbSyncTool [args]`
|
||||
*
|
||||
* <p>
|
||||
* This utility will get the schema from the latest commit and will sync ADB table schema,
|
||||
* incremental partitions will be synced as well.
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
public class AdbSyncTool extends AbstractSyncTool {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(AdbSyncTool.class);
|
||||
|
||||
public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
|
||||
public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
|
||||
|
||||
private final AdbSyncConfig adbSyncConfig;
|
||||
private final AbstractAdbSyncHoodieClient hoodieAdbClient;
|
||||
private final String snapshotTableName;
|
||||
private final Option<String> roTableTableName;
|
||||
|
||||
public AdbSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
super(props, conf, fs);
|
||||
this.adbSyncConfig = new AdbSyncConfig(props);
|
||||
this.hoodieAdbClient = getHoodieAdbClient(adbSyncConfig, fs);
|
||||
switch (hoodieAdbClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
this.snapshotTableName = adbSyncConfig.tableName;
|
||||
this.roTableTableName = Option.empty();
|
||||
break;
|
||||
case MERGE_ON_READ:
|
||||
this.snapshotTableName = adbSyncConfig.tableName + SUFFIX_SNAPSHOT_TABLE;
|
||||
this.roTableTableName = adbSyncConfig.skipROSuffix ? Option.of(adbSyncConfig.tableName)
|
||||
: Option.of(adbSyncConfig.tableName + SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
break;
|
||||
default:
|
||||
throw new HoodieAdbSyncException("Unknown table type:" + hoodieAdbClient.getTableType()
|
||||
+ ", basePath:" + hoodieAdbClient.getBasePath());
|
||||
}
|
||||
}
|
||||
|
||||
private AbstractAdbSyncHoodieClient getHoodieAdbClient(AdbSyncConfig adbSyncConfig, FileSystem fs) {
|
||||
return new HoodieAdbJdbcClient(adbSyncConfig, fs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
try {
|
||||
switch (hoodieAdbClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
syncHoodieTable(snapshotTableName, false, false);
|
||||
break;
|
||||
case MERGE_ON_READ:
|
||||
// Sync a ro table for MOR table
|
||||
syncHoodieTable(roTableTableName.get(), false, true);
|
||||
// Sync a rt table for MOR table
|
||||
if (!adbSyncConfig.skipRTSync) {
|
||||
syncHoodieTable(snapshotTableName, true, false);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new HoodieAdbSyncException("Unknown table type:" + hoodieAdbClient.getTableType()
|
||||
+ ", basePath:" + hoodieAdbClient.getBasePath());
|
||||
}
|
||||
} catch (Exception re) {
|
||||
throw new HoodieAdbSyncException("Sync hoodie table to ADB failed, tableName:" + adbSyncConfig.tableName, re);
|
||||
} finally {
|
||||
hoodieAdbClient.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void syncHoodieTable(String tableName, boolean useRealtimeInputFormat,
|
||||
boolean readAsOptimized) throws Exception {
|
||||
LOG.info("Try to sync hoodie table, tableName:{}, path:{}, tableType:{}",
|
||||
tableName, hoodieAdbClient.getBasePath(), hoodieAdbClient.getTableType());
|
||||
|
||||
if (adbSyncConfig.autoCreateDatabase) {
|
||||
try {
|
||||
synchronized (AdbSyncTool.class) {
|
||||
if (!hoodieAdbClient.databaseExists(adbSyncConfig.databaseName)) {
|
||||
hoodieAdbClient.createDatabase(adbSyncConfig.databaseName);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieAdbSyncException("Failed to create database:" + adbSyncConfig.databaseName
|
||||
+ ", useRealtimeInputFormat = " + useRealtimeInputFormat, e);
|
||||
}
|
||||
} else if (!hoodieAdbClient.databaseExists(adbSyncConfig.databaseName)) {
|
||||
throw new HoodieAdbSyncException("ADB database does not exists:" + adbSyncConfig.databaseName);
|
||||
}
|
||||
|
||||
// Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table,
|
||||
// so we disable the syncAsSparkDataSourceTable here to avoid read such kind table
|
||||
// by the data source way (which will use the HoodieBootstrapRelation).
|
||||
// TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071],
|
||||
// we can remove this logical.
|
||||
if (hoodieAdbClient.isBootstrap()
|
||||
&& hoodieAdbClient.getTableType() == HoodieTableType.MERGE_ON_READ
|
||||
&& !readAsOptimized) {
|
||||
adbSyncConfig.syncAsSparkDataSourceTable = false;
|
||||
LOG.info("Disable sync as spark datasource table for mor rt table:{}", tableName);
|
||||
}
|
||||
|
||||
if (adbSyncConfig.dropTableBeforeCreation) {
|
||||
LOG.info("Drop table before creation, tableName:{}", tableName);
|
||||
hoodieAdbClient.dropTable(tableName);
|
||||
}
|
||||
|
||||
boolean tableExists = hoodieAdbClient.tableExists(tableName);
|
||||
|
||||
// Get the parquet schema for this table looking at the latest commit
|
||||
MessageType schema = hoodieAdbClient.getDataSchema();
|
||||
|
||||
// Sync schema if needed
|
||||
syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema);
|
||||
LOG.info("Sync schema complete, start syncing partitions for table:{}", tableName);
|
||||
|
||||
// Get the last time we successfully synced partitions
|
||||
Option<String> lastCommitTimeSynced = Option.empty();
|
||||
if (tableExists) {
|
||||
lastCommitTimeSynced = hoodieAdbClient.getLastCommitTimeSynced(tableName);
|
||||
}
|
||||
LOG.info("Last commit time synced was found:{}", lastCommitTimeSynced.orElse("null"));
|
||||
|
||||
// Scan synced partitions
|
||||
List<String> writtenPartitionsSince;
|
||||
if (adbSyncConfig.partitionFields.isEmpty()) {
|
||||
writtenPartitionsSince = new ArrayList<>();
|
||||
} else {
|
||||
writtenPartitionsSince = hoodieAdbClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
|
||||
}
|
||||
LOG.info("Scan partitions complete, partitionNum:{}", writtenPartitionsSince.size());
|
||||
|
||||
// Sync the partitions if needed
|
||||
syncPartitions(tableName, writtenPartitionsSince);
|
||||
|
||||
// Update sync commit time
|
||||
// whether to skip syncing commit time stored in tbl properties, since it is time consuming.
|
||||
if (!adbSyncConfig.skipLastCommitTimeSync) {
|
||||
hoodieAdbClient.updateLastCommitTimeSynced(tableName);
|
||||
}
|
||||
LOG.info("Sync complete for table:{}", tableName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the latest schema from the last commit and check if its in sync with the ADB
|
||||
* table schema. If not, evolves the table schema.
|
||||
*
|
||||
* @param tableName The table to be synced
|
||||
* @param tableExists Whether target table exists
|
||||
* @param useRealTimeInputFormat Whether using realtime input format
|
||||
* @param readAsOptimized Whether read as optimized table
|
||||
* @param schema The extracted schema
|
||||
*/
|
||||
private void syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat,
|
||||
boolean readAsOptimized, MessageType schema) throws Exception {
|
||||
// Append spark table properties & serde properties
|
||||
Map<String, String> tableProperties = ConfigUtils.toMap(adbSyncConfig.tableProperties);
|
||||
Map<String, String> serdeProperties = ConfigUtils.toMap(adbSyncConfig.serdeProperties);
|
||||
if (adbSyncConfig.syncAsSparkDataSourceTable) {
|
||||
Map<String, String> sparkTableProperties = getSparkTableProperties(adbSyncConfig.partitionFields,
|
||||
adbSyncConfig.sparkVersion, adbSyncConfig.sparkSchemaLengthThreshold, schema);
|
||||
Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized, adbSyncConfig.basePath);
|
||||
tableProperties.putAll(sparkTableProperties);
|
||||
serdeProperties.putAll(sparkSerdeProperties);
|
||||
LOG.info("Sync as spark datasource table, tableName:{}, tableExists:{}, tableProperties:{}, sederProperties:{}",
|
||||
tableName, tableExists, tableProperties, serdeProperties);
|
||||
}
|
||||
|
||||
// Check and sync schema
|
||||
if (!tableExists) {
|
||||
LOG.info("ADB table [{}] is not found, creating it", tableName);
|
||||
String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(HoodieFileFormat.PARQUET, useRealTimeInputFormat);
|
||||
|
||||
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
|
||||
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
|
||||
// /ql/exec/DDLTask.java#L3488
|
||||
hoodieAdbClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(),
|
||||
ParquetHiveSerDe.class.getName(), serdeProperties, tableProperties);
|
||||
} else {
|
||||
// Check if the table schema has evolved
|
||||
Map<String, String> tableSchema = hoodieAdbClient.getTableSchema(tableName);
|
||||
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, adbSyncConfig.partitionFields,
|
||||
adbSyncConfig.supportTimestamp);
|
||||
if (!schemaDiff.isEmpty()) {
|
||||
LOG.info("Schema difference found for table:{}", tableName);
|
||||
hoodieAdbClient.updateTableDefinition(tableName, schemaDiff);
|
||||
} else {
|
||||
LOG.info("No Schema difference for table:{}", tableName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Syncs the list of storage partitions passed in (checks if the partition is in adb, if not adds it or if the
|
||||
* partition path does not match, it updates the partition path).
|
||||
*/
|
||||
private void syncPartitions(String tableName, List<String> writtenPartitionsSince) {
|
||||
try {
|
||||
if (adbSyncConfig.partitionFields.isEmpty()) {
|
||||
LOG.info("Not a partitioned table.");
|
||||
return;
|
||||
}
|
||||
|
||||
Map<List<String>, String> partitions = hoodieAdbClient.scanTablePartitions(tableName);
|
||||
List<PartitionEvent> partitionEvents = hoodieAdbClient.getPartitionEvents(partitions, writtenPartitionsSince);
|
||||
List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
|
||||
LOG.info("New Partitions:{}", newPartitions);
|
||||
hoodieAdbClient.addPartitionsToTable(tableName, newPartitions);
|
||||
List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
|
||||
LOG.info("Changed Partitions:{}", updatePartitions);
|
||||
hoodieAdbClient.updatePartitionsToTable(tableName, updatePartitions);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieAdbSyncException("Failed to sync partitions for table:" + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> filterPartitions(List<PartitionEvent> events, PartitionEventType eventType) {
|
||||
return events.stream().filter(s -> s.eventType == eventType)
|
||||
.map(s -> s.storagePartition).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
// parse the params
|
||||
final AdbSyncConfig cfg = new AdbSyncConfig();
|
||||
JCommander cmd = new JCommander(cfg, null, args);
|
||||
if (cfg.help || args.length == 0) {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
Configuration hadoopConf = new Configuration();
|
||||
FileSystem fs = FSUtils.getFs(cfg.basePath, hadoopConf);
|
||||
new AdbSyncTool(AdbSyncConfig.toProps(cfg), hadoopConf, fs).syncHoodieTable();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,440 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.adb;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.SchemaDifference;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DatabaseMetaData;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
|
||||
public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(HoodieAdbJdbcClient.class);
|
||||
|
||||
public static final String HOODIE_LAST_COMMIT_TIME_SYNC = "hoodie_last_sync";
|
||||
// Make sure we have the jdbc driver in classpath
|
||||
private static final String DRIVER_NAME = "com.mysql.jdbc.Driver";
|
||||
public static final String ADB_ESCAPE_CHARACTER = "";
|
||||
private static final String TBL_PROPERTIES_STR = "TBLPROPERTIES";
|
||||
|
||||
static {
|
||||
try {
|
||||
Class.forName(DRIVER_NAME);
|
||||
} catch (ClassNotFoundException e) {
|
||||
throw new IllegalStateException("Could not find " + DRIVER_NAME + " in classpath. ", e);
|
||||
}
|
||||
}
|
||||
|
||||
private Connection connection;
|
||||
|
||||
public HoodieAdbJdbcClient(AdbSyncConfig syncConfig, FileSystem fs) {
|
||||
super(syncConfig, fs);
|
||||
createAdbConnection();
|
||||
LOG.info("Init adb jdbc client success, jdbcUrl:{}", syncConfig.jdbcUrl);
|
||||
}
|
||||
|
||||
private void createAdbConnection() {
|
||||
if (connection == null) {
|
||||
try {
|
||||
Class.forName(DRIVER_NAME);
|
||||
} catch (ClassNotFoundException e) {
|
||||
LOG.error("Unable to load jdbc driver class", e);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
this.connection = DriverManager.getConnection(
|
||||
adbSyncConfig.jdbcUrl, adbSyncConfig.adbUser, adbSyncConfig.adbPass);
|
||||
} catch (SQLException e) {
|
||||
throw new HoodieException("Cannot create adb connection ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void createTable(String tableName, MessageType storageSchema, String inputFormatClass,
|
||||
String outputFormatClass, String serdeClass,
|
||||
Map<String, String> serdeProperties, Map<String, String> tableProperties) {
|
||||
try {
|
||||
LOG.info("Creating table:{}", tableName);
|
||||
String createSQLQuery = HiveSchemaUtil.generateCreateDDL(tableName, storageSchema,
|
||||
getHiveSyncConfig(), inputFormatClass, outputFormatClass, serdeClass, serdeProperties, tableProperties);
|
||||
executeAdbSql(createSQLQuery);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieException("Fail to create table:" + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void dropTable(String tableName) {
|
||||
LOG.info("Dropping table:{}", tableName);
|
||||
String dropTable = "drop table if exists `" + adbSyncConfig.databaseName + "`.`" + tableName + "`";
|
||||
executeAdbSql(dropTable);
|
||||
}
|
||||
|
||||
public Map<String, String> getTableSchema(String tableName) {
|
||||
Map<String, String> schema = new HashMap<>();
|
||||
ResultSet result = null;
|
||||
try {
|
||||
DatabaseMetaData databaseMetaData = connection.getMetaData();
|
||||
result = databaseMetaData.getColumns(adbSyncConfig.databaseName,
|
||||
adbSyncConfig.databaseName, tableName, null);
|
||||
while (result.next()) {
|
||||
String columnName = result.getString(4);
|
||||
String columnType = result.getString(6);
|
||||
if ("DECIMAL".equals(columnType)) {
|
||||
int columnSize = result.getInt("COLUMN_SIZE");
|
||||
int decimalDigits = result.getInt("DECIMAL_DIGITS");
|
||||
columnType += String.format("(%s,%s)", columnSize, decimalDigits);
|
||||
}
|
||||
schema.put(columnName, columnType);
|
||||
}
|
||||
return schema;
|
||||
} catch (SQLException e) {
|
||||
throw new HoodieException("Fail to get table schema:" + tableName, e);
|
||||
} finally {
|
||||
closeQuietly(result, null);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addPartitionsToTable(String tableName, List<String> partitionsToAdd) {
|
||||
if (partitionsToAdd.isEmpty()) {
|
||||
LOG.info("No partitions to add for table:{}", tableName);
|
||||
return;
|
||||
}
|
||||
|
||||
LOG.info("Adding partitions to table:{}, partitionNum:{}", tableName, partitionsToAdd.size());
|
||||
String sql = constructAddPartitionsSql(tableName, partitionsToAdd);
|
||||
executeAdbSql(sql);
|
||||
}
|
||||
|
||||
private void executeAdbSql(String sql) {
|
||||
Statement stmt = null;
|
||||
try {
|
||||
stmt = connection.createStatement();
|
||||
LOG.info("Executing sql:{}", sql);
|
||||
stmt.execute(sql);
|
||||
} catch (SQLException e) {
|
||||
throw new HoodieException("Fail to execute sql:" + sql, e);
|
||||
} finally {
|
||||
closeQuietly(null, stmt);
|
||||
}
|
||||
}
|
||||
|
||||
private <T> T executeQuerySQL(String sql, Function<ResultSet, T> function) {
|
||||
Statement stmt = null;
|
||||
try {
|
||||
stmt = connection.createStatement();
|
||||
LOG.info("Executing sql:{}", sql);
|
||||
return function.apply(stmt.executeQuery(sql));
|
||||
} catch (SQLException e) {
|
||||
throw new HoodieException("Fail to execute sql:" + sql, e);
|
||||
} finally {
|
||||
closeQuietly(null, stmt);
|
||||
}
|
||||
}
|
||||
|
||||
public void createDatabase(String databaseName) {
|
||||
String rootPath = getDatabasePath();
|
||||
LOG.info("Creating database:{}, databaseLocation:{}", databaseName, rootPath);
|
||||
String sql = constructCreateDatabaseSql(rootPath);
|
||||
executeAdbSql(sql);
|
||||
}
|
||||
|
||||
public boolean databaseExists(String databaseName) {
|
||||
String sql = constructShowCreateDatabaseSql(databaseName);
|
||||
Function<ResultSet, Boolean> transform = resultSet -> {
|
||||
try {
|
||||
return resultSet.next();
|
||||
} catch (Exception e) {
|
||||
if (e.getMessage().contains("Unknown database `" + databaseName + "`")) {
|
||||
return false;
|
||||
} else {
|
||||
throw new HoodieException("Fail to execute sql:" + sql, e);
|
||||
}
|
||||
}
|
||||
};
|
||||
return executeQuerySQL(sql, transform);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean doesTableExist(String tableName) {
|
||||
String sql = constructShowLikeTableSql(tableName);
|
||||
Function<ResultSet, Boolean> transform = resultSet -> {
|
||||
try {
|
||||
return resultSet.next();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Fail to execute sql:" + sql, e);
|
||||
}
|
||||
};
|
||||
return executeQuerySQL(sql, transform);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean tableExists(String tableName) {
|
||||
return doesTableExist(tableName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getLastCommitTimeSynced(String tableName) {
|
||||
String sql = constructShowCreateTableSql(tableName);
|
||||
|
||||
Function<ResultSet, Option<String>> transform = resultSet -> {
|
||||
try {
|
||||
if (resultSet.next()) {
|
||||
String table = resultSet.getString(2);
|
||||
Map<String, String> attr = new HashMap<>();
|
||||
int index = table.indexOf(TBL_PROPERTIES_STR);
|
||||
if (index != -1) {
|
||||
String sub = table.substring(index + TBL_PROPERTIES_STR.length());
|
||||
sub = sub
|
||||
.replaceAll("\\(", "")
|
||||
.replaceAll("\\)", "")
|
||||
.replaceAll("'", "");
|
||||
String[] str = sub.split(",");
|
||||
|
||||
for (String s : str) {
|
||||
String key = s.split("=")[0].trim();
|
||||
String value = s.split("=")[1].trim();
|
||||
attr.put(key, value);
|
||||
}
|
||||
}
|
||||
return Option.ofNullable(attr.getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null));
|
||||
}
|
||||
return Option.empty();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Fail to execute sql:" + sql, e);
|
||||
}
|
||||
};
|
||||
return executeQuerySQL(sql, transform);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateLastCommitTimeSynced(String tableName) {
|
||||
// Set the last commit time from the TBLProperties
|
||||
String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp();
|
||||
try {
|
||||
String sql = constructUpdateTblPropertiesSql(tableName, lastCommitSynced);
|
||||
executeAdbSql(sql);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Fail to get update last commit time synced:" + lastCommitSynced, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getLastReplicatedTime(String tableName) {
|
||||
throw new UnsupportedOperationException("Not support getLastReplicatedTime yet");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
|
||||
throw new UnsupportedOperationException("Not support updateLastReplicatedTimeStamp yet");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteLastReplicatedTimeStamp(String tableName) {
|
||||
throw new UnsupportedOperationException("Not support deleteLastReplicatedTimeStamp yet");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
|
||||
if (changedPartitions.isEmpty()) {
|
||||
LOG.info("No partitions to change for table:{}", tableName);
|
||||
return;
|
||||
}
|
||||
|
||||
LOG.info("Changing partitions on table:{}, changedPartitionNum:{}", tableName, changedPartitions.size());
|
||||
List<String> sqlList = constructChangePartitionsSql(tableName, changedPartitions);
|
||||
for (String sql : sqlList) {
|
||||
executeAdbSql(sql);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void dropPartitions(String tableName, List<String> partitionsToDrop) {
|
||||
throw new UnsupportedOperationException("Not support dropPartitions yet.");
|
||||
}
|
||||
|
||||
public Map<List<String>, String> scanTablePartitions(String tableName) {
|
||||
String sql = constructShowPartitionSql(tableName);
|
||||
Function<ResultSet, Map<List<String>, String>> transform = resultSet -> {
|
||||
Map<List<String>, String> partitions = new HashMap<>();
|
||||
try {
|
||||
while (resultSet.next()) {
|
||||
if (resultSet.getMetaData().getColumnCount() > 0) {
|
||||
String str = resultSet.getString(1);
|
||||
if (!StringUtils.isNullOrEmpty(str)) {
|
||||
List<String> values = partitionValueExtractor.extractPartitionValuesInPath(str);
|
||||
Path storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, String.join("/", values));
|
||||
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
partitions.put(values, fullStoragePartitionPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Fail to execute sql:" + sql, e);
|
||||
}
|
||||
return partitions;
|
||||
};
|
||||
return executeQuerySQL(sql, transform);
|
||||
}
|
||||
|
||||
public void updateTableDefinition(String tableName, SchemaDifference schemaDiff) {
|
||||
LOG.info("Adding columns for table:{}", tableName);
|
||||
schemaDiff.getAddColumnTypes().forEach((columnName, columnType) ->
|
||||
executeAdbSql(constructAddColumnSql(tableName, columnName, columnType))
|
||||
);
|
||||
|
||||
LOG.info("Updating columns' definition for table:{}", tableName);
|
||||
schemaDiff.getUpdateColumnTypes().forEach((columnName, columnType) ->
|
||||
executeAdbSql(constructChangeColumnSql(tableName, columnName, columnType))
|
||||
);
|
||||
}
|
||||
|
||||
private String constructAddPartitionsSql(String tableName, List<String> partitions) {
|
||||
StringBuilder sqlBuilder = new StringBuilder("alter table `");
|
||||
sqlBuilder.append(adbSyncConfig.databaseName).append("`").append(".`")
|
||||
.append(tableName).append("`").append(" add if not exists ");
|
||||
for (String partition : partitions) {
|
||||
String partitionClause = getPartitionClause(partition);
|
||||
Path partitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
|
||||
String fullPartitionPathStr = generateAbsolutePathStr(partitionPath);
|
||||
sqlBuilder.append(" partition (").append(partitionClause).append(") location '")
|
||||
.append(fullPartitionPathStr).append("' ");
|
||||
}
|
||||
|
||||
return sqlBuilder.toString();
|
||||
}
|
||||
|
||||
private List<String> constructChangePartitionsSql(String tableName, List<String> partitions) {
|
||||
List<String> changePartitions = new ArrayList<>();
|
||||
String useDatabase = "use `" + adbSyncConfig.databaseName + "`";
|
||||
changePartitions.add(useDatabase);
|
||||
|
||||
String alterTable = "alter table `" + tableName + "`";
|
||||
for (String partition : partitions) {
|
||||
String partitionClause = getPartitionClause(partition);
|
||||
Path partitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
|
||||
String fullPartitionPathStr = generateAbsolutePathStr(partitionPath);
|
||||
String changePartition = alterTable + " add if not exists partition (" + partitionClause
|
||||
+ ") location '" + fullPartitionPathStr + "'";
|
||||
changePartitions.add(changePartition);
|
||||
}
|
||||
|
||||
return changePartitions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate Hive Partition from partition values.
|
||||
*
|
||||
* @param partition Partition path
|
||||
* @return partition clause
|
||||
*/
|
||||
private String getPartitionClause(String partition) {
|
||||
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
|
||||
ValidationUtils.checkArgument(adbSyncConfig.partitionFields.size() == partitionValues.size(),
|
||||
"Partition key parts " + adbSyncConfig.partitionFields
|
||||
+ " does not match with partition values " + partitionValues + ". Check partition strategy. ");
|
||||
List<String> partBuilder = new ArrayList<>();
|
||||
for (int i = 0; i < adbSyncConfig.partitionFields.size(); i++) {
|
||||
partBuilder.add(adbSyncConfig.partitionFields.get(i) + "='" + partitionValues.get(i) + "'");
|
||||
}
|
||||
|
||||
return String.join(",", partBuilder);
|
||||
}
|
||||
|
||||
private String constructShowPartitionSql(String tableName) {
|
||||
return String.format("show partitions `%s`.`%s`", adbSyncConfig.databaseName, tableName);
|
||||
}
|
||||
|
||||
private String constructShowCreateTableSql(String tableName) {
|
||||
return String.format("show create table `%s`.`%s`", adbSyncConfig.databaseName, tableName);
|
||||
}
|
||||
|
||||
private String constructShowLikeTableSql(String tableName) {
|
||||
return String.format("show tables from `%s` like '%s'", adbSyncConfig.databaseName, tableName);
|
||||
}
|
||||
|
||||
private String constructCreateDatabaseSql(String rootPath) {
|
||||
return String.format("create database if not exists `%s` with dbproperties(catalog = 'oss', location = '%s')",
|
||||
adbSyncConfig.databaseName, rootPath);
|
||||
}
|
||||
|
||||
private String constructShowCreateDatabaseSql(String databaseName) {
|
||||
return String.format("show create database `%s`", databaseName);
|
||||
}
|
||||
|
||||
private String constructUpdateTblPropertiesSql(String tableName, String lastCommitSynced) {
|
||||
return String.format("alter table `%s`.`%s` set tblproperties('%s' = '%s')",
|
||||
adbSyncConfig.databaseName, tableName, HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced);
|
||||
}
|
||||
|
||||
private String constructAddColumnSql(String tableName, String columnName, String columnType) {
|
||||
return String.format("alter table `%s`.`%s` add columns(`%s` %s)",
|
||||
adbSyncConfig.databaseName, tableName, columnName, columnType);
|
||||
}
|
||||
|
||||
private String constructChangeColumnSql(String tableName, String columnName, String columnType) {
|
||||
return String.format("alter table `%s`.`%s` change `%s` `%s` %s",
|
||||
adbSyncConfig.databaseName, tableName, columnName, columnName, columnType);
|
||||
}
|
||||
|
||||
private HiveSyncConfig getHiveSyncConfig() {
|
||||
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
|
||||
hiveSyncConfig.partitionFields = adbSyncConfig.partitionFields;
|
||||
hiveSyncConfig.databaseName = adbSyncConfig.databaseName;
|
||||
Path basePath = new Path(adbSyncConfig.basePath);
|
||||
hiveSyncConfig.basePath = generateAbsolutePathStr(basePath);
|
||||
return hiveSyncConfig;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
try {
|
||||
if (connection != null) {
|
||||
connection.close();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.error("Fail to close connection", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.adb;
|
||||
|
||||
public class HoodieAdbSyncException extends RuntimeException {
|
||||
public HoodieAdbSyncException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public HoodieAdbSyncException(String message, Throwable t) {
|
||||
super(message, t);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.adb;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
public class TestAdbSyncConfig {
|
||||
@Test
|
||||
public void testCopy() {
|
||||
AdbSyncConfig adbSyncConfig = new AdbSyncConfig();
|
||||
adbSyncConfig.partitionFields = Arrays.asList("a", "b");
|
||||
adbSyncConfig.basePath = "/tmp";
|
||||
adbSyncConfig.assumeDatePartitioning = true;
|
||||
adbSyncConfig.databaseName = "test";
|
||||
adbSyncConfig.tableName = "test";
|
||||
adbSyncConfig.adbUser = "adb";
|
||||
adbSyncConfig.adbPass = "adb";
|
||||
adbSyncConfig.jdbcUrl = "jdbc:mysql://localhost:3306";
|
||||
adbSyncConfig.skipROSuffix = false;
|
||||
adbSyncConfig.tableProperties = "spark.sql.sources.provider= 'hudi'\\n"
|
||||
+ "spark.sql.sources.schema.numParts = '1'\\n "
|
||||
+ "spark.sql.sources.schema.part.0 ='xx'\\n "
|
||||
+ "spark.sql.sources.schema.numPartCols = '1'\\n"
|
||||
+ "spark.sql.sources.schema.partCol.0 = 'dt'";
|
||||
adbSyncConfig.serdeProperties = "'path'='/tmp/test_db/tbl'";
|
||||
adbSyncConfig.dbLocation = "file://tmp/test_db";
|
||||
|
||||
TypedProperties props = AdbSyncConfig.toProps(adbSyncConfig);
|
||||
AdbSyncConfig copied = new AdbSyncConfig(props);
|
||||
|
||||
assertEquals(copied.partitionFields, adbSyncConfig.partitionFields);
|
||||
assertEquals(copied.basePath, adbSyncConfig.basePath);
|
||||
assertEquals(copied.assumeDatePartitioning, adbSyncConfig.assumeDatePartitioning);
|
||||
assertEquals(copied.databaseName, adbSyncConfig.databaseName);
|
||||
assertEquals(copied.tableName, adbSyncConfig.tableName);
|
||||
assertEquals(copied.adbUser, adbSyncConfig.adbUser);
|
||||
assertEquals(copied.adbPass, adbSyncConfig.adbPass);
|
||||
assertEquals(copied.basePath, adbSyncConfig.basePath);
|
||||
assertEquals(copied.jdbcUrl, adbSyncConfig.jdbcUrl);
|
||||
assertEquals(copied.skipROSuffix, adbSyncConfig.skipROSuffix);
|
||||
assertEquals(copied.supportTimestamp, adbSyncConfig.supportTimestamp);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=WARN, CONSOLE
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
|
||||
# CONSOLE is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# CONSOLE uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
@@ -0,0 +1,30 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=WARN, CONSOLE
|
||||
log4j.logger.org.apache=INFO
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
|
||||
# CONSOLE is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# CONSOLE uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
Reference in New Issue
Block a user