[HUDI-2883] Refactor hive sync tool / config to use reflection and standardize configs (#4175)
- Refactor hive sync tool / config to use reflection and standardize configs Co-authored-by: sivabalan <n.siva.b@gmail.com> Co-authored-by: Rajesh Mahindra <rmahindra@Rajeshs-MacBook-Pro.local> Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
@@ -18,7 +18,6 @@
|
||||
|
||||
package org.apache.hudi.utilities.deltastreamer;
|
||||
|
||||
import org.apache.hudi.DataSourceUtils;
|
||||
import org.apache.hudi.DataSourceWriteOptions;
|
||||
import org.apache.hudi.client.SparkRDDWriteClient;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
@@ -34,13 +33,13 @@ import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HiveSyncTool;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
import org.apache.hudi.utilities.UtilHelpers;
|
||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
@@ -161,12 +160,16 @@ public class BootstrapExecutor implements Serializable {
|
||||
*/
|
||||
private void syncHive() {
|
||||
if (cfg.enableHiveSync || cfg.enableMetaSync) {
|
||||
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat);
|
||||
HiveConf hiveConf = new HiveConf(fs.getConf(), HiveConf.class);
|
||||
hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname,hiveSyncConfig.metastoreUris);
|
||||
LOG.info("Hive Conf => " + hiveConf.getAllProperties().toString());
|
||||
LOG.info("Hive Sync Conf => " + hiveSyncConfig);
|
||||
new HiveSyncTool(hiveSyncConfig, new HiveConf(configuration, HiveConf.class), fs).syncHoodieTable();
|
||||
TypedProperties metaProps = new TypedProperties();
|
||||
metaProps.putAll(props);
|
||||
metaProps.put(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), cfg.targetBasePath);
|
||||
metaProps.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), cfg.baseFileFormat);
|
||||
if (props.getBoolean(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.key(), HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.defaultValue())) {
|
||||
metaProps.put(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()),
|
||||
props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key())));
|
||||
}
|
||||
|
||||
new HiveSyncTool(metaProps, configuration, fs).syncHoodieTable();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -195,4 +198,4 @@ public class BootstrapExecutor implements Serializable {
|
||||
public HoodieWriteConfig getBootstrapConfig() {
|
||||
return bootstrapConfig;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,12 +42,12 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.CommitUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieClusteringConfig;
|
||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodiePayloadConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
@@ -59,7 +59,7 @@ import org.apache.hudi.keygen.SimpleKeyGenerator;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
||||
import org.apache.hudi.metrics.HoodieMetrics;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.util.SyncUtilHelpers;
|
||||
import org.apache.hudi.utilities.UtilHelpers;
|
||||
import org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallback;
|
||||
import org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallbackConfig;
|
||||
@@ -81,7 +81,6 @@ import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
@@ -99,7 +98,6 @@ import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
@@ -692,44 +690,24 @@ public class DeltaSync implements Serializable {
|
||||
LOG.info("When set --enable-hive-sync will use HiveSyncTool for backward compatibility");
|
||||
}
|
||||
if (cfg.enableMetaSync) {
|
||||
FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration());
|
||||
|
||||
TypedProperties metaProps = new TypedProperties();
|
||||
metaProps.putAll(props);
|
||||
if (props.getBoolean(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.key(), HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.defaultValue())) {
|
||||
metaProps.put(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()),
|
||||
props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key())));
|
||||
}
|
||||
|
||||
for (String impl : syncClientToolClasses) {
|
||||
Timer.Context syncContext = metrics.getMetaSyncTimerContext();
|
||||
impl = impl.trim();
|
||||
switch (impl) {
|
||||
case "org.apache.hudi.hive.HiveSyncTool":
|
||||
syncHive();
|
||||
break;
|
||||
default:
|
||||
FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration());
|
||||
Properties properties = new Properties();
|
||||
properties.putAll(props);
|
||||
properties.put("basePath", cfg.targetBasePath);
|
||||
properties.put("baseFileFormat", cfg.baseFileFormat);
|
||||
AbstractSyncTool syncTool = (AbstractSyncTool) ReflectionUtils.loadClass(impl, new Class[]{Properties.class, FileSystem.class}, properties, fs);
|
||||
syncTool.syncHoodieTable();
|
||||
}
|
||||
SyncUtilHelpers.runHoodieMetaSync(impl.trim(), metaProps, conf, fs, cfg.targetBasePath, cfg.baseFileFormat);
|
||||
long metaSyncTimeMs = syncContext != null ? syncContext.stop() : 0;
|
||||
metrics.updateDeltaStreamerMetaSyncMetrics(getSyncClassShortName(impl), metaSyncTimeMs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void syncHive() {
|
||||
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat);
|
||||
HiveConf hiveConf = new HiveConf(conf, HiveConf.class);
|
||||
if (StringUtils.isNullOrEmpty(hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname))) {
|
||||
hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname, hiveSyncConfig.metastoreUris);
|
||||
}
|
||||
LOG.info("Hive Conf => " + hiveConf.getAllProperties().toString());
|
||||
LOG.info("Hive Sync Conf => " + hiveSyncConfig.toString());
|
||||
new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable();
|
||||
}
|
||||
|
||||
public void syncHive(HiveConf conf) {
|
||||
this.conf = conf;
|
||||
syncHive();
|
||||
}
|
||||
|
||||
/**
|
||||
* Note that depending on configs and source-type, schemaProvider could either be eagerly or lazily created.
|
||||
* SchemaProvider creation is a precursor to HoodieWriteClient and AsyncCompactor creation. This method takes care of
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
package org.apache.hudi.utilities.deltastreamer;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
import org.apache.hudi.DataSourceWriteOptions;
|
||||
import org.apache.hudi.client.utils.OperationConverter;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
|
||||
@@ -28,6 +27,7 @@ import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
import org.apache.hudi.utilities.IdentitySplitter;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.utilities.UtilHelpers;
|
||||
@@ -131,7 +131,7 @@ public class HoodieMultiTableDeltaStreamer {
|
||||
Helpers.deepCopyConfigs(config, cfg);
|
||||
String overriddenTargetBasePath = tableProperties.getString(Constants.TARGET_BASE_PATH_PROP, "");
|
||||
cfg.targetBasePath = StringUtils.isNullOrEmpty(overriddenTargetBasePath) ? targetBasePath : overriddenTargetBasePath;
|
||||
if (cfg.enableMetaSync && StringUtils.isNullOrEmpty(tableProperties.getString(DataSourceWriteOptions.HIVE_TABLE().key(), ""))) {
|
||||
if (cfg.enableMetaSync && StringUtils.isNullOrEmpty(tableProperties.getString(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), ""))) {
|
||||
throw new HoodieException("Meta sync table field not provided!");
|
||||
}
|
||||
populateSchemaProviderProps(cfg, tableProperties);
|
||||
|
||||
@@ -20,10 +20,13 @@ package org.apache.hudi.utilities;
|
||||
|
||||
import org.apache.hadoop.hive.metastore.api.MetaException;
|
||||
import org.apache.hadoop.hive.ql.metadata.HiveException;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HiveSyncTool;
|
||||
import org.apache.hudi.hive.HoodieHiveClient;
|
||||
import org.apache.hudi.hive.testutils.HiveTestUtil;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
import org.apache.hudi.utilities.exception.HoodieIncrementalPullSQLException;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@@ -38,7 +41,7 @@ import java.nio.file.Paths;
|
||||
import java.time.Instant;
|
||||
|
||||
import static org.apache.hudi.hive.testutils.HiveTestUtil.fileSystem;
|
||||
import static org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncConfig;
|
||||
import static org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncProps;
|
||||
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
@@ -69,12 +72,12 @@ public class TestHiveIncrementalPuller {
|
||||
}
|
||||
|
||||
private HiveIncrementalPuller.Config getHivePullerConfig(String incrementalSql) throws IOException {
|
||||
config.hiveJDBCUrl = hiveSyncConfig.jdbcUrl;
|
||||
config.hiveUsername = hiveSyncConfig.hiveUser;
|
||||
config.hivePassword = hiveSyncConfig.hivePass;
|
||||
config.hiveJDBCUrl = hiveSyncProps.getString(HiveSyncConfig.HIVE_URL.key());
|
||||
config.hiveUsername = hiveSyncProps.getString(HiveSyncConfig.HIVE_USER.key());
|
||||
config.hivePassword = hiveSyncProps.getString(HiveSyncConfig.HIVE_PASS.key());
|
||||
config.hoodieTmpDir = Files.createTempDirectory("hivePullerTest").toUri().toString();
|
||||
config.sourceDb = hiveSyncConfig.databaseName;
|
||||
config.sourceTable = hiveSyncConfig.tableName;
|
||||
config.sourceDb = hiveSyncProps.getString(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key());
|
||||
config.sourceTable = hiveSyncProps.getString(HoodieSyncConfig.META_SYNC_TABLE_NAME.key());
|
||||
config.targetDb = "tgtdb";
|
||||
config.targetTable = "test2";
|
||||
config.tmpDb = "tmp_db";
|
||||
@@ -98,9 +101,9 @@ public class TestHiveIncrementalPuller {
|
||||
private void createSourceTable() throws IOException, URISyntaxException {
|
||||
String instantTime = "101";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, true);
|
||||
hiveSyncConfig.syncMode = "jdbc";
|
||||
HiveTestUtil.hiveSyncConfig.batchSyncNum = 3;
|
||||
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
|
||||
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), "jdbc");
|
||||
|
||||
HiveSyncTool tool = new HiveSyncTool(hiveSyncProps, HiveTestUtil.getHiveConf(), fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
}
|
||||
|
||||
@@ -113,20 +116,20 @@ public class TestHiveIncrementalPuller {
|
||||
tool.syncHoodieTable();
|
||||
}
|
||||
|
||||
private HiveSyncConfig getTargetHiveSyncConfig(String basePath) {
|
||||
HiveSyncConfig config = HiveSyncConfig.copy(hiveSyncConfig);
|
||||
config.databaseName = "tgtdb";
|
||||
config.tableName = "test2";
|
||||
config.basePath = basePath;
|
||||
config.batchSyncNum = 3;
|
||||
config.syncMode = "jdbc";
|
||||
return config;
|
||||
private TypedProperties getTargetHiveSyncConfig(String basePath) {
|
||||
TypedProperties targetHiveSyncProps = new TypedProperties(hiveSyncProps);
|
||||
targetHiveSyncProps.setProperty(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "tgtdb");
|
||||
targetHiveSyncProps.setProperty(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "test2");
|
||||
targetHiveSyncProps.setProperty(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), basePath);
|
||||
targetHiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), "jdbc");
|
||||
|
||||
return targetHiveSyncProps;
|
||||
}
|
||||
|
||||
private HiveSyncConfig getAssertionSyncConfig(String databaseName) {
|
||||
HiveSyncConfig config = HiveSyncConfig.copy(hiveSyncConfig);
|
||||
config.databaseName = databaseName;
|
||||
return config;
|
||||
private TypedProperties getAssertionSyncConfig(String databaseName) {
|
||||
TypedProperties assertHiveSyncProps = new TypedProperties(hiveSyncProps);
|
||||
assertHiveSyncProps.setProperty(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), databaseName);
|
||||
return assertHiveSyncProps;
|
||||
}
|
||||
|
||||
private void createTables() throws IOException, URISyntaxException {
|
||||
@@ -158,12 +161,11 @@ public class TestHiveIncrementalPuller {
|
||||
public void testPuller() throws IOException, URISyntaxException {
|
||||
createTables();
|
||||
HiveIncrementalPuller.Config cfg = getHivePullerConfig("select name from testdb.test1 where `_hoodie_commit_time` > '%s'");
|
||||
HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
|
||||
HoodieHiveClient hiveClient = new HoodieHiveClient(new HiveSyncConfig(hiveSyncProps), HiveTestUtil.getHiveConf(), fileSystem);
|
||||
hiveClient.createDatabase(cfg.tmpDb);
|
||||
HiveIncrementalPuller puller = new HiveIncrementalPuller(cfg);
|
||||
puller.saveDelta();
|
||||
HiveSyncConfig assertingConfig = getAssertionSyncConfig(cfg.tmpDb);
|
||||
HoodieHiveClient assertingClient = new HoodieHiveClient(assertingConfig, HiveTestUtil.getHiveConf(), fileSystem);
|
||||
HoodieHiveClient assertingClient = new HoodieHiveClient(new HiveSyncConfig(getAssertionSyncConfig(cfg.tmpDb)), HiveTestUtil.getHiveConf(), fileSystem);
|
||||
String tmpTable = cfg.targetTable + "__" + cfg.sourceTable;
|
||||
assertTrue(assertingClient.doesTableExist(tmpTable));
|
||||
}
|
||||
|
||||
@@ -18,7 +18,6 @@
|
||||
|
||||
package org.apache.hudi.utilities.functional;
|
||||
|
||||
import org.apache.hudi.DataSourceWriteOptions;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
@@ -28,6 +27,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.MultiPartKeysValueExtractor;
|
||||
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
@@ -178,11 +178,11 @@ public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase {
|
||||
props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc");
|
||||
|
||||
// Hive Configs
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_URL().key(), "jdbc:hive2://127.0.0.1:9999/");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_DATABASE().key(), "testdb1");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_TABLE().key(), "hive_trips");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "datestr");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(),
|
||||
props.setProperty(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/");
|
||||
props.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), "testdb1");
|
||||
props.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), "hive_trips");
|
||||
props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr");
|
||||
props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(),
|
||||
MultiPartKeysValueExtractor.class.getName());
|
||||
UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE);
|
||||
}
|
||||
@@ -237,11 +237,11 @@ public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase {
|
||||
|
||||
protected static void populateCommonHiveProps(TypedProperties props) {
|
||||
// Hive Configs
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_URL().key(), "jdbc:hive2://127.0.0.1:9999/");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_DATABASE().key(), "testdb2");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_ASSUME_DATE_PARTITION().key(), "false");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "datestr");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(),
|
||||
props.setProperty(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/");
|
||||
props.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), "testdb2");
|
||||
props.setProperty(HiveSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key(), "false");
|
||||
props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr");
|
||||
props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(),
|
||||
MultiPartKeysValueExtractor.class.getName());
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user