1
0

[HUDI-3730] Improve meta sync class design and hierarchies (#5854)

* [HUDI-3730] Improve meta sync class design and hierarchies (#5754)
* Implements class design proposed in RFC-55

Co-authored-by: jian.feng <fengjian428@gmial.com>
Co-authored-by: jian.feng <jian.feng@shopee.com>
This commit is contained in:
Shiyan Xu
2022-07-03 04:17:25 -05:00
committed by GitHub
parent c00ea84985
commit c0e1587966
86 changed files with 2977 additions and 2877 deletions

View File

@@ -21,7 +21,7 @@ import org.apache.hudi.DataSourceWriteOptions;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.HoodieDataSourceHelpers; import org.apache.hudi.HoodieDataSourceHelpers;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfigHolder;
import org.apache.hudi.sync.common.HoodieSyncConfig; import org.apache.hudi.sync.common.HoodieSyncConfig;
import org.apache.hudi.hive.MultiPartKeysValueExtractor; import org.apache.hudi.hive.MultiPartKeysValueExtractor;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@@ -47,10 +47,10 @@ spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, cl
option(HoodieWriteConfig.TBL_NAME.key(), "stock_ticks_derived_mor"). option(HoodieWriteConfig.TBL_NAME.key(), "stock_ticks_derived_mor").
option(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "stock_ticks_derived_mor"). option(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "stock_ticks_derived_mor").
option(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "default"). option(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "default").
option(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://hiveserver:10000"). option(HiveSyncConfigHolder.HIVE_URL.key(), "jdbc:hive2://hiveserver:10000").
option(HiveSyncConfig.HIVE_USER.key(), "hive"). option(HiveSyncConfigHolder.HIVE_USER.key(), "hive").
option(HiveSyncConfig.HIVE_PASS.key(), "hive"). option(HiveSyncConfigHolder.HIVE_PASS.key(), "hive").
option(HiveSyncConfig.HIVE_SYNC_ENABLED.key(), "true"). option(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key(), "true").
option(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"). option(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr").
option(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), classOf[MultiPartKeysValueExtractor].getCanonicalName). option(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), classOf[MultiPartKeysValueExtractor].getCanonicalName).
option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key(), "true"). option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key(), "true").
@@ -79,10 +79,10 @@ spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, cl
option(HoodieWriteConfig.TBL_NAME.key(), "stock_ticks_derived_mor_bs"). option(HoodieWriteConfig.TBL_NAME.key(), "stock_ticks_derived_mor_bs").
option(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "stock_ticks_derived_mor_bs"). option(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "stock_ticks_derived_mor_bs").
option(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "default"). option(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "default").
option(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://hiveserver:10000"). option(HiveSyncConfigHolder.HIVE_URL.key(), "jdbc:hive2://hiveserver:10000").
option(HiveSyncConfig.HIVE_USER.key(), "hive"). option(HiveSyncConfigHolder.HIVE_USER.key(), "hive").
option(HiveSyncConfig.HIVE_PASS.key(), "hive"). option(HiveSyncConfigHolder.HIVE_PASS.key(), "hive").
option(HiveSyncConfig.HIVE_SYNC_ENABLED.key(), "true"). option(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key(), "true").
option(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"). option(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr").
option(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), classOf[MultiPartKeysValueExtractor].getCanonicalName). option(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), classOf[MultiPartKeysValueExtractor].getCanonicalName).
option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key(), "true"). option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key(), "true").

View File

@@ -21,8 +21,8 @@ package org.apache.hudi.aws.sync;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.hive.AbstractHiveSyncHoodieClient;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.sync.common.HoodieSyncClient;
import org.apache.hudi.sync.common.model.Partition; import org.apache.hudi.sync.common.model.Partition;
import com.amazonaws.services.glue.AWSGlue; import com.amazonaws.services.glue.AWSGlue;
@@ -50,10 +50,6 @@ import com.amazonaws.services.glue.model.StorageDescriptor;
import com.amazonaws.services.glue.model.Table; import com.amazonaws.services.glue.model.Table;
import com.amazonaws.services.glue.model.TableInput; import com.amazonaws.services.glue.model.TableInput;
import com.amazonaws.services.glue.model.UpdateTableRequest; import com.amazonaws.services.glue.model.UpdateTableRequest;
import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageType;
@@ -69,8 +65,12 @@ import java.util.stream.Collectors;
import static org.apache.hudi.aws.utils.S3Utils.s3aToS3; import static org.apache.hudi.aws.utils.S3Utils.s3aToS3;
import static org.apache.hudi.common.util.MapUtils.nonEmpty; import static org.apache.hudi.common.util.MapUtils.nonEmpty;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
import static org.apache.hudi.hive.util.HiveSchemaUtil.getPartitionKeyType; import static org.apache.hudi.hive.util.HiveSchemaUtil.getPartitionKeyType;
import static org.apache.hudi.hive.util.HiveSchemaUtil.parquetSchemaToMapSchema; import static org.apache.hudi.hive.util.HiveSchemaUtil.parquetSchemaToMapSchema;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.util.TableUtils.tableId; import static org.apache.hudi.sync.common.util.TableUtils.tableId;
/** /**
@@ -79,7 +79,7 @@ import static org.apache.hudi.sync.common.util.TableUtils.tableId;
* *
* @Experimental * @Experimental
*/ */
public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient { public class AWSGlueCatalogSyncClient extends HoodieSyncClient {
private static final Logger LOG = LogManager.getLogger(AWSGlueCatalogSyncClient.class); private static final Logger LOG = LogManager.getLogger(AWSGlueCatalogSyncClient.class);
private static final int MAX_PARTITIONS_PER_REQUEST = 100; private static final int MAX_PARTITIONS_PER_REQUEST = 100;
@@ -87,10 +87,10 @@ public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient {
private final AWSGlue awsGlue; private final AWSGlue awsGlue;
private final String databaseName; private final String databaseName;
public AWSGlueCatalogSyncClient(HiveSyncConfig syncConfig, Configuration hadoopConf, FileSystem fs) { public AWSGlueCatalogSyncClient(HiveSyncConfig config) {
super(syncConfig, hadoopConf, fs); super(config);
this.awsGlue = AWSGlueClientBuilder.standard().build(); this.awsGlue = AWSGlueClientBuilder.standard().build();
this.databaseName = syncConfig.databaseName; this.databaseName = config.getStringOrDefault(META_SYNC_DATABASE_NAME);
} }
@Override @Override
@@ -126,7 +126,7 @@ public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient {
StorageDescriptor sd = table.getStorageDescriptor(); StorageDescriptor sd = table.getStorageDescriptor();
List<PartitionInput> partitionInputs = partitionsToAdd.stream().map(partition -> { List<PartitionInput> partitionInputs = partitionsToAdd.stream().map(partition -> {
StorageDescriptor partitionSd = sd.clone(); StorageDescriptor partitionSd = sd.clone();
String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition).toString(); String fullPartitionPath = FSUtils.getPartitionPath(getBasePath(), partition).toString();
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
partitionSd.setLocation(fullPartitionPath); partitionSd.setLocation(fullPartitionPath);
return new PartitionInput().withValues(partitionValues).withStorageDescriptor(partitionSd); return new PartitionInput().withValues(partitionValues).withStorageDescriptor(partitionSd);
@@ -160,7 +160,7 @@ public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient {
StorageDescriptor sd = table.getStorageDescriptor(); StorageDescriptor sd = table.getStorageDescriptor();
List<BatchUpdatePartitionRequestEntry> updatePartitionEntries = changedPartitions.stream().map(partition -> { List<BatchUpdatePartitionRequestEntry> updatePartitionEntries = changedPartitions.stream().map(partition -> {
StorageDescriptor partitionSd = sd.clone(); StorageDescriptor partitionSd = sd.clone();
String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition).toString(); String fullPartitionPath = FSUtils.getPartitionPath(getBasePath(), partition).toString();
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
sd.setLocation(fullPartitionPath); sd.setLocation(fullPartitionPath);
PartitionInput partitionInput = new PartitionInput().withValues(partitionValues).withStorageDescriptor(partitionSd); PartitionInput partitionInput = new PartitionInput().withValues(partitionValues).withStorageDescriptor(partitionSd);
@@ -204,12 +204,12 @@ public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient {
} }
@Override @Override
public void updateTableDefinition(String tableName, MessageType newSchema) { public void updateTableSchema(String tableName, MessageType newSchema) {
// ToDo Cascade is set in Hive meta sync, but need to investigate how to configure it for Glue meta // ToDo Cascade is set in Hive meta sync, but need to investigate how to configure it for Glue meta
boolean cascade = syncConfig.partitionFields.size() > 0; boolean cascade = config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() > 0;
try { try {
Table table = getTable(awsGlue, databaseName, tableName); Table table = getTable(awsGlue, databaseName, tableName);
Map<String, String> newSchemaMap = parquetSchemaToMapSchema(newSchema, syncConfig.supportTimestamp, false); Map<String, String> newSchemaMap = parquetSchemaToMapSchema(newSchema, config.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE), false);
List<Column> newColumns = newSchemaMap.keySet().stream().map(key -> { List<Column> newColumns = newSchemaMap.keySet().stream().map(key -> {
String keyType = getPartitionKeyType(newSchemaMap, key); String keyType = getPartitionKeyType(newSchemaMap, key);
return new Column().withName(key).withType(keyType.toLowerCase()).withComment(""); return new Column().withName(key).withType(keyType.toLowerCase()).withComment("");
@@ -237,21 +237,6 @@ public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient {
} }
} }
@Override
public List<FieldSchema> getTableCommentUsingMetastoreClient(String tableName) {
throw new UnsupportedOperationException("Not supported: `getTableCommentUsingMetastoreClient`");
}
@Override
public void updateTableComments(String tableName, List<FieldSchema> oldSchema, List<Schema.Field> newSchema) {
throw new UnsupportedOperationException("Not supported: `updateTableComments`");
}
@Override
public void updateTableComments(String tableName, List<FieldSchema> oldSchema, Map<String, String> newComments) {
throw new UnsupportedOperationException("Not supported: `updateTableComments`");
}
@Override @Override
public void createTable(String tableName, public void createTable(String tableName,
MessageType storageSchema, MessageType storageSchema,
@@ -265,26 +250,26 @@ public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient {
} }
CreateTableRequest request = new CreateTableRequest(); CreateTableRequest request = new CreateTableRequest();
Map<String, String> params = new HashMap<>(); Map<String, String> params = new HashMap<>();
if (!syncConfig.createManagedTable) { if (!config.getBoolean(HIVE_CREATE_MANAGED_TABLE)) {
params.put("EXTERNAL", "TRUE"); params.put("EXTERNAL", "TRUE");
} }
params.putAll(tableProperties); params.putAll(tableProperties);
try { try {
Map<String, String> mapSchema = parquetSchemaToMapSchema(storageSchema, syncConfig.supportTimestamp, false); Map<String, String> mapSchema = parquetSchemaToMapSchema(storageSchema, config.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE), false);
List<Column> schemaWithoutPartitionKeys = new ArrayList<>(); List<Column> schemaWithoutPartitionKeys = new ArrayList<>();
for (String key : mapSchema.keySet()) { for (String key : mapSchema.keySet()) {
String keyType = getPartitionKeyType(mapSchema, key); String keyType = getPartitionKeyType(mapSchema, key);
Column column = new Column().withName(key).withType(keyType.toLowerCase()).withComment(""); Column column = new Column().withName(key).withType(keyType.toLowerCase()).withComment("");
// In Glue, the full schema should exclude the partition keys // In Glue, the full schema should exclude the partition keys
if (!syncConfig.partitionFields.contains(key)) { if (!config.getSplitStrings(META_SYNC_PARTITION_FIELDS).contains(key)) {
schemaWithoutPartitionKeys.add(column); schemaWithoutPartitionKeys.add(column);
} }
} }
// now create the schema partition // now create the schema partition
List<Column> schemaPartitionKeys = syncConfig.partitionFields.stream().map(partitionKey -> { List<Column> schemaPartitionKeys = config.getSplitStrings(META_SYNC_PARTITION_FIELDS).stream().map(partitionKey -> {
String keyType = getPartitionKeyType(mapSchema, partitionKey); String keyType = getPartitionKeyType(mapSchema, partitionKey);
return new Column().withName(partitionKey).withType(keyType.toLowerCase()).withComment(""); return new Column().withName(partitionKey).withType(keyType.toLowerCase()).withComment("");
}).collect(Collectors.toList()); }).collect(Collectors.toList());
@@ -293,7 +278,7 @@ public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient {
serdeProperties.put("serialization.format", "1"); serdeProperties.put("serialization.format", "1");
storageDescriptor storageDescriptor
.withSerdeInfo(new SerDeInfo().withSerializationLibrary(serdeClass).withParameters(serdeProperties)) .withSerdeInfo(new SerDeInfo().withSerializationLibrary(serdeClass).withParameters(serdeProperties))
.withLocation(s3aToS3(syncConfig.basePath)) .withLocation(s3aToS3(getBasePath()))
.withInputFormat(inputFormatClass) .withInputFormat(inputFormatClass)
.withOutputFormat(outputFormatClass) .withOutputFormat(outputFormatClass)
.withColumns(schemaWithoutPartitionKeys); .withColumns(schemaWithoutPartitionKeys);
@@ -320,7 +305,7 @@ public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient {
} }
@Override @Override
public Map<String, String> getTableSchema(String tableName) { public Map<String, String> getMetastoreSchema(String tableName) {
try { try {
// GlueMetastoreClient returns partition keys separate from Columns, hence get both and merge to // GlueMetastoreClient returns partition keys separate from Columns, hence get both and merge to
// get the Schema of the table. // get the Schema of the table.
@@ -340,11 +325,6 @@ public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient {
} }
} }
@Override
public boolean doesTableExist(String tableName) {
return tableExists(tableName);
}
@Override @Override
public boolean tableExists(String tableName) { public boolean tableExists(String tableName) {
GetTableRequest request = new GetTableRequest() GetTableRequest request = new GetTableRequest()
@@ -412,11 +392,11 @@ public class AWSGlueCatalogSyncClient extends AbstractHiveSyncHoodieClient {
@Override @Override
public void updateLastCommitTimeSynced(String tableName) { public void updateLastCommitTimeSynced(String tableName) {
if (!activeTimeline.lastInstant().isPresent()) { if (!getActiveTimeline().lastInstant().isPresent()) {
LOG.warn("No commit in active timeline."); LOG.warn("No commit in active timeline.");
return; return;
} }
final String lastCommitTimestamp = activeTimeline.lastInstant().get().getTimestamp(); final String lastCommitTimestamp = getActiveTimeline().lastInstant().get().getTimestamp();
try { try {
updateTableParameters(awsGlue, databaseName, tableName, Collections.singletonMap(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitTimestamp), false); updateTableParameters(awsGlue, databaseName, tableName, Collections.singletonMap(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitTimestamp), false);
} catch (Exception e) { } catch (Exception e) {

View File

@@ -18,53 +18,44 @@
package org.apache.hudi.aws.sync; package org.apache.hudi.aws.sync;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.hive.HiveSyncTool;
import com.beust.jcommander.JCommander; import com.beust.jcommander.JCommander;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf; import java.util.Properties;
/** /**
* Currently Experimental. Utility class that implements syncing a Hudi Table with the * Currently Experimental. Utility class that implements syncing a Hudi Table with the
* AWS Glue Data Catalog (https://docs.aws.amazon.com/glue/latest/dg/populate-data-catalog.html) * AWS Glue Data Catalog (https://docs.aws.amazon.com/glue/latest/dg/populate-data-catalog.html)
* to enable querying via Glue ETLs, Athena etc. * to enable querying via Glue ETLs, Athena etc.
* * <p>
* Extends HiveSyncTool since most logic is similar to Hive syncing, * Extends HiveSyncTool since most logic is similar to Hive syncing,
* expect using a different client {@link AWSGlueCatalogSyncClient} that implements * expect using a different client {@link AWSGlueCatalogSyncClient} that implements
* the necessary functionality using Glue APIs. * the necessary functionality using Glue APIs.
* *
* @Experimental * @Experimental
*/ */
public class AwsGlueCatalogSyncTool extends HiveSyncTool { public class AWSGlueCatalogSyncTool extends HiveSyncTool {
public AwsGlueCatalogSyncTool(TypedProperties props, Configuration conf, FileSystem fs) { public AWSGlueCatalogSyncTool(Properties props, Configuration hadoopConf) {
super(props, new HiveConf(conf, HiveConf.class), fs); super(props, hadoopConf);
}
public AwsGlueCatalogSyncTool(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf, FileSystem fs) {
super(hiveSyncConfig, hiveConf, fs);
} }
@Override @Override
protected void initClient(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf) { protected void initSyncClient(HiveSyncConfig hiveSyncConfig) {
hoodieHiveClient = new AWSGlueCatalogSyncClient(hiveSyncConfig, hiveConf, fs); syncClient = new AWSGlueCatalogSyncClient(hiveSyncConfig);
} }
public static void main(String[] args) { public static void main(String[] args) {
// parse the params final HiveSyncConfig.HiveSyncConfigParams params = new HiveSyncConfig.HiveSyncConfigParams();
final HiveSyncConfig cfg = new HiveSyncConfig(); JCommander cmd = JCommander.newBuilder().addObject(params).build();
JCommander cmd = new JCommander(cfg, null, args); cmd.parse(args);
if (cfg.help || args.length == 0) { if (params.isHelp()) {
cmd.usage(); cmd.usage();
System.exit(1); System.exit(0);
} }
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration()); new AWSGlueCatalogSyncTool(params.toProps(), new Configuration()).syncHoodieTable();
HiveConf hiveConf = new HiveConf();
hiveConf.addResource(fs.getConf());
new AwsGlueCatalogSyncTool(cfg, hiveConf, fs).syncHoodieTable();
} }
} }

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.common.config;
import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -29,6 +30,7 @@ import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.Modifier; import java.lang.reflect.Modifier;
import java.util.Arrays; import java.util.Arrays;
import java.util.List;
import java.util.Properties; import java.util.Properties;
/** /**
@@ -133,6 +135,14 @@ public class HoodieConfig implements Serializable {
return rawValue.map(Object::toString).orElse(null); return rawValue.map(Object::toString).orElse(null);
} }
public <T> List<String> getSplitStrings(ConfigProperty<T> configProperty) {
return getSplitStrings(configProperty, ",");
}
public <T> List<String> getSplitStrings(ConfigProperty<T> configProperty, String delimiter) {
return StringUtils.split(getString(configProperty), delimiter);
}
public String getString(String key) { public String getString(String key) {
return props.getProperty(key); return props.getProperty(key);
} }

View File

@@ -49,6 +49,12 @@ public class TypedProperties extends Properties implements Serializable {
} }
} }
public void setPropertyIfNonNull(String key, Object value) {
if (value != null) {
setProperty(key, value.toString());
}
}
@Override @Override
public String getProperty(String key) { public String getProperty(String key) {
Object oval = super.get(key); Object oval = super.get(key);

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.common.util; package org.apache.hudi.common.util;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
@@ -65,6 +66,18 @@ public class StringUtils {
return org.apache.hadoop.util.StringUtils.join(separator, array); return org.apache.hadoop.util.StringUtils.join(separator, array);
} }
/**
* Wrapper of {@link java.lang.String#join(CharSequence, Iterable)}.
*
* Allow return {@code null} when {@code Iterable} is {@code null}.
*/
public static String join(CharSequence delimiter, Iterable<? extends CharSequence> elements) {
if (elements == null) {
return null;
}
return String.join(delimiter, elements);
}
public static String toHexString(byte[] bytes) { public static String toHexString(byte[] bytes) {
StringBuilder sb = new StringBuilder(bytes.length * 2); StringBuilder sb = new StringBuilder(bytes.length * 2);
for (byte b : bytes) { for (byte b : bytes) {
@@ -77,6 +90,9 @@ public class StringUtils {
return str == null || str.length() == 0; return str == null || str.length() == 0;
} }
public static boolean nonEmpty(String str) {
return !isNullOrEmpty(str);
}
/** /**
* Returns the given string if it is non-null; the empty string otherwise. * Returns the given string if it is non-null; the empty string otherwise.

View File

@@ -23,6 +23,7 @@ import org.junit.jupiter.api.Test;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals;
@@ -45,6 +46,14 @@ public class TestStringUtils {
assertNotEquals(null, StringUtils.join(STRINGS)); assertNotEquals(null, StringUtils.join(STRINGS));
} }
@Test
public void testStringJoinWithJavaImpl() {
assertNull(StringUtils.join(",", null));
assertEquals("", String.join(",", Collections.singletonList("")));
assertEquals(",", String.join(",", Arrays.asList("", "")));
assertEquals("a,", String.join(",", Arrays.asList("a", "")));
}
@Test @Test
public void testStringNullToEmpty() { public void testStringNullToEmpty() {
String str = "This is a test"; String str = "This is a test";

View File

@@ -18,22 +18,43 @@
package org.apache.hudi.sink.utils; package org.apache.hudi.sink.utils;
import org.apache.hudi.aws.sync.AwsGlueCatalogSyncTool; import org.apache.hudi.aws.sync.AWSGlueCatalogSyncTool;
import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.configuration.HadoopConfigurations; import org.apache.hudi.configuration.HadoopConfigurations;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hudi.hive.ddl.HiveSyncMode; import org.apache.hudi.hive.ddl.HiveSyncMode;
import org.apache.hudi.table.format.FilePathUtils; import org.apache.hudi.table.format.FilePathUtils;
import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf;
import java.util.Arrays; import java.util.Properties;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_MODE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_JDBC;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
import static org.apache.hudi.hive.HiveSyncConfigHolder.METASTORE_URIS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DECODE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_USE_FILE_LISTING_FROM_METADATA;
/** /**
* Hive synchronization context. * Hive synchronization context.
@@ -41,63 +62,57 @@ import java.util.Arrays;
* <p>Use this context to create the {@link HiveSyncTool} for synchronization. * <p>Use this context to create the {@link HiveSyncTool} for synchronization.
*/ */
public class HiveSyncContext { public class HiveSyncContext {
private final HiveSyncConfig syncConfig;
private final HiveConf hiveConf;
private final FileSystem fs;
private HiveSyncContext(HiveSyncConfig syncConfig, HiveConf hiveConf, FileSystem fs) { private final Properties props;
this.syncConfig = syncConfig; private final HiveConf hiveConf;
private HiveSyncContext(Properties props, HiveConf hiveConf) {
this.props = props;
this.hiveConf = hiveConf; this.hiveConf = hiveConf;
this.fs = fs;
} }
public HiveSyncTool hiveSyncTool() { public HiveSyncTool hiveSyncTool() {
HiveSyncMode syncMode = HiveSyncMode.of(syncConfig.syncMode); HiveSyncMode syncMode = HiveSyncMode.of(props.getProperty(HIVE_SYNC_MODE.key()));
if (syncMode == HiveSyncMode.GLUE) { if (syncMode == HiveSyncMode.GLUE) {
return new AwsGlueCatalogSyncTool(this.syncConfig, this.hiveConf, this.fs); return new AWSGlueCatalogSyncTool(props, hiveConf);
} }
return new HiveSyncTool(this.syncConfig, this.hiveConf, this.fs); return new HiveSyncTool(props, hiveConf);
} }
public static HiveSyncContext create(Configuration conf, SerializableConfiguration serConf) { public static HiveSyncContext create(Configuration conf, SerializableConfiguration serConf) {
HiveSyncConfig syncConfig = buildSyncConfig(conf); Properties props = buildSyncConfig(conf);
org.apache.hadoop.conf.Configuration hadoopConf = HadoopConfigurations.getHadoopConf(conf); org.apache.hadoop.conf.Configuration hadoopConf = HadoopConfigurations.getHadoopConf(conf);
String path = conf.getString(FlinkOptions.PATH);
FileSystem fs = FSUtils.getFs(path, hadoopConf);
HiveConf hiveConf = new HiveConf(); HiveConf hiveConf = new HiveConf();
if (!FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.HIVE_SYNC_METASTORE_URIS)) {
hadoopConf.set(HiveConf.ConfVars.METASTOREURIS.varname, conf.getString(FlinkOptions.HIVE_SYNC_METASTORE_URIS));
}
hiveConf.addResource(serConf.get()); hiveConf.addResource(serConf.get());
hiveConf.addResource(hadoopConf); hiveConf.addResource(hadoopConf);
return new HiveSyncContext(syncConfig, hiveConf, fs); return new HiveSyncContext(props, hiveConf);
} }
@VisibleForTesting @VisibleForTesting
public static HiveSyncConfig buildSyncConfig(Configuration conf) { public static Properties buildSyncConfig(Configuration conf) {
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(); TypedProperties props = new TypedProperties();
hiveSyncConfig.basePath = conf.getString(FlinkOptions.PATH); props.setPropertyIfNonNull(META_SYNC_BASE_PATH.key(), conf.getString(FlinkOptions.PATH));
hiveSyncConfig.baseFileFormat = conf.getString(FlinkOptions.HIVE_SYNC_FILE_FORMAT); props.setPropertyIfNonNull(META_SYNC_BASE_FILE_FORMAT.key(), conf.getString(FlinkOptions.HIVE_SYNC_FILE_FORMAT));
hiveSyncConfig.usePreApacheInputFormat = false; props.setPropertyIfNonNull(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
hiveSyncConfig.databaseName = conf.getString(FlinkOptions.HIVE_SYNC_DB); props.setPropertyIfNonNull(META_SYNC_DATABASE_NAME.key(), conf.getString(FlinkOptions.HIVE_SYNC_DB));
hiveSyncConfig.tableName = conf.getString(FlinkOptions.HIVE_SYNC_TABLE); props.setPropertyIfNonNull(META_SYNC_TABLE_NAME.key(), conf.getString(FlinkOptions.HIVE_SYNC_TABLE));
hiveSyncConfig.syncMode = conf.getString(FlinkOptions.HIVE_SYNC_MODE); props.setPropertyIfNonNull(HIVE_SYNC_MODE.key(), conf.getString(FlinkOptions.HIVE_SYNC_MODE));
hiveSyncConfig.hiveUser = conf.getString(FlinkOptions.HIVE_SYNC_USERNAME); props.setPropertyIfNonNull(HIVE_USER.key(), conf.getString(FlinkOptions.HIVE_SYNC_USERNAME));
hiveSyncConfig.hivePass = conf.getString(FlinkOptions.HIVE_SYNC_PASSWORD); props.setPropertyIfNonNull(HIVE_PASS.key(), conf.getString(FlinkOptions.HIVE_SYNC_PASSWORD));
hiveSyncConfig.tableProperties = conf.getString(FlinkOptions.HIVE_SYNC_TABLE_PROPERTIES); props.setPropertyIfNonNull(HIVE_URL.key(), conf.getString(FlinkOptions.HIVE_SYNC_JDBC_URL));
hiveSyncConfig.serdeProperties = conf.getString(FlinkOptions.HIVE_SYNC_TABLE_SERDE_PROPERTIES); props.setPropertyIfNonNull(METASTORE_URIS.key(), conf.getString(FlinkOptions.HIVE_SYNC_METASTORE_URIS));
hiveSyncConfig.jdbcUrl = conf.getString(FlinkOptions.HIVE_SYNC_JDBC_URL); props.setPropertyIfNonNull(HIVE_TABLE_PROPERTIES.key(), conf.getString(FlinkOptions.HIVE_SYNC_TABLE_PROPERTIES));
hiveSyncConfig.partitionFields = Arrays.asList(FilePathUtils.extractHivePartitionFields(conf)); props.setPropertyIfNonNull(HIVE_TABLE_SERDE_PROPERTIES.key(), conf.getString(FlinkOptions.HIVE_SYNC_TABLE_SERDE_PROPERTIES));
hiveSyncConfig.partitionValueExtractorClass = conf.getString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME); props.setPropertyIfNonNull(META_SYNC_PARTITION_FIELDS.key(), String.join(",", FilePathUtils.extractHivePartitionFields(conf)));
hiveSyncConfig.useJdbc = conf.getBoolean(FlinkOptions.HIVE_SYNC_USE_JDBC); props.setPropertyIfNonNull(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), conf.getString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME));
hiveSyncConfig.useFileListingFromMetadata = conf.getBoolean(FlinkOptions.METADATA_ENABLED); props.setPropertyIfNonNull(HIVE_USE_JDBC.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_USE_JDBC)));
hiveSyncConfig.ignoreExceptions = conf.getBoolean(FlinkOptions.HIVE_SYNC_IGNORE_EXCEPTIONS); props.setPropertyIfNonNull(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(conf.getBoolean(FlinkOptions.METADATA_ENABLED)));
hiveSyncConfig.supportTimestamp = conf.getBoolean(FlinkOptions.HIVE_SYNC_SUPPORT_TIMESTAMP); props.setPropertyIfNonNull(HIVE_IGNORE_EXCEPTIONS.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_IGNORE_EXCEPTIONS)));
hiveSyncConfig.autoCreateDatabase = conf.getBoolean(FlinkOptions.HIVE_SYNC_AUTO_CREATE_DB); props.setPropertyIfNonNull(HIVE_SUPPORT_TIMESTAMP_TYPE.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_SUPPORT_TIMESTAMP)));
hiveSyncConfig.decodePartition = conf.getBoolean(FlinkOptions.URL_ENCODE_PARTITIONING); props.setPropertyIfNonNull(HIVE_AUTO_CREATE_DATABASE.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_AUTO_CREATE_DB)));
hiveSyncConfig.skipROSuffix = conf.getBoolean(FlinkOptions.HIVE_SYNC_SKIP_RO_SUFFIX); props.setPropertyIfNonNull(META_SYNC_DECODE_PARTITION.key(), String.valueOf(conf.getBoolean(FlinkOptions.URL_ENCODE_PARTITIONING)));
hiveSyncConfig.assumeDatePartitioning = conf.getBoolean(FlinkOptions.HIVE_SYNC_ASSUME_DATE_PARTITION); props.setPropertyIfNonNull(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_SKIP_RO_SUFFIX)));
hiveSyncConfig.withOperationField = conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED); props.setPropertyIfNonNull(META_SYNC_ASSUME_DATE_PARTITION.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_ASSUME_DATE_PARTITION)));
return hiveSyncConfig; return props;
} }
} }

View File

@@ -19,14 +19,15 @@
package org.apache.hudi.sink.utils; package org.apache.hudi.sink.utils;
import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.Configuration;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.lang.reflect.Method; import java.lang.reflect.Method;
import java.util.Properties;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.junit.jupiter.api.Assertions.assertEquals;
/** /**
* Test cases for {@link HiveSyncContext}. * Test cases for {@link HiveSyncContext}.
@@ -51,11 +52,11 @@ public class TestHiveSyncContext {
Method buildSyncConfigMethod = threadClazz.getDeclaredMethod("buildSyncConfig", Configuration.class); Method buildSyncConfigMethod = threadClazz.getDeclaredMethod("buildSyncConfig", Configuration.class);
buildSyncConfigMethod.setAccessible(true); buildSyncConfigMethod.setAccessible(true);
HiveSyncConfig hiveSyncConfig1 = HiveSyncContext.buildSyncConfig(configuration1); Properties props1 = HiveSyncContext.buildSyncConfig(configuration1);
HiveSyncConfig hiveSyncConfig2 = HiveSyncContext.buildSyncConfig(configuration2); Properties props2 = HiveSyncContext.buildSyncConfig(configuration2);
assertTrue(hiveSyncConfig1.partitionFields.get(0).equals(hiveSyncPartitionField)); assertEquals(hiveSyncPartitionField, props1.getProperty(META_SYNC_PARTITION_FIELDS.key()));
assertTrue(hiveSyncConfig2.partitionFields.get(0).equals(partitionPathField)); assertEquals(partitionPathField, props2.getProperty(META_SYNC_PARTITION_FIELDS.key()));
} }
} }

View File

@@ -19,113 +19,121 @@
package org.apache.hudi.gcp.bigquery; package org.apache.hudi.gcp.bigquery;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.sync.common.HoodieSyncConfig;
import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Properties;
/** /**
* Configs needed to sync data into BigQuery. * Configs needed to sync data into BigQuery.
*/ */
public class BigQuerySyncConfig implements Serializable { public class BigQuerySyncConfig extends HoodieSyncConfig implements Serializable {
public static String BIGQUERY_SYNC_PROJECT_ID = "hoodie.gcp.bigquery.sync.project_id"; public static final ConfigProperty<String> BIGQUERY_SYNC_PROJECT_ID = ConfigProperty
public static String BIGQUERY_SYNC_DATASET_NAME = "hoodie.gcp.bigquery.sync.dataset_name"; .key("hoodie.gcp.bigquery.sync.project_id")
public static String BIGQUERY_SYNC_DATASET_LOCATION = "hoodie.gcp.bigquery.sync.dataset_location"; .noDefaultValue()
public static String BIGQUERY_SYNC_TABLE_NAME = "hoodie.gcp.bigquery.sync.table_name"; .withDocumentation("Name of the target project in BigQuery");
public static String BIGQUERY_SYNC_SOURCE_URI = "hoodie.gcp.bigquery.sync.source_uri";
public static String BIGQUERY_SYNC_SOURCE_URI_PREFIX = "hoodie.gcp.bigquery.sync.source_uri_prefix";
public static String BIGQUERY_SYNC_SYNC_BASE_PATH = "hoodie.gcp.bigquery.sync.base_path";
public static String BIGQUERY_SYNC_PARTITION_FIELDS = "hoodie.gcp.bigquery.sync.partition_fields";
public static String BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA = "hoodie.gcp.bigquery.sync.use_file_listing_from_metadata";
public static String BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING = "hoodie.gcp.bigquery.sync.assume_date_partitioning";
@Parameter(names = {"--project-id"}, description = "name of the target project in BigQuery", required = true) public static final ConfigProperty<String> BIGQUERY_SYNC_DATASET_NAME = ConfigProperty
.key("hoodie.gcp.bigquery.sync.dataset_name")
.noDefaultValue()
.withDocumentation("Name of the target dataset in BigQuery");
public static final ConfigProperty<String> BIGQUERY_SYNC_DATASET_LOCATION = ConfigProperty
.key("hoodie.gcp.bigquery.sync.dataset_location")
.noDefaultValue()
.withDocumentation("Location of the target dataset in BigQuery");
public static final ConfigProperty<String> BIGQUERY_SYNC_TABLE_NAME = ConfigProperty
.key("hoodie.gcp.bigquery.sync.table_name")
.noDefaultValue()
.withDocumentation("Name of the target table in BigQuery");
public static final ConfigProperty<String> BIGQUERY_SYNC_SOURCE_URI = ConfigProperty
.key("hoodie.gcp.bigquery.sync.source_uri")
.noDefaultValue()
.withDocumentation("Name of the source uri gcs path of the table");
public static final ConfigProperty<String> BIGQUERY_SYNC_SOURCE_URI_PREFIX = ConfigProperty
.key("hoodie.gcp.bigquery.sync.source_uri_prefix")
.noDefaultValue()
.withDocumentation("Name of the source uri gcs path prefix of the table");
public static final ConfigProperty<String> BIGQUERY_SYNC_SYNC_BASE_PATH = ConfigProperty
.key("hoodie.gcp.bigquery.sync.base_path")
.noDefaultValue()
.withDocumentation("Base path of the hoodie table to sync");
public static final ConfigProperty<String> BIGQUERY_SYNC_PARTITION_FIELDS = ConfigProperty
.key("hoodie.gcp.bigquery.sync.partition_fields")
.noDefaultValue()
.withDocumentation("Comma-delimited partition fields. Default to non-partitioned.");
public static final ConfigProperty<Boolean> BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty
.key("hoodie.gcp.bigquery.sync.use_file_listing_from_metadata")
.defaultValue(false)
.withDocumentation("Fetch file listing from Hudi's metadata");
public static final ConfigProperty<Boolean> BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING = ConfigProperty
.key("hoodie.gcp.bigquery.sync.assume_date_partitioning")
.defaultValue(false)
.withDocumentation("Assume standard yyyy/mm/dd partitioning, this"
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter");
public BigQuerySyncConfig(Properties props) {
super(props);
}
public static class BigQuerySyncConfigParams {
@ParametersDelegate()
public final HoodieSyncConfigParams hoodieSyncConfigParams = new HoodieSyncConfigParams();
@Parameter(names = {"--project-id"}, description = "Name of the target project in BigQuery", required = true)
public String projectId; public String projectId;
@Parameter(names = {"--dataset-name"}, description = "name of the target dataset in BigQuery", required = true) @Parameter(names = {"--dataset-name"}, description = "Name of the target dataset in BigQuery", required = true)
public String datasetName; public String datasetName;
@Parameter(names = {"--dataset-location"}, description = "location of the target dataset in BigQuery", required = true) @Parameter(names = {"--dataset-location"}, description = "Location of the target dataset in BigQuery", required = true)
public String datasetLocation; public String datasetLocation;
@Parameter(names = {"--table-name"}, description = "name of the target table in BigQuery", required = true) @Parameter(names = {"--table-name"}, description = "Name of the target table in BigQuery", required = true)
public String tableName; public String tableName;
@Parameter(names = {"--source-uri"}, description = "name of the source uri gcs path of the table", required = true) @Parameter(names = {"--source-uri"}, description = "Name of the source uri gcs path of the table", required = true)
public String sourceUri; public String sourceUri;
@Parameter(names = {"--source-uri-prefix"}, description = "name of the source uri gcs path prefix of the table", required = true) @Parameter(names = {"--source-uri-prefix"}, description = "Name of the source uri gcs path prefix of the table", required = true)
public String sourceUriPrefix; public String sourceUriPrefix;
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true) @Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
public String basePath; public String basePath;
@Parameter(names = {"--partitioned-by"}, description = "Comma-delimited partition fields. Default to non-partitioned.") @Parameter(names = {"--partitioned-by"}, description = "Comma-delimited partition fields. Default to non-partitioned.")
public List<String> partitionFields = new ArrayList<>(); public List<String> partitionFields = new ArrayList<>();
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata") @Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
public Boolean useFileListingFromMetadata = false; public boolean useFileListingFromMetadata = false;
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this" @Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter") + " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
public Boolean assumeDatePartitioning = false; public boolean assumeDatePartitioning = false;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
public static BigQuerySyncConfig copy(BigQuerySyncConfig cfg) { public boolean isHelp() {
BigQuerySyncConfig newConfig = new BigQuerySyncConfig(); return hoodieSyncConfigParams.isHelp();
newConfig.projectId = cfg.projectId;
newConfig.datasetName = cfg.datasetName;
newConfig.datasetLocation = cfg.datasetLocation;
newConfig.tableName = cfg.tableName;
newConfig.sourceUri = cfg.sourceUri;
newConfig.sourceUriPrefix = cfg.sourceUriPrefix;
newConfig.basePath = cfg.basePath;
newConfig.partitionFields = cfg.partitionFields;
newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
newConfig.assumeDatePartitioning = cfg.assumeDatePartitioning;
newConfig.help = cfg.help;
return newConfig;
} }
public TypedProperties toProps() { public Properties toProps() {
TypedProperties properties = new TypedProperties(); final Properties props = hoodieSyncConfigParams.toProps();
properties.put(BIGQUERY_SYNC_PROJECT_ID, projectId); props.setProperty(BIGQUERY_SYNC_PROJECT_ID.key(), projectId);
properties.put(BIGQUERY_SYNC_DATASET_NAME, datasetName); props.setProperty(BIGQUERY_SYNC_DATASET_NAME.key(), datasetName);
properties.put(BIGQUERY_SYNC_DATASET_LOCATION, datasetLocation); props.setProperty(BIGQUERY_SYNC_DATASET_LOCATION.key(), datasetLocation);
properties.put(BIGQUERY_SYNC_TABLE_NAME, tableName); props.setProperty(BIGQUERY_SYNC_TABLE_NAME.key(), tableName);
properties.put(BIGQUERY_SYNC_SOURCE_URI, sourceUri); props.setProperty(BIGQUERY_SYNC_SOURCE_URI.key(), sourceUri);
properties.put(BIGQUERY_SYNC_SOURCE_URI_PREFIX, sourceUriPrefix); props.setProperty(BIGQUERY_SYNC_SOURCE_URI_PREFIX.key(), sourceUriPrefix);
properties.put(BIGQUERY_SYNC_SYNC_BASE_PATH, basePath); props.setProperty(BIGQUERY_SYNC_SYNC_BASE_PATH.key(), basePath);
properties.put(BIGQUERY_SYNC_PARTITION_FIELDS, String.join(",", partitionFields)); props.setProperty(BIGQUERY_SYNC_PARTITION_FIELDS.key(), String.join(",", partitionFields));
properties.put(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA, useFileListingFromMetadata); props.setProperty(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(useFileListingFromMetadata));
properties.put(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING, assumeDatePartitioning); props.setProperty(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING.key(), String.valueOf(assumeDatePartitioning));
return properties; return props;
} }
public static BigQuerySyncConfig fromProps(TypedProperties props) {
BigQuerySyncConfig config = new BigQuerySyncConfig();
config.projectId = props.getString(BIGQUERY_SYNC_PROJECT_ID);
config.datasetName = props.getString(BIGQUERY_SYNC_DATASET_NAME);
config.datasetLocation = props.getString(BIGQUERY_SYNC_DATASET_LOCATION);
config.tableName = props.getString(BIGQUERY_SYNC_TABLE_NAME);
config.sourceUri = props.getString(BIGQUERY_SYNC_SOURCE_URI);
config.sourceUriPrefix = props.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX);
config.basePath = props.getString(BIGQUERY_SYNC_SYNC_BASE_PATH);
config.partitionFields = props.getStringList(BIGQUERY_SYNC_PARTITION_FIELDS, ",", Collections.emptyList());
config.useFileListingFromMetadata = props.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA, false);
config.assumeDatePartitioning = props.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING, false);
return config;
}
@Override
public String toString() {
return "BigQuerySyncConfig{projectId='" + projectId
+ "', datasetName='" + datasetName
+ "', datasetLocation='" + datasetLocation
+ "', tableName='" + tableName
+ "', sourceUri='" + sourceUri
+ "', sourceUriPrefix='" + sourceUriPrefix
+ "', basePath='" + basePath + "'"
+ ", partitionFields=" + partitionFields
+ "', useFileListingFromMetadata='" + useFileListingFromMetadata
+ "', assumeDataPartitioning='" + assumeDatePartitioning
+ "', help=" + help + "}";
} }
} }

View File

@@ -19,19 +19,26 @@
package org.apache.hudi.gcp.bigquery; package org.apache.hudi.gcp.bigquery;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.sync.common.AbstractSyncTool; import org.apache.hudi.sync.common.HoodieSyncTool;
import org.apache.hudi.sync.common.util.ManifestFileWriter; import org.apache.hudi.sync.common.util.ManifestFileWriter;
import com.beust.jcommander.JCommander; import com.beust.jcommander.JCommander;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import java.util.Properties;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_NAME;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI_PREFIX;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SYNC_BASE_PATH;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_TABLE_NAME;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA;
/** /**
* Tool to sync a hoodie table with a big query table. Either use it as an api * Tool to sync a hoodie table with a big query table. Either use it as an api
* BigQuerySyncTool.syncHoodieTable(BigQuerySyncConfig) or as a command line java -cp hoodie-hive.jar BigQuerySyncTool [args] * BigQuerySyncTool.syncHoodieTable(BigQuerySyncConfig) or as a command line java -cp hoodie-hive.jar BigQuerySyncTool [args]
@@ -40,26 +47,28 @@ import org.apache.log4j.Logger;
* *
* @Experimental * @Experimental
*/ */
public class BigQuerySyncTool extends AbstractSyncTool { public class BigQuerySyncTool extends HoodieSyncTool {
private static final Logger LOG = LogManager.getLogger(BigQuerySyncTool.class); private static final Logger LOG = LogManager.getLogger(BigQuerySyncTool.class);
public final BigQuerySyncConfig cfg; public final BigQuerySyncConfig config;
public final String tableName;
public final String manifestTableName; public final String manifestTableName;
public final String versionsTableName; public final String versionsTableName;
public final String snapshotViewName; public final String snapshotViewName;
public BigQuerySyncTool(TypedProperties properties, Configuration conf, FileSystem fs) { public BigQuerySyncTool(Properties props) {
super(properties, conf, fs); super(props);
cfg = BigQuerySyncConfig.fromProps(properties); this.config = new BigQuerySyncConfig(props);
manifestTableName = cfg.tableName + "_manifest"; this.tableName = config.getString(BIGQUERY_SYNC_TABLE_NAME);
versionsTableName = cfg.tableName + "_versions"; this.manifestTableName = tableName + "_manifest";
snapshotViewName = cfg.tableName; this.versionsTableName = tableName + "_versions";
this.snapshotViewName = tableName;
} }
@Override @Override
public void syncHoodieTable() { public void syncHoodieTable() {
try (HoodieBigQuerySyncClient bqSyncClient = new HoodieBigQuerySyncClient(BigQuerySyncConfig.fromProps(props), fs)) { try (HoodieBigQuerySyncClient bqSyncClient = new HoodieBigQuerySyncClient(config)) {
switch (bqSyncClient.getTableType()) { switch (bqSyncClient.getTableType()) {
case COPY_ON_WRITE: case COPY_ON_WRITE:
syncCoWTable(bqSyncClient); syncCoWTable(bqSyncClient);
@@ -69,7 +78,7 @@ public class BigQuerySyncTool extends AbstractSyncTool {
throw new UnsupportedOperationException(bqSyncClient.getTableType() + " table type is not supported yet."); throw new UnsupportedOperationException(bqSyncClient.getTableType() + " table type is not supported yet.");
} }
} catch (Exception e) { } catch (Exception e) {
throw new HoodieBigQuerySyncException("Got runtime exception when big query syncing " + cfg.tableName, e); throw new HoodieBigQuerySyncException("Failed to sync BigQuery for table:" + tableName, e);
} }
} }
@@ -78,14 +87,14 @@ public class BigQuerySyncTool extends AbstractSyncTool {
LOG.info("Sync hoodie table " + snapshotViewName + " at base path " + bqSyncClient.getBasePath()); LOG.info("Sync hoodie table " + snapshotViewName + " at base path " + bqSyncClient.getBasePath());
if (!bqSyncClient.datasetExists()) { if (!bqSyncClient.datasetExists()) {
throw new HoodieBigQuerySyncException("Dataset not found: " + cfg); throw new HoodieBigQuerySyncException("Dataset not found: " + config.getString(BIGQUERY_SYNC_DATASET_NAME));
} }
ManifestFileWriter manifestFileWriter = ManifestFileWriter.builder() ManifestFileWriter manifestFileWriter = ManifestFileWriter.builder()
.setConf(conf) .setConf(config.getHadoopConf())
.setBasePath(cfg.basePath) .setBasePath(config.getString(BIGQUERY_SYNC_SYNC_BASE_PATH))
.setUseFileListingFromMetadata(cfg.useFileListingFromMetadata) .setUseFileListingFromMetadata(config.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA))
.setAssumeDatePartitioning(cfg.assumeDatePartitioning) .setAssumeDatePartitioning(config.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING))
.build(); .build();
manifestFileWriter.writeManifestFile(); manifestFileWriter.writeManifestFile();
@@ -94,7 +103,11 @@ public class BigQuerySyncTool extends AbstractSyncTool {
LOG.info("Manifest table creation complete for " + manifestTableName); LOG.info("Manifest table creation complete for " + manifestTableName);
} }
if (!bqSyncClient.tableExists(versionsTableName)) { if (!bqSyncClient.tableExists(versionsTableName)) {
bqSyncClient.createVersionsTable(versionsTableName, cfg.sourceUri, cfg.sourceUriPrefix, cfg.partitionFields); bqSyncClient.createVersionsTable(
versionsTableName,
config.getString(BIGQUERY_SYNC_SOURCE_URI),
config.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX),
config.getSplitStrings(BIGQUERY_SYNC_PARTITION_FIELDS));
LOG.info("Versions table creation complete for " + versionsTableName); LOG.info("Versions table creation complete for " + versionsTableName);
} }
if (!bqSyncClient.tableExists(snapshotViewName)) { if (!bqSyncClient.tableExists(snapshotViewName)) {
@@ -107,13 +120,13 @@ public class BigQuerySyncTool extends AbstractSyncTool {
} }
public static void main(String[] args) { public static void main(String[] args) {
BigQuerySyncConfig cfg = new BigQuerySyncConfig(); final BigQuerySyncConfig.BigQuerySyncConfigParams params = new BigQuerySyncConfig.BigQuerySyncConfigParams();
JCommander cmd = new JCommander(cfg, null, args); JCommander cmd = JCommander.newBuilder().addObject(params).build();
if (cfg.help || args.length == 0) { cmd.parse(args);
if (params.isHelp()) {
cmd.usage(); cmd.usage();
System.exit(1); System.exit(0);
} }
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration()); new BigQuerySyncTool(params.toProps()).syncHoodieTable();
new BigQuerySyncTool(cfg.toProps(), fs.getConf(), fs).syncHoodieTable();
} }
} }

View File

@@ -19,8 +19,7 @@
package org.apache.hudi.gcp.bigquery; package org.apache.hudi.gcp.bigquery;
import org.apache.hudi.common.util.Option; import org.apache.hudi.sync.common.HoodieSyncClient;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryException; import com.google.cloud.bigquery.BigQueryException;
@@ -38,25 +37,31 @@ import com.google.cloud.bigquery.Table;
import com.google.cloud.bigquery.TableId; import com.google.cloud.bigquery.TableId;
import com.google.cloud.bigquery.TableInfo; import com.google.cloud.bigquery.TableInfo;
import com.google.cloud.bigquery.ViewDefinition; import com.google.cloud.bigquery.ViewDefinition;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.parquet.schema.MessageType;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
public class HoodieBigQuerySyncClient extends AbstractSyncHoodieClient { import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_LOCATION;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_NAME;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PROJECT_ID;
public class HoodieBigQuerySyncClient extends HoodieSyncClient {
private static final Logger LOG = LogManager.getLogger(HoodieBigQuerySyncClient.class); private static final Logger LOG = LogManager.getLogger(HoodieBigQuerySyncClient.class);
private final BigQuerySyncConfig syncConfig; protected final BigQuerySyncConfig config;
private final String projectId;
private final String datasetName;
private transient BigQuery bigquery; private transient BigQuery bigquery;
public HoodieBigQuerySyncClient(final BigQuerySyncConfig syncConfig, final FileSystem fs) { public HoodieBigQuerySyncClient(final BigQuerySyncConfig config) {
super(syncConfig.basePath, syncConfig.assumeDatePartitioning, syncConfig.useFileListingFromMetadata, super(config);
false, fs); this.config = config;
this.syncConfig = syncConfig; this.projectId = config.getString(BIGQUERY_SYNC_PROJECT_ID);
this.datasetName = config.getString(BIGQUERY_SYNC_DATASET_NAME);
this.createBigQueryConnection(); this.createBigQueryConnection();
} }
@@ -65,7 +70,7 @@ public class HoodieBigQuerySyncClient extends AbstractSyncHoodieClient {
try { try {
// Initialize client that will be used to send requests. This client only needs to be created // Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests. // once, and can be reused for multiple requests.
bigquery = BigQueryOptions.newBuilder().setLocation(syncConfig.datasetLocation).build().getService(); bigquery = BigQueryOptions.newBuilder().setLocation(config.getString(BIGQUERY_SYNC_DATASET_LOCATION)).build().getService();
LOG.info("Successfully established BigQuery connection."); LOG.info("Successfully established BigQuery connection.");
} catch (BigQueryException e) { } catch (BigQueryException e) {
throw new HoodieBigQuerySyncException("Cannot create bigQuery connection ", e); throw new HoodieBigQuerySyncException("Cannot create bigQuery connection ", e);
@@ -73,16 +78,9 @@ public class HoodieBigQuerySyncClient extends AbstractSyncHoodieClient {
} }
} }
@Override
public void createTable(final String tableName, final MessageType storageSchema, final String inputFormatClass,
final String outputFormatClass, final String serdeClass,
final Map<String, String> serdeProperties, final Map<String, String> tableProperties) {
// bigQuery create table arguments are different, so do nothing.
}
public void createManifestTable(String tableName, String sourceUri) { public void createManifestTable(String tableName, String sourceUri) {
try { try {
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, tableName); TableId tableId = TableId.of(projectId, datasetName, tableName);
CsvOptions csvOptions = CsvOptions.newBuilder() CsvOptions csvOptions = CsvOptions.newBuilder()
.setFieldDelimiter(",") .setFieldDelimiter(",")
.setAllowJaggedRows(false) .setAllowJaggedRows(false)
@@ -108,7 +106,7 @@ public class HoodieBigQuerySyncClient extends AbstractSyncHoodieClient {
public void createVersionsTable(String tableName, String sourceUri, String sourceUriPrefix, List<String> partitionFields) { public void createVersionsTable(String tableName, String sourceUri, String sourceUriPrefix, List<String> partitionFields) {
try { try {
ExternalTableDefinition customTable; ExternalTableDefinition customTable;
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, tableName); TableId tableId = TableId.of(projectId, datasetName, tableName);
if (partitionFields.isEmpty()) { if (partitionFields.isEmpty()) {
customTable = customTable =
@@ -143,16 +141,16 @@ public class HoodieBigQuerySyncClient extends AbstractSyncHoodieClient {
public void createSnapshotView(String viewName, String versionsTableName, String manifestTableName) { public void createSnapshotView(String viewName, String versionsTableName, String manifestTableName) {
try { try {
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, viewName); TableId tableId = TableId.of(projectId, datasetName, viewName);
String query = String query =
String.format( String.format(
"SELECT * FROM `%s.%s.%s` WHERE _hoodie_file_name IN " "SELECT * FROM `%s.%s.%s` WHERE _hoodie_file_name IN "
+ "(SELECT filename FROM `%s.%s.%s`)", + "(SELECT filename FROM `%s.%s.%s`)",
syncConfig.projectId, projectId,
syncConfig.datasetName, datasetName,
versionsTableName, versionsTableName,
syncConfig.projectId, projectId,
syncConfig.datasetName, datasetName,
manifestTableName); manifestTableName);
ViewDefinition viewDefinition = ViewDefinition viewDefinition =
@@ -166,78 +164,25 @@ public class HoodieBigQuerySyncClient extends AbstractSyncHoodieClient {
} }
@Override @Override
public Map<String, String> getTableSchema(String tableName) { public Map<String, String> getMetastoreSchema(String tableName) {
// TODO: Implement automatic schema evolution when you add a new column. // TODO: Implement automatic schema evolution when you add a new column.
return Collections.emptyMap(); return Collections.emptyMap();
} }
@Override
public void addPartitionsToTable(final String tableName, final List<String> partitionsToAdd) {
// bigQuery discovers the new partitions automatically, so do nothing.
throw new UnsupportedOperationException("No support for addPartitionsToTable yet.");
}
public boolean datasetExists() { public boolean datasetExists() {
Dataset dataset = bigquery.getDataset(DatasetId.of(syncConfig.projectId, syncConfig.datasetName)); Dataset dataset = bigquery.getDataset(DatasetId.of(projectId, datasetName));
return dataset != null; return dataset != null;
} }
@Override
public boolean doesTableExist(final String tableName) {
return tableExists(tableName);
}
@Override @Override
public boolean tableExists(String tableName) { public boolean tableExists(String tableName) {
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, tableName); TableId tableId = TableId.of(projectId, datasetName, tableName);
Table table = bigquery.getTable(tableId, BigQuery.TableOption.fields()); Table table = bigquery.getTable(tableId, BigQuery.TableOption.fields());
return table != null && table.exists(); return table != null && table.exists();
} }
@Override
public Option<String> getLastCommitTimeSynced(final String tableName) {
// bigQuery doesn't support tblproperties, so do nothing.
throw new UnsupportedOperationException("Not support getLastCommitTimeSynced yet.");
}
@Override
public void updateLastCommitTimeSynced(final String tableName) {
// bigQuery doesn't support tblproperties, so do nothing.
throw new UnsupportedOperationException("No support for updateLastCommitTimeSynced yet.");
}
@Override
public Option<String> getLastReplicatedTime(String tableName) {
// bigQuery doesn't support tblproperties, so do nothing.
throw new UnsupportedOperationException("Not support getLastReplicatedTime yet.");
}
@Override
public void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
// bigQuery doesn't support tblproperties, so do nothing.
throw new UnsupportedOperationException("No support for updateLastReplicatedTimeStamp yet.");
}
@Override
public void deleteLastReplicatedTimeStamp(String tableName) {
// bigQuery doesn't support tblproperties, so do nothing.
throw new UnsupportedOperationException("No support for deleteLastReplicatedTimeStamp yet.");
}
@Override
public void updatePartitionsToTable(final String tableName, final List<String> changedPartitions) {
// bigQuery updates the partitions automatically, so do nothing.
throw new UnsupportedOperationException("No support for updatePartitionsToTable yet.");
}
@Override
public void dropPartitions(String tableName, List<String> partitionsToDrop) {
// bigQuery discovers the new partitions automatically, so do nothing.
throw new UnsupportedOperationException("No support for dropPartitions yet.");
}
@Override @Override
public void close() { public void close() {
// bigQuery has no connection close method, so do nothing. bigquery = null;
} }
} }

View File

@@ -19,12 +19,11 @@
package org.apache.hudi.gcp.bigquery; package org.apache.hudi.gcp.bigquery;
import org.apache.hudi.common.config.TypedProperties;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.util.Arrays; import java.util.Arrays;
import java.util.Properties;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING; import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_LOCATION; import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_LOCATION;
@@ -44,75 +43,32 @@ public class TestBigQuerySyncConfig {
@BeforeEach @BeforeEach
void setUp() { void setUp() {
syncConfig = new BigQuerySyncConfig(); Properties props = new Properties();
syncConfig.projectId = "fooproject"; props.setProperty(BIGQUERY_SYNC_PROJECT_ID.key(), "fooproject");
syncConfig.datasetName = "foodataset"; props.setProperty(BIGQUERY_SYNC_DATASET_NAME.key(), "foodataset");
syncConfig.datasetLocation = "US"; props.setProperty(BIGQUERY_SYNC_DATASET_LOCATION.key(), "US");
syncConfig.tableName = "footable"; props.setProperty(BIGQUERY_SYNC_TABLE_NAME.key(), "footable");
syncConfig.sourceUri = "gs://test-bucket/dwh/table_name/dt=*"; props.setProperty(BIGQUERY_SYNC_SOURCE_URI.key(), "gs://test-bucket/dwh/table_name/dt=*");
syncConfig.sourceUriPrefix = "gs://test-bucket/dwh/table_name/"; props.setProperty(BIGQUERY_SYNC_SOURCE_URI_PREFIX.key(), "gs://test-bucket/dwh/table_name/");
syncConfig.basePath = "gs://test-bucket/dwh/table_name"; props.setProperty(BIGQUERY_SYNC_SYNC_BASE_PATH.key(), "gs://test-bucket/dwh/table_name");
syncConfig.partitionFields = Arrays.asList("a", "b"); props.setProperty(BIGQUERY_SYNC_PARTITION_FIELDS.key(), "a,b");
syncConfig.useFileListingFromMetadata = true; props.setProperty(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), "true");
syncConfig.assumeDatePartitioning = true; props.setProperty(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING.key(), "true");
syncConfig.help = true; syncConfig = new BigQuerySyncConfig(props);
} }
@Test @Test
public void testCopy() { public void testGetConfigs() {
BigQuerySyncConfig copied = BigQuerySyncConfig.copy(syncConfig); assertEquals("fooproject", syncConfig.getString(BIGQUERY_SYNC_PROJECT_ID));
assertEquals(copied.partitionFields, syncConfig.partitionFields); assertEquals("foodataset", syncConfig.getString(BIGQUERY_SYNC_DATASET_NAME));
assertEquals(copied.basePath, syncConfig.basePath); assertEquals("US", syncConfig.getString(BIGQUERY_SYNC_DATASET_LOCATION));
assertEquals(copied.projectId, syncConfig.projectId); assertEquals("footable", syncConfig.getString(BIGQUERY_SYNC_TABLE_NAME));
assertEquals(copied.datasetName, syncConfig.datasetName); assertEquals("gs://test-bucket/dwh/table_name/dt=*", syncConfig.getString(BIGQUERY_SYNC_SOURCE_URI));
assertEquals(copied.datasetLocation, syncConfig.datasetLocation); assertEquals("gs://test-bucket/dwh/table_name/", syncConfig.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX));
assertEquals(copied.tableName, syncConfig.tableName); assertEquals("gs://test-bucket/dwh/table_name", syncConfig.getString(BIGQUERY_SYNC_SYNC_BASE_PATH));
assertEquals(copied.sourceUri, syncConfig.sourceUri); assertEquals(Arrays.asList("a", "b"), syncConfig.getSplitStrings(BIGQUERY_SYNC_PARTITION_FIELDS));
assertEquals(copied.sourceUriPrefix, syncConfig.sourceUriPrefix); assertEquals(true, syncConfig.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA));
assertEquals(copied.useFileListingFromMetadata, syncConfig.useFileListingFromMetadata); assertEquals(true, syncConfig.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING));
assertEquals(copied.assumeDatePartitioning, syncConfig.assumeDatePartitioning);
assertEquals(copied.help, syncConfig.help);
} }
@Test
public void testToProps() {
TypedProperties props = syncConfig.toProps();
assertEquals("fooproject", props.getString(BIGQUERY_SYNC_PROJECT_ID));
assertEquals("foodataset", props.getString(BIGQUERY_SYNC_DATASET_NAME));
assertEquals("US", props.getString(BIGQUERY_SYNC_DATASET_LOCATION));
assertEquals("footable", props.getString(BIGQUERY_SYNC_TABLE_NAME));
assertEquals("gs://test-bucket/dwh/table_name/dt=*", props.getString(BIGQUERY_SYNC_SOURCE_URI));
assertEquals("gs://test-bucket/dwh/table_name/", props.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX));
assertEquals("gs://test-bucket/dwh/table_name", props.getString(BIGQUERY_SYNC_SYNC_BASE_PATH));
assertEquals("a,b", props.getString(BIGQUERY_SYNC_PARTITION_FIELDS));
assertEquals("true", props.getString(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA));
assertEquals("true", props.getString(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING));
}
@Test
public void fromProps() {
TypedProperties props = new TypedProperties();
props.put(BIGQUERY_SYNC_PROJECT_ID, "fooproject");
props.put(BIGQUERY_SYNC_DATASET_NAME, "foodataset");
props.put(BIGQUERY_SYNC_DATASET_LOCATION, "US");
props.put(BIGQUERY_SYNC_TABLE_NAME, "footable");
props.put(BIGQUERY_SYNC_SOURCE_URI, "gs://test-bucket/dwh/table_name/dt=*");
props.put(BIGQUERY_SYNC_SOURCE_URI_PREFIX, "gs://test-bucket/dwh/table_name/");
props.put(BIGQUERY_SYNC_SYNC_BASE_PATH, "gs://test-bucket/dwh/table_name");
props.put(BIGQUERY_SYNC_PARTITION_FIELDS, "a,b");
props.put(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA, true);
props.put(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING, true);
BigQuerySyncConfig cfg = BigQuerySyncConfig.fromProps(props);
assertEquals(syncConfig.projectId, cfg.projectId);
assertEquals(syncConfig.datasetName, cfg.datasetName);
assertEquals(syncConfig.datasetLocation, cfg.datasetLocation);
assertEquals(syncConfig.tableName, cfg.tableName);
assertEquals(syncConfig.sourceUri, cfg.sourceUri);
assertEquals(syncConfig.sourceUriPrefix, cfg.sourceUriPrefix);
assertEquals(syncConfig.basePath, cfg.basePath);
assertEquals(syncConfig.partitionFields, cfg.partitionFields);
assertEquals(syncConfig.useFileListingFromMetadata, cfg.useFileListingFromMetadata);
assertEquals(syncConfig.assumeDatePartitioning, cfg.assumeDatePartitioning);
}
} }

View File

@@ -32,6 +32,10 @@ import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.sql.Statement; import java.sql.Statement;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
/** /**
* A hive query node in the DAG of operations for a workflow. used to perform a hive query with given config. * A hive query node in the DAG of operations for a workflow. used to perform a hive query with given config.
*/ */
@@ -57,8 +61,8 @@ public class HiveQueryNode extends DagNode<Boolean> {
.getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat); .getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat);
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(properties); HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(properties);
this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter()); this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
Connection con = DriverManager.getConnection(hiveSyncConfig.jdbcUrl, hiveSyncConfig.hiveUser, Connection con = DriverManager.getConnection(hiveSyncConfig.getString(HIVE_URL),
hiveSyncConfig.hivePass); hiveSyncConfig.getString(HIVE_USER), hiveSyncConfig.getString(HIVE_PASS));
Statement stmt = con.createStatement(); Statement stmt = con.createStatement();
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
for (String hiveProperty : this.config.getHiveProperties()) { for (String hiveProperty : this.config.getHiveProperties()) {

View File

@@ -18,16 +18,16 @@
package org.apache.hudi.integ.testsuite.helpers; package org.apache.hudi.integ.testsuite.helpers;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hive.service.server.HiveServer2;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hudi.hive.testutils.HiveTestService; import org.apache.hudi.hive.testutils.HiveTestService;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hadoop.conf.Configuration;
import org.apache.hive.service.server.HiveServer2;
import java.io.IOException;
/** /**
* Hive Service provider. * Hive Service provider.
*/ */
@@ -52,12 +52,10 @@ public class HiveServiceProvider {
HiveSyncTool hiveSyncTool; HiveSyncTool hiveSyncTool;
if (this.config.isHiveLocal()) { if (this.config.isHiveLocal()) {
hiveSyncTool = new HiveSyncTool(writer.getWriteConfig().getProps(), hiveSyncTool = new HiveSyncTool(writer.getWriteConfig().getProps(),
getLocalHiveServer().getHiveConf(), getLocalHiveServer().getHiveConf());
FSUtils.getFs(writer.getWriteConfig().getBasePath(), getLocalHiveServer().getHiveConf()));
} else { } else {
hiveSyncTool = new HiveSyncTool(writer.getWriteConfig().getProps(), hiveSyncTool = new HiveSyncTool(writer.getWriteConfig().getProps(),
getLocalHiveServer().getHiveConf(), writer.getConfiguration());
FSUtils.getFs(writer.getWriteConfig().getBasePath(), writer.getConfiguration()));
} }
hiveSyncTool.syncHoodieTable(); hiveSyncTool.syncHoodieTable();
} }

View File

@@ -22,7 +22,6 @@ import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob; import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig; import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig;
@@ -55,6 +54,10 @@ import org.junit.jupiter.params.provider.MethodSource;
import java.util.UUID; import java.util.UUID;
import java.util.stream.Stream; import java.util.stream.Stream;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
/** /**
@@ -177,10 +180,10 @@ public class TestHoodieTestSuiteJob extends UtilitiesTestBase {
// Make path selection test suite specific // Make path selection test suite specific
props.setProperty("hoodie.deltastreamer.source.input.selector", DFSTestSuitePathSelector.class.getName()); props.setProperty("hoodie.deltastreamer.source.input.selector", DFSTestSuitePathSelector.class.getName());
// Hive Configs // Hive Configs
props.setProperty(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/"); props.setProperty(HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/");
props.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), "testdb1"); props.setProperty(META_SYNC_DATABASE_NAME.key(), "testdb1");
props.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), "table1"); props.setProperty(META_SYNC_TABLE_NAME.key(), "table1");
props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"); props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr");
props.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), TimestampBasedKeyGenerator.class.getName()); props.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), TimestampBasedKeyGenerator.class.getName());
props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider"); props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider");

View File

@@ -18,9 +18,6 @@
package org.apache.hudi; package org.apache.hudi;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.HoodieReadClient;
import org.apache.hudi.client.HoodieWriteResult; import org.apache.hudi.client.HoodieWriteResult;
import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.SparkRDDWriteClient;
@@ -36,30 +33,27 @@ import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.TablePathUtils; import org.apache.hudi.common.util.TablePathUtils;
import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodiePayloadConfig; import org.apache.hudi.config.HoodiePayloadConfig;
import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieStorageConfig;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.exception.TableNotFoundException;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor;
import org.apache.hudi.sync.common.HoodieSyncConfig;
import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.util.DataTypeUtils; import org.apache.hudi.util.DataTypeUtils;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.hive.HiveExternalCatalog;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@@ -270,63 +264,6 @@ public class DataSourceUtils {
return dropDuplicates(jssc, incomingHoodieRecords, writeConfig); return dropDuplicates(jssc, incomingHoodieRecords, writeConfig);
} }
/**
* @deprecated Use {@link HiveSyncConfig} constructor directly and provide the props,
* and set {@link HoodieSyncConfig#META_SYNC_BASE_PATH} and {@link HoodieSyncConfig#META_SYNC_BASE_FILE_FORMAT} instead.
*/
@Deprecated
public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String basePath, String baseFileFormat) {
checkRequiredProperties(props, Collections.singletonList(DataSourceWriteOptions.HIVE_TABLE().key()));
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
hiveSyncConfig.basePath = basePath;
hiveSyncConfig.usePreApacheInputFormat =
props.getBoolean(DataSourceWriteOptions.HIVE_USE_PRE_APACHE_INPUT_FORMAT().key(),
Boolean.parseBoolean(DataSourceWriteOptions.HIVE_USE_PRE_APACHE_INPUT_FORMAT().defaultValue()));
hiveSyncConfig.databaseName = props.getString(DataSourceWriteOptions.HIVE_DATABASE().key(),
DataSourceWriteOptions.HIVE_DATABASE().defaultValue());
hiveSyncConfig.tableName = props.getString(DataSourceWriteOptions.HIVE_TABLE().key());
hiveSyncConfig.baseFileFormat = baseFileFormat;
hiveSyncConfig.hiveUser =
props.getString(DataSourceWriteOptions.HIVE_USER().key(), DataSourceWriteOptions.HIVE_USER().defaultValue());
hiveSyncConfig.hivePass =
props.getString(DataSourceWriteOptions.HIVE_PASS().key(), DataSourceWriteOptions.HIVE_PASS().defaultValue());
hiveSyncConfig.jdbcUrl =
props.getString(DataSourceWriteOptions.HIVE_URL().key(), DataSourceWriteOptions.HIVE_URL().defaultValue());
hiveSyncConfig.metastoreUris =
props.getString(DataSourceWriteOptions.METASTORE_URIS().key(), DataSourceWriteOptions.METASTORE_URIS().defaultValue());
hiveSyncConfig.partitionFields =
props.getStringList(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), ",", new ArrayList<>());
hiveSyncConfig.partitionValueExtractorClass =
props.getString(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(),
SlashEncodedDayPartitionValueExtractor.class.getName());
hiveSyncConfig.useJdbc = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_USE_JDBC().key(),
DataSourceWriteOptions.HIVE_USE_JDBC().defaultValue()));
if (props.containsKey(DataSourceWriteOptions.HIVE_SYNC_MODE().key())) {
hiveSyncConfig.syncMode = props.getString(DataSourceWriteOptions.HIVE_SYNC_MODE().key());
}
hiveSyncConfig.autoCreateDatabase = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_AUTO_CREATE_DATABASE().key(),
DataSourceWriteOptions.HIVE_AUTO_CREATE_DATABASE().defaultValue()));
hiveSyncConfig.ignoreExceptions = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_IGNORE_EXCEPTIONS().key(),
DataSourceWriteOptions.HIVE_IGNORE_EXCEPTIONS().defaultValue()));
hiveSyncConfig.skipROSuffix = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE().key(),
DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE().defaultValue()));
hiveSyncConfig.supportTimestamp = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_SUPPORT_TIMESTAMP_TYPE().key(),
DataSourceWriteOptions.HIVE_SUPPORT_TIMESTAMP_TYPE().defaultValue()));
hiveSyncConfig.isConditionalSync = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_CONDITIONAL_SYNC().key(),
DataSourceWriteOptions.HIVE_CONDITIONAL_SYNC().defaultValue()));
hiveSyncConfig.bucketSpec = props.getBoolean(DataSourceWriteOptions.HIVE_SYNC_BUCKET_SYNC().key(),
DataSourceWriteOptions.HIVE_SYNC_BUCKET_SYNC().defaultValue())
? HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()),
props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key())) : null;
if (props.containsKey(HiveExternalCatalog.CREATED_SPARK_VERSION())) {
hiveSyncConfig.sparkVersion = props.getString(HiveExternalCatalog.CREATED_SPARK_VERSION());
}
hiveSyncConfig.syncComment = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_SYNC_COMMENT().key(),
DataSourceWriteOptions.HIVE_SYNC_COMMENT().defaultValue()));
return hiveSyncConfig;
}
/** /**
* Checks whether default value (false) of "hoodie.parquet.writelegacyformat.enabled" should be * Checks whether default value (false) of "hoodie.parquet.writelegacyformat.enabled" should be
* overridden in case: * overridden in case:

View File

@@ -26,7 +26,7 @@ import org.apache.hudi.common.table.HoodieTableConfig
import org.apache.hudi.common.util.Option import org.apache.hudi.common.util.Option
import org.apache.hudi.common.util.ValidationUtils.checkState import org.apache.hudi.common.util.ValidationUtils.checkState
import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig} import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig}
import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool} import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncConfigHolder, HiveSyncTool}
import org.apache.hudi.keygen.constant.KeyGeneratorOptions import org.apache.hudi.keygen.constant.KeyGeneratorOptions
import org.apache.hudi.keygen.{ComplexKeyGenerator, CustomKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator} import org.apache.hudi.keygen.{ComplexKeyGenerator, CustomKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator}
import org.apache.hudi.sync.common.HoodieSyncConfig import org.apache.hudi.sync.common.HoodieSyncConfig
@@ -413,7 +413,7 @@ object DataSourceWriteOptions {
* @deprecated Hive Specific Configs are moved to {@link HiveSyncConfig} * @deprecated Hive Specific Configs are moved to {@link HiveSyncConfig}
*/ */
@Deprecated @Deprecated
val HIVE_SYNC_ENABLED: ConfigProperty[String] = HiveSyncConfig.HIVE_SYNC_ENABLED val HIVE_SYNC_ENABLED: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_ENABLED
@Deprecated @Deprecated
val META_SYNC_ENABLED: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_ENABLED val META_SYNC_ENABLED: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_ENABLED
@Deprecated @Deprecated
@@ -425,13 +425,13 @@ object DataSourceWriteOptions {
@Deprecated @Deprecated
val HIVE_BASE_FILE_FORMAT: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT val HIVE_BASE_FILE_FORMAT: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT
@Deprecated @Deprecated
val HIVE_USER: ConfigProperty[String] = HiveSyncConfig.HIVE_USER val HIVE_USER: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_USER
@Deprecated @Deprecated
val HIVE_PASS: ConfigProperty[String] = HiveSyncConfig.HIVE_PASS val HIVE_PASS: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_PASS
@Deprecated @Deprecated
val HIVE_URL: ConfigProperty[String] = HiveSyncConfig.HIVE_URL val HIVE_URL: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_URL
@Deprecated @Deprecated
val METASTORE_URIS: ConfigProperty[String] = HiveSyncConfig.METASTORE_URIS val METASTORE_URIS: ConfigProperty[String] = HiveSyncConfigHolder.METASTORE_URIS
@Deprecated @Deprecated
val hivePartitionFieldsInferFunc: JavaFunction[HoodieConfig, Option[String]] = HoodieSyncConfig.PARTITION_FIELDS_INFERENCE_FUNCTION val hivePartitionFieldsInferFunc: JavaFunction[HoodieConfig, Option[String]] = HoodieSyncConfig.PARTITION_FIELDS_INFERENCE_FUNCTION
@Deprecated @Deprecated
@@ -443,19 +443,19 @@ object DataSourceWriteOptions {
@Deprecated @Deprecated
val HIVE_ASSUME_DATE_PARTITION: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION val HIVE_ASSUME_DATE_PARTITION: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION
@Deprecated @Deprecated
val HIVE_USE_PRE_APACHE_INPUT_FORMAT: ConfigProperty[String] = HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT val HIVE_USE_PRE_APACHE_INPUT_FORMAT: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT
/** @deprecated Use {@link HIVE_SYNC_MODE} instead of this config from 0.9.0 */ /** @deprecated Use {@link HIVE_SYNC_MODE} instead of this config from 0.9.0 */
@Deprecated @Deprecated
val HIVE_USE_JDBC: ConfigProperty[String] = HiveSyncConfig.HIVE_USE_JDBC val HIVE_USE_JDBC: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_USE_JDBC
@Deprecated @Deprecated
val HIVE_AUTO_CREATE_DATABASE: ConfigProperty[String] = HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE val HIVE_AUTO_CREATE_DATABASE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE
@Deprecated @Deprecated
val HIVE_IGNORE_EXCEPTIONS: ConfigProperty[String] = HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS val HIVE_IGNORE_EXCEPTIONS: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS
@Deprecated @Deprecated
val HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE: ConfigProperty[String] = HiveSyncConfig.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE val HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE
@Deprecated @Deprecated
val HIVE_SUPPORT_TIMESTAMP_TYPE: ConfigProperty[String] = HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE val HIVE_SUPPORT_TIMESTAMP_TYPE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE
/** /**
* Flag to indicate whether to use conditional syncing in HiveSync. * Flag to indicate whether to use conditional syncing in HiveSync.
@@ -465,23 +465,23 @@ object DataSourceWriteOptions {
@Deprecated @Deprecated
val HIVE_CONDITIONAL_SYNC: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC val HIVE_CONDITIONAL_SYNC: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC
@Deprecated @Deprecated
val HIVE_TABLE_PROPERTIES: ConfigProperty[String] = HiveSyncConfig.HIVE_TABLE_PROPERTIES val HIVE_TABLE_PROPERTIES: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES
@Deprecated @Deprecated
val HIVE_TABLE_SERDE_PROPERTIES: ConfigProperty[String] = HiveSyncConfig.HIVE_TABLE_SERDE_PROPERTIES val HIVE_TABLE_SERDE_PROPERTIES: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES
@Deprecated @Deprecated
val HIVE_SYNC_AS_DATA_SOURCE_TABLE: ConfigProperty[String] = HiveSyncConfig.HIVE_SYNC_AS_DATA_SOURCE_TABLE val HIVE_SYNC_AS_DATA_SOURCE_TABLE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE
// Create table as managed table // Create table as managed table
@Deprecated @Deprecated
val HIVE_CREATE_MANAGED_TABLE: ConfigProperty[java.lang.Boolean] = HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE val HIVE_CREATE_MANAGED_TABLE: ConfigProperty[java.lang.Boolean] = HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE
@Deprecated @Deprecated
val HIVE_BATCH_SYNC_PARTITION_NUM: ConfigProperty[java.lang.Integer] = HiveSyncConfig.HIVE_BATCH_SYNC_PARTITION_NUM val HIVE_BATCH_SYNC_PARTITION_NUM: ConfigProperty[java.lang.Integer] = HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM
@Deprecated @Deprecated
val HIVE_SYNC_MODE: ConfigProperty[String] = HiveSyncConfig.HIVE_SYNC_MODE val HIVE_SYNC_MODE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_MODE
@Deprecated @Deprecated
val HIVE_SYNC_BUCKET_SYNC: ConfigProperty[java.lang.Boolean] = HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC val HIVE_SYNC_BUCKET_SYNC: ConfigProperty[java.lang.Boolean] = HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC
@Deprecated @Deprecated
val HIVE_SYNC_COMMENT: ConfigProperty[String] = HiveSyncConfig.HIVE_SYNC_COMMENT; val HIVE_SYNC_COMMENT: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_COMMENT;
// Async Compaction - Enabled by default for MOR // Async Compaction - Enabled by default for MOR
val ASYNC_COMPACT_ENABLE: ConfigProperty[String] = ConfigProperty val ASYNC_COMPACT_ENABLE: ConfigProperty[String] = ConfigProperty
@@ -506,16 +506,16 @@ object DataSourceWriteOptions {
val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key() val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key()
/** @deprecated Use {@link HIVE_USE_PRE_APACHE_INPUT_FORMAT} and its methods instead */ /** @deprecated Use {@link HIVE_USE_PRE_APACHE_INPUT_FORMAT} and its methods instead */
@Deprecated @Deprecated
val HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY = HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key() val HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY = HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key()
/** @deprecated Use {@link HIVE_USE_JDBC} and its methods instead */ /** @deprecated Use {@link HIVE_USE_JDBC} and its methods instead */
@Deprecated @Deprecated
val HIVE_USE_JDBC_OPT_KEY = HiveSyncConfig.HIVE_USE_JDBC.key() val HIVE_USE_JDBC_OPT_KEY = HiveSyncConfigHolder.HIVE_USE_JDBC.key()
/** @deprecated Use {@link HIVE_AUTO_CREATE_DATABASE} and its methods instead */ /** @deprecated Use {@link HIVE_AUTO_CREATE_DATABASE} and its methods instead */
@Deprecated @Deprecated
val HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key() val HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE.key()
/** @deprecated Use {@link HIVE_IGNORE_EXCEPTIONS} and its methods instead */ /** @deprecated Use {@link HIVE_IGNORE_EXCEPTIONS} and its methods instead */
@Deprecated @Deprecated
val HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS.key() val HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS.key()
/** @deprecated Use {@link STREAMING_IGNORE_FAILED_BATCH} and its methods instead */ /** @deprecated Use {@link STREAMING_IGNORE_FAILED_BATCH} and its methods instead */
@Deprecated @Deprecated
val STREAMING_IGNORE_FAILED_BATCH_OPT_KEY = STREAMING_IGNORE_FAILED_BATCH.key() val STREAMING_IGNORE_FAILED_BATCH_OPT_KEY = STREAMING_IGNORE_FAILED_BATCH.key()
@@ -530,7 +530,7 @@ object DataSourceWriteOptions {
val DEFAULT_META_SYNC_CLIENT_TOOL_CLASS = META_SYNC_CLIENT_TOOL_CLASS_NAME.defaultValue() val DEFAULT_META_SYNC_CLIENT_TOOL_CLASS = META_SYNC_CLIENT_TOOL_CLASS_NAME.defaultValue()
/** @deprecated Use {@link HIVE_SYNC_ENABLED} and its methods instead */ /** @deprecated Use {@link HIVE_SYNC_ENABLED} and its methods instead */
@Deprecated @Deprecated
val HIVE_SYNC_ENABLED_OPT_KEY = HiveSyncConfig.HIVE_SYNC_ENABLED.key() val HIVE_SYNC_ENABLED_OPT_KEY = HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key()
/** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */ /** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */
@Deprecated @Deprecated
val META_SYNC_ENABLED_OPT_KEY = HoodieSyncConfig.META_SYNC_ENABLED.key() val META_SYNC_ENABLED_OPT_KEY = HoodieSyncConfig.META_SYNC_ENABLED.key()
@@ -545,13 +545,13 @@ object DataSourceWriteOptions {
val HIVE_BASE_FILE_FORMAT_OPT_KEY = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key() val HIVE_BASE_FILE_FORMAT_OPT_KEY = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key()
/** @deprecated Use {@link HIVE_USER} and its methods instead */ /** @deprecated Use {@link HIVE_USER} and its methods instead */
@Deprecated @Deprecated
val HIVE_USER_OPT_KEY = HiveSyncConfig.HIVE_USER.key() val HIVE_USER_OPT_KEY = HiveSyncConfigHolder.HIVE_USER.key()
/** @deprecated Use {@link HIVE_PASS} and its methods instead */ /** @deprecated Use {@link HIVE_PASS} and its methods instead */
@Deprecated @Deprecated
val HIVE_PASS_OPT_KEY = HiveSyncConfig.HIVE_PASS.key() val HIVE_PASS_OPT_KEY = HiveSyncConfigHolder.HIVE_PASS.key()
/** @deprecated Use {@link HIVE_URL} and its methods instead */ /** @deprecated Use {@link HIVE_URL} and its methods instead */
@Deprecated @Deprecated
val HIVE_URL_OPT_KEY = HiveSyncConfig.HIVE_URL.key() val HIVE_URL_OPT_KEY = HiveSyncConfigHolder.HIVE_URL.key()
/** @deprecated Use {@link HIVE_PARTITION_FIELDS} and its methods instead */ /** @deprecated Use {@link HIVE_PARTITION_FIELDS} and its methods instead */
@Deprecated @Deprecated
val HIVE_PARTITION_FIELDS_OPT_KEY = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key() val HIVE_PARTITION_FIELDS_OPT_KEY = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key()
@@ -667,7 +667,7 @@ object DataSourceWriteOptions {
/** @deprecated Use {@link HIVE_SYNC_ENABLED} and its methods instead */ /** @deprecated Use {@link HIVE_SYNC_ENABLED} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = HiveSyncConfig.HIVE_SYNC_ENABLED.defaultValue() val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = HiveSyncConfigHolder.HIVE_SYNC_ENABLED.defaultValue()
/** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */ /** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_META_SYNC_ENABLED_OPT_VAL = HoodieSyncConfig.META_SYNC_ENABLED.defaultValue() val DEFAULT_META_SYNC_ENABLED_OPT_VAL = HoodieSyncConfig.META_SYNC_ENABLED.defaultValue()
@@ -682,13 +682,13 @@ object DataSourceWriteOptions {
val DEFAULT_HIVE_BASE_FILE_FORMAT_OPT_VAL = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.defaultValue() val DEFAULT_HIVE_BASE_FILE_FORMAT_OPT_VAL = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.defaultValue()
/** @deprecated Use {@link HIVE_USER} and its methods instead */ /** @deprecated Use {@link HIVE_USER} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_HIVE_USER_OPT_VAL = HiveSyncConfig.HIVE_USER.defaultValue() val DEFAULT_HIVE_USER_OPT_VAL = HiveSyncConfigHolder.HIVE_USER.defaultValue()
/** @deprecated Use {@link HIVE_PASS} and its methods instead */ /** @deprecated Use {@link HIVE_PASS} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_HIVE_PASS_OPT_VAL = HiveSyncConfig.HIVE_PASS.defaultValue() val DEFAULT_HIVE_PASS_OPT_VAL = HiveSyncConfigHolder.HIVE_PASS.defaultValue()
/** @deprecated Use {@link HIVE_URL} and its methods instead */ /** @deprecated Use {@link HIVE_URL} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_HIVE_URL_OPT_VAL = HiveSyncConfig.HIVE_URL.defaultValue() val DEFAULT_HIVE_URL_OPT_VAL = HiveSyncConfigHolder.HIVE_URL.defaultValue()
/** @deprecated Use {@link HIVE_PARTITION_FIELDS} and its methods instead */ /** @deprecated Use {@link HIVE_PARTITION_FIELDS} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.defaultValue() val DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.defaultValue()
@@ -702,25 +702,25 @@ object DataSourceWriteOptions {
val DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL = "false" val DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL = "false"
/** @deprecated Use {@link HIVE_USE_JDBC} and its methods instead */ /** @deprecated Use {@link HIVE_USE_JDBC} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_HIVE_USE_JDBC_OPT_VAL = HiveSyncConfig.HIVE_USE_JDBC.defaultValue() val DEFAULT_HIVE_USE_JDBC_OPT_VAL = HiveSyncConfigHolder.HIVE_USE_JDBC.defaultValue()
/** @deprecated Use {@link HIVE_AUTO_CREATE_DATABASE} and its methods instead */ /** @deprecated Use {@link HIVE_AUTO_CREATE_DATABASE} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.defaultValue() val DEFAULT_HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE.defaultValue()
/** @deprecated Use {@link HIVE_IGNORE_EXCEPTIONS} and its methods instead */ /** @deprecated Use {@link HIVE_IGNORE_EXCEPTIONS} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS.defaultValue() val DEFAULT_HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS.defaultValue()
/** @deprecated Use {@link HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE} and its methods instead */ /** @deprecated Use {@link HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE} and its methods instead */
@Deprecated @Deprecated
val HIVE_SKIP_RO_SUFFIX = HiveSyncConfig.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key() val HIVE_SKIP_RO_SUFFIX = HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key()
/** @deprecated Use {@link HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE} and its methods instead */ /** @deprecated Use {@link HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_HIVE_SKIP_RO_SUFFIX_VAL = HiveSyncConfig.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.defaultValue() val DEFAULT_HIVE_SKIP_RO_SUFFIX_VAL = HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.defaultValue()
/** @deprecated Use {@link HIVE_SUPPORT_TIMESTAMP_TYPE} and its methods instead */ /** @deprecated Use {@link HIVE_SUPPORT_TIMESTAMP_TYPE} and its methods instead */
@Deprecated @Deprecated
val HIVE_SUPPORT_TIMESTAMP = HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key() val HIVE_SUPPORT_TIMESTAMP = HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key()
/** @deprecated Use {@link HIVE_SUPPORT_TIMESTAMP_TYPE} and its methods instead */ /** @deprecated Use {@link HIVE_SUPPORT_TIMESTAMP_TYPE} and its methods instead */
@Deprecated @Deprecated
val DEFAULT_HIVE_SUPPORT_TIMESTAMP = HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.defaultValue() val DEFAULT_HIVE_SUPPORT_TIMESTAMP = HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.defaultValue()
/** @deprecated Use {@link ASYNC_COMPACT_ENABLE} and its methods instead */ /** @deprecated Use {@link ASYNC_COMPACT_ENABLE} and its methods instead */
@Deprecated @Deprecated
val ASYNC_COMPACT_ENABLE_OPT_KEY = ASYNC_COMPACT_ENABLE.key() val ASYNC_COMPACT_ENABLE_OPT_KEY = ASYNC_COMPACT_ENABLE.key()

View File

@@ -36,7 +36,7 @@ import org.apache.hudi.config.HoodieBootstrapConfig.{BASE_PATH, INDEX_CLASS_NAME
import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig} import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig}
import org.apache.hudi.exception.HoodieException import org.apache.hudi.exception.HoodieException
import org.apache.hudi.execution.bulkinsert.{BulkInsertInternalPartitionerWithRowsFactory, NonSortPartitionerWithRows} import org.apache.hudi.execution.bulkinsert.{BulkInsertInternalPartitionerWithRowsFactory, NonSortPartitionerWithRows}
import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool} import org.apache.hudi.hive.{HiveSyncConfigHolder, HiveSyncTool}
import org.apache.hudi.index.SparkHoodieIndexFactory import org.apache.hudi.index.SparkHoodieIndexFactory
import org.apache.hudi.internal.DataSourceInternalWriterHelper import org.apache.hudi.internal.DataSourceInternalWriterHelper
import org.apache.hudi.internal.schema.InternalSchema import org.apache.hudi.internal.schema.InternalSchema
@@ -600,7 +600,7 @@ object HoodieSparkSqlWriter {
private def metaSync(spark: SparkSession, hoodieConfig: HoodieConfig, basePath: Path, private def metaSync(spark: SparkSession, hoodieConfig: HoodieConfig, basePath: Path,
schema: StructType): Boolean = { schema: StructType): Boolean = {
val hiveSyncEnabled = hoodieConfig.getStringOrDefault(HiveSyncConfig.HIVE_SYNC_ENABLED).toBoolean val hiveSyncEnabled = hoodieConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_ENABLED).toBoolean
var metaSyncEnabled = hoodieConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_ENABLED).toBoolean var metaSyncEnabled = hoodieConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_ENABLED).toBoolean
var syncClientToolClassSet = scala.collection.mutable.Set[String]() var syncClientToolClassSet = scala.collection.mutable.Set[String]()
hoodieConfig.getString(META_SYNC_CLIENT_TOOL_CLASS_NAME).split(",").foreach(syncClass => syncClientToolClassSet += syncClass) hoodieConfig.getString(META_SYNC_CLIENT_TOOL_CLASS_NAME).split(",").foreach(syncClass => syncClientToolClassSet += syncClass)
@@ -616,7 +616,7 @@ object HoodieSparkSqlWriter {
val baseFileFormat = hoodieConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT); val baseFileFormat = hoodieConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT);
val properties = new TypedProperties() val properties = new TypedProperties()
properties.putAll(hoodieConfig.getProps) properties.putAll(hoodieConfig.getProps)
properties.put(HiveSyncConfig.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key, spark.sessionState.conf.getConf(StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD).toString) properties.put(HiveSyncConfigHolder.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key, spark.sessionState.conf.getConf(StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD).toString)
properties.put(HoodieSyncConfig.META_SYNC_SPARK_VERSION.key, SPARK_VERSION) properties.put(HoodieSyncConfig.META_SYNC_SPARK_VERSION.key, SPARK_VERSION)
properties.put(HoodieSyncConfig.META_SYNC_USE_FILE_LISTING_FROM_METADATA.key, hoodieConfig.getBoolean(HoodieMetadataConfig.ENABLE)) properties.put(HoodieSyncConfig.META_SYNC_USE_FILE_LISTING_FROM_METADATA.key, hoodieConfig.getBoolean(HoodieMetadataConfig.ENABLE))

View File

@@ -17,20 +17,19 @@
package org.apache.hudi package org.apache.hudi
import java.util.Properties
import org.apache.hudi.DataSourceOptionsHelper.allAlternatives import org.apache.hudi.DataSourceOptionsHelper.allAlternatives
import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE import org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE
import org.apache.hudi.common.config.{DFSPropertiesConfiguration, HoodieCommonConfig, HoodieConfig, TypedProperties} import org.apache.hudi.common.config.{DFSPropertiesConfiguration, HoodieCommonConfig, HoodieConfig}
import org.apache.hudi.common.table.HoodieTableConfig import org.apache.hudi.common.table.HoodieTableConfig
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.exception.HoodieException import org.apache.hudi.exception.HoodieException
import org.apache.hudi.hive.HiveSyncConfig import org.apache.hudi.hive.HiveSyncConfigHolder
import org.apache.hudi.keygen.{NonpartitionedKeyGenerator, SimpleKeyGenerator} import org.apache.hudi.keygen.{NonpartitionedKeyGenerator, SimpleKeyGenerator}
import org.apache.hudi.sync.common.HoodieSyncConfig import org.apache.hudi.sync.common.HoodieSyncConfig
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.hudi.command.SqlKeyGenerator import org.apache.spark.sql.hudi.command.SqlKeyGenerator
import java.util.Properties
import scala.collection.JavaConversions.mapAsJavaMap import scala.collection.JavaConversions.mapAsJavaMap
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
@@ -67,21 +66,21 @@ object HoodieWriterUtils {
hoodieConfig.setDefaultValue(STREAMING_RETRY_INTERVAL_MS) hoodieConfig.setDefaultValue(STREAMING_RETRY_INTERVAL_MS)
hoodieConfig.setDefaultValue(STREAMING_IGNORE_FAILED_BATCH) hoodieConfig.setDefaultValue(STREAMING_IGNORE_FAILED_BATCH)
hoodieConfig.setDefaultValue(META_SYNC_CLIENT_TOOL_CLASS_NAME) hoodieConfig.setDefaultValue(META_SYNC_CLIENT_TOOL_CLASS_NAME)
hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_SYNC_ENABLED) hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_SYNC_ENABLED)
hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_ENABLED) hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_ENABLED)
hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_DATABASE_NAME) hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_DATABASE_NAME)
hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_TABLE_NAME) hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_TABLE_NAME)
hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT) hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT)
hoodieConfig.setDefaultValue(HiveSyncConfig.METASTORE_URIS) hoodieConfig.setDefaultValue(HiveSyncConfigHolder.METASTORE_URIS)
hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_USER) hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_USER)
hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_PASS) hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_PASS)
hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_URL) hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_URL)
hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS) hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS)
hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS) hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS)
hoodieConfig.setDefaultValue(HIVE_STYLE_PARTITIONING) hoodieConfig.setDefaultValue(HIVE_STYLE_PARTITIONING)
hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_USE_JDBC) hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_USE_JDBC)
hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE) hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE)
hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_SYNC_AS_DATA_SOURCE_TABLE) hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE)
hoodieConfig.setDefaultValue(ASYNC_COMPACT_ENABLE) hoodieConfig.setDefaultValue(ASYNC_COMPACT_ENABLE)
hoodieConfig.setDefaultValue(INLINE_CLUSTERING_ENABLE) hoodieConfig.setDefaultValue(INLINE_CLUSTERING_ENABLE)
hoodieConfig.setDefaultValue(ASYNC_CLUSTERING_ENABLE) hoodieConfig.setDefaultValue(ASYNC_CLUSTERING_ENABLE)

View File

@@ -24,7 +24,7 @@ import org.apache.hudi.common.table.HoodieTableConfig
import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME
import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig} import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig}
import org.apache.hudi.hive.ddl.HiveSyncMode import org.apache.hudi.hive.ddl.HiveSyncMode
import org.apache.hudi.hive.{HiveSyncConfig, MultiPartKeysValueExtractor} import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncConfigHolder, MultiPartKeysValueExtractor}
import org.apache.hudi.keygen.ComplexKeyGenerator import org.apache.hudi.keygen.ComplexKeyGenerator
import org.apache.hudi.sql.InsertMode import org.apache.hudi.sql.InsertMode
import org.apache.hudi.sync.common.HoodieSyncConfig import org.apache.hudi.sync.common.HoodieSyncConfig
@@ -38,9 +38,7 @@ import org.apache.spark.sql.hudi.command.{SqlKeyGenerator, ValidateDuplicateKeyP
import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructType
import java.util
import java.util.Locale import java.util.Locale
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
trait ProvidesHoodieConfig extends Logging { trait ProvidesHoodieConfig extends Logging {
@@ -76,13 +74,13 @@ trait ProvidesHoodieConfig extends Logging {
OPERATION.key -> UPSERT_OPERATION_OPT_VAL, OPERATION.key -> UPSERT_OPERATION_OPT_VAL,
PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp, PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp,
HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString, HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString,
HiveSyncConfig.HIVE_SYNC_ENABLED.key -> enableHive.toString, HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> enableHive.toString,
HiveSyncConfig.HIVE_SYNC_MODE.key -> hiveSyncConfig.syncMode, HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()),
HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.databaseName, HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME),
HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.tableName, HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME),
HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp, HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp,
HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.partitionValueExtractorClass, HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS),
HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.supportTimestamp.toString, HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString,
HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "200"), HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "200"),
SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL
) )
@@ -194,12 +192,12 @@ trait ProvidesHoodieConfig extends Logging {
HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(hasPrecombineColumn), HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(hasPrecombineColumn),
HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> partitionFieldsStr, HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> partitionFieldsStr,
HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString, HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString,
HiveSyncConfig.HIVE_SYNC_ENABLED.key -> enableHive.toString, HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> enableHive.toString,
HiveSyncConfig.HIVE_SYNC_MODE.key -> hiveSyncConfig.syncMode, HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()),
HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.databaseName, HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME),
HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.tableName, HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME),
HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.supportTimestamp.toString, HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString,
HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.partitionValueExtractorClass, HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS),
HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key, "200"), HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key, "200"),
HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "200"), HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "200"),
SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL
@@ -231,13 +229,13 @@ trait ProvidesHoodieConfig extends Logging {
PRECOMBINE_FIELD.key -> hoodieCatalogTable.preCombineKey.getOrElse(""), PRECOMBINE_FIELD.key -> hoodieCatalogTable.preCombineKey.getOrElse(""),
PARTITIONPATH_FIELD.key -> partitionFields, PARTITIONPATH_FIELD.key -> partitionFields,
HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString, HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString,
HiveSyncConfig.HIVE_SYNC_ENABLED.key -> enableHive.toString, HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> enableHive.toString,
HiveSyncConfig.HIVE_SYNC_MODE.key -> hiveSyncConfig.syncMode, HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()),
HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.databaseName, HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME),
HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.tableName, HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME),
HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.supportTimestamp.toString, HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString,
HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> partitionFields, HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> partitionFields,
HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.partitionValueExtractorClass HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS)
) )
.filter { case (_, v) => v != null } .filter { case (_, v) => v != null }
} }
@@ -273,9 +271,9 @@ trait ProvidesHoodieConfig extends Logging {
OPERATION.key -> DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL, OPERATION.key -> DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL,
PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp, PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp,
HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString, HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString,
HiveSyncConfig.HIVE_SYNC_ENABLED.key -> enableHive.toString, HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> enableHive.toString,
HiveSyncConfig.HIVE_SYNC_MODE.key -> hiveSyncConfig.syncMode, HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()),
HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.supportTimestamp.toString, HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString,
HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key, "200"), HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key, "200"),
SqlKeyGenerator.PARTITION_SCHEMA -> partitionSchema.toDDL SqlKeyGenerator.PARTITION_SCHEMA -> partitionSchema.toDDL
) )
@@ -289,33 +287,20 @@ trait ProvidesHoodieConfig extends Logging {
} }
def buildHiveSyncConfig(props: TypedProperties, hoodieCatalogTable: HoodieCatalogTable): HiveSyncConfig = { def buildHiveSyncConfig(props: TypedProperties, hoodieCatalogTable: HoodieCatalogTable): HiveSyncConfig = {
val hiveSyncConfig: HiveSyncConfig = new HiveSyncConfig val hiveSyncConfig: HiveSyncConfig = new HiveSyncConfig(props)
hiveSyncConfig.basePath = hoodieCatalogTable.tableLocation hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_BASE_PATH, hoodieCatalogTable.tableLocation)
hiveSyncConfig.baseFileFormat = hoodieCatalogTable.baseFileFormat hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT, hoodieCatalogTable.baseFileFormat)
hiveSyncConfig.usePreApacheInputFormat = props.getBoolean(HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key, HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT.defaultValue.toBoolean) hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_DATABASE_NAME, hoodieCatalogTable.table.identifier.database.getOrElse("default"))
hiveSyncConfig.databaseName = hoodieCatalogTable.table.identifier.database.getOrElse("default") hiveSyncConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_TABLE_NAME, hoodieCatalogTable.table.identifier.table)
if (props.containsKey(HoodieSyncConfig.META_SYNC_TABLE_NAME.key)) { hiveSyncConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS, classOf[MultiPartKeysValueExtractor].getName)
hiveSyncConfig.tableName = props.getString(HoodieSyncConfig.META_SYNC_TABLE_NAME.key) hiveSyncConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE, "true")
} else { if (hiveSyncConfig.useBucketSync())
hiveSyncConfig.tableName = hoodieCatalogTable.table.identifier.table hiveSyncConfig.setValue(HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC,
} HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key),
hiveSyncConfig.syncMode = props.getString(HiveSyncConfig.HIVE_SYNC_MODE.key, HiveSyncMode.HMS.name()) props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key)))
hiveSyncConfig.hiveUser = props.getString(HiveSyncConfig.HIVE_USER.key, HiveSyncConfig.HIVE_USER.defaultValue) if (props.containsKey(HiveExternalCatalog.CREATED_SPARK_VERSION))
hiveSyncConfig.hivePass = props.getString(HiveSyncConfig.HIVE_PASS.key, HiveSyncConfig.HIVE_PASS.defaultValue) hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_SPARK_VERSION,
hiveSyncConfig.jdbcUrl = props.getString(HiveSyncConfig.HIVE_URL.key, HiveSyncConfig.HIVE_URL.defaultValue) props.getString(HiveExternalCatalog.CREATED_SPARK_VERSION))
hiveSyncConfig.metastoreUris = props.getString(HiveSyncConfig.METASTORE_URIS.key, HiveSyncConfig.METASTORE_URIS.defaultValue)
hiveSyncConfig.partitionFields = props.getStringList(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key, ",", new util.ArrayList[String])
hiveSyncConfig.partitionValueExtractorClass = props.getString(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key, classOf[MultiPartKeysValueExtractor].getName)
if (props.containsKey(HiveSyncConfig.HIVE_SYNC_MODE.key)) hiveSyncConfig.syncMode = props.getString(HiveSyncConfig.HIVE_SYNC_MODE.key)
hiveSyncConfig.autoCreateDatabase = props.getString(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key, HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.defaultValue).toBoolean
hiveSyncConfig.ignoreExceptions = props.getString(HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS.key, HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS.defaultValue).toBoolean
hiveSyncConfig.skipROSuffix = props.getString(HiveSyncConfig.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key, HiveSyncConfig.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.defaultValue).toBoolean
hiveSyncConfig.supportTimestamp = props.getString(HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key, "true").toBoolean
hiveSyncConfig.isConditionalSync = props.getString(HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC.key, HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC.defaultValue).toBoolean
hiveSyncConfig.bucketSpec = if (props.getBoolean(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.key, HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.defaultValue)) HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key), props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key))
else null
if (props.containsKey(HiveExternalCatalog.CREATED_SPARK_VERSION)) hiveSyncConfig.sparkVersion = props.getString(HiveExternalCatalog.CREATED_SPARK_VERSION)
hiveSyncConfig.syncComment = props.getString(DataSourceWriteOptions.HIVE_SYNC_COMMENT.key, DataSourceWriteOptions.HIVE_SYNC_COMMENT.defaultValue).toBoolean
hiveSyncConfig hiveSyncConfig
} }
} }

View File

@@ -18,9 +18,6 @@
package org.apache.hudi.cli; package org.apache.hudi.cli;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext;
@@ -38,7 +35,10 @@ import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.sync.common.HoodieSyncConfig;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
@@ -48,6 +48,12 @@ import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER;
import static org.apache.hudi.config.HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD;
import static org.apache.hudi.config.HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
/** /**
* Performs bootstrap from a non-hudi source. * Performs bootstrap from a non-hudi source.
@@ -115,7 +121,7 @@ public class BootstrapExecutorUtils implements Serializable {
// Add more defaults if full bootstrap requested // Add more defaults if full bootstrap requested
this.props.putIfAbsent(DataSourceWriteOptions.PAYLOAD_CLASS_NAME().key(), this.props.putIfAbsent(DataSourceWriteOptions.PAYLOAD_CLASS_NAME().key(),
DataSourceWriteOptions.PAYLOAD_CLASS_NAME().defaultValue()); DataSourceWriteOptions.PAYLOAD_CLASS_NAME().defaultValue());
/** /*
* Schema provider that supplies the command for reading the input and writing out the target table. * Schema provider that supplies the command for reading the input and writing out the target table.
*/ */
SchemaProvider schemaProvider = createSchemaProvider(cfg.schemaProviderClass, props, jssc); SchemaProvider schemaProvider = createSchemaProvider(cfg.schemaProviderClass, props, jssc);
@@ -165,14 +171,14 @@ public class BootstrapExecutorUtils implements Serializable {
if (cfg.enableHiveSync) { if (cfg.enableHiveSync) {
TypedProperties metaProps = new TypedProperties(); TypedProperties metaProps = new TypedProperties();
metaProps.putAll(props); metaProps.putAll(props);
metaProps.put(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), cfg.basePath); metaProps.put(META_SYNC_BASE_PATH.key(), cfg.basePath);
metaProps.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), cfg.baseFileFormat); metaProps.put(META_SYNC_BASE_FILE_FORMAT.key(), cfg.baseFileFormat);
if (props.getBoolean(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.key(), HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.defaultValue())) { if (props.getBoolean(HIVE_SYNC_BUCKET_SYNC.key(), HIVE_SYNC_BUCKET_SYNC.defaultValue())) {
metaProps.put(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()), metaProps.put(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(BUCKET_INDEX_HASH_FIELD.key()),
props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key()))); props.getInteger(BUCKET_INDEX_NUM_BUCKETS.key())));
} }
new HiveSyncTool(metaProps, configuration, fs).syncHoodieTable(); new HiveSyncTool(metaProps, configuration).syncHoodieTable();
} }
} }

View File

@@ -20,11 +20,11 @@ package org.apache.spark.sql.hudi.command
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path import org.apache.hadoop.fs.Path
import org.apache.hudi.DataSourceWriteOptions import org.apache.hudi.DataSourceWriteOptions
import org.apache.hudi.hive.HiveSyncConfig import org.apache.hudi.hive.HiveSyncConfigHolder
import org.apache.hudi.sql.InsertMode import org.apache.hudi.sql.InsertMode
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, HoodieCatalogTable}
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable.needFilterProps
import org.apache.hudi.sync.common.util.ConfigUtils import org.apache.hudi.sync.common.util.ConfigUtils
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable.needFilterProps
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, HoodieCatalogTable}
import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils import org.apache.spark.sql.hudi.HoodieSqlCommonUtils
@@ -94,9 +94,9 @@ case class CreateHoodieTableAsSelectCommand(
val tblProperties = hoodieCatalogTable.catalogProperties val tblProperties = hoodieCatalogTable.catalogProperties
val options = Map( val options = Map(
HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE.key -> (table.tableType == CatalogTableType.MANAGED).toString, HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE.key -> (table.tableType == CatalogTableType.MANAGED).toString,
HiveSyncConfig.HIVE_TABLE_SERDE_PROPERTIES.key -> ConfigUtils.configToString(tblProperties.asJava), HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES.key -> ConfigUtils.configToString(tblProperties.asJava),
HiveSyncConfig.HIVE_TABLE_PROPERTIES.key -> ConfigUtils.configToString(newTable.properties.asJava), HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES.key -> ConfigUtils.configToString(newTable.properties.asJava),
DataSourceWriteOptions.SQL_INSERT_MODE.key -> InsertMode.NON_STRICT.value(), DataSourceWriteOptions.SQL_INSERT_MODE.key -> InsertMode.NON_STRICT.value(),
DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key -> "true" DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key -> "true"
) )

View File

@@ -22,7 +22,7 @@ import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.common.util.StringUtils import org.apache.hudi.common.util.StringUtils
import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME
import org.apache.hudi.hive.HiveSyncConfig import org.apache.hudi.hive.HiveSyncConfigHolder
import org.apache.hudi.sync.common.HoodieSyncConfig import org.apache.hudi.sync.common.HoodieSyncConfig
import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions, HoodieSparkSqlWriter, SparkAdapterSupport} import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions, HoodieSparkSqlWriter, SparkAdapterSupport}
import org.apache.spark.sql._ import org.apache.spark.sql._
@@ -467,13 +467,13 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Hoodie
KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName,
SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName, SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName,
HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString, HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString,
HiveSyncConfig.HIVE_SYNC_ENABLED.key -> enableHive.toString, HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_MODE),
HiveSyncConfig.HIVE_SYNC_MODE.key -> hiveSyncConfig.syncMode, HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> enableHive.toString,
HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> targetTableDb, HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> targetTableDb,
HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> targetTableName, HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> targetTableName,
HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.supportTimestamp.toString, HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString,
HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp, HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp,
HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.partitionValueExtractorClass, HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS),
HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key, "200"), // set the default parallelism to 200 for sql HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key, "200"), // set the default parallelism to 200 for sql
HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "200"), HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "200"),
HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key, "200"), HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key, "200"),

View File

@@ -50,6 +50,12 @@ import java.util.stream.Collectors;
import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings; import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings;
import static org.apache.hudi.common.testutils.Transformations.randomSelectAsHoodieKeys; import static org.apache.hudi.common.testutils.Transformations.randomSelectAsHoodieKeys;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_ENABLED;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
/** /**
* Sample program that writes & reads hoodie tables via the Spark datasource. * Sample program that writes & reads hoodie tables via the Spark datasource.
@@ -256,12 +262,12 @@ public class HoodieJavaApp {
private DataFrameWriter<Row> updateHiveSyncConfig(DataFrameWriter<Row> writer) { private DataFrameWriter<Row> updateHiveSyncConfig(DataFrameWriter<Row> writer) {
if (enableHiveSync) { if (enableHiveSync) {
LOG.info("Enabling Hive sync to " + hiveJdbcUrl); LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
writer = writer.option(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), hiveTable) writer = writer.option(META_SYNC_TABLE_NAME.key(), hiveTable)
.option(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), hiveDB) .option(META_SYNC_DATABASE_NAME.key(), hiveDB)
.option(HiveSyncConfig.HIVE_URL.key(), hiveJdbcUrl) .option(HIVE_URL.key(), hiveJdbcUrl)
.option(HiveSyncConfig.HIVE_USER.key(), hiveUser) .option(HIVE_USER.key(), hiveUser)
.option(HiveSyncConfig.HIVE_PASS.key(), hivePass) .option(HIVE_PASS.key(), hivePass)
.option(HiveSyncConfig.HIVE_SYNC_ENABLED.key(), "true"); .option(HIVE_SYNC_ENABLED.key(), "true");
if (nonPartitionedTable) { if (nonPartitionedTable) {
writer = writer writer = writer
.option(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), .option(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(),

View File

@@ -46,6 +46,12 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings; import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_ENABLED;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
public class HoodieJavaGenerateApp { public class HoodieJavaGenerateApp {
@Parameter(names = {"--table-path", "-p"}, description = "Path for Hoodie sample table") @Parameter(names = {"--table-path", "-p"}, description = "Path for Hoodie sample table")
@@ -126,12 +132,12 @@ public class HoodieJavaGenerateApp {
private DataFrameWriter<Row> updateHiveSyncConfig(DataFrameWriter<Row> writer) { private DataFrameWriter<Row> updateHiveSyncConfig(DataFrameWriter<Row> writer) {
if (enableHiveSync) { if (enableHiveSync) {
LOG.info("Enabling Hive sync to " + hiveJdbcUrl); LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
writer = writer.option(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), hiveTable) writer = writer.option(META_SYNC_TABLE_NAME.key(), hiveTable)
.option(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), hiveDB) .option(META_SYNC_DATABASE_NAME.key(), hiveDB)
.option(HiveSyncConfig.HIVE_URL.key(), hiveJdbcUrl) .option(HIVE_URL.key(), hiveJdbcUrl)
.option(HiveSyncConfig.HIVE_USER.key(), hiveUser) .option(HIVE_USER.key(), hiveUser)
.option(HiveSyncConfig.HIVE_PASS.key(), hivePass) .option(HIVE_PASS.key(), hivePass)
.option(HiveSyncConfig.HIVE_SYNC_ENABLED.key(), "true"); .option(HIVE_SYNC_ENABLED.key(), "true");
if (nonPartitionedTable) { if (nonPartitionedTable) {
writer = writer writer = writer
.option(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), .option(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(),

View File

@@ -54,6 +54,12 @@ import java.util.concurrent.Future;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings; import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_ENABLED;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
/** /**
* Sample program that writes & reads hoodie tables via the Spark datasource streaming. * Sample program that writes & reads hoodie tables via the Spark datasource streaming.
@@ -383,12 +389,12 @@ public class HoodieJavaStreamingApp {
private DataStreamWriter<Row> updateHiveSyncConfig(DataStreamWriter<Row> writer) { private DataStreamWriter<Row> updateHiveSyncConfig(DataStreamWriter<Row> writer) {
if (enableHiveSync) { if (enableHiveSync) {
LOG.info("Enabling Hive sync to " + hiveJdbcUrl); LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
writer = writer.option(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), hiveTable) writer = writer.option(META_SYNC_TABLE_NAME.key(), hiveTable)
.option(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), hiveDB) .option(META_SYNC_DATABASE_NAME.key(), hiveDB)
.option(HiveSyncConfig.HIVE_URL.key(), hiveJdbcUrl) .option(HIVE_URL.key(), hiveJdbcUrl)
.option(HiveSyncConfig.HIVE_USER.key(), hiveUser) .option(HIVE_USER.key(), hiveUser)
.option(HiveSyncConfig.HIVE_PASS.key(), hivePass) .option(HIVE_PASS.key(), hivePass)
.option(HiveSyncConfig.HIVE_SYNC_ENABLED.key(), "true"); .option(HIVE_SYNC_ENABLED.key(), "true");
if (useMultiPartitionKeys) { if (useMultiPartitionKeys) {
writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day").option( writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day").option(
HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(),

View File

@@ -18,12 +18,6 @@
package org.apache.hudi; package org.apache.hudi;
import org.apache.avro.Conversions;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericFixed;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
@@ -37,8 +31,14 @@ import org.apache.hudi.config.HoodieStorageConfig;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.avro.Conversions;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericFixed;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
@@ -54,7 +54,6 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.MethodSource;
import org.junit.jupiter.params.provider.ValueSource;
import org.mockito.ArgumentCaptor; import org.mockito.ArgumentCaptor;
import org.mockito.Captor; import org.mockito.Captor;
import org.mockito.Mock; import org.mockito.Mock;
@@ -70,18 +69,13 @@ import java.util.Map;
import java.util.stream.Stream; import java.util.stream.Stream;
import static org.apache.hudi.DataSourceUtils.tryOverrideParquetWriteLegacyFormatProperty; import static org.apache.hudi.DataSourceUtils.tryOverrideParquetWriteLegacyFormatProperty;
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
import static org.apache.hudi.hive.ddl.HiveSyncMode.HMS;
import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.instanceOf; import static org.hamcrest.CoreMatchers.instanceOf;
import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.times; import static org.mockito.Mockito.times;
@@ -252,29 +246,6 @@ public class TestDataSourceUtils {
}); });
} }
@ParameterizedTest
@ValueSource(booleans = {true, false})
public void testBuildHiveSyncConfig(boolean useSyncMode) {
TypedProperties props = new TypedProperties();
if (useSyncMode) {
props.setProperty(DataSourceWriteOptions.HIVE_SYNC_MODE().key(), HMS.name());
props.setProperty(DataSourceWriteOptions.HIVE_USE_JDBC().key(), String.valueOf(false));
}
props.setProperty(DataSourceWriteOptions.HIVE_DATABASE().key(), HIVE_DATABASE);
props.setProperty(DataSourceWriteOptions.HIVE_TABLE().key(), HIVE_TABLE);
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, config.getBasePath(), PARQUET.name());
if (useSyncMode) {
assertFalse(hiveSyncConfig.useJdbc);
assertEquals(HMS.name(), hiveSyncConfig.syncMode);
} else {
assertTrue(hiveSyncConfig.useJdbc);
assertNull(hiveSyncConfig.syncMode);
}
assertEquals(HIVE_DATABASE, hiveSyncConfig.databaseName);
assertEquals(HIVE_TABLE, hiveSyncConfig.tableName);
}
private void setAndVerifyHoodieWriteClientWith(final String partitionerClassName) { private void setAndVerifyHoodieWriteClientWith(final String partitionerClassName) {
config = HoodieWriteConfig.newBuilder().withPath(config.getBasePath()) config = HoodieWriteConfig.newBuilder().withPath(config.getBasePath())
.withUserDefinedBulkInsertPartitionerClass(partitionerClassName) .withUserDefinedBulkInsertPartitionerClass(partitionerClassName)

View File

@@ -1,128 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sync.adb;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hive.PartitionValueExtractor;
import org.apache.hudi.hive.SchemaDifference;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public abstract class AbstractAdbSyncHoodieClient extends AbstractSyncHoodieClient {
protected AdbSyncConfig adbSyncConfig;
protected PartitionValueExtractor partitionValueExtractor;
protected HoodieTimeline activeTimeline;
public AbstractAdbSyncHoodieClient(AdbSyncConfig syncConfig, FileSystem fs) {
super(syncConfig.basePath, syncConfig.assumeDatePartitioning,
syncConfig.useFileListingFromMetadata, false, fs);
this.adbSyncConfig = syncConfig;
final String clazz = adbSyncConfig.partitionValueExtractorClass;
try {
this.partitionValueExtractor = (PartitionValueExtractor) Class.forName(clazz).newInstance();
} catch (Exception e) {
throw new HoodieException("Fail to init PartitionValueExtractor class " + clazz, e);
}
activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
}
public List<PartitionEvent> getPartitionEvents(Map<List<String>, String> tablePartitions,
List<String> partitionStoragePartitions) {
Map<String, String> paths = new HashMap<>();
for (Map.Entry<List<String>, String> entry : tablePartitions.entrySet()) {
List<String> partitionValues = entry.getKey();
String fullTablePartitionPath = entry.getValue();
paths.put(String.join(", ", partitionValues), fullTablePartitionPath);
}
List<PartitionEvent> events = new ArrayList<>();
for (String storagePartition : partitionStoragePartitions) {
Path storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, storagePartition);
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
// Check if the partition values or if hdfs path is the same
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
if (adbSyncConfig.useHiveStylePartitioning) {
String partition = String.join("/", storagePartitionValues);
storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition);
fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
}
if (!storagePartitionValues.isEmpty()) {
String storageValue = String.join(", ", storagePartitionValues);
if (!paths.containsKey(storageValue)) {
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
}
}
}
return events;
}
public void close() {
}
public abstract Map<List<String>, String> scanTablePartitions(String tableName) throws Exception;
public abstract void updateTableDefinition(String tableName, SchemaDifference schemaDiff) throws Exception;
public abstract boolean databaseExists(String databaseName) throws Exception;
public abstract void createDatabase(String databaseName) throws Exception;
public abstract void dropTable(String tableName);
protected String getDatabasePath() {
String dbLocation = adbSyncConfig.dbLocation;
Path dbLocationPath;
if (StringUtils.isNullOrEmpty(dbLocation)) {
if (new Path(adbSyncConfig.basePath).isRoot()) {
dbLocationPath = new Path(adbSyncConfig.basePath);
} else {
dbLocationPath = new Path(adbSyncConfig.basePath).getParent();
}
} else {
dbLocationPath = new Path(dbLocation);
}
return generateAbsolutePathStr(dbLocationPath);
}
protected String generateAbsolutePathStr(Path path) {
String absolutePathStr = path.toString();
if (path.toUri().getScheme() == null) {
absolutePathStr = getDefaultFs() + absolutePathStr;
}
return absolutePathStr.endsWith("/") ? absolutePathStr : absolutePathStr + "/";
}
protected String getDefaultFs() {
return fs.getConf().get("fs.defaultFS");
}
}

View File

@@ -20,62 +20,19 @@ package org.apache.hudi.sync.adb;
import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.sync.common.HoodieSyncConfig; import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.hive.HiveSyncConfig;
import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import org.apache.hadoop.fs.Path;
import java.util.Properties;
/** /**
* Configs needed to sync data into Alibaba Cloud AnalyticDB(ADB). * Configs needed to sync data into Alibaba Cloud AnalyticDB(ADB).
*/ */
public class AdbSyncConfig extends HoodieSyncConfig { public class AdbSyncConfig extends HiveSyncConfig {
@Parameter(names = {"--user"}, description = "Adb username", required = true)
public String adbUser;
@Parameter(names = {"--pass"}, description = "Adb password", required = true)
public String adbPass;
@Parameter(names = {"--jdbc-url"}, description = "Adb jdbc connect url", required = true)
public String jdbcUrl;
@Parameter(names = {"--skip-ro-suffix"}, description = "Whether skip the `_ro` suffix for read optimized table when syncing")
public Boolean skipROSuffix;
@Parameter(names = {"--skip-rt-sync"}, description = "Whether skip the rt table when syncing")
public Boolean skipRTSync;
@Parameter(names = {"--hive-style-partitioning"}, description = "Whether use hive style partitioning, true if like the following style: field1=value1/field2=value2")
public Boolean useHiveStylePartitioning;
@Parameter(names = {"--support-timestamp"}, description = "If true, converts int64(timestamp_micros) to timestamp type")
public Boolean supportTimestamp;
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table")
public Boolean syncAsSparkDataSourceTable;
@Parameter(names = {"--table-properties"}, description = "Table properties, to support read hoodie table as datasource table", required = true)
public String tableProperties;
@Parameter(names = {"--serde-properties"}, description = "Serde properties, to support read hoodie table as datasource table", required = true)
public String serdeProperties;
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore")
public int sparkSchemaLengthThreshold;
@Parameter(names = {"--db-location"}, description = "Database location")
public String dbLocation;
@Parameter(names = {"--auto-create-database"}, description = "Whether auto create adb database")
public Boolean autoCreateDatabase = true;
@Parameter(names = {"--skip-last-commit-time-sync"}, description = "Whether skip last commit time syncing")
public Boolean skipLastCommitTimeSync = false;
@Parameter(names = {"--drop-table-before-creation"}, description = "Whether drop table before creation")
public Boolean dropTableBeforeCreation = false;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
public static final ConfigProperty<String> ADB_SYNC_USER = ConfigProperty public static final ConfigProperty<String> ADB_SYNC_USER = ConfigProperty
.key("hoodie.datasource.adb.sync.username") .key("hoodie.datasource.adb.sync.username")
@@ -152,89 +109,101 @@ public class AdbSyncConfig extends HoodieSyncConfig {
.defaultValue(false) .defaultValue(false)
.withDocumentation("Whether drop table before creation"); .withDocumentation("Whether drop table before creation");
public AdbSyncConfig() { public AdbSyncConfig(Properties props) {
this(new TypedProperties());
}
public AdbSyncConfig(TypedProperties props) {
super(props); super(props);
adbUser = getString(ADB_SYNC_USER);
adbPass = getString(ADB_SYNC_PASS);
jdbcUrl = getString(ADB_SYNC_JDBC_URL);
skipROSuffix = getBooleanOrDefault(ADB_SYNC_SKIP_RO_SUFFIX);
skipRTSync = getBooleanOrDefault(ADB_SYNC_SKIP_RT_SYNC);
useHiveStylePartitioning = getBooleanOrDefault(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING);
supportTimestamp = getBooleanOrDefault(ADB_SYNC_SUPPORT_TIMESTAMP);
syncAsSparkDataSourceTable = getBooleanOrDefault(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE);
tableProperties = getString(ADB_SYNC_TABLE_PROPERTIES);
serdeProperties = getString(ADB_SYNC_SERDE_PROPERTIES);
sparkSchemaLengthThreshold = getIntOrDefault(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD);
dbLocation = getString(ADB_SYNC_DB_LOCATION);
autoCreateDatabase = getBooleanOrDefault(ADB_SYNC_AUTO_CREATE_DATABASE);
skipLastCommitTimeSync = getBooleanOrDefault(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC);
dropTableBeforeCreation = getBooleanOrDefault(ADB_SYNC_DROP_TABLE_BEFORE_CREATION);
}
public static TypedProperties toProps(AdbSyncConfig cfg) {
TypedProperties properties = new TypedProperties();
properties.put(META_SYNC_DATABASE_NAME.key(), cfg.databaseName);
properties.put(META_SYNC_TABLE_NAME.key(), cfg.tableName);
properties.put(ADB_SYNC_USER.key(), cfg.adbUser);
properties.put(ADB_SYNC_PASS.key(), cfg.adbPass);
properties.put(ADB_SYNC_JDBC_URL.key(), cfg.jdbcUrl);
properties.put(META_SYNC_BASE_PATH.key(), cfg.basePath);
properties.put(META_SYNC_PARTITION_FIELDS.key(), String.join(",", cfg.partitionFields));
properties.put(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), cfg.partitionValueExtractorClass);
properties.put(META_SYNC_ASSUME_DATE_PARTITION.key(), String.valueOf(cfg.assumeDatePartitioning));
properties.put(ADB_SYNC_SKIP_RO_SUFFIX.key(), String.valueOf(cfg.skipROSuffix));
properties.put(ADB_SYNC_SKIP_RT_SYNC.key(), String.valueOf(cfg.skipRTSync));
properties.put(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING.key(), String.valueOf(cfg.useHiveStylePartitioning));
properties.put(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(cfg.useFileListingFromMetadata));
properties.put(ADB_SYNC_SUPPORT_TIMESTAMP.key(), String.valueOf(cfg.supportTimestamp));
properties.put(ADB_SYNC_TABLE_PROPERTIES.key(), cfg.tableProperties);
properties.put(ADB_SYNC_SERDE_PROPERTIES.key(), cfg.serdeProperties);
properties.put(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE.key(), String.valueOf(cfg.syncAsSparkDataSourceTable));
properties.put(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), String.valueOf(cfg.sparkSchemaLengthThreshold));
properties.put(META_SYNC_SPARK_VERSION.key(), cfg.sparkVersion);
properties.put(ADB_SYNC_DB_LOCATION.key(), cfg.dbLocation);
properties.put(ADB_SYNC_AUTO_CREATE_DATABASE.key(), String.valueOf(cfg.autoCreateDatabase));
properties.put(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC.key(), String.valueOf(cfg.skipLastCommitTimeSync));
properties.put(ADB_SYNC_DROP_TABLE_BEFORE_CREATION.key(), String.valueOf(cfg.dropTableBeforeCreation));
return properties;
} }
@Override @Override
public String toString() { public String getAbsoluteBasePath() {
return "AdbSyncConfig{" return generateAbsolutePathStr(new Path(getString(META_SYNC_BASE_PATH)));
+ "adbUser='" + adbUser + '\'' }
+ ", adbPass='" + adbPass + '\''
+ ", jdbcUrl='" + jdbcUrl + '\'' public String getDatabasePath() {
+ ", skipROSuffix=" + skipROSuffix Path basePath = new Path(getString(META_SYNC_BASE_PATH));
+ ", skipRTSync=" + skipRTSync Path dbLocationPath;
+ ", useHiveStylePartitioning=" + useHiveStylePartitioning String dbLocation = getString(ADB_SYNC_DB_LOCATION);
+ ", supportTimestamp=" + supportTimestamp if (StringUtils.isNullOrEmpty(dbLocation)) {
+ ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable if (basePath.isRoot()) {
+ ", tableProperties='" + tableProperties + '\'' dbLocationPath = basePath;
+ ", serdeProperties='" + serdeProperties + '\'' } else {
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold dbLocationPath = basePath.getParent();
+ ", dbLocation='" + dbLocation + '\'' }
+ ", autoCreateDatabase=" + autoCreateDatabase } else {
+ ", skipLastCommitTimeSync=" + skipLastCommitTimeSync dbLocationPath = new Path(dbLocation);
+ ", dropTableBeforeCreation=" + dropTableBeforeCreation }
+ ", help=" + help return generateAbsolutePathStr(dbLocationPath);
+ ", databaseName='" + databaseName + '\'' }
+ ", tableName='" + tableName + '\''
+ ", basePath='" + basePath + '\'' public String generateAbsolutePathStr(Path path) {
+ ", baseFileFormat='" + baseFileFormat + '\'' String absolutePathStr = path.toString();
+ ", partitionFields=" + partitionFields if (path.toUri().getScheme() == null) {
+ ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\'' absolutePathStr = getDefaultFs() + absolutePathStr;
+ ", assumeDatePartitioning=" + assumeDatePartitioning }
+ ", decodePartition=" + decodePartition return absolutePathStr.endsWith("/") ? absolutePathStr : absolutePathStr + "/";
+ ", useFileListingFromMetadata=" + useFileListingFromMetadata }
+ ", isConditionalSync=" + isConditionalSync
+ ", sparkVersion='" + sparkVersion + '\'' public String getDefaultFs() {
+ '}'; return getHadoopConf().get("fs.defaultFS");
}
public static class AdbSyncConfigParams {
@ParametersDelegate()
public HiveSyncConfig.HiveSyncConfigParams hiveSyncConfigParams = new HiveSyncConfig.HiveSyncConfigParams();
@Parameter(names = {"--support-timestamp"}, description = "If true, converts int64(timestamp_micros) to timestamp type")
public Boolean supportTimestamp;
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table")
public Boolean syncAsSparkDataSourceTable;
@Parameter(names = {"--table-properties"}, description = "Table properties, to support read hoodie table as datasource table", required = true)
public String tableProperties;
@Parameter(names = {"--serde-properties"}, description = "Serde properties, to support read hoodie table as datasource table", required = true)
public String serdeProperties;
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore")
public int sparkSchemaLengthThreshold;
@Parameter(names = {"--hive-style-partitioning"}, description = "Whether use hive style partitioning, true if like the following style: field1=value1/field2=value2")
public Boolean useHiveStylePartitioning;
@Parameter(names = {"--skip-rt-sync"}, description = "Whether skip the rt table when syncing")
public Boolean skipRTSync;
@Parameter(names = {"--db-location"}, description = "Database location")
public String dbLocation;
@Parameter(names = {"--auto-create-database"}, description = "Whether auto create adb database")
public Boolean autoCreateDatabase = true;
@Parameter(names = {"--skip-last-commit-time-sync"}, description = "Whether skip last commit time syncing")
public Boolean skipLastCommitTimeSync = false;
@Parameter(names = {"--drop-table-before-creation"}, description = "Whether drop table before creation")
public Boolean dropTableBeforeCreation = false;
public boolean isHelp() {
return hiveSyncConfigParams.isHelp();
}
public TypedProperties toProps() {
final TypedProperties props = hiveSyncConfigParams.toProps();
props.setPropertyIfNonNull(META_SYNC_DATABASE_NAME.key(), hiveSyncConfigParams.hoodieSyncConfigParams.databaseName);
props.setPropertyIfNonNull(META_SYNC_TABLE_NAME.key(), hiveSyncConfigParams.hoodieSyncConfigParams.tableName);
props.setPropertyIfNonNull(ADB_SYNC_USER.key(), hiveSyncConfigParams.hiveUser);
props.setPropertyIfNonNull(ADB_SYNC_PASS.key(), hiveSyncConfigParams.hivePass);
props.setPropertyIfNonNull(ADB_SYNC_JDBC_URL.key(), hiveSyncConfigParams.jdbcUrl);
props.setPropertyIfNonNull(META_SYNC_BASE_PATH.key(), hiveSyncConfigParams.hoodieSyncConfigParams.basePath);
props.setPropertyIfNonNull(META_SYNC_PARTITION_FIELDS.key(), String.join(",", hiveSyncConfigParams.hoodieSyncConfigParams.partitionFields));
props.setPropertyIfNonNull(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), hiveSyncConfigParams.hoodieSyncConfigParams.partitionValueExtractorClass);
props.setPropertyIfNonNull(META_SYNC_ASSUME_DATE_PARTITION.key(), String.valueOf(hiveSyncConfigParams.hoodieSyncConfigParams.assumeDatePartitioning));
props.setPropertyIfNonNull(ADB_SYNC_SKIP_RO_SUFFIX.key(), String.valueOf(hiveSyncConfigParams.skipROSuffix));
props.setPropertyIfNonNull(ADB_SYNC_SKIP_RT_SYNC.key(), String.valueOf(skipRTSync));
props.setPropertyIfNonNull(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING.key(), String.valueOf(useHiveStylePartitioning));
props.setPropertyIfNonNull(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(hiveSyncConfigParams.hoodieSyncConfigParams.useFileListingFromMetadata));
props.setPropertyIfNonNull(ADB_SYNC_SUPPORT_TIMESTAMP.key(), String.valueOf(supportTimestamp));
props.setPropertyIfNonNull(ADB_SYNC_TABLE_PROPERTIES.key(), tableProperties);
props.setPropertyIfNonNull(ADB_SYNC_SERDE_PROPERTIES.key(), serdeProperties);
props.setPropertyIfNonNull(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsSparkDataSourceTable));
props.setPropertyIfNonNull(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), String.valueOf(sparkSchemaLengthThreshold));
props.setPropertyIfNonNull(META_SYNC_SPARK_VERSION.key(), hiveSyncConfigParams.hoodieSyncConfigParams.sparkVersion);
props.setPropertyIfNonNull(ADB_SYNC_DB_LOCATION.key(), dbLocation);
props.setPropertyIfNonNull(ADB_SYNC_AUTO_CREATE_DATABASE.key(), String.valueOf(autoCreateDatabase));
props.setPropertyIfNonNull(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC.key(), String.valueOf(skipLastCommitTimeSync));
props.setPropertyIfNonNull(ADB_SYNC_DROP_TABLE_BEFORE_CREATION.key(), String.valueOf(dropTableBeforeCreation));
return props;
}
} }
} }

View File

@@ -18,22 +18,19 @@
package org.apache.hudi.sync.adb; package org.apache.hudi.sync.adb;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.hudi.hive.SchemaDifference; import org.apache.hudi.hive.SchemaDifference;
import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hudi.hive.util.HiveSchemaUtil;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent; import org.apache.hudi.sync.common.HoodieSyncTool;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType; import org.apache.hudi.sync.common.model.PartitionEvent;
import org.apache.hudi.sync.common.AbstractSyncTool; import org.apache.hudi.sync.common.model.PartitionEvent.PartitionEventType;
import org.apache.hudi.sync.common.util.ConfigUtils; import org.apache.hudi.sync.common.util.ConfigUtils;
import org.apache.hudi.sync.common.util.SparkDataSourceTableUtils;
import com.beust.jcommander.JCommander; import com.beust.jcommander.JCommander;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageType;
@@ -43,8 +40,25 @@ import org.slf4j.LoggerFactory;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_AUTO_CREATE_DATABASE;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_DROP_TABLE_BEFORE_CREATION;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SERDE_PROPERTIES;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_RO_SUFFIX;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_RT_SYNC;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SUPPORT_TIMESTAMP;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_TABLE_PROPERTIES;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_SPARK_VERSION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
/** /**
* Adb sync tool is mainly used to sync hoodie tables to Alibaba Cloud AnalyticDB(ADB), * Adb sync tool is mainly used to sync hoodie tables to Alibaba Cloud AnalyticDB(ADB),
* it can be used as API `AdbSyncTool.syncHoodieTable(AdbSyncConfig)` or as command * it can be used as API `AdbSyncTool.syncHoodieTable(AdbSyncConfig)` or as command
@@ -55,45 +69,52 @@ import java.util.stream.Collectors;
* incremental partitions will be synced as well. * incremental partitions will be synced as well.
*/ */
@SuppressWarnings("WeakerAccess") @SuppressWarnings("WeakerAccess")
public class AdbSyncTool extends AbstractSyncTool { public class AdbSyncTool extends HoodieSyncTool {
private static final Logger LOG = LoggerFactory.getLogger(AdbSyncTool.class); private static final Logger LOG = LoggerFactory.getLogger(AdbSyncTool.class);
public static final String SUFFIX_SNAPSHOT_TABLE = "_rt"; public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro"; public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
private final AdbSyncConfig adbSyncConfig; private final AdbSyncConfig config;
private final AbstractAdbSyncHoodieClient hoodieAdbClient; private final String databaseName;
private final String tableName;
private final HoodieAdbJdbcClient syncClient;
private final String snapshotTableName; private final String snapshotTableName;
private final Option<String> roTableTableName; private final Option<String> roTableTableName;
public AdbSyncTool(TypedProperties props, Configuration conf, FileSystem fs) { public AdbSyncTool(Properties props) {
super(props, conf, fs); super(props);
this.adbSyncConfig = new AdbSyncConfig(props); this.config = new AdbSyncConfig(props);
this.hoodieAdbClient = getHoodieAdbClient(adbSyncConfig, fs); this.databaseName = config.getString(META_SYNC_DATABASE_NAME);
switch (hoodieAdbClient.getTableType()) { this.tableName = config.getString(META_SYNC_TABLE_NAME);
this.syncClient = new HoodieAdbJdbcClient(config);
switch (syncClient.getTableType()) {
case COPY_ON_WRITE: case COPY_ON_WRITE:
this.snapshotTableName = adbSyncConfig.tableName; this.snapshotTableName = tableName;
this.roTableTableName = Option.empty(); this.roTableTableName = Option.empty();
break; break;
case MERGE_ON_READ: case MERGE_ON_READ:
this.snapshotTableName = adbSyncConfig.tableName + SUFFIX_SNAPSHOT_TABLE; this.snapshotTableName = tableName + SUFFIX_SNAPSHOT_TABLE;
this.roTableTableName = adbSyncConfig.skipROSuffix ? Option.of(adbSyncConfig.tableName) this.roTableTableName = config.getBoolean(ADB_SYNC_SKIP_RO_SUFFIX) ? Option.of(tableName)
: Option.of(adbSyncConfig.tableName + SUFFIX_READ_OPTIMIZED_TABLE); : Option.of(tableName + SUFFIX_READ_OPTIMIZED_TABLE);
break; break;
default: default:
throw new HoodieAdbSyncException("Unknown table type:" + hoodieAdbClient.getTableType() throw new HoodieAdbSyncException("Unknown table type:" + syncClient.getTableType()
+ ", basePath:" + hoodieAdbClient.getBasePath()); + ", basePath:" + syncClient.getBasePath());
} }
} }
private AbstractAdbSyncHoodieClient getHoodieAdbClient(AdbSyncConfig adbSyncConfig, FileSystem fs) { @Override
return new HoodieAdbJdbcClient(adbSyncConfig, fs); public void close() {
if (syncClient != null) {
syncClient.close();
}
} }
@Override @Override
public void syncHoodieTable() { public void syncHoodieTable() {
try { try {
switch (hoodieAdbClient.getTableType()) { switch (syncClient.getTableType()) {
case COPY_ON_WRITE: case COPY_ON_WRITE:
syncHoodieTable(snapshotTableName, false, false); syncHoodieTable(snapshotTableName, false, false);
break; break;
@@ -101,39 +122,38 @@ public class AdbSyncTool extends AbstractSyncTool {
// Sync a ro table for MOR table // Sync a ro table for MOR table
syncHoodieTable(roTableTableName.get(), false, true); syncHoodieTable(roTableTableName.get(), false, true);
// Sync a rt table for MOR table // Sync a rt table for MOR table
if (!adbSyncConfig.skipRTSync) { if (!config.getBoolean(ADB_SYNC_SKIP_RT_SYNC)) {
syncHoodieTable(snapshotTableName, true, false); syncHoodieTable(snapshotTableName, true, false);
} }
break; break;
default: default:
throw new HoodieAdbSyncException("Unknown table type:" + hoodieAdbClient.getTableType() throw new HoodieAdbSyncException("Unknown table type:" + syncClient.getTableType()
+ ", basePath:" + hoodieAdbClient.getBasePath()); + ", basePath:" + syncClient.getBasePath());
} }
} catch (Exception re) { } catch (Exception re) {
throw new HoodieAdbSyncException("Sync hoodie table to ADB failed, tableName:" + adbSyncConfig.tableName, re); throw new HoodieAdbSyncException("Sync hoodie table to ADB failed, tableName:" + tableName, re);
} finally { } finally {
hoodieAdbClient.close(); syncClient.close();
} }
} }
private void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, private void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) throws Exception {
boolean readAsOptimized) throws Exception {
LOG.info("Try to sync hoodie table, tableName:{}, path:{}, tableType:{}", LOG.info("Try to sync hoodie table, tableName:{}, path:{}, tableType:{}",
tableName, hoodieAdbClient.getBasePath(), hoodieAdbClient.getTableType()); tableName, syncClient.getBasePath(), syncClient.getTableType());
if (adbSyncConfig.autoCreateDatabase) { if (config.getBoolean(ADB_SYNC_AUTO_CREATE_DATABASE)) {
try { try {
synchronized (AdbSyncTool.class) { synchronized (AdbSyncTool.class) {
if (!hoodieAdbClient.databaseExists(adbSyncConfig.databaseName)) { if (!syncClient.databaseExists(databaseName)) {
hoodieAdbClient.createDatabase(adbSyncConfig.databaseName); syncClient.createDatabase(databaseName);
} }
} }
} catch (Exception e) { } catch (Exception e) {
throw new HoodieAdbSyncException("Failed to create database:" + adbSyncConfig.databaseName throw new HoodieAdbSyncException("Failed to create database:" + databaseName
+ ", useRealtimeInputFormat = " + useRealtimeInputFormat, e); + ", useRealtimeInputFormat = " + useRealtimeInputFormat, e);
} }
} else if (!hoodieAdbClient.databaseExists(adbSyncConfig.databaseName)) { } else if (!syncClient.databaseExists(databaseName)) {
throw new HoodieAdbSyncException("ADB database does not exists:" + adbSyncConfig.databaseName); throw new HoodieAdbSyncException("ADB database does not exists:" + databaseName);
} }
// Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table, // Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table,
@@ -141,22 +161,22 @@ public class AdbSyncTool extends AbstractSyncTool {
// by the data source way (which will use the HoodieBootstrapRelation). // by the data source way (which will use the HoodieBootstrapRelation).
// TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071], // TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071],
// we can remove this logical. // we can remove this logical.
if (hoodieAdbClient.isBootstrap() if (syncClient.isBootstrap()
&& hoodieAdbClient.getTableType() == HoodieTableType.MERGE_ON_READ && syncClient.getTableType() == HoodieTableType.MERGE_ON_READ
&& !readAsOptimized) { && !readAsOptimized) {
adbSyncConfig.syncAsSparkDataSourceTable = false; config.setValue(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE, "false");
LOG.info("Disable sync as spark datasource table for mor rt table:{}", tableName); LOG.info("Disable sync as spark datasource table for mor rt table:{}", tableName);
} }
if (adbSyncConfig.dropTableBeforeCreation) { if (config.getBoolean(ADB_SYNC_DROP_TABLE_BEFORE_CREATION)) {
LOG.info("Drop table before creation, tableName:{}", tableName); LOG.info("Drop table before creation, tableName:{}", tableName);
hoodieAdbClient.dropTable(tableName); syncClient.dropTable(tableName);
} }
boolean tableExists = hoodieAdbClient.tableExists(tableName); boolean tableExists = syncClient.tableExists(tableName);
// Get the parquet schema for this table looking at the latest commit // Get the parquet schema for this table looking at the latest commit
MessageType schema = hoodieAdbClient.getDataSchema(); MessageType schema = syncClient.getStorageSchema();
// Sync schema if needed // Sync schema if needed
syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema); syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema);
@@ -165,16 +185,16 @@ public class AdbSyncTool extends AbstractSyncTool {
// Get the last time we successfully synced partitions // Get the last time we successfully synced partitions
Option<String> lastCommitTimeSynced = Option.empty(); Option<String> lastCommitTimeSynced = Option.empty();
if (tableExists) { if (tableExists) {
lastCommitTimeSynced = hoodieAdbClient.getLastCommitTimeSynced(tableName); lastCommitTimeSynced = syncClient.getLastCommitTimeSynced(tableName);
} }
LOG.info("Last commit time synced was found:{}", lastCommitTimeSynced.orElse("null")); LOG.info("Last commit time synced was found:{}", lastCommitTimeSynced.orElse("null"));
// Scan synced partitions // Scan synced partitions
List<String> writtenPartitionsSince; List<String> writtenPartitionsSince;
if (adbSyncConfig.partitionFields.isEmpty()) { if (config.getSplitStrings(META_SYNC_PARTITION_FIELDS).isEmpty()) {
writtenPartitionsSince = new ArrayList<>(); writtenPartitionsSince = new ArrayList<>();
} else { } else {
writtenPartitionsSince = hoodieAdbClient.getPartitionsWrittenToSince(lastCommitTimeSynced); writtenPartitionsSince = syncClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
} }
LOG.info("Scan partitions complete, partitionNum:{}", writtenPartitionsSince.size()); LOG.info("Scan partitions complete, partitionNum:{}", writtenPartitionsSince.size());
@@ -183,8 +203,8 @@ public class AdbSyncTool extends AbstractSyncTool {
// Update sync commit time // Update sync commit time
// whether to skip syncing commit time stored in tbl properties, since it is time consuming. // whether to skip syncing commit time stored in tbl properties, since it is time consuming.
if (!adbSyncConfig.skipLastCommitTimeSync) { if (!config.getBoolean(ADB_SYNC_SKIP_LAST_COMMIT_TIME_SYNC)) {
hoodieAdbClient.updateLastCommitTimeSynced(tableName); syncClient.updateLastCommitTimeSynced(tableName);
} }
LOG.info("Sync complete for table:{}", tableName); LOG.info("Sync complete for table:{}", tableName);
} }
@@ -200,14 +220,14 @@ public class AdbSyncTool extends AbstractSyncTool {
* @param schema The extracted schema * @param schema The extracted schema
*/ */
private void syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, private void syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat,
boolean readAsOptimized, MessageType schema) throws Exception { boolean readAsOptimized, MessageType schema) {
// Append spark table properties & serde properties // Append spark table properties & serde properties
Map<String, String> tableProperties = ConfigUtils.toMap(adbSyncConfig.tableProperties); Map<String, String> tableProperties = ConfigUtils.toMap(config.getString(ADB_SYNC_TABLE_PROPERTIES));
Map<String, String> serdeProperties = ConfigUtils.toMap(adbSyncConfig.serdeProperties); Map<String, String> serdeProperties = ConfigUtils.toMap(config.getString(ADB_SYNC_SERDE_PROPERTIES));
if (adbSyncConfig.syncAsSparkDataSourceTable) { if (config.getBoolean(ADB_SYNC_SYNC_AS_SPARK_DATA_SOURCE_TABLE)) {
Map<String, String> sparkTableProperties = getSparkTableProperties(adbSyncConfig.partitionFields, Map<String, String> sparkTableProperties = SparkDataSourceTableUtils.getSparkTableProperties(config.getSplitStrings(META_SYNC_PARTITION_FIELDS),
adbSyncConfig.sparkVersion, adbSyncConfig.sparkSchemaLengthThreshold, schema); config.getString(META_SYNC_SPARK_VERSION), config.getInt(ADB_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD), schema);
Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized, adbSyncConfig.basePath); Map<String, String> sparkSerdeProperties = SparkDataSourceTableUtils.getSparkSerdeProperties(readAsOptimized, config.getString(META_SYNC_BASE_PATH));
tableProperties.putAll(sparkTableProperties); tableProperties.putAll(sparkTableProperties);
serdeProperties.putAll(sparkSerdeProperties); serdeProperties.putAll(sparkSerdeProperties);
LOG.info("Sync as spark datasource table, tableName:{}, tableExists:{}, tableProperties:{}, sederProperties:{}", LOG.info("Sync as spark datasource table, tableName:{}, tableExists:{}, tableProperties:{}, sederProperties:{}",
@@ -222,16 +242,16 @@ public class AdbSyncTool extends AbstractSyncTool {
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS // Custom serde will not work with ALTER TABLE REPLACE COLUMNS
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
// /ql/exec/DDLTask.java#L3488 // /ql/exec/DDLTask.java#L3488
hoodieAdbClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(), syncClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(),
ParquetHiveSerDe.class.getName(), serdeProperties, tableProperties); ParquetHiveSerDe.class.getName(), serdeProperties, tableProperties);
} else { } else {
// Check if the table schema has evolved // Check if the table schema has evolved
Map<String, String> tableSchema = hoodieAdbClient.getTableSchema(tableName); Map<String, String> tableSchema = syncClient.getMetastoreSchema(tableName);
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, adbSyncConfig.partitionFields, SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, config.getSplitStrings(META_SYNC_PARTITION_FIELDS),
adbSyncConfig.supportTimestamp); config.getBoolean(ADB_SYNC_SUPPORT_TIMESTAMP));
if (!schemaDiff.isEmpty()) { if (!schemaDiff.isEmpty()) {
LOG.info("Schema difference found for table:{}", tableName); LOG.info("Schema difference found for table:{}", tableName);
hoodieAdbClient.updateTableDefinition(tableName, schemaDiff); syncClient.updateTableDefinition(tableName, schemaDiff);
} else { } else {
LOG.info("No Schema difference for table:{}", tableName); LOG.info("No Schema difference for table:{}", tableName);
} }
@@ -244,19 +264,19 @@ public class AdbSyncTool extends AbstractSyncTool {
*/ */
private void syncPartitions(String tableName, List<String> writtenPartitionsSince) { private void syncPartitions(String tableName, List<String> writtenPartitionsSince) {
try { try {
if (adbSyncConfig.partitionFields.isEmpty()) { if (config.getSplitStrings(META_SYNC_PARTITION_FIELDS).isEmpty()) {
LOG.info("Not a partitioned table."); LOG.info("Not a partitioned table.");
return; return;
} }
Map<List<String>, String> partitions = hoodieAdbClient.scanTablePartitions(tableName); Map<List<String>, String> partitions = syncClient.scanTablePartitions(tableName);
List<PartitionEvent> partitionEvents = hoodieAdbClient.getPartitionEvents(partitions, writtenPartitionsSince); List<PartitionEvent> partitionEvents = syncClient.getPartitionEvents(partitions, writtenPartitionsSince);
List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD); List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
LOG.info("New Partitions:{}", newPartitions); LOG.info("New Partitions:{}", newPartitions);
hoodieAdbClient.addPartitionsToTable(tableName, newPartitions); syncClient.addPartitionsToTable(tableName, newPartitions);
List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE); List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
LOG.info("Changed Partitions:{}", updatePartitions); LOG.info("Changed Partitions:{}", updatePartitions);
hoodieAdbClient.updatePartitionsToTable(tableName, updatePartitions); syncClient.updatePartitionsToTable(tableName, updatePartitions);
} catch (Exception e) { } catch (Exception e) {
throw new HoodieAdbSyncException("Failed to sync partitions for table:" + tableName, e); throw new HoodieAdbSyncException("Failed to sync partitions for table:" + tableName, e);
} }
@@ -268,16 +288,13 @@ public class AdbSyncTool extends AbstractSyncTool {
} }
public static void main(String[] args) { public static void main(String[] args) {
// parse the params final AdbSyncConfig.AdbSyncConfigParams params = new AdbSyncConfig.AdbSyncConfigParams();
final AdbSyncConfig cfg = new AdbSyncConfig(); JCommander cmd = JCommander.newBuilder().addObject(params).build();
JCommander cmd = new JCommander(cfg, null, args); cmd.parse(args);
if (cfg.help || args.length == 0) { if (params.isHelp()) {
cmd.usage(); cmd.usage();
System.exit(1); System.exit(0);
} }
new AdbSyncTool(params.toProps()).syncHoodieTable();
Configuration hadoopConf = new Configuration();
FileSystem fs = FSUtils.getFs(cfg.basePath, hadoopConf);
new AdbSyncTool(AdbSyncConfig.toProps(cfg), hadoopConf, fs).syncHoodieTable();
} }
} }

View File

@@ -23,12 +23,12 @@ import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HoodieHiveSyncException; import org.apache.hudi.hive.HoodieHiveSyncException;
import org.apache.hudi.hive.SchemaDifference; import org.apache.hudi.hive.SchemaDifference;
import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hudi.hive.util.HiveSchemaUtil;
import org.apache.hudi.sync.common.HoodieSyncClient;
import org.apache.hudi.sync.common.model.PartitionEvent;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageType;
import org.slf4j.Logger; import org.slf4j.Logger;
@@ -47,13 +47,21 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.function.Function; import java.util.function.Function;
public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient { import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_JDBC_URL;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_PASS;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_USER;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_USE_HIVE_STYLE_PARTITIONING;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
public class HoodieAdbJdbcClient extends HoodieSyncClient {
private static final Logger LOG = LoggerFactory.getLogger(HoodieAdbJdbcClient.class); private static final Logger LOG = LoggerFactory.getLogger(HoodieAdbJdbcClient.class);
public static final String HOODIE_LAST_COMMIT_TIME_SYNC = "hoodie_last_sync"; public static final String HOODIE_LAST_COMMIT_TIME_SYNC = "hoodie_last_sync";
// Make sure we have the jdbc driver in classpath // Make sure we have the jdbc driver in classpath
private static final String DRIVER_NAME = "com.mysql.jdbc.Driver"; private static final String DRIVER_NAME = "com.mysql.jdbc.Driver";
public static final String ADB_ESCAPE_CHARACTER = "";
private static final String TBL_PROPERTIES_STR = "TBLPROPERTIES"; private static final String TBL_PROPERTIES_STR = "TBLPROPERTIES";
static { static {
@@ -64,12 +72,16 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
} }
} }
private final AdbSyncConfig config;
private final String databaseName;
private Connection connection; private Connection connection;
public HoodieAdbJdbcClient(AdbSyncConfig syncConfig, FileSystem fs) { public HoodieAdbJdbcClient(AdbSyncConfig config) {
super(syncConfig, fs); super(config);
this.config = config;
this.databaseName = config.getString(META_SYNC_DATABASE_NAME);
createAdbConnection(); createAdbConnection();
LOG.info("Init adb jdbc client success, jdbcUrl:{}", syncConfig.jdbcUrl); LOG.info("Init adb jdbc client success, jdbcUrl:{}", config.getString(ADB_SYNC_JDBC_URL));
} }
private void createAdbConnection() { private void createAdbConnection() {
@@ -82,7 +94,9 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
} }
try { try {
this.connection = DriverManager.getConnection( this.connection = DriverManager.getConnection(
adbSyncConfig.jdbcUrl, adbSyncConfig.adbUser, adbSyncConfig.adbPass); config.getString(ADB_SYNC_JDBC_URL),
config.getString(ADB_SYNC_USER),
config.getString(ADB_SYNC_PASS));
} catch (SQLException e) { } catch (SQLException e) {
throw new HoodieException("Cannot create adb connection ", e); throw new HoodieException("Cannot create adb connection ", e);
} }
@@ -96,7 +110,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
try { try {
LOG.info("Creating table:{}", tableName); LOG.info("Creating table:{}", tableName);
String createSQLQuery = HiveSchemaUtil.generateCreateDDL(tableName, storageSchema, String createSQLQuery = HiveSchemaUtil.generateCreateDDL(tableName, storageSchema,
getHiveSyncConfig(), inputFormatClass, outputFormatClass, serdeClass, serdeProperties, tableProperties); config, inputFormatClass, outputFormatClass, serdeClass, serdeProperties, tableProperties);
executeAdbSql(createSQLQuery); executeAdbSql(createSQLQuery);
} catch (IOException e) { } catch (IOException e) {
throw new HoodieException("Fail to create table:" + tableName, e); throw new HoodieException("Fail to create table:" + tableName, e);
@@ -106,17 +120,18 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
@Override @Override
public void dropTable(String tableName) { public void dropTable(String tableName) {
LOG.info("Dropping table:{}", tableName); LOG.info("Dropping table:{}", tableName);
String dropTable = "drop table if exists `" + adbSyncConfig.databaseName + "`.`" + tableName + "`"; String dropTable = "drop table if exists `" + databaseName + "`.`" + tableName + "`";
executeAdbSql(dropTable); executeAdbSql(dropTable);
} }
public Map<String, String> getTableSchema(String tableName) { @Override
public Map<String, String> getMetastoreSchema(String tableName) {
Map<String, String> schema = new HashMap<>(); Map<String, String> schema = new HashMap<>();
ResultSet result = null; ResultSet result = null;
try { try {
DatabaseMetaData databaseMetaData = connection.getMetaData(); DatabaseMetaData databaseMetaData = connection.getMetaData();
result = databaseMetaData.getColumns(adbSyncConfig.databaseName, result = databaseMetaData.getColumns(databaseName,
adbSyncConfig.databaseName, tableName, null); databaseName, tableName, null);
while (result.next()) { while (result.next()) {
String columnName = result.getString(4); String columnName = result.getString(4);
String columnType = result.getString(6); String columnType = result.getString(6);
@@ -174,7 +189,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
} }
public void createDatabase(String databaseName) { public void createDatabase(String databaseName) {
String rootPath = getDatabasePath(); String rootPath = config.getDatabasePath();
LOG.info("Creating database:{}, databaseLocation:{}", databaseName, rootPath); LOG.info("Creating database:{}, databaseLocation:{}", databaseName, rootPath);
String sql = constructCreateDatabaseSql(rootPath); String sql = constructCreateDatabaseSql(rootPath);
executeAdbSql(sql); executeAdbSql(sql);
@@ -197,7 +212,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
} }
@Override @Override
public boolean doesTableExist(String tableName) { public boolean tableExists(String tableName) {
String sql = constructShowLikeTableSql(tableName); String sql = constructShowLikeTableSql(tableName);
Function<ResultSet, Boolean> transform = resultSet -> { Function<ResultSet, Boolean> transform = resultSet -> {
try { try {
@@ -209,11 +224,6 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
return executeQuerySQL(sql, transform); return executeQuerySQL(sql, transform);
} }
@Override
public boolean tableExists(String tableName) {
return doesTableExist(tableName);
}
@Override @Override
public Option<String> getLastCommitTimeSynced(String tableName) { public Option<String> getLastCommitTimeSynced(String tableName) {
String sql = constructShowCreateTableSql(tableName); String sql = constructShowCreateTableSql(tableName);
@@ -251,7 +261,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
@Override @Override
public void updateLastCommitTimeSynced(String tableName) { public void updateLastCommitTimeSynced(String tableName) {
// Set the last commit time from the TBLProperties // Set the last commit time from the TBLProperties
String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp(); String lastCommitSynced = getActiveTimeline().lastInstant().get().getTimestamp();
try { try {
String sql = constructUpdateTblPropertiesSql(tableName, lastCommitSynced); String sql = constructUpdateTblPropertiesSql(tableName, lastCommitSynced);
executeAdbSql(sql); executeAdbSql(sql);
@@ -275,6 +285,11 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
throw new UnsupportedOperationException("Not support deleteLastReplicatedTimeStamp yet"); throw new UnsupportedOperationException("Not support deleteLastReplicatedTimeStamp yet");
} }
@Override
public void updateTableProperties(String tableName, Map<String, String> tableProperties) {
throw new UnsupportedOperationException("Not support updateTableProperties yet");
}
@Override @Override
public void updatePartitionsToTable(String tableName, List<String> changedPartitions) { public void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
if (changedPartitions.isEmpty()) { if (changedPartitions.isEmpty()) {
@@ -294,6 +309,9 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
throw new UnsupportedOperationException("Not support dropPartitions yet."); throw new UnsupportedOperationException("Not support dropPartitions yet.");
} }
/**
* TODO migrate to implementation of {@link #getAllPartitions(String)}
*/
public Map<List<String>, String> scanTablePartitions(String tableName) { public Map<List<String>, String> scanTablePartitions(String tableName) {
String sql = constructShowPartitionSql(tableName); String sql = constructShowPartitionSql(tableName);
Function<ResultSet, Map<List<String>, String>> transform = resultSet -> { Function<ResultSet, Map<List<String>, String>> transform = resultSet -> {
@@ -304,7 +322,7 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
String str = resultSet.getString(1); String str = resultSet.getString(1);
if (!StringUtils.isNullOrEmpty(str)) { if (!StringUtils.isNullOrEmpty(str)) {
List<String> values = partitionValueExtractor.extractPartitionValuesInPath(str); List<String> values = partitionValueExtractor.extractPartitionValuesInPath(str);
Path storagePartitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, String.join("/", values)); Path storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), String.join("/", values));
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath(); String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
partitions.put(values, fullStoragePartitionPath); partitions.put(values, fullStoragePartitionPath);
} }
@@ -318,6 +336,9 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
return executeQuerySQL(sql, transform); return executeQuerySQL(sql, transform);
} }
/**
* TODO align with {@link org.apache.hudi.sync.common.HoodieMetaSyncOperations#updateTableSchema}
*/
public void updateTableDefinition(String tableName, SchemaDifference schemaDiff) { public void updateTableDefinition(String tableName, SchemaDifference schemaDiff) {
LOG.info("Adding columns for table:{}", tableName); LOG.info("Adding columns for table:{}", tableName);
schemaDiff.getAddColumnTypes().forEach((columnName, columnType) -> schemaDiff.getAddColumnTypes().forEach((columnName, columnType) ->
@@ -332,12 +353,12 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
private String constructAddPartitionsSql(String tableName, List<String> partitions) { private String constructAddPartitionsSql(String tableName, List<String> partitions) {
StringBuilder sqlBuilder = new StringBuilder("alter table `"); StringBuilder sqlBuilder = new StringBuilder("alter table `");
sqlBuilder.append(adbSyncConfig.databaseName).append("`").append(".`") sqlBuilder.append(databaseName).append("`").append(".`")
.append(tableName).append("`").append(" add if not exists "); .append(tableName).append("`").append(" add if not exists ");
for (String partition : partitions) { for (String partition : partitions) {
String partitionClause = getPartitionClause(partition); String partitionClause = getPartitionClause(partition);
Path partitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition); Path partitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
String fullPartitionPathStr = generateAbsolutePathStr(partitionPath); String fullPartitionPathStr = config.generateAbsolutePathStr(partitionPath);
sqlBuilder.append(" partition (").append(partitionClause).append(") location '") sqlBuilder.append(" partition (").append(partitionClause).append(") location '")
.append(fullPartitionPathStr).append("' "); .append(fullPartitionPathStr).append("' ");
} }
@@ -347,14 +368,14 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
private List<String> constructChangePartitionsSql(String tableName, List<String> partitions) { private List<String> constructChangePartitionsSql(String tableName, List<String> partitions) {
List<String> changePartitions = new ArrayList<>(); List<String> changePartitions = new ArrayList<>();
String useDatabase = "use `" + adbSyncConfig.databaseName + "`"; String useDatabase = "use `" + databaseName + "`";
changePartitions.add(useDatabase); changePartitions.add(useDatabase);
String alterTable = "alter table `" + tableName + "`"; String alterTable = "alter table `" + tableName + "`";
for (String partition : partitions) { for (String partition : partitions) {
String partitionClause = getPartitionClause(partition); String partitionClause = getPartitionClause(partition);
Path partitionPath = FSUtils.getPartitionPath(adbSyncConfig.basePath, partition); Path partitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
String fullPartitionPathStr = generateAbsolutePathStr(partitionPath); String fullPartitionPathStr = config.generateAbsolutePathStr(partitionPath);
String changePartition = alterTable + " add if not exists partition (" + partitionClause String changePartition = alterTable + " add if not exists partition (" + partitionClause
+ ") location '" + fullPartitionPathStr + "'"; + ") location '" + fullPartitionPathStr + "'";
changePartitions.add(changePartition); changePartitions.add(changePartition);
@@ -371,32 +392,32 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
*/ */
private String getPartitionClause(String partition) { private String getPartitionClause(String partition) {
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
ValidationUtils.checkArgument(adbSyncConfig.partitionFields.size() == partitionValues.size(), ValidationUtils.checkArgument(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() == partitionValues.size(),
"Partition key parts " + adbSyncConfig.partitionFields "Partition key parts " + config.getSplitStrings(META_SYNC_PARTITION_FIELDS)
+ " does not match with partition values " + partitionValues + ". Check partition strategy. "); + " does not match with partition values " + partitionValues + ". Check partition strategy. ");
List<String> partBuilder = new ArrayList<>(); List<String> partBuilder = new ArrayList<>();
for (int i = 0; i < adbSyncConfig.partitionFields.size(); i++) { for (int i = 0; i < config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size(); i++) {
partBuilder.add(adbSyncConfig.partitionFields.get(i) + "='" + partitionValues.get(i) + "'"); partBuilder.add(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).get(i) + "='" + partitionValues.get(i) + "'");
} }
return String.join(",", partBuilder); return String.join(",", partBuilder);
} }
private String constructShowPartitionSql(String tableName) { private String constructShowPartitionSql(String tableName) {
return String.format("show partitions `%s`.`%s`", adbSyncConfig.databaseName, tableName); return String.format("show partitions `%s`.`%s`", databaseName, tableName);
} }
private String constructShowCreateTableSql(String tableName) { private String constructShowCreateTableSql(String tableName) {
return String.format("show create table `%s`.`%s`", adbSyncConfig.databaseName, tableName); return String.format("show create table `%s`.`%s`", databaseName, tableName);
} }
private String constructShowLikeTableSql(String tableName) { private String constructShowLikeTableSql(String tableName) {
return String.format("show tables from `%s` like '%s'", adbSyncConfig.databaseName, tableName); return String.format("show tables from `%s` like '%s'", databaseName, tableName);
} }
private String constructCreateDatabaseSql(String rootPath) { private String constructCreateDatabaseSql(String rootPath) {
return String.format("create database if not exists `%s` with dbproperties(catalog = 'oss', location = '%s')", return String.format("create database if not exists `%s` with dbproperties(catalog = 'oss', location = '%s')",
adbSyncConfig.databaseName, rootPath); databaseName, rootPath);
} }
private String constructShowCreateDatabaseSql(String databaseName) { private String constructShowCreateDatabaseSql(String databaseName) {
@@ -405,26 +426,69 @@ public class HoodieAdbJdbcClient extends AbstractAdbSyncHoodieClient {
private String constructUpdateTblPropertiesSql(String tableName, String lastCommitSynced) { private String constructUpdateTblPropertiesSql(String tableName, String lastCommitSynced) {
return String.format("alter table `%s`.`%s` set tblproperties('%s' = '%s')", return String.format("alter table `%s`.`%s` set tblproperties('%s' = '%s')",
adbSyncConfig.databaseName, tableName, HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced); databaseName, tableName, HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced);
} }
private String constructAddColumnSql(String tableName, String columnName, String columnType) { private String constructAddColumnSql(String tableName, String columnName, String columnType) {
return String.format("alter table `%s`.`%s` add columns(`%s` %s)", return String.format("alter table `%s`.`%s` add columns(`%s` %s)",
adbSyncConfig.databaseName, tableName, columnName, columnType); databaseName, tableName, columnName, columnType);
} }
private String constructChangeColumnSql(String tableName, String columnName, String columnType) { private String constructChangeColumnSql(String tableName, String columnName, String columnType) {
return String.format("alter table `%s`.`%s` change `%s` `%s` %s", return String.format("alter table `%s`.`%s` change `%s` `%s` %s",
adbSyncConfig.databaseName, tableName, columnName, columnName, columnType); databaseName, tableName, columnName, columnName, columnType);
} }
private HiveSyncConfig getHiveSyncConfig() { /**
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(); * TODO align with {@link HoodieSyncClient#getPartitionEvents}
hiveSyncConfig.partitionFields = adbSyncConfig.partitionFields; */
hiveSyncConfig.databaseName = adbSyncConfig.databaseName; public List<PartitionEvent> getPartitionEvents(Map<List<String>, String> tablePartitions, List<String> partitionStoragePartitions) {
Path basePath = new Path(adbSyncConfig.basePath); Map<String, String> paths = new HashMap<>();
hiveSyncConfig.basePath = generateAbsolutePathStr(basePath);
return hiveSyncConfig; for (Map.Entry<List<String>, String> entry : tablePartitions.entrySet()) {
List<String> partitionValues = entry.getKey();
String fullTablePartitionPath = entry.getValue();
paths.put(String.join(", ", partitionValues), fullTablePartitionPath);
}
List<PartitionEvent> events = new ArrayList<>();
for (String storagePartition : partitionStoragePartitions) {
Path storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), storagePartition);
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
// Check if the partition values or if hdfs path is the same
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
if (config.getBoolean(ADB_SYNC_USE_HIVE_STYLE_PARTITIONING)) {
String partition = String.join("/", storagePartitionValues);
storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
}
if (!storagePartitionValues.isEmpty()) {
String storageValue = String.join(", ", storagePartitionValues);
if (!paths.containsKey(storageValue)) {
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
}
}
}
return events;
}
public void closeQuietly(ResultSet resultSet, Statement stmt) {
try {
if (stmt != null) {
stmt.close();
}
} catch (SQLException e) {
LOG.warn("Could not close the statement opened ", e);
}
try {
if (resultSet != null) {
resultSet.close();
}
} catch (SQLException e) {
LOG.warn("Could not close the resultset opened ", e);
}
} }
@Override @Override

View File

@@ -19,47 +19,72 @@
package org.apache.hudi.sync.adb; package org.apache.hudi.sync.adb;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.sync.common.util.ConfigUtils;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_DB_LOCATION;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_JDBC_URL;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_PASS;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SERDE_PROPERTIES;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_SKIP_RO_SUFFIX;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_TABLE_PROPERTIES;
import static org.apache.hudi.sync.adb.AdbSyncConfig.ADB_SYNC_USER;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestAdbSyncConfig { public class TestAdbSyncConfig {
@Test @Test
public void testCopy() { public void testInitConfig() {
AdbSyncConfig adbSyncConfig = new AdbSyncConfig(); Properties props = new Properties();
adbSyncConfig.partitionFields = Arrays.asList("a", "b"); props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "a,b");
adbSyncConfig.basePath = "/tmp"; props.setProperty(META_SYNC_BASE_PATH.key(), "/tmp");
adbSyncConfig.assumeDatePartitioning = true; props.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
adbSyncConfig.databaseName = "test"; props.setProperty(META_SYNC_DATABASE_NAME.key(), "test");
adbSyncConfig.tableName = "test"; props.setProperty(META_SYNC_TABLE_NAME.key(), "test");
adbSyncConfig.adbUser = "adb"; props.setProperty(ADB_SYNC_USER.key(), "adb");
adbSyncConfig.adbPass = "adb"; props.setProperty(ADB_SYNC_PASS.key(), "adb");
adbSyncConfig.jdbcUrl = "jdbc:mysql://localhost:3306"; props.setProperty(ADB_SYNC_JDBC_URL.key(), "jdbc:mysql://localhost:3306");
adbSyncConfig.skipROSuffix = false; props.setProperty(ADB_SYNC_SKIP_RO_SUFFIX.key(), "false");
adbSyncConfig.tableProperties = "spark.sql.sources.provider= 'hudi'\\n" String tableProps = "spark.sql.sources.provider=hudi\n"
+ "spark.sql.sources.schema.numParts = '1'\\n " + "spark.sql.sources.schema.numParts=1\n"
+ "spark.sql.sources.schema.part.0 ='xx'\\n " + "spark.sql.sources.schema.part.0=xx\n"
+ "spark.sql.sources.schema.numPartCols = '1'\\n" + "spark.sql.sources.schema.numPartCols=1\n"
+ "spark.sql.sources.schema.partCol.0 = 'dt'"; + "spark.sql.sources.schema.partCol.0=dt";
adbSyncConfig.serdeProperties = "'path'='/tmp/test_db/tbl'"; props.setProperty(ADB_SYNC_TABLE_PROPERTIES.key(), tableProps);
adbSyncConfig.dbLocation = "file://tmp/test_db"; props.setProperty(ADB_SYNC_SERDE_PROPERTIES.key(), "path=/tmp/test_db/tbl");
props.setProperty(ADB_SYNC_DB_LOCATION.key(), "file://tmp/test_db");
TypedProperties props = AdbSyncConfig.toProps(adbSyncConfig); AdbSyncConfig config = new AdbSyncConfig(props);
AdbSyncConfig copied = new AdbSyncConfig(props); assertEquals(Arrays.asList("a", "b"), config.getSplitStrings(META_SYNC_PARTITION_FIELDS));
assertEquals("/tmp", config.getString(META_SYNC_BASE_PATH));
assertEquals(copied.partitionFields, adbSyncConfig.partitionFields); assertEquals(true, config.getBoolean(META_SYNC_ASSUME_DATE_PARTITION));
assertEquals(copied.basePath, adbSyncConfig.basePath); assertEquals("test", config.getString(META_SYNC_DATABASE_NAME));
assertEquals(copied.assumeDatePartitioning, adbSyncConfig.assumeDatePartitioning); assertEquals("test", config.getString(META_SYNC_TABLE_NAME));
assertEquals(copied.databaseName, adbSyncConfig.databaseName); assertEquals("adb", config.getString(ADB_SYNC_USER));
assertEquals(copied.tableName, adbSyncConfig.tableName); assertEquals("adb", config.getString(ADB_SYNC_PASS));
assertEquals(copied.adbUser, adbSyncConfig.adbUser); assertEquals("jdbc:mysql://localhost:3306", config.getString(ADB_SYNC_JDBC_URL));
assertEquals(copied.adbPass, adbSyncConfig.adbPass); assertEquals(false, config.getBoolean(ADB_SYNC_SKIP_RO_SUFFIX));
assertEquals(copied.basePath, adbSyncConfig.basePath); Map<String, String> tablePropsMap = new HashMap<>();
assertEquals(copied.jdbcUrl, adbSyncConfig.jdbcUrl); tablePropsMap.put("spark.sql.sources.provider", "hudi");
assertEquals(copied.skipROSuffix, adbSyncConfig.skipROSuffix); tablePropsMap.put("spark.sql.sources.schema.numParts", "1");
assertEquals(copied.supportTimestamp, adbSyncConfig.supportTimestamp); tablePropsMap.put("spark.sql.sources.schema.part.0", "xx");
tablePropsMap.put("spark.sql.sources.schema.numPartCols", "1");
tablePropsMap.put("spark.sql.sources.schema.partCol.0", "dt");
assertEquals(tablePropsMap, ConfigUtils.toMap(config.getString(ADB_SYNC_TABLE_PROPERTIES)));
Map<String, String> serdePropsMap = new HashMap<>();
serdePropsMap.put("path", "/tmp/test_db/tbl");
assertEquals(serdePropsMap, ConfigUtils.toMap(config.getString(ADB_SYNC_SERDE_PROPERTIES)));
assertEquals("file://tmp/test_db", config.getString(ADB_SYNC_DB_LOCATION));
} }
} }

View File

@@ -21,9 +21,8 @@ package org.apache.hudi.sync.datahub;
import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient; import org.apache.hudi.sync.common.HoodieSyncClient;
import org.apache.hudi.sync.common.HoodieSyncException; import org.apache.hudi.sync.common.HoodieSyncException;
import org.apache.hudi.sync.datahub.config.DataHubSyncConfig; import org.apache.hudi.sync.datahub.config.DataHubSyncConfig;
@@ -51,8 +50,6 @@ import datahub.client.rest.RestEmitter;
import datahub.event.MetadataChangeProposalWrapper; import datahub.event.MetadataChangeProposalWrapper;
import org.apache.avro.AvroTypeException; import org.apache.avro.AvroTypeException;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageType;
import java.util.Collections; import java.util.Collections;
@@ -60,40 +57,15 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
public class DataHubSyncClient extends AbstractSyncHoodieClient { public class DataHubSyncClient extends HoodieSyncClient {
private final HoodieTimeline activeTimeline; protected final DataHubSyncConfig config;
private final DataHubSyncConfig syncConfig;
private final Configuration hadoopConf;
private final DatasetUrn datasetUrn; private final DatasetUrn datasetUrn;
public DataHubSyncClient(DataHubSyncConfig syncConfig, Configuration hadoopConf, FileSystem fs) { public DataHubSyncClient(DataHubSyncConfig config) {
super(syncConfig.basePath, syncConfig.assumeDatePartitioning, syncConfig.useFileListingFromMetadata, false, fs); super(config);
this.syncConfig = syncConfig; this.config = config;
this.hadoopConf = hadoopConf; this.datasetUrn = config.datasetIdentifier.getDatasetUrn();
this.datasetUrn = syncConfig.datasetIdentifier.getDatasetUrn();
this.activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
}
@Override
public void createTable(String tableName,
MessageType storageSchema,
String inputFormatClass,
String outputFormatClass,
String serdeClass,
Map<String, String> serdeProperties,
Map<String, String> tableProperties) {
throw new UnsupportedOperationException("Not supported: `createTable`");
}
@Override
public boolean doesTableExist(String tableName) {
return tableExists(tableName);
}
@Override
public boolean tableExists(String tableName) {
throw new UnsupportedOperationException("Not supported: `tableExists`");
} }
@Override @Override
@@ -103,37 +75,7 @@ public class DataHubSyncClient extends AbstractSyncHoodieClient {
@Override @Override
public void updateLastCommitTimeSynced(String tableName) { public void updateLastCommitTimeSynced(String tableName) {
updateTableProperties(tableName, Collections.singletonMap(HOODIE_LAST_COMMIT_TIME_SYNC, activeTimeline.lastInstant().get().getTimestamp())); updateTableProperties(tableName, Collections.singletonMap(HOODIE_LAST_COMMIT_TIME_SYNC, getActiveTimeline().lastInstant().get().getTimestamp()));
}
@Override
public Option<String> getLastReplicatedTime(String tableName) {
throw new UnsupportedOperationException("Not supported: `getLastReplicatedTime`");
}
@Override
public void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
throw new UnsupportedOperationException("Not supported: `updateLastReplicatedTimeStamp`");
}
@Override
public void deleteLastReplicatedTimeStamp(String tableName) {
throw new UnsupportedOperationException("Not supported: `deleteLastReplicatedTimeStamp`");
}
@Override
public void addPartitionsToTable(String tableName, List<String> partitionsToAdd) {
throw new UnsupportedOperationException("Not supported: `addPartitionsToTable`");
}
@Override
public void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
throw new UnsupportedOperationException("Not supported: `updatePartitionsToTable`");
}
@Override
public void dropPartitions(String tableName, List<String> partitionsToDrop) {
throw new UnsupportedOperationException("Not supported: `dropPartitions`");
} }
@Override @Override
@@ -145,14 +87,15 @@ public class DataHubSyncClient extends AbstractSyncHoodieClient {
.aspect(new DatasetProperties().setCustomProperties(new StringMap(tableProperties))) .aspect(new DatasetProperties().setCustomProperties(new StringMap(tableProperties)))
.build(); .build();
try (RestEmitter emitter = syncConfig.getRestEmitter()) { try (RestEmitter emitter = config.getRestEmitter()) {
emitter.emit(propertiesChangeProposal, null).get(); emitter.emit(propertiesChangeProposal, null).get();
} catch (Exception e) { } catch (Exception e) {
throw new HoodieDataHubSyncException("Fail to change properties for Dataset " + datasetUrn + ": " + tableProperties, e); throw new HoodieDataHubSyncException("Fail to change properties for Dataset " + datasetUrn + ": " + tableProperties, e);
} }
} }
public void updateTableDefinition(String tableName) { @Override
public void updateTableSchema(String tableName, MessageType schema) {
Schema avroSchema = getAvroSchemaWithoutMetadataFields(metaClient); Schema avroSchema = getAvroSchemaWithoutMetadataFields(metaClient);
List<SchemaField> fields = avroSchema.getFields().stream().map(f -> new SchemaField() List<SchemaField> fields = avroSchema.getFields().stream().map(f -> new SchemaField()
.setFieldPath(f.name()) .setFieldPath(f.name())
@@ -175,7 +118,7 @@ public class DataHubSyncClient extends AbstractSyncHoodieClient {
.setFields(new SchemaFieldArray(fields))) .setFields(new SchemaFieldArray(fields)))
.build(); .build();
try (RestEmitter emitter = syncConfig.getRestEmitter()) { try (RestEmitter emitter = config.getRestEmitter()) {
emitter.emit(schemaChangeProposal, null).get(); emitter.emit(schemaChangeProposal, null).get();
} catch (Exception e) { } catch (Exception e) {
throw new HoodieDataHubSyncException("Fail to change schema for Dataset " + datasetUrn, e); throw new HoodieDataHubSyncException("Fail to change schema for Dataset " + datasetUrn, e);
@@ -183,7 +126,7 @@ public class DataHubSyncClient extends AbstractSyncHoodieClient {
} }
@Override @Override
public Map<String, String> getTableSchema(String tableName) { public Map<String, String> getMetastoreSchema(String tableName) {
throw new UnsupportedOperationException("Not supported: `getTableSchema`"); throw new UnsupportedOperationException("Not supported: `getTableSchema`");
} }

View File

@@ -19,14 +19,14 @@
package org.apache.hudi.sync.datahub; package org.apache.hudi.sync.datahub;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.sync.common.HoodieSyncTool;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.sync.common.AbstractSyncTool;
import org.apache.hudi.sync.datahub.config.DataHubSyncConfig; import org.apache.hudi.sync.datahub.config.DataHubSyncConfig;
import com.beust.jcommander.JCommander; import com.beust.jcommander.JCommander;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import java.util.Properties;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
/** /**
* To sync with DataHub via REST APIs. * To sync with DataHub via REST APIs.
@@ -34,17 +34,13 @@ import org.apache.hadoop.fs.FileSystem;
* @Experimental * @Experimental
* @see <a href="https://datahubproject.io/">https://datahubproject.io/</a> * @see <a href="https://datahubproject.io/">https://datahubproject.io/</a>
*/ */
public class DataHubSyncTool extends AbstractSyncTool { public class DataHubSyncTool extends HoodieSyncTool {
private final DataHubSyncConfig config; protected final DataHubSyncConfig config;
public DataHubSyncTool(TypedProperties props, Configuration conf, FileSystem fs) { public DataHubSyncTool(Properties props) {
this(new DataHubSyncConfig(props), conf, fs); super(props);
} this.config = new DataHubSyncConfig(props);
public DataHubSyncTool(DataHubSyncConfig config, Configuration conf, FileSystem fs) {
super(config.getProps(), conf, fs);
this.config = config;
} }
/** /**
@@ -55,20 +51,20 @@ public class DataHubSyncTool extends AbstractSyncTool {
*/ */
@Override @Override
public void syncHoodieTable() { public void syncHoodieTable() {
try (DataHubSyncClient syncClient = new DataHubSyncClient(config, conf, fs)) { try (DataHubSyncClient syncClient = new DataHubSyncClient(config)) {
syncClient.updateTableDefinition(config.tableName); syncClient.updateTableSchema(config.getString(META_SYNC_TABLE_NAME), null);
syncClient.updateLastCommitTimeSynced(config.tableName); syncClient.updateLastCommitTimeSynced(config.getString(META_SYNC_TABLE_NAME));
} }
} }
public static void main(String[] args) { public static void main(String[] args) {
final DataHubSyncConfig cfg = new DataHubSyncConfig(); final DataHubSyncConfig.DataHubSyncConfigParams params = new DataHubSyncConfig.DataHubSyncConfigParams();
JCommander cmd = new JCommander(cfg, null, args); JCommander cmd = JCommander.newBuilder().addObject(params).build();
if (cfg.help || args.length == 0) { cmd.parse(args);
if (params.isHelp()) {
cmd.usage(); cmd.usage();
System.exit(1); System.exit(0);
} }
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration()); new DataHubSyncTool(params.toProps()).syncHoodieTable();
new DataHubSyncTool(cfg, fs.getConf(), fs).syncHoodieTable();
} }
} }

View File

@@ -25,8 +25,11 @@ import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.sync.common.HoodieSyncConfig; import org.apache.hudi.sync.common.HoodieSyncConfig;
import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import datahub.client.rest.RestEmitter; import datahub.client.rest.RestEmitter;
import java.util.Properties;
public class DataHubSyncConfig extends HoodieSyncConfig { public class DataHubSyncConfig extends HoodieSyncConfig {
public static final ConfigProperty<String> META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS = ConfigProperty public static final ConfigProperty<String> META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS = ConfigProperty
@@ -49,6 +52,29 @@ public class DataHubSyncConfig extends HoodieSyncConfig {
.noDefaultValue() .noDefaultValue()
.withDocumentation("Pluggable class to supply a DataHub REST emitter to connect to the DataHub instance. This overwrites other emitter configs."); .withDocumentation("Pluggable class to supply a DataHub REST emitter to connect to the DataHub instance. This overwrites other emitter configs.");
public final HoodieDataHubDatasetIdentifier datasetIdentifier;
public DataHubSyncConfig(Properties props) {
super(props);
String identifierClass = getStringOrDefault(META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS);
datasetIdentifier = (HoodieDataHubDatasetIdentifier) ReflectionUtils.loadClass(identifierClass, new Class<?>[] {Properties.class}, props);
}
public RestEmitter getRestEmitter() {
if (contains(META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS)) {
return ((DataHubEmitterSupplier) ReflectionUtils.loadClass(getString(META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS))).get();
} else if (contains(META_SYNC_DATAHUB_EMITTER_SERVER)) {
return RestEmitter.create(b -> b.server(getString(META_SYNC_DATAHUB_EMITTER_SERVER)).token(getStringOrDefault(META_SYNC_DATAHUB_EMITTER_TOKEN, null)));
} else {
return RestEmitter.createWithDefaults();
}
}
public static class DataHubSyncConfigParams {
@ParametersDelegate()
public final HoodieSyncConfigParams hoodieSyncConfigParams = new HoodieSyncConfigParams();
@Parameter(names = {"--identifier-class"}, description = "Pluggable class to help provide info to identify a DataHub Dataset.") @Parameter(names = {"--identifier-class"}, description = "Pluggable class to help provide info to identify a DataHub Dataset.")
public String identifierClass; public String identifierClass;
@@ -61,33 +87,17 @@ public class DataHubSyncConfig extends HoodieSyncConfig {
@Parameter(names = {"--emitter-supplier-class"}, description = "Pluggable class to supply a DataHub REST emitter to connect to the DataHub instance. This overwrites other emitter configs.") @Parameter(names = {"--emitter-supplier-class"}, description = "Pluggable class to supply a DataHub REST emitter to connect to the DataHub instance. This overwrites other emitter configs.")
public String emitterSupplierClass; public String emitterSupplierClass;
@Parameter(names = {"--help", "-h"}, help = true) public boolean isHelp() {
public Boolean help = false; return hoodieSyncConfigParams.isHelp();
public final HoodieDataHubDatasetIdentifier datasetIdentifier;
public DataHubSyncConfig() {
this(new TypedProperties());
} }
public DataHubSyncConfig(TypedProperties props) { public Properties toProps() {
super(props); final TypedProperties props = hoodieSyncConfigParams.toProps();
identifierClass = getStringOrDefault(META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS); props.setPropertyIfNonNull(META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS.key(), identifierClass);
emitterServer = getStringOrDefault(META_SYNC_DATAHUB_EMITTER_SERVER, null); props.setPropertyIfNonNull(META_SYNC_DATAHUB_EMITTER_SERVER.key(), emitterServer);
emitterToken = getStringOrDefault(META_SYNC_DATAHUB_EMITTER_TOKEN, null); props.setPropertyIfNonNull(META_SYNC_DATAHUB_EMITTER_TOKEN.key(), emitterToken);
emitterSupplierClass = getStringOrDefault(META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS, null); props.setPropertyIfNonNull(META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS.key(), emitterSupplierClass);
return props;
datasetIdentifier = (HoodieDataHubDatasetIdentifier) ReflectionUtils
.loadClass(identifierClass, new Class<?>[] {TypedProperties.class}, props);
}
public RestEmitter getRestEmitter() {
if (emitterSupplierClass != null) {
return ((DataHubEmitterSupplier) ReflectionUtils.loadClass(emitterSupplierClass)).get();
} else if (emitterServer != null) {
return RestEmitter.create(b -> b.server(emitterServer).token(emitterToken));
} else {
return RestEmitter.createWithDefaults();
} }
} }
} }

View File

@@ -19,12 +19,15 @@
package org.apache.hudi.sync.datahub.config; package org.apache.hudi.sync.datahub.config;
import org.apache.hudi.common.config.TypedProperties;
import com.linkedin.common.FabricType; import com.linkedin.common.FabricType;
import com.linkedin.common.urn.DataPlatformUrn; import com.linkedin.common.urn.DataPlatformUrn;
import com.linkedin.common.urn.DatasetUrn; import com.linkedin.common.urn.DatasetUrn;
import java.util.Properties;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
/** /**
* Construct and provide the default {@link DatasetUrn} to identify the Dataset on DataHub. * Construct and provide the default {@link DatasetUrn} to identify the Dataset on DataHub.
* <p> * <p>
@@ -34,15 +37,15 @@ public class HoodieDataHubDatasetIdentifier {
public static final String DEFAULT_HOODIE_DATAHUB_PLATFORM_NAME = "hudi"; public static final String DEFAULT_HOODIE_DATAHUB_PLATFORM_NAME = "hudi";
protected final TypedProperties props; protected final Properties props;
public HoodieDataHubDatasetIdentifier(TypedProperties props) { public HoodieDataHubDatasetIdentifier(Properties props) {
this.props = props; this.props = props;
} }
public DatasetUrn getDatasetUrn() { public DatasetUrn getDatasetUrn() {
DataPlatformUrn dataPlatformUrn = new DataPlatformUrn(DEFAULT_HOODIE_DATAHUB_PLATFORM_NAME); DataPlatformUrn dataPlatformUrn = new DataPlatformUrn(DEFAULT_HOODIE_DATAHUB_PLATFORM_NAME);
DataHubSyncConfig config = new DataHubSyncConfig(props); DataHubSyncConfig config = new DataHubSyncConfig(props);
return new DatasetUrn(dataPlatformUrn, String.format("%s.%s", config.databaseName, config.tableName), FabricType.DEV); return new DatasetUrn(dataPlatformUrn, String.format("%s.%s", config.getString(META_SYNC_DATABASE_NAME), config.getString(META_SYNC_TABLE_NAME)), FabricType.DEV);
} }
} }

View File

@@ -19,22 +19,32 @@
package org.apache.hudi.sync.datahub.config; package org.apache.hudi.sync.datahub.config;
import org.apache.hudi.common.config.TypedProperties;
import com.linkedin.common.FabricType; import com.linkedin.common.FabricType;
import com.linkedin.common.urn.DatasetUrn; import com.linkedin.common.urn.DatasetUrn;
import datahub.client.rest.RestEmitter;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.util.Properties;
import static org.apache.hudi.sync.datahub.config.DataHubSyncConfig.META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS; import static org.apache.hudi.sync.datahub.config.DataHubSyncConfig.META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS;
import static org.apache.hudi.sync.datahub.config.DataHubSyncConfig.META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
class TestDataHubSyncConfig { class TestDataHubSyncConfig {
@Test
void testGetEmitterFromSupplier() {
Properties props = new Properties();
props.setProperty(META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS.key(), DummySupplier.class.getName());
DataHubSyncConfig syncConfig = new DataHubSyncConfig(props);
assertNotNull(syncConfig.getRestEmitter());
}
@Test @Test
void testInstantiationWithProps() { void testInstantiationWithProps() {
TypedProperties props = new TypedProperties(); Properties props = new Properties();
props.setProperty(META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS.key(), DummyIdentifier.class.getName()); props.setProperty(META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS.key(), DummyIdentifier.class.getName());
DataHubSyncConfig syncConfig = new DataHubSyncConfig(props); DataHubSyncConfig syncConfig = new DataHubSyncConfig(props);
DatasetUrn datasetUrn = syncConfig.datasetIdentifier.getDatasetUrn(); DatasetUrn datasetUrn = syncConfig.datasetIdentifier.getDatasetUrn();
@@ -43,9 +53,17 @@ class TestDataHubSyncConfig {
assertEquals(FabricType.PROD, datasetUrn.getOriginEntity()); assertEquals(FabricType.PROD, datasetUrn.getOriginEntity());
} }
public static class DummySupplier implements DataHubEmitterSupplier {
@Override
public RestEmitter get() {
return RestEmitter.createWithDefaults();
}
}
public static class DummyIdentifier extends HoodieDataHubDatasetIdentifier { public static class DummyIdentifier extends HoodieDataHubDatasetIdentifier {
public DummyIdentifier(TypedProperties props) { public DummyIdentifier(Properties props) {
super(props); super(props);
} }

View File

@@ -1,142 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.hive;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
import org.apache.hudi.sync.common.HoodieSyncException;
import org.apache.hudi.sync.common.model.Partition;
import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.parquet.schema.MessageType;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Base class to sync Hudi tables with Hive based metastores, such as Hive server, HMS or managed Hive services.
*/
public abstract class AbstractHiveSyncHoodieClient extends AbstractSyncHoodieClient {
protected final HoodieTimeline activeTimeline;
protected final HiveSyncConfig syncConfig;
protected final Configuration hadoopConf;
protected final PartitionValueExtractor partitionValueExtractor;
public AbstractHiveSyncHoodieClient(HiveSyncConfig syncConfig, Configuration hadoopConf, FileSystem fs) {
super(syncConfig.basePath, syncConfig.assumeDatePartitioning, syncConfig.useFileListingFromMetadata, syncConfig.withOperationField, fs);
this.syncConfig = syncConfig;
this.hadoopConf = hadoopConf;
this.partitionValueExtractor = ReflectionUtils.loadClass(syncConfig.partitionValueExtractorClass);
this.activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
}
public HoodieTimeline getActiveTimeline() {
return activeTimeline;
}
/**
* Iterate over the storage partitions and find if there are any new partitions that need to be added or updated.
* Generate a list of PartitionEvent based on the changes required.
*/
protected List<PartitionEvent> getPartitionEvents(List<Partition> tablePartitions, List<String> partitionStoragePartitions, boolean isDropPartition) {
Map<String, String> paths = new HashMap<>();
for (Partition tablePartition : tablePartitions) {
List<String> hivePartitionValues = tablePartition.getValues();
String fullTablePartitionPath =
Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getStorageLocation())).toUri().getPath();
paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath);
}
List<PartitionEvent> events = new ArrayList<>();
for (String storagePartition : partitionStoragePartitions) {
Path storagePartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, storagePartition);
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
// Check if the partition values or if hdfs path is the same
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
if (isDropPartition) {
events.add(PartitionEvent.newPartitionDropEvent(storagePartition));
} else {
if (!storagePartitionValues.isEmpty()) {
String storageValue = String.join(", ", storagePartitionValues);
if (!paths.containsKey(storageValue)) {
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
}
}
}
}
return events;
}
/**
* Get all partitions for the table in the metastore.
*/
public abstract List<Partition> getAllPartitions(String tableName);
/**
* Check if a database already exists in the metastore.
*/
public abstract boolean databaseExists(String databaseName);
/**
* Create a database in the metastore.
*/
public abstract void createDatabase(String databaseName);
/**
* Update schema for the table in the metastore.
*/
public abstract void updateTableDefinition(String tableName, MessageType newSchema);
/*
* APIs below need to be re-worked by modeling field comment in hudi-sync-common,
* instead of relying on Avro or Hive schema class.
*/
public Schema getAvroSchemaWithoutMetadataFields() {
try {
return new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields();
} catch (Exception e) {
throw new HoodieSyncException("Failed to read avro schema", e);
}
}
public abstract List<FieldSchema> getTableCommentUsingMetastoreClient(String tableName);
public abstract void updateTableComments(String tableName, List<FieldSchema> oldSchema, List<Schema.Field> newSchema);
public abstract void updateTableComments(String tableName, List<FieldSchema> oldSchema, Map<String, String> newComments);
/*
* APIs above need to be re-worked by modeling field comment in hudi-sync-common,
* instead of relying on Avro or Hive schema class.
*/
}

View File

@@ -18,6 +18,8 @@
package org.apache.hudi.hive; package org.apache.hudi.hive;
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;

View File

@@ -18,269 +18,147 @@
package org.apache.hudi.hive; package org.apache.hudi.hive;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.sync.common.HoodieSyncConfig; import org.apache.hudi.sync.common.HoodieSyncConfig;
import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import java.util.Properties;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_COMMENT;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_MODE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_JDBC;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
import static org.apache.hudi.hive.HiveSyncConfigHolder.METASTORE_URIS;
/** /**
* Configs needed to sync data into the Hive Metastore. * Configs needed to sync data into the Hive Metastore.
*/ */
public class HiveSyncConfig extends HoodieSyncConfig { public class HiveSyncConfig extends HoodieSyncConfig {
public static String getBucketSpec(String bucketCols, int bucketNum) {
return "CLUSTERED BY (" + bucketCols + " INTO " + bucketNum + " BUCKETS";
}
public HiveSyncConfig(Properties props) {
super(props);
}
public HiveSyncConfig(Properties props, Configuration hadoopConf) {
super(props, hadoopConf);
HiveConf hiveConf = new HiveConf(hadoopConf, HiveConf.class);
// HiveConf needs to load fs conf to allow instantiation via AWSGlueClientFactory
hiveConf.addResource(getHadoopFileSystem().getConf());
setHadoopConf(hiveConf);
}
public HiveConf getHiveConf() {
return (HiveConf) getHadoopConf();
}
public boolean useBucketSync() {
return getBooleanOrDefault(HIVE_SYNC_BUCKET_SYNC);
}
public static class HiveSyncConfigParams {
@ParametersDelegate()
public final HoodieSyncConfigParams hoodieSyncConfigParams = new HoodieSyncConfigParams();
@Parameter(names = {"--user"}, description = "Hive username") @Parameter(names = {"--user"}, description = "Hive username")
public String hiveUser; public String hiveUser;
@Parameter(names = {"--pass"}, description = "Hive password") @Parameter(names = {"--pass"}, description = "Hive password")
public String hivePass; public String hivePass;
@Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url") @Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url")
public String jdbcUrl; public String jdbcUrl;
@Parameter(names = {"--metastore-uris"}, description = "Hive metastore uris")
public String metastoreUris;
@Parameter(names = {"--use-pre-apache-input-format"}, @Parameter(names = {"--use-pre-apache-input-format"},
description = "Use InputFormat under com.uber.hoodie package " description = "Use InputFormat under com.uber.hoodie package "
+ "instead of org.apache.hudi package. Use this when you are in the process of migrating from " + "instead of org.apache.hudi package. Use this when you are in the process of migrating from "
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to " + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to "
+ "org.apache.hudi input format.") + "org.apache.hudi input format.")
public Boolean usePreApacheInputFormat; public Boolean usePreApacheInputFormat;
@Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore", required = false)
public String bucketSpec;
@Deprecated @Deprecated
@Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url") @Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
public Boolean useJdbc; public Boolean useJdbc;
@Parameter(names = {"--metastore-uris"}, description = "Hive metastore uris")
public String metastoreUris;
@Parameter(names = {"--sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms,glue,jdbc and hiveql") @Parameter(names = {"--sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms,glue,jdbc and hiveql")
public String syncMode; public String syncMode;
@Parameter(names = {"--auto-create-database"}, description = "Auto create hive database") @Parameter(names = {"--auto-create-database"}, description = "Auto create hive database")
public Boolean autoCreateDatabase; public Boolean autoCreateDatabase;
@Parameter(names = {"--ignore-exceptions"}, description = "Ignore hive exceptions") @Parameter(names = {"--ignore-exceptions"}, description = "Ignore hive exceptions")
public Boolean ignoreExceptions; public Boolean ignoreExceptions;
@Parameter(names = {"--skip-ro-suffix"}, description = "Skip the `_ro` suffix for Read optimized table, when registering") @Parameter(names = {"--skip-ro-suffix"}, description = "Skip the `_ro` suffix for Read optimized table, when registering")
public Boolean skipROSuffix; public Boolean skipROSuffix;
@Parameter(names = {"--table-properties"}, description = "Table properties to hive table") @Parameter(names = {"--table-properties"}, description = "Table properties to hive table")
public String tableProperties; public String tableProperties;
@Parameter(names = {"--serde-properties"}, description = "Serde properties to hive table") @Parameter(names = {"--serde-properties"}, description = "Serde properties to hive table")
public String serdeProperties; public String serdeProperties;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
@Parameter(names = {"--support-timestamp"}, description = "'INT64' with original type TIMESTAMP_MICROS is converted to hive 'timestamp' type." @Parameter(names = {"--support-timestamp"}, description = "'INT64' with original type TIMESTAMP_MICROS is converted to hive 'timestamp' type."
+ "Disabled by default for backward compatibility.") + "Disabled by default for backward compatibility.")
public Boolean supportTimestamp; public Boolean supportTimestamp;
@Parameter(names = {"--managed-table"}, description = "Create a managed table") @Parameter(names = {"--managed-table"}, description = "Create a managed table")
public Boolean createManagedTable; public Boolean createManagedTable;
@Parameter(names = {"--batch-sync-num"}, description = "The number of partitions one batch when synchronous partitions to hive") @Parameter(names = {"--batch-sync-num"}, description = "The number of partitions one batch when synchronous partitions to hive")
public Integer batchSyncNum; public Integer batchSyncNum;
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table.") @Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table.")
public Boolean syncAsSparkDataSourceTable; public Boolean syncAsSparkDataSourceTable;
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore.") @Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore.")
public int sparkSchemaLengthThreshold; public Integer sparkSchemaLengthThreshold;
@Parameter(names = {"--bucket-sync"}, description = "use bucket sync")
@Parameter(names = {"--with-operation-field"}, description = "Whether to include the '_hoodie_operation' field in the metadata fields") public Boolean bucketSync;
public Boolean withOperationField = false; @Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore")
public String bucketSpec;
@Parameter(names = {"--sync-comment"}, description = "synchronize table comments to hive") @Parameter(names = {"--sync-comment"}, description = "synchronize table comments to hive")
public boolean syncComment = false; public Boolean syncComment;
@Parameter(names = {"--with-operation-field"}, description = "Whether to include the '_hoodie_operation' field in the metadata fields")
public Boolean withOperationField; // TODO remove this as it's not used
// HIVE SYNC SPECIFIC CONFIGS public boolean isHelp() {
// NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes return hoodieSyncConfigParams.isHelp();
// unexpected issues with config getting reset
public static final ConfigProperty<String> HIVE_SYNC_ENABLED = ConfigProperty
.key("hoodie.datasource.hive_sync.enable")
.defaultValue("false")
.withDocumentation("When set to true, register/sync the table to Apache Hive metastore.");
public static final ConfigProperty<String> HIVE_USER = ConfigProperty
.key("hoodie.datasource.hive_sync.username")
.defaultValue("hive")
.withDocumentation("hive user name to use");
public static final ConfigProperty<String> HIVE_PASS = ConfigProperty
.key("hoodie.datasource.hive_sync.password")
.defaultValue("hive")
.withDocumentation("hive password to use");
public static final ConfigProperty<String> HIVE_URL = ConfigProperty
.key("hoodie.datasource.hive_sync.jdbcurl")
.defaultValue("jdbc:hive2://localhost:10000")
.withDocumentation("Hive metastore url");
public static final ConfigProperty<String> HIVE_USE_PRE_APACHE_INPUT_FORMAT = ConfigProperty
.key("hoodie.datasource.hive_sync.use_pre_apache_input_format")
.defaultValue("false")
.withDocumentation("Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. "
+ "Use this when you are in the process of migrating from "
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format");
/**
* @deprecated Use {@link #HIVE_SYNC_MODE} instead of this config from 0.9.0
*/
@Deprecated
public static final ConfigProperty<String> HIVE_USE_JDBC = ConfigProperty
.key("hoodie.datasource.hive_sync.use_jdbc")
.defaultValue("true")
.deprecatedAfter("0.9.0")
.withDocumentation("Use JDBC when hive synchronization is enabled");
public static final ConfigProperty<String> METASTORE_URIS = ConfigProperty
.key("hoodie.datasource.hive_sync.metastore.uris")
.defaultValue("thrift://localhost:9083")
.withDocumentation("Hive metastore url");
public static final ConfigProperty<String> HIVE_AUTO_CREATE_DATABASE = ConfigProperty
.key("hoodie.datasource.hive_sync.auto_create_database")
.defaultValue("true")
.withDocumentation("Auto create hive database if does not exists");
public static final ConfigProperty<String> HIVE_IGNORE_EXCEPTIONS = ConfigProperty
.key("hoodie.datasource.hive_sync.ignore_exceptions")
.defaultValue("false")
.withDocumentation("Ignore exceptions when syncing with Hive.");
public static final ConfigProperty<String> HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE = ConfigProperty
.key("hoodie.datasource.hive_sync.skip_ro_suffix")
.defaultValue("false")
.withDocumentation("Skip the _ro suffix for Read optimized table, when registering");
public static final ConfigProperty<String> HIVE_SUPPORT_TIMESTAMP_TYPE = ConfigProperty
.key("hoodie.datasource.hive_sync.support_timestamp")
.defaultValue("false")
.withDocumentation("INT64 with original type TIMESTAMP_MICROS is converted to hive timestamp type. "
+ "Disabled by default for backward compatibility.");
public static final ConfigProperty<String> HIVE_TABLE_PROPERTIES = ConfigProperty
.key("hoodie.datasource.hive_sync.table_properties")
.noDefaultValue()
.withDocumentation("Additional properties to store with table.");
public static final ConfigProperty<String> HIVE_TABLE_SERDE_PROPERTIES = ConfigProperty
.key("hoodie.datasource.hive_sync.serde_properties")
.noDefaultValue()
.withDocumentation("Serde properties to hive table.");
public static final ConfigProperty<String> HIVE_SYNC_AS_DATA_SOURCE_TABLE = ConfigProperty
.key("hoodie.datasource.hive_sync.sync_as_datasource")
.defaultValue("true")
.withDocumentation("");
public static final ConfigProperty<Integer> HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD = ConfigProperty
.key("hoodie.datasource.hive_sync.schema_string_length_thresh")
.defaultValue(4000)
.withDocumentation("");
// Create table as managed table
public static final ConfigProperty<Boolean> HIVE_CREATE_MANAGED_TABLE = ConfigProperty
.key("hoodie.datasource.hive_sync.create_managed_table")
.defaultValue(false)
.withDocumentation("Whether to sync the table as managed table.");
public static final ConfigProperty<Integer> HIVE_BATCH_SYNC_PARTITION_NUM = ConfigProperty
.key("hoodie.datasource.hive_sync.batch_num")
.defaultValue(1000)
.withDocumentation("The number of partitions one batch when synchronous partitions to hive.");
public static final ConfigProperty<String> HIVE_SYNC_MODE = ConfigProperty
.key("hoodie.datasource.hive_sync.mode")
.noDefaultValue()
.withDocumentation("Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.");
public static final ConfigProperty<Boolean> HIVE_SYNC_BUCKET_SYNC = ConfigProperty
.key("hoodie.datasource.hive_sync.bucket_sync")
.defaultValue(false)
.withDocumentation("Whether sync hive metastore bucket specification when using bucket index."
+ "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'");
public static final ConfigProperty<String> HIVE_SYNC_BUCKET_SYNC_SPEC = ConfigProperty
.key("hoodie.datasource.hive_sync.bucket_sync_spec")
.defaultValue("")
.withDocumentation("The hive metastore bucket specification when using bucket index."
+ "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'");
public static final ConfigProperty<String> HIVE_SYNC_COMMENT = ConfigProperty
.key("hoodie.datasource.hive_sync.sync_comment")
.defaultValue("false")
.withDocumentation("Whether to sync the table column comments while syncing the table.");
public HiveSyncConfig() {
this(new TypedProperties());
} }
public HiveSyncConfig(TypedProperties props) { public TypedProperties toProps() {
super(props); final TypedProperties props = hoodieSyncConfigParams.toProps();
this.hiveUser = getStringOrDefault(HIVE_USER); props.setPropertyIfNonNull(HIVE_USER.key(), hiveUser);
this.hivePass = getStringOrDefault(HIVE_PASS); props.setPropertyIfNonNull(HIVE_PASS.key(), hivePass);
this.jdbcUrl = getStringOrDefault(HIVE_URL); props.setPropertyIfNonNull(HIVE_URL.key(), jdbcUrl);
this.usePreApacheInputFormat = getBooleanOrDefault(HIVE_USE_PRE_APACHE_INPUT_FORMAT); props.setPropertyIfNonNull(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), usePreApacheInputFormat);
this.useJdbc = getBooleanOrDefault(HIVE_USE_JDBC); props.setPropertyIfNonNull(HIVE_USE_JDBC.key(), useJdbc);
this.metastoreUris = getStringOrDefault(METASTORE_URIS); props.setPropertyIfNonNull(HIVE_SYNC_MODE.key(), syncMode);
this.syncMode = getString(HIVE_SYNC_MODE); props.setPropertyIfNonNull(METASTORE_URIS.key(), metastoreUris);
this.autoCreateDatabase = getBooleanOrDefault(HIVE_AUTO_CREATE_DATABASE); props.setPropertyIfNonNull(HIVE_AUTO_CREATE_DATABASE.key(), autoCreateDatabase);
this.ignoreExceptions = getBooleanOrDefault(HIVE_IGNORE_EXCEPTIONS); props.setPropertyIfNonNull(HIVE_IGNORE_EXCEPTIONS.key(), ignoreExceptions);
this.skipROSuffix = getBooleanOrDefault(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE); props.setPropertyIfNonNull(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key(), skipROSuffix);
this.tableProperties = getString(HIVE_TABLE_PROPERTIES); props.setPropertyIfNonNull(HIVE_SUPPORT_TIMESTAMP_TYPE.key(), supportTimestamp);
this.serdeProperties = getString(HIVE_TABLE_SERDE_PROPERTIES); props.setPropertyIfNonNull(HIVE_TABLE_PROPERTIES.key(), tableProperties);
this.supportTimestamp = getBooleanOrDefault(HIVE_SUPPORT_TIMESTAMP_TYPE); props.setPropertyIfNonNull(HIVE_TABLE_SERDE_PROPERTIES.key(), serdeProperties);
this.batchSyncNum = getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM); props.setPropertyIfNonNull(HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), syncAsSparkDataSourceTable);
this.syncAsSparkDataSourceTable = getBooleanOrDefault(HIVE_SYNC_AS_DATA_SOURCE_TABLE); props.setPropertyIfNonNull(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), sparkSchemaLengthThreshold);
this.sparkSchemaLengthThreshold = getIntOrDefault(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD); props.setPropertyIfNonNull(HIVE_CREATE_MANAGED_TABLE.key(), createManagedTable);
this.createManagedTable = getBooleanOrDefault(HIVE_CREATE_MANAGED_TABLE); props.setPropertyIfNonNull(HIVE_BATCH_SYNC_PARTITION_NUM.key(), batchSyncNum);
this.bucketSpec = getStringOrDefault(HIVE_SYNC_BUCKET_SYNC_SPEC); props.setPropertyIfNonNull(HIVE_SYNC_BUCKET_SYNC.key(), bucketSync);
this.syncComment = getBooleanOrDefault(HIVE_SYNC_COMMENT); props.setPropertyIfNonNull(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), bucketSpec);
} props.setPropertyIfNonNull(HIVE_SYNC_COMMENT.key(), syncComment);
return props;
@Override }
public String toString() {
return "HiveSyncConfig{"
+ "databaseName='" + databaseName + '\''
+ ", tableName='" + tableName + '\''
+ ", bucketSpec='" + bucketSpec + '\''
+ ", baseFileFormat='" + baseFileFormat + '\''
+ ", hiveUser='" + hiveUser + '\''
+ ", hivePass='" + hivePass + '\''
+ ", jdbcUrl='" + jdbcUrl + '\''
+ ", metastoreUris='" + metastoreUris + '\''
+ ", basePath='" + basePath + '\''
+ ", partitionFields=" + partitionFields
+ ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\''
+ ", assumeDatePartitioning=" + assumeDatePartitioning
+ ", usePreApacheInputFormat=" + usePreApacheInputFormat
+ ", useJdbc=" + useJdbc
+ ", autoCreateDatabase=" + autoCreateDatabase
+ ", ignoreExceptions=" + ignoreExceptions
+ ", skipROSuffix=" + skipROSuffix
+ ", useFileListingFromMetadata=" + useFileListingFromMetadata
+ ", tableProperties='" + tableProperties + '\''
+ ", serdeProperties='" + serdeProperties + '\''
+ ", help=" + help
+ ", supportTimestamp=" + supportTimestamp
+ ", decodePartition=" + decodePartition
+ ", createManagedTable=" + createManagedTable
+ ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold
+ ", withOperationField=" + withOperationField
+ ", isConditionalSync=" + isConditionalSync
+ ", sparkVersion=" + sparkVersion
+ ", syncComment=" + syncComment
+ '}';
}
public static String getBucketSpec(String bucketCols, int bucketNum) {
return "CLUSTERED BY (" + bucketCols + " INTO " + bucketNum + " BUCKETS";
} }
} }

View File

@@ -0,0 +1,128 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.hive;
import org.apache.hudi.common.config.ConfigProperty;
public class HiveSyncConfigHolder {
/*
* NOTE: below are HIVE SYNC SPECIFIC CONFIGS which should be under HiveSyncConfig.java
* But since DataSourceOptions.scala references constants to some of these, and HiveSyncConfig.java imports HiveConf,
* it causes HiveConf ClassNotFound issue for loading DataSourceOptions.
*
* NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes
* unexpected issues with config getting reset
*/
public static final ConfigProperty<String> HIVE_SYNC_ENABLED = ConfigProperty
.key("hoodie.datasource.hive_sync.enable")
.defaultValue("false")
.withDocumentation("When set to true, register/sync the table to Apache Hive metastore.");
public static final ConfigProperty<String> HIVE_USER = ConfigProperty
.key("hoodie.datasource.hive_sync.username")
.defaultValue("hive")
.withDocumentation("hive user name to use");
public static final ConfigProperty<String> HIVE_PASS = ConfigProperty
.key("hoodie.datasource.hive_sync.password")
.defaultValue("hive")
.withDocumentation("hive password to use");
public static final ConfigProperty<String> HIVE_URL = ConfigProperty
.key("hoodie.datasource.hive_sync.jdbcurl")
.defaultValue("jdbc:hive2://localhost:10000")
.withDocumentation("Hive metastore url");
public static final ConfigProperty<String> HIVE_USE_PRE_APACHE_INPUT_FORMAT = ConfigProperty
.key("hoodie.datasource.hive_sync.use_pre_apache_input_format")
.defaultValue("false")
.withDocumentation("Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. "
+ "Use this when you are in the process of migrating from "
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format");
/**
* @deprecated Use {@link #HIVE_SYNC_MODE} instead of this config from 0.9.0
*/
@Deprecated
public static final ConfigProperty<String> HIVE_USE_JDBC = ConfigProperty
.key("hoodie.datasource.hive_sync.use_jdbc")
.defaultValue("true")
.deprecatedAfter("0.9.0")
.withDocumentation("Use JDBC when hive synchronization is enabled");
public static final ConfigProperty<String> METASTORE_URIS = ConfigProperty
.key("hoodie.datasource.hive_sync.metastore.uris")
.defaultValue("thrift://localhost:9083")
.withDocumentation("Hive metastore url");
public static final ConfigProperty<String> HIVE_AUTO_CREATE_DATABASE = ConfigProperty
.key("hoodie.datasource.hive_sync.auto_create_database")
.defaultValue("true")
.withDocumentation("Auto create hive database if does not exists");
public static final ConfigProperty<String> HIVE_IGNORE_EXCEPTIONS = ConfigProperty
.key("hoodie.datasource.hive_sync.ignore_exceptions")
.defaultValue("false")
.withDocumentation("Ignore exceptions when syncing with Hive.");
public static final ConfigProperty<String> HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE = ConfigProperty
.key("hoodie.datasource.hive_sync.skip_ro_suffix")
.defaultValue("false")
.withDocumentation("Skip the _ro suffix for Read optimized table, when registering");
public static final ConfigProperty<String> HIVE_SUPPORT_TIMESTAMP_TYPE = ConfigProperty
.key("hoodie.datasource.hive_sync.support_timestamp")
.defaultValue("false")
.withDocumentation("INT64 with original type TIMESTAMP_MICROS is converted to hive timestamp type. "
+ "Disabled by default for backward compatibility.");
public static final ConfigProperty<String> HIVE_TABLE_PROPERTIES = ConfigProperty
.key("hoodie.datasource.hive_sync.table_properties")
.noDefaultValue()
.withDocumentation("Additional properties to store with table.");
public static final ConfigProperty<String> HIVE_TABLE_SERDE_PROPERTIES = ConfigProperty
.key("hoodie.datasource.hive_sync.serde_properties")
.noDefaultValue()
.withDocumentation("Serde properties to hive table.");
public static final ConfigProperty<String> HIVE_SYNC_AS_DATA_SOURCE_TABLE = ConfigProperty
.key("hoodie.datasource.hive_sync.sync_as_datasource")
.defaultValue("true")
.withDocumentation("");
public static final ConfigProperty<Integer> HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD = ConfigProperty
.key("hoodie.datasource.hive_sync.schema_string_length_thresh")
.defaultValue(4000)
.withDocumentation("");
// Create table as managed table
public static final ConfigProperty<Boolean> HIVE_CREATE_MANAGED_TABLE = ConfigProperty
.key("hoodie.datasource.hive_sync.create_managed_table")
.defaultValue(false)
.withDocumentation("Whether to sync the table as managed table.");
public static final ConfigProperty<Integer> HIVE_BATCH_SYNC_PARTITION_NUM = ConfigProperty
.key("hoodie.datasource.hive_sync.batch_num")
.defaultValue(1000)
.withDocumentation("The number of partitions one batch when synchronous partitions to hive.");
public static final ConfigProperty<String> HIVE_SYNC_MODE = ConfigProperty
.key("hoodie.datasource.hive_sync.mode")
.noDefaultValue()
.withDocumentation("Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.");
public static final ConfigProperty<Boolean> HIVE_SYNC_BUCKET_SYNC = ConfigProperty
.key("hoodie.datasource.hive_sync.bucket_sync")
.defaultValue(false)
.withDocumentation("Whether sync hive metastore bucket specification when using bucket index."
+ "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'");
public static final ConfigProperty<String> HIVE_SYNC_BUCKET_SYNC_SPEC = ConfigProperty
.key("hoodie.datasource.hive_sync.bucket_sync_spec")
.defaultValue("")
.withDocumentation("The hive metastore bucket specification when using bucket index."
+ "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'");
public static final ConfigProperty<String> HIVE_SYNC_COMMENT = ConfigProperty
.key("hoodie.datasource.hive_sync.sync_comment")
.defaultValue("false")
.withDocumentation("Whether to sync the table column comments while syncing the table.");
}

View File

@@ -18,37 +18,53 @@
package org.apache.hudi.hive; package org.apache.hudi.hive;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.InvalidTableException; import org.apache.hudi.exception.InvalidTableException;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.hudi.sync.common.util.ConfigUtils;
import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hudi.hive.util.HiveSchemaUtil;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent; import org.apache.hudi.sync.common.HoodieSyncClient;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType; import org.apache.hudi.sync.common.HoodieSyncTool;
import org.apache.hudi.sync.common.AbstractSyncTool; import org.apache.hudi.sync.common.model.FieldSchema;
import org.apache.hudi.sync.common.model.Partition; import org.apache.hudi.sync.common.model.Partition;
import org.apache.hudi.sync.common.model.PartitionEvent;
import org.apache.hudi.sync.common.model.PartitionEvent.PartitionEventType;
import org.apache.hudi.sync.common.util.ConfigUtils;
import org.apache.hudi.sync.common.util.SparkDataSourceTableUtils;
import com.beust.jcommander.JCommander; import com.beust.jcommander.JCommander;
import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageType;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_COMMENT;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
import static org.apache.hudi.hive.HiveSyncConfigHolder.METASTORE_URIS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_SPARK_VERSION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
import static org.apache.hudi.sync.common.util.TableUtils.tableId;
/** /**
* Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api * Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api
* HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive-sync.jar HiveSyncTool [args] * HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive-sync.jar HiveSyncTool [args]
@@ -57,38 +73,34 @@ import java.util.stream.Collectors;
* partitions incrementally (all the partitions modified since the last commit) * partitions incrementally (all the partitions modified since the last commit)
*/ */
@SuppressWarnings("WeakerAccess") @SuppressWarnings("WeakerAccess")
public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable { public class HiveSyncTool extends HoodieSyncTool implements AutoCloseable {
private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class); private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class);
public static final String SUFFIX_SNAPSHOT_TABLE = "_rt"; public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro"; public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
protected HiveSyncConfig hiveSyncConfig; protected final HiveSyncConfig config;
protected AbstractHiveSyncHoodieClient hoodieHiveClient; protected final String databaseName;
protected String snapshotTableName = null; protected final String tableName;
protected Option<String> roTableName = null; protected HoodieSyncClient syncClient;
protected String snapshotTableName;
protected Option<String> roTableName;
public HiveSyncTool(TypedProperties props, Configuration conf, FileSystem fs) { public HiveSyncTool(Properties props, Configuration hadoopConf) {
this(new HiveSyncConfig(props), new HiveConf(conf, HiveConf.class), fs); super(props, hadoopConf);
HiveSyncConfig config = new HiveSyncConfig(props, hadoopConf);
this.config = config;
this.databaseName = config.getStringOrDefault(META_SYNC_DATABASE_NAME);
this.tableName = config.getString(META_SYNC_TABLE_NAME);
initSyncClient(config);
initTableNameVars(config);
} }
public HiveSyncTool(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf, FileSystem fs) { protected void initSyncClient(HiveSyncConfig config) {
super(hiveSyncConfig.getProps(), hiveConf, fs);
// TODO: reconcile the way to set METASTOREURIS
if (StringUtils.isNullOrEmpty(hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname))) {
hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname, hiveSyncConfig.metastoreUris);
}
// HiveConf needs to load fs conf to allow instantiation via AWSGlueClientFactory
hiveConf.addResource(fs.getConf());
initClient(hiveSyncConfig, hiveConf);
initConfig(hiveSyncConfig);
}
protected void initClient(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf) {
try { try {
this.hoodieHiveClient = new HoodieHiveClient(hiveSyncConfig, hiveConf, fs); this.syncClient = new HoodieHiveSyncClient(config);
} catch (RuntimeException e) { } catch (RuntimeException e) {
if (hiveSyncConfig.ignoreExceptions) { if (config.getBoolean(HIVE_IGNORE_EXCEPTIONS)) {
LOG.error("Got runtime exception when hive syncing, but continuing as ignoreExceptions config is set ", e); LOG.error("Got runtime exception when hive syncing, but continuing as ignoreExceptions config is set ", e);
} else { } else {
throw new HoodieHiveSyncException("Got runtime exception when hive syncing", e); throw new HoodieHiveSyncException("Got runtime exception when hive syncing", e);
@@ -96,28 +108,22 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
} }
} }
private void initConfig(HiveSyncConfig hiveSyncConfig) { private void initTableNameVars(HiveSyncConfig config) {
// Set partitionFields to empty, when the NonPartitionedExtractor is used if (syncClient != null) {
// TODO: HiveSyncConfig should be responsible for inferring config value switch (syncClient.getTableType()) {
if (NonPartitionedExtractor.class.getName().equals(hiveSyncConfig.partitionValueExtractorClass)) {
LOG.warn("Set partitionFields to empty, since the NonPartitionedExtractor is used");
hiveSyncConfig.partitionFields = new ArrayList<>();
}
this.hiveSyncConfig = hiveSyncConfig;
if (hoodieHiveClient != null) {
switch (hoodieHiveClient.getTableType()) {
case COPY_ON_WRITE: case COPY_ON_WRITE:
this.snapshotTableName = hiveSyncConfig.tableName; this.snapshotTableName = tableName;
this.roTableName = Option.empty(); this.roTableName = Option.empty();
break; break;
case MERGE_ON_READ: case MERGE_ON_READ:
this.snapshotTableName = hiveSyncConfig.tableName + SUFFIX_SNAPSHOT_TABLE; this.snapshotTableName = tableName + SUFFIX_SNAPSHOT_TABLE;
this.roTableName = hiveSyncConfig.skipROSuffix ? Option.of(hiveSyncConfig.tableName) : this.roTableName = config.getBoolean(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE)
Option.of(hiveSyncConfig.tableName + SUFFIX_READ_OPTIMIZED_TABLE); ? Option.of(tableName)
: Option.of(tableName + SUFFIX_READ_OPTIMIZED_TABLE);
break; break;
default: default:
LOG.error("Unknown table type " + hoodieHiveClient.getTableType()); LOG.error("Unknown table type " + syncClient.getTableType());
throw new InvalidTableException(hoodieHiveClient.getBasePath()); throw new InvalidTableException(syncClient.getBasePath());
} }
} }
} }
@@ -125,21 +131,23 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
@Override @Override
public void syncHoodieTable() { public void syncHoodieTable() {
try { try {
if (hoodieHiveClient != null) { if (syncClient != null) {
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :" LOG.info("Syncing target hoodie table with hive table("
+ hiveSyncConfig.jdbcUrl + ", basePath :" + hiveSyncConfig.basePath); + tableId(databaseName, tableName) + "). Hive metastore URL :"
+ config.getString(METASTORE_URIS) + ", basePath :"
+ config.getString(META_SYNC_BASE_PATH));
doSync(); doSync();
} }
} catch (RuntimeException re) { } catch (RuntimeException re) {
throw new HoodieException("Got runtime exception when hive syncing " + hiveSyncConfig.tableName, re); throw new HoodieException("Got runtime exception when hive syncing " + tableName, re);
} finally { } finally {
close(); close();
} }
} }
protected void doSync() { protected void doSync() {
switch (hoodieHiveClient.getTableType()) { switch (syncClient.getTableType()) {
case COPY_ON_WRITE: case COPY_ON_WRITE:
syncHoodieTable(snapshotTableName, false, false); syncHoodieTable(snapshotTableName, false, false);
break; break;
@@ -150,61 +158,60 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
syncHoodieTable(snapshotTableName, true, false); syncHoodieTable(snapshotTableName, true, false);
break; break;
default: default:
LOG.error("Unknown table type " + hoodieHiveClient.getTableType()); LOG.error("Unknown table type " + syncClient.getTableType());
throw new InvalidTableException(hoodieHiveClient.getBasePath()); throw new InvalidTableException(syncClient.getBasePath());
} }
} }
@Override @Override
public void close() { public void close() {
if (hoodieHiveClient != null) { if (syncClient != null) {
try { try {
hoodieHiveClient.close(); syncClient.close();
} catch (Exception e) { } catch (Exception e) {
throw new HoodieHiveSyncException("Fail to close sync client.", e); throw new HoodieHiveSyncException("Fail to close sync client.", e);
} }
} }
} }
protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) {
boolean readAsOptimized) { LOG.info("Trying to sync hoodie table " + tableName + " with base path " + syncClient.getBasePath()
LOG.info("Trying to sync hoodie table " + tableName + " with base path " + hoodieHiveClient.getBasePath() + " of type " + syncClient.getTableType());
+ " of type " + hoodieHiveClient.getTableType());
// check if the database exists else create it // check if the database exists else create it
if (hiveSyncConfig.autoCreateDatabase) { if (config.getBoolean(HIVE_AUTO_CREATE_DATABASE)) {
try { try {
if (!hoodieHiveClient.databaseExists(hiveSyncConfig.databaseName)) { if (!syncClient.databaseExists(databaseName)) {
hoodieHiveClient.createDatabase(hiveSyncConfig.databaseName); syncClient.createDatabase(databaseName);
} }
} catch (Exception e) { } catch (Exception e) {
// this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing // this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing
LOG.warn("Unable to create database", e); LOG.warn("Unable to create database", e);
} }
} else { } else {
if (!hoodieHiveClient.databaseExists(hiveSyncConfig.databaseName)) { if (!syncClient.databaseExists(databaseName)) {
LOG.error("Hive database does not exist " + hiveSyncConfig.databaseName); LOG.error("Hive database does not exist " + databaseName);
throw new HoodieHiveSyncException("hive database does not exist " + hiveSyncConfig.databaseName); throw new HoodieHiveSyncException("hive database does not exist " + databaseName);
} }
} }
// Check if the necessary table exists // Check if the necessary table exists
boolean tableExists = hoodieHiveClient.tableExists(tableName); boolean tableExists = syncClient.tableExists(tableName);
// check if isDropPartition // check if isDropPartition
boolean isDropPartition = hoodieHiveClient.isDropPartition(); boolean isDropPartition = syncClient.isDropPartition();
// Get the parquet schema for this table looking at the latest commit // Get the parquet schema for this table looking at the latest commit
MessageType schema = hoodieHiveClient.getDataSchema(); MessageType schema = syncClient.getStorageSchema();
// Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table, // Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table,
// so we disable the syncAsSparkDataSourceTable here to avoid read such kind table // so we disable the syncAsSparkDataSourceTable here to avoid read such kind table
// by the data source way (which will use the HoodieBootstrapRelation). // by the data source way (which will use the HoodieBootstrapRelation).
// TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071], we can remove this logical. // TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071], we can remove this logical.
if (hoodieHiveClient.isBootstrap() if (syncClient.isBootstrap()
&& hoodieHiveClient.getTableType() == HoodieTableType.MERGE_ON_READ && syncClient.getTableType() == HoodieTableType.MERGE_ON_READ
&& !readAsOptimized) { && !readAsOptimized) {
hiveSyncConfig.syncAsSparkDataSourceTable = false; config.setValue(HIVE_SYNC_AS_DATA_SOURCE_TABLE, "false");
} }
// Sync schema if needed // Sync schema if needed
@@ -214,17 +221,17 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
// Get the last time we successfully synced partitions // Get the last time we successfully synced partitions
Option<String> lastCommitTimeSynced = Option.empty(); Option<String> lastCommitTimeSynced = Option.empty();
if (tableExists) { if (tableExists) {
lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced(tableName); lastCommitTimeSynced = syncClient.getLastCommitTimeSynced(tableName);
} }
LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null")); LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null"));
List<String> writtenPartitionsSince = hoodieHiveClient.getPartitionsWrittenToSince(lastCommitTimeSynced); List<String> writtenPartitionsSince = syncClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size()); LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size());
// Sync the partitions if needed // Sync the partitions if needed
boolean partitionsChanged = syncPartitions(tableName, writtenPartitionsSince, isDropPartition); boolean partitionsChanged = syncPartitions(tableName, writtenPartitionsSince, isDropPartition);
boolean meetSyncConditions = schemaChanged || partitionsChanged; boolean meetSyncConditions = schemaChanged || partitionsChanged;
if (!hiveSyncConfig.isConditionalSync || meetSyncConditions) { if (!config.getBoolean(META_SYNC_CONDITIONAL_SYNC) || meetSyncConditions) {
hoodieHiveClient.updateLastCommitTimeSynced(tableName); syncClient.updateLastCommitTimeSynced(tableName);
} }
LOG.info("Sync complete for " + tableName); LOG.info("Sync complete for " + tableName);
} }
@@ -233,18 +240,18 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
* Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the * Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the
* table schema. * table schema.
* *
* @param tableExists - does table exist * @param tableExists does table exist
* @param schema - extracted schema * @param schema extracted schema
*/ */
private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat,
boolean readAsOptimized, MessageType schema) { boolean readAsOptimized, MessageType schema) {
// Append spark table properties & serde properties // Append spark table properties & serde properties
Map<String, String> tableProperties = ConfigUtils.toMap(hiveSyncConfig.tableProperties); Map<String, String> tableProperties = ConfigUtils.toMap(config.getString(HIVE_TABLE_PROPERTIES));
Map<String, String> serdeProperties = ConfigUtils.toMap(hiveSyncConfig.serdeProperties); Map<String, String> serdeProperties = ConfigUtils.toMap(config.getString(HIVE_TABLE_SERDE_PROPERTIES));
if (hiveSyncConfig.syncAsSparkDataSourceTable) { if (config.getBoolean(HIVE_SYNC_AS_DATA_SOURCE_TABLE)) {
Map<String, String> sparkTableProperties = getSparkTableProperties(hiveSyncConfig.partitionFields, Map<String, String> sparkTableProperties = SparkDataSourceTableUtils.getSparkTableProperties(config.getSplitStrings(META_SYNC_PARTITION_FIELDS),
hiveSyncConfig.sparkVersion, hiveSyncConfig.sparkSchemaLengthThreshold, schema); config.getStringOrDefault(META_SYNC_SPARK_VERSION), config.getIntOrDefault(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD), schema);
Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized, hiveSyncConfig.basePath); Map<String, String> sparkSerdeProperties = SparkDataSourceTableUtils.getSparkSerdeProperties(readAsOptimized, config.getString(META_SYNC_BASE_PATH));
tableProperties.putAll(sparkTableProperties); tableProperties.putAll(sparkTableProperties);
serdeProperties.putAll(sparkSerdeProperties); serdeProperties.putAll(sparkSerdeProperties);
} }
@@ -252,10 +259,10 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
// Check and sync schema // Check and sync schema
if (!tableExists) { if (!tableExists) {
LOG.info("Hive table " + tableName + " is not found. Creating it"); LOG.info("Hive table " + tableName + " is not found. Creating it");
HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(hiveSyncConfig.baseFileFormat.toUpperCase()); HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(config.getStringOrDefault(META_SYNC_BASE_FILE_FORMAT).toUpperCase());
String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat); String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat);
if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && hiveSyncConfig.usePreApacheInputFormat) { if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && config.getBooleanOrDefault(HIVE_USE_PRE_APACHE_INPUT_FORMAT)) {
// Parquet input format had an InputFormat class visible under the old naming scheme. // Parquet input format had an InputFormat class visible under the old naming scheme.
inputFormatClassName = useRealTimeInputFormat inputFormatClassName = useRealTimeInputFormat
? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName() ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName()
@@ -268,19 +275,20 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS // Custom serde will not work with ALTER TABLE REPLACE COLUMNS
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
// /ql/exec/DDLTask.java#L3488 // /ql/exec/DDLTask.java#L3488
hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, syncClient.createTable(tableName, schema, inputFormatClassName,
outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties); outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties);
schemaChanged = true; schemaChanged = true;
} else { } else {
// Check if the table schema has evolved // Check if the table schema has evolved
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName); Map<String, String> tableSchema = syncClient.getMetastoreSchema(tableName);
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, hiveSyncConfig.partitionFields, hiveSyncConfig.supportTimestamp); SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, config.getSplitStrings(META_SYNC_PARTITION_FIELDS),
config.getBooleanOrDefault(HIVE_SUPPORT_TIMESTAMP_TYPE));
if (!schemaDiff.isEmpty()) { if (!schemaDiff.isEmpty()) {
LOG.info("Schema difference found for " + tableName); LOG.info("Schema difference found for " + tableName);
hoodieHiveClient.updateTableDefinition(tableName, schema); syncClient.updateTableSchema(tableName, schema);
// Sync the table properties if the schema has changed // Sync the table properties if the schema has changed
if (hiveSyncConfig.tableProperties != null || hiveSyncConfig.syncAsSparkDataSourceTable) { if (config.getString(HIVE_TABLE_PROPERTIES) != null || config.getBoolean(HIVE_SYNC_AS_DATA_SOURCE_TABLE)) {
hoodieHiveClient.updateTableProperties(tableName, tableProperties); syncClient.updateTableProperties(tableName, tableProperties);
LOG.info("Sync table properties for " + tableName + ", table properties is: " + tableProperties); LOG.info("Sync table properties for " + tableName + ", table properties is: " + tableProperties);
} }
schemaChanged = true; schemaChanged = true;
@@ -289,17 +297,10 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
} }
} }
if (hiveSyncConfig.syncComment) { if (config.getBoolean(HIVE_SYNC_COMMENT)) {
Schema avroSchemaWithoutMetadataFields = hoodieHiveClient.getAvroSchemaWithoutMetadataFields(); List<FieldSchema> fromMetastore = syncClient.getMetastoreFieldSchemas(tableName);
Map<String, String> newComments = avroSchemaWithoutMetadataFields.getFields() List<FieldSchema> fromStorage = syncClient.getStorageFieldSchemas();
.stream().collect(Collectors.toMap(Schema.Field::name, field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc())); syncClient.updateTableComments(tableName, fromMetastore, fromStorage);
boolean allEmpty = newComments.values().stream().allMatch(StringUtils::isNullOrEmpty);
if (!allEmpty) {
List<FieldSchema> hiveSchema = hoodieHiveClient.getTableCommentUsingMetastoreClient(tableName);
hoodieHiveClient.updateTableComments(tableName, hiveSchema, avroSchemaWithoutMetadataFields.getFields());
} else {
LOG.info(String.format("No comment %s need to add", tableName));
}
} }
return schemaChanged; return schemaChanged;
} }
@@ -311,26 +312,26 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
private boolean syncPartitions(String tableName, List<String> writtenPartitionsSince, boolean isDropPartition) { private boolean syncPartitions(String tableName, List<String> writtenPartitionsSince, boolean isDropPartition) {
boolean partitionsChanged; boolean partitionsChanged;
try { try {
List<Partition> hivePartitions = hoodieHiveClient.getAllPartitions(tableName); List<Partition> hivePartitions = syncClient.getAllPartitions(tableName);
List<PartitionEvent> partitionEvents = List<PartitionEvent> partitionEvents =
hoodieHiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, isDropPartition); syncClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, isDropPartition);
List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD); List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
if (!newPartitions.isEmpty()) { if (!newPartitions.isEmpty()) {
LOG.info("New Partitions " + newPartitions); LOG.info("New Partitions " + newPartitions);
hoodieHiveClient.addPartitionsToTable(tableName, newPartitions); syncClient.addPartitionsToTable(tableName, newPartitions);
} }
List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE); List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
if (!updatePartitions.isEmpty()) { if (!updatePartitions.isEmpty()) {
LOG.info("Changed Partitions " + updatePartitions); LOG.info("Changed Partitions " + updatePartitions);
hoodieHiveClient.updatePartitionsToTable(tableName, updatePartitions); syncClient.updatePartitionsToTable(tableName, updatePartitions);
} }
List<String> dropPartitions = filterPartitions(partitionEvents, PartitionEventType.DROP); List<String> dropPartitions = filterPartitions(partitionEvents, PartitionEventType.DROP);
if (!dropPartitions.isEmpty()) { if (!dropPartitions.isEmpty()) {
LOG.info("Drop Partitions " + dropPartitions); LOG.info("Drop Partitions " + dropPartitions);
hoodieHiveClient.dropPartitions(tableName, dropPartitions); syncClient.dropPartitions(tableName, dropPartitions);
} }
partitionsChanged = !updatePartitions.isEmpty() || !newPartitions.isEmpty() || !dropPartitions.isEmpty(); partitionsChanged = !updatePartitions.isEmpty() || !newPartitions.isEmpty() || !dropPartitions.isEmpty();
@@ -346,16 +347,13 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
} }
public static void main(String[] args) { public static void main(String[] args) {
// parse the params final HiveSyncConfig.HiveSyncConfigParams params = new HiveSyncConfig.HiveSyncConfigParams();
final HiveSyncConfig cfg = new HiveSyncConfig(); JCommander cmd = JCommander.newBuilder().addObject(params).build();
JCommander cmd = new JCommander(cfg, null, args); cmd.parse(args);
if (cfg.help || args.length == 0) { if (params.isHelp()) {
cmd.usage(); cmd.usage();
System.exit(1); System.exit(0);
} }
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration()); new HiveSyncTool(params.toProps(), new Configuration()).syncHoodieTable();
HiveConf hiveConf = new HiveConf();
hiveConf.addResource(fs.getConf());
new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable();
} }
} }

View File

@@ -18,22 +18,21 @@
package org.apache.hudi.hive; package org.apache.hudi.hive;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.hive.ddl.DDLExecutor; import org.apache.hudi.hive.ddl.DDLExecutor;
import org.apache.hudi.hive.ddl.HMSDDLExecutor; import org.apache.hudi.hive.ddl.HMSDDLExecutor;
import org.apache.hudi.hive.ddl.HiveQueryDDLExecutor; import org.apache.hudi.hive.ddl.HiveQueryDDLExecutor;
import org.apache.hudi.hive.ddl.HiveSyncMode; import org.apache.hudi.hive.ddl.HiveSyncMode;
import org.apache.hudi.hive.ddl.JDBCExecutor; import org.apache.hudi.hive.ddl.JDBCExecutor;
import org.apache.hudi.sync.common.HoodieSyncClient;
import org.apache.hudi.sync.common.model.FieldSchema;
import org.apache.hudi.sync.common.model.Partition; import org.apache.hudi.sync.common.model.Partition;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.Hive;
@@ -49,115 +48,100 @@ import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static org.apache.hudi.hadoop.utils.HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP; import static org.apache.hudi.hadoop.utils.HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_MODE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_JDBC;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.util.TableUtils.tableId; import static org.apache.hudi.sync.common.util.TableUtils.tableId;
/** /**
* This class implements logic to sync a Hudi table with either the Hive server or the Hive Metastore. * This class implements logic to sync a Hudi table with either the Hive server or the Hive Metastore.
*/ */
public class HoodieHiveClient extends AbstractHiveSyncHoodieClient { public class HoodieHiveSyncClient extends HoodieSyncClient {
private static final Logger LOG = LogManager.getLogger(HoodieHiveClient.class); private static final Logger LOG = LogManager.getLogger(HoodieHiveSyncClient.class);
protected final HiveSyncConfig config;
private final String databaseName;
DDLExecutor ddlExecutor; DDLExecutor ddlExecutor;
private IMetaStoreClient client; private IMetaStoreClient client;
public HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { public HoodieHiveSyncClient(HiveSyncConfig config) {
super(cfg, configuration, fs); super(config);
this.config = config;
this.databaseName = config.getStringOrDefault(META_SYNC_DATABASE_NAME);
// Support JDBC, HiveQL and metastore based implementations for backwards compatibility. Future users should // Support JDBC, HiveQL and metastore based implementations for backwards compatibility. Future users should
// disable jdbc and depend on metastore client for all hive registrations // disable jdbc and depend on metastore client for all hive registrations
try { try {
if (!StringUtils.isNullOrEmpty(cfg.syncMode)) { if (!StringUtils.isNullOrEmpty(config.getString(HIVE_SYNC_MODE))) {
HiveSyncMode syncMode = HiveSyncMode.of(cfg.syncMode); HiveSyncMode syncMode = HiveSyncMode.of(config.getString(HIVE_SYNC_MODE));
switch (syncMode) { switch (syncMode) {
case HMS: case HMS:
ddlExecutor = new HMSDDLExecutor(configuration, cfg, fs); ddlExecutor = new HMSDDLExecutor(config);
break; break;
case HIVEQL: case HIVEQL:
ddlExecutor = new HiveQueryDDLExecutor(cfg, fs, configuration); ddlExecutor = new HiveQueryDDLExecutor(config);
break; break;
case JDBC: case JDBC:
ddlExecutor = new JDBCExecutor(cfg, fs); ddlExecutor = new JDBCExecutor(config);
break; break;
default: default:
throw new HoodieHiveSyncException("Invalid sync mode given " + cfg.syncMode); throw new HoodieHiveSyncException("Invalid sync mode given " + config.getString(HIVE_SYNC_MODE));
} }
} else { } else {
ddlExecutor = cfg.useJdbc ? new JDBCExecutor(cfg, fs) : new HiveQueryDDLExecutor(cfg, fs, configuration); ddlExecutor = config.getBoolean(HIVE_USE_JDBC) ? new JDBCExecutor(config) : new HiveQueryDDLExecutor(config);
} }
this.client = Hive.get(configuration).getMSC(); this.client = Hive.get(config.getHiveConf()).getMSC();
} catch (Exception e) { } catch (Exception e) {
throw new HoodieHiveSyncException("Failed to create HiveMetaStoreClient", e); throw new HoodieHiveSyncException("Failed to create HiveMetaStoreClient", e);
} }
} }
/**
* Add the (NEW) partitions to the table.
*/
@Override @Override
public void addPartitionsToTable(String tableName, List<String> partitionsToAdd) { public void addPartitionsToTable(String tableName, List<String> partitionsToAdd) {
ddlExecutor.addPartitionsToTable(tableName, partitionsToAdd); ddlExecutor.addPartitionsToTable(tableName, partitionsToAdd);
} }
/**
* Partition path has changed - update the path for te following partitions.
*/
@Override @Override
public void updatePartitionsToTable(String tableName, List<String> changedPartitions) { public void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
ddlExecutor.updatePartitionsToTable(tableName, changedPartitions); ddlExecutor.updatePartitionsToTable(tableName, changedPartitions);
} }
/**
* Partition path has changed - drop the following partitions.
*/
@Override @Override
public void dropPartitions(String tableName, List<String> partitionsToDrop) { public void dropPartitions(String tableName, List<String> partitionsToDrop) {
ddlExecutor.dropPartitionsToTable(tableName, partitionsToDrop); ddlExecutor.dropPartitionsToTable(tableName, partitionsToDrop);
} }
/**
* Update the table properties to the table.
*/
@Override @Override
public void updateTableProperties(String tableName, Map<String, String> tableProperties) { public void updateTableProperties(String tableName, Map<String, String> tableProperties) {
if (tableProperties == null || tableProperties.isEmpty()) { if (tableProperties == null || tableProperties.isEmpty()) {
return; return;
} }
try { try {
Table table = client.getTable(syncConfig.databaseName, tableName); Table table = client.getTable(databaseName, tableName);
for (Map.Entry<String, String> entry : tableProperties.entrySet()) { for (Map.Entry<String, String> entry : tableProperties.entrySet()) {
table.putToParameters(entry.getKey(), entry.getValue()); table.putToParameters(entry.getKey(), entry.getValue());
} }
client.alter_table(syncConfig.databaseName, tableName, table); client.alter_table(databaseName, tableName, table);
} catch (Exception e) { } catch (Exception e) {
throw new HoodieHiveSyncException("Failed to update table properties for table: " throw new HoodieHiveSyncException("Failed to update table properties for table: "
+ tableName, e); + tableName, e);
} }
} }
/**
* Scan table partitions.
*
* @deprecated Use {@link #getAllPartitions} instead.
*/
@Deprecated
public List<org.apache.hadoop.hive.metastore.api.Partition> scanTablePartitions(String tableName) throws TException {
return client.listPartitions(syncConfig.databaseName, tableName, (short) -1);
}
@Override @Override
public void updateTableDefinition(String tableName, MessageType newSchema) { public void updateTableSchema(String tableName, MessageType newSchema) {
ddlExecutor.updateTableDefinition(tableName, newSchema); ddlExecutor.updateTableDefinition(tableName, newSchema);
} }
@Override @Override
public List<Partition> getAllPartitions(String tableName) { public List<Partition> getAllPartitions(String tableName) {
try { try {
return client.listPartitions(syncConfig.databaseName, tableName, (short) -1) return client.listPartitions(databaseName, tableName, (short) -1)
.stream() .stream()
.map(p -> new Partition(p.getValues(), p.getSd().getLocation())) .map(p -> new Partition(p.getValues(), p.getSd().getLocation()))
.collect(Collectors.toList()); .collect(Collectors.toList());
} catch (TException e) { } catch (TException e) {
throw new HoodieHiveSyncException("Failed to get all partitions for table " + tableId(syncConfig.databaseName, tableName), e); throw new HoodieHiveSyncException("Failed to get all partitions for table " + tableId(databaseName, tableName), e);
} }
} }
@@ -168,11 +152,8 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
ddlExecutor.createTable(tableName, storageSchema, inputFormatClass, outputFormatClass, serdeClass, serdeProperties, tableProperties); ddlExecutor.createTable(tableName, storageSchema, inputFormatClass, outputFormatClass, serdeClass, serdeProperties, tableProperties);
} }
/**
* Get the table schema.
*/
@Override @Override
public Map<String, String> getTableSchema(String tableName) { public Map<String, String> getMetastoreSchema(String tableName) {
if (!tableExists(tableName)) { if (!tableExists(tableName)) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"Failed to get schema for table " + tableName + " does not exist"); "Failed to get schema for table " + tableName + " does not exist");
@@ -180,26 +161,15 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
return ddlExecutor.getTableSchema(tableName); return ddlExecutor.getTableSchema(tableName);
} }
@Deprecated
@Override
public boolean doesTableExist(String tableName) {
return tableExists(tableName);
}
@Override @Override
public boolean tableExists(String tableName) { public boolean tableExists(String tableName) {
try { try {
return client.tableExists(syncConfig.databaseName, tableName); return client.tableExists(databaseName, tableName);
} catch (TException e) { } catch (TException e) {
throw new HoodieHiveSyncException("Failed to check if table exists " + tableName, e); throw new HoodieHiveSyncException("Failed to check if table exists " + tableName, e);
} }
} }
@Deprecated
public boolean doesDataBaseExist(String databaseName) {
return databaseExists(databaseName);
}
@Override @Override
public boolean databaseExists(String databaseName) { public boolean databaseExists(String databaseName) {
try { try {
@@ -222,7 +192,7 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
public Option<String> getLastCommitTimeSynced(String tableName) { public Option<String> getLastCommitTimeSynced(String tableName) {
// Get the last commit time from the TBLproperties // Get the last commit time from the TBLproperties
try { try {
Table table = client.getTable(syncConfig.databaseName, tableName); Table table = client.getTable(databaseName, tableName);
return Option.ofNullable(table.getParameters().getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null)); return Option.ofNullable(table.getParameters().getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null));
} catch (Exception e) { } catch (Exception e) {
throw new HoodieHiveSyncException("Failed to get the last commit time synced from the table " + tableName, e); throw new HoodieHiveSyncException("Failed to get the last commit time synced from the table " + tableName, e);
@@ -232,10 +202,10 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
public Option<String> getLastReplicatedTime(String tableName) { public Option<String> getLastReplicatedTime(String tableName) {
// Get the last replicated time from the TBLproperties // Get the last replicated time from the TBLproperties
try { try {
Table table = client.getTable(syncConfig.databaseName, tableName); Table table = client.getTable(databaseName, tableName);
return Option.ofNullable(table.getParameters().getOrDefault(GLOBALLY_CONSISTENT_READ_TIMESTAMP, null)); return Option.ofNullable(table.getParameters().getOrDefault(GLOBALLY_CONSISTENT_READ_TIMESTAMP, null));
} catch (NoSuchObjectException e) { } catch (NoSuchObjectException e) {
LOG.warn("the said table not found in hms " + syncConfig.databaseName + "." + tableName); LOG.warn("the said table not found in hms " + tableId(databaseName, tableName));
return Option.empty(); return Option.empty();
} catch (Exception e) { } catch (Exception e) {
throw new HoodieHiveSyncException("Failed to get the last replicated time from the table " + tableName, e); throw new HoodieHiveSyncException("Failed to get the last replicated time from the table " + tableName, e);
@@ -243,15 +213,14 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
} }
public void updateLastReplicatedTimeStamp(String tableName, String timeStamp) { public void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
if (!activeTimeline.filterCompletedInstants().getInstants() if (getActiveTimeline().getInstants().noneMatch(i -> i.getTimestamp().equals(timeStamp))) {
.anyMatch(i -> i.getTimestamp().equals(timeStamp))) {
throw new HoodieHiveSyncException( throw new HoodieHiveSyncException(
"Not a valid completed timestamp " + timeStamp + " for table " + tableName); "Not a valid completed timestamp " + timeStamp + " for table " + tableName);
} }
try { try {
Table table = client.getTable(syncConfig.databaseName, tableName); Table table = client.getTable(databaseName, tableName);
table.putToParameters(GLOBALLY_CONSISTENT_READ_TIMESTAMP, timeStamp); table.putToParameters(GLOBALLY_CONSISTENT_READ_TIMESTAMP, timeStamp);
client.alter_table(syncConfig.databaseName, tableName, table); client.alter_table(databaseName, tableName, table);
} catch (Exception e) { } catch (Exception e) {
throw new HoodieHiveSyncException( throw new HoodieHiveSyncException(
"Failed to update last replicated time to " + timeStamp + " for " + tableName, e); "Failed to update last replicated time to " + timeStamp + " for " + tableName, e);
@@ -260,9 +229,9 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
public void deleteLastReplicatedTimeStamp(String tableName) { public void deleteLastReplicatedTimeStamp(String tableName) {
try { try {
Table table = client.getTable(syncConfig.databaseName, tableName); Table table = client.getTable(databaseName, tableName);
String timestamp = table.getParameters().remove(GLOBALLY_CONSISTENT_READ_TIMESTAMP); String timestamp = table.getParameters().remove(GLOBALLY_CONSISTENT_READ_TIMESTAMP);
client.alter_table(syncConfig.databaseName, tableName, table); client.alter_table(databaseName, tableName, table);
if (timestamp != null) { if (timestamp != null) {
LOG.info("deleted last replicated timestamp " + timestamp + " for table " + tableName); LOG.info("deleted last replicated timestamp " + timestamp + " for table " + tableName);
} }
@@ -290,12 +259,12 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
@Override @Override
public void updateLastCommitTimeSynced(String tableName) { public void updateLastCommitTimeSynced(String tableName) {
// Set the last commit time from the TBLproperties // Set the last commit time from the TBLproperties
Option<String> lastCommitSynced = activeTimeline.lastInstant().map(HoodieInstant::getTimestamp); Option<String> lastCommitSynced = getActiveTimeline().lastInstant().map(HoodieInstant::getTimestamp);
if (lastCommitSynced.isPresent()) { if (lastCommitSynced.isPresent()) {
try { try {
Table table = client.getTable(syncConfig.databaseName, tableName); Table table = client.getTable(databaseName, tableName);
table.putToParameters(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced.get()); table.putToParameters(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced.get());
client.alter_table(syncConfig.databaseName, tableName, table); client.alter_table(databaseName, tableName, table);
} catch (Exception e) { } catch (Exception e) {
throw new HoodieHiveSyncException("Failed to get update last commit time synced to " + lastCommitSynced, e); throw new HoodieHiveSyncException("Failed to get update last commit time synced to " + lastCommitSynced, e);
} }
@@ -303,36 +272,48 @@ public class HoodieHiveClient extends AbstractHiveSyncHoodieClient {
} }
@Override @Override
public List<FieldSchema> getTableCommentUsingMetastoreClient(String tableName) { public List<FieldSchema> getMetastoreFieldSchemas(String tableName) {
try { try {
return client.getSchema(syncConfig.databaseName, tableName); return client.getSchema(databaseName, tableName)
.stream()
.map(f -> new FieldSchema(f.getName(), f.getType(), f.getComment()))
.collect(Collectors.toList());
} catch (Exception e) { } catch (Exception e) {
throw new HoodieHiveSyncException("Failed to get table comments for : " + tableName, e); throw new HoodieHiveSyncException("Failed to get field schemas from metastore for table : " + tableName, e);
} }
} }
@Override @Override
public void updateTableComments(String tableName, List<FieldSchema> oldSchema, List<Schema.Field> newSchema) { public List<FieldSchema> getStorageFieldSchemas() {
Map<String,String> newComments = newSchema.stream().collect(Collectors.toMap(field -> field.name().toLowerCase(Locale.ROOT), field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc())); try {
updateTableComments(tableName,oldSchema,newComments); return new TableSchemaResolver(metaClient).getTableAvroSchema(false)
.getFields()
.stream()
.map(f -> new FieldSchema(f.name(), f.schema().getType().getName(), f.doc()))
.collect(Collectors.toList());
} catch (Exception e) {
throw new HoodieHiveSyncException("Failed to get field schemas from storage : ", e);
}
} }
@Override @Override
public void updateTableComments(String tableName, List<FieldSchema> oldSchema, Map<String,String> newComments) { public void updateTableComments(String tableName, List<FieldSchema> fromMetastore, List<FieldSchema> fromStorage) {
Map<String,String> oldComments = oldSchema.stream().collect(Collectors.toMap(fieldSchema -> fieldSchema.getName().toLowerCase(Locale.ROOT), Map<String, FieldSchema> metastoreMap = fromMetastore.stream().collect(Collectors.toMap(f -> f.getName().toLowerCase(Locale.ROOT), f -> f));
fieldSchema -> StringUtils.isNullOrEmpty(fieldSchema.getComment()) ? "" : fieldSchema.getComment())); Map<String, FieldSchema> storageMap = fromStorage.stream().collect(Collectors.toMap(f -> f.getName().toLowerCase(Locale.ROOT), f -> f));
Map<String,String> types = oldSchema.stream().collect(Collectors.toMap(FieldSchema::getName, FieldSchema::getType)); Map<String, Pair<String, String>> alterComments = new HashMap<>();
Map<String, ImmutablePair<String,String>> alterComments = new HashMap<>(); metastoreMap.forEach((name, metastoreFieldSchema) -> {
oldComments.forEach((name,comment) -> { if (storageMap.containsKey(name)) {
String newComment = newComments.getOrDefault(name,""); boolean updated = metastoreFieldSchema.updateComment(storageMap.get(name));
if (!newComment.equals(comment)) { if (updated) {
alterComments.put(name,new ImmutablePair<>(types.get(name),newComment)); alterComments.put(name, Pair.of(metastoreFieldSchema.getType(), metastoreFieldSchema.getCommentOrEmpty()));
}
} }
}); });
if (alterComments.size() > 0) { if (alterComments.isEmpty()) {
ddlExecutor.updateTableComments(tableName, alterComments);
} else {
LOG.info(String.format("No comment difference of %s ", tableName)); LOG.info(String.format("No comment difference of %s ", tableName));
} else {
ddlExecutor.updateTableComments(tableName, alterComments);
} }
} }
} }

View File

@@ -9,17 +9,19 @@
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing,
* distributed under the License is distributed on an "AS IS" BASIS, * software distributed under the License is distributed on an
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* See the License for the specific language governing permissions and * KIND, either express or implied. See the License for the
* limitations under the License. * specific language governing permissions and limitations
* under the License.
*/ */
package org.apache.hudi.hive; package org.apache.hudi.hive;
import java.util.Collections; import java.util.Collections;
import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;

View File

@@ -9,15 +9,18 @@
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing,
* distributed under the License is distributed on an "AS IS" BASIS, * software distributed under the License is distributed on an
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* See the License for the specific language governing permissions and * KIND, either express or implied. See the License for the
* limitations under the License. * specific language governing permissions and limitations
* under the License.
*/ */
package org.apache.hudi.hive; package org.apache.hudi.hive;
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;

View File

@@ -9,15 +9,18 @@
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing,
* distributed under the License is distributed on an "AS IS" BASIS, * software distributed under the License is distributed on an
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* See the License for the specific language governing permissions and * KIND, either express or implied. See the License for the
* limitations under the License. * specific language governing permissions and limitations
* under the License.
*/ */
package org.apache.hudi.hive; package org.apache.hudi.hive;
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.time.ZoneId; import java.time.ZoneId;
import java.time.ZonedDateTime; import java.time.ZonedDateTime;

View File

@@ -9,15 +9,18 @@
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing,
* distributed under the License is distributed on an "AS IS" BASIS, * software distributed under the License is distributed on an
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* See the License for the specific language governing permissions and * KIND, either express or implied. See the License for the
* limitations under the License. * specific language governing permissions and limitations
* under the License.
*/ */
package org.apache.hudi.hive; package org.apache.hudi.hive;
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.time.ZoneId; import java.time.ZoneId;
import java.time.ZonedDateTime; import java.time.ZonedDateTime;

View File

@@ -18,7 +18,7 @@
package org.apache.hudi.hive.ddl; package org.apache.hudi.hive.ddl;
import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair;
import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageType;
@@ -98,5 +98,5 @@ public interface DDLExecutor extends AutoCloseable {
* @param tableName * @param tableName
* @param newSchema Map key: field name, Map value: [field type, field comment] * @param newSchema Map key: field name, Map value: [field type, field comment]
*/ */
void updateTableComments(String tableName, Map<String, ImmutablePair<String, String>> newSchema); void updateTableComments(String tableName, Map<String, Pair<String, String>> newSchema);
} }

View File

@@ -20,17 +20,15 @@ package org.apache.hudi.hive.ddl;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.fs.StorageSchemes; import org.apache.hudi.common.fs.StorageSchemes;
import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HoodieHiveSyncException; import org.apache.hudi.hive.HoodieHiveSyncException;
import org.apache.hudi.hive.PartitionValueExtractor;
import org.apache.hudi.hive.util.HivePartitionUtil; import org.apache.hudi.hive.util.HivePartitionUtil;
import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hudi.hive.util.HiveSchemaUtil;
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.Database;
@@ -55,26 +53,35 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
/** /**
* DDLExecutor impl based on HMS which use HMS apis directly for all DDL tasks. * DDLExecutor impl based on HMS which use HMS apis directly for all DDL tasks.
*/ */
public class HMSDDLExecutor implements DDLExecutor { public class HMSDDLExecutor implements DDLExecutor {
private static final Logger LOG = LogManager.getLogger(HMSDDLExecutor.class);
private final HiveSyncConfig syncConfig;
private final PartitionValueExtractor partitionValueExtractor;
private final FileSystem fs;
private final IMetaStoreClient client;
public HMSDDLExecutor(HiveConf conf, HiveSyncConfig syncConfig, FileSystem fs) throws HiveException, MetaException { private static final Logger LOG = LogManager.getLogger(HMSDDLExecutor.class);
this.client = Hive.get(conf).getMSC();
private final HiveSyncConfig syncConfig;
private final String databaseName;
private final IMetaStoreClient client;
private final PartitionValueExtractor partitionValueExtractor;
public HMSDDLExecutor(HiveSyncConfig syncConfig) throws HiveException, MetaException {
this.syncConfig = syncConfig; this.syncConfig = syncConfig;
this.fs = fs; this.databaseName = syncConfig.getStringOrDefault(META_SYNC_DATABASE_NAME);
this.client = Hive.get(syncConfig.getHiveConf()).getMSC();
try { try {
this.partitionValueExtractor = this.partitionValueExtractor =
(PartitionValueExtractor) Class.forName(syncConfig.partitionValueExtractorClass).newInstance(); (PartitionValueExtractor) Class.forName(syncConfig.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS)).newInstance();
} catch (Exception e) { } catch (Exception e) {
throw new HoodieHiveSyncException( throw new HoodieHiveSyncException(
"Failed to initialize PartitionValueExtractor class " + syncConfig.partitionValueExtractorClass, e); "Failed to initialize PartitionValueExtractor class " + syncConfig.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS), e);
} }
} }
@@ -93,16 +100,16 @@ public class HMSDDLExecutor implements DDLExecutor {
public void createTable(String tableName, MessageType storageSchema, String inputFormatClass, String outputFormatClass, String serdeClass, Map<String, String> serdeProperties, public void createTable(String tableName, MessageType storageSchema, String inputFormatClass, String outputFormatClass, String serdeClass, Map<String, String> serdeProperties,
Map<String, String> tableProperties) { Map<String, String> tableProperties) {
try { try {
LinkedHashMap<String, String> mapSchema = HiveSchemaUtil.parquetSchemaToMapSchema(storageSchema, syncConfig.supportTimestamp, false); LinkedHashMap<String, String> mapSchema = HiveSchemaUtil.parquetSchemaToMapSchema(storageSchema, syncConfig.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE), false);
List<FieldSchema> fieldSchema = HiveSchemaUtil.convertMapSchemaToHiveFieldSchema(mapSchema, syncConfig); List<FieldSchema> fieldSchema = HiveSchemaUtil.convertMapSchemaToHiveFieldSchema(mapSchema, syncConfig);
List<FieldSchema> partitionSchema = syncConfig.partitionFields.stream().map(partitionKey -> { List<FieldSchema> partitionSchema = syncConfig.getSplitStrings(META_SYNC_PARTITION_FIELDS).stream().map(partitionKey -> {
String partitionKeyType = HiveSchemaUtil.getPartitionKeyType(mapSchema, partitionKey); String partitionKeyType = HiveSchemaUtil.getPartitionKeyType(mapSchema, partitionKey);
return new FieldSchema(partitionKey, partitionKeyType.toLowerCase(), ""); return new FieldSchema(partitionKey, partitionKeyType.toLowerCase(), "");
}).collect(Collectors.toList()); }).collect(Collectors.toList());
Table newTb = new Table(); Table newTb = new Table();
newTb.setDbName(syncConfig.databaseName); newTb.setDbName(databaseName);
newTb.setTableName(tableName); newTb.setTableName(tableName);
newTb.setOwner(UserGroupInformation.getCurrentUser().getShortUserName()); newTb.setOwner(UserGroupInformation.getCurrentUser().getShortUserName());
newTb.setCreateTime((int) System.currentTimeMillis()); newTb.setCreateTime((int) System.currentTimeMillis());
@@ -110,13 +117,13 @@ public class HMSDDLExecutor implements DDLExecutor {
storageDescriptor.setCols(fieldSchema); storageDescriptor.setCols(fieldSchema);
storageDescriptor.setInputFormat(inputFormatClass); storageDescriptor.setInputFormat(inputFormatClass);
storageDescriptor.setOutputFormat(outputFormatClass); storageDescriptor.setOutputFormat(outputFormatClass);
storageDescriptor.setLocation(syncConfig.basePath); storageDescriptor.setLocation(syncConfig.getString(META_SYNC_BASE_PATH));
serdeProperties.put("serialization.format", "1"); serdeProperties.put("serialization.format", "1");
storageDescriptor.setSerdeInfo(new SerDeInfo(null, serdeClass, serdeProperties)); storageDescriptor.setSerdeInfo(new SerDeInfo(null, serdeClass, serdeProperties));
newTb.setSd(storageDescriptor); newTb.setSd(storageDescriptor);
newTb.setPartitionKeys(partitionSchema); newTb.setPartitionKeys(partitionSchema);
if (!syncConfig.createManagedTable) { if (!syncConfig.getBoolean(HIVE_CREATE_MANAGED_TABLE)) {
newTb.putToParameters("EXTERNAL", "TRUE"); newTb.putToParameters("EXTERNAL", "TRUE");
} }
@@ -134,9 +141,9 @@ public class HMSDDLExecutor implements DDLExecutor {
@Override @Override
public void updateTableDefinition(String tableName, MessageType newSchema) { public void updateTableDefinition(String tableName, MessageType newSchema) {
try { try {
boolean cascade = syncConfig.partitionFields.size() > 0; boolean cascade = syncConfig.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() > 0;
List<FieldSchema> fieldSchema = HiveSchemaUtil.convertParquetSchemaToHiveFieldSchema(newSchema, syncConfig); List<FieldSchema> fieldSchema = HiveSchemaUtil.convertParquetSchemaToHiveFieldSchema(newSchema, syncConfig);
Table table = client.getTable(syncConfig.databaseName, tableName); Table table = client.getTable(databaseName, tableName);
StorageDescriptor sd = table.getSd(); StorageDescriptor sd = table.getSd();
sd.setCols(fieldSchema); sd.setCols(fieldSchema);
table.setSd(sd); table.setSd(sd);
@@ -145,7 +152,7 @@ public class HMSDDLExecutor implements DDLExecutor {
LOG.info("partition table,need cascade"); LOG.info("partition table,need cascade");
environmentContext.putToProperties(StatsSetupConst.CASCADE, StatsSetupConst.TRUE); environmentContext.putToProperties(StatsSetupConst.CASCADE, StatsSetupConst.TRUE);
} }
client.alter_table_with_environmentContext(syncConfig.databaseName, tableName, table, environmentContext); client.alter_table_with_environmentContext(databaseName, tableName, table, environmentContext);
} catch (Exception e) { } catch (Exception e) {
LOG.error("Failed to update table for " + tableName, e); LOG.error("Failed to update table for " + tableName, e);
throw new HoodieHiveSyncException("Failed to update table for " + tableName, e); throw new HoodieHiveSyncException("Failed to update table for " + tableName, e);
@@ -158,7 +165,7 @@ public class HMSDDLExecutor implements DDLExecutor {
// HiveMetastoreClient returns partition keys separate from Columns, hence get both and merge to // HiveMetastoreClient returns partition keys separate from Columns, hence get both and merge to
// get the Schema of the table. // get the Schema of the table.
final long start = System.currentTimeMillis(); final long start = System.currentTimeMillis();
Table table = this.client.getTable(syncConfig.databaseName, tableName); Table table = this.client.getTable(databaseName, tableName);
Map<String, String> partitionKeysMap = Map<String, String> partitionKeysMap =
table.getPartitionKeys().stream().collect(Collectors.toMap(FieldSchema::getName, f -> f.getType().toUpperCase())); table.getPartitionKeys().stream().collect(Collectors.toMap(FieldSchema::getName, f -> f.getType().toUpperCase()));
@@ -184,22 +191,22 @@ public class HMSDDLExecutor implements DDLExecutor {
} }
LOG.info("Adding partitions " + partitionsToAdd.size() + " to table " + tableName); LOG.info("Adding partitions " + partitionsToAdd.size() + " to table " + tableName);
try { try {
StorageDescriptor sd = client.getTable(syncConfig.databaseName, tableName).getSd(); StorageDescriptor sd = client.getTable(databaseName, tableName).getSd();
List<Partition> partitionList = partitionsToAdd.stream().map(partition -> { List<Partition> partitionList = partitionsToAdd.stream().map(partition -> {
StorageDescriptor partitionSd = new StorageDescriptor(); StorageDescriptor partitionSd = new StorageDescriptor();
partitionSd.setCols(sd.getCols()); partitionSd.setCols(sd.getCols());
partitionSd.setInputFormat(sd.getInputFormat()); partitionSd.setInputFormat(sd.getInputFormat());
partitionSd.setOutputFormat(sd.getOutputFormat()); partitionSd.setOutputFormat(sd.getOutputFormat());
partitionSd.setSerdeInfo(sd.getSerdeInfo()); partitionSd.setSerdeInfo(sd.getSerdeInfo());
String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition).toString(); String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.getString(META_SYNC_BASE_PATH), partition).toString();
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
partitionSd.setLocation(fullPartitionPath); partitionSd.setLocation(fullPartitionPath);
return new Partition(partitionValues, syncConfig.databaseName, tableName, 0, 0, partitionSd, null); return new Partition(partitionValues, databaseName, tableName, 0, 0, partitionSd, null);
}).collect(Collectors.toList()); }).collect(Collectors.toList());
client.add_partitions(partitionList, true, false); client.add_partitions(partitionList, true, false);
} catch (TException e) { } catch (TException e) {
LOG.error(syncConfig.databaseName + "." + tableName + " add partition failed", e); LOG.error(databaseName + "." + tableName + " add partition failed", e);
throw new HoodieHiveSyncException(syncConfig.databaseName + "." + tableName + " add partition failed", e); throw new HoodieHiveSyncException(databaseName + "." + tableName + " add partition failed", e);
} }
} }
@@ -211,20 +218,20 @@ public class HMSDDLExecutor implements DDLExecutor {
} }
LOG.info("Changing partitions " + changedPartitions.size() + " on " + tableName); LOG.info("Changing partitions " + changedPartitions.size() + " on " + tableName);
try { try {
StorageDescriptor sd = client.getTable(syncConfig.databaseName, tableName).getSd(); StorageDescriptor sd = client.getTable(databaseName, tableName).getSd();
List<Partition> partitionList = changedPartitions.stream().map(partition -> { List<Partition> partitionList = changedPartitions.stream().map(partition -> {
Path partitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition); Path partitionPath = FSUtils.getPartitionPath(syncConfig.getString(META_SYNC_BASE_PATH), partition);
String partitionScheme = partitionPath.toUri().getScheme(); String partitionScheme = partitionPath.toUri().getScheme();
String fullPartitionPath = StorageSchemes.HDFS.getScheme().equals(partitionScheme) String fullPartitionPath = StorageSchemes.HDFS.getScheme().equals(partitionScheme)
? FSUtils.getDFSFullPartitionPath(fs, partitionPath) : partitionPath.toString(); ? FSUtils.getDFSFullPartitionPath(syncConfig.getHadoopFileSystem(), partitionPath) : partitionPath.toString();
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
sd.setLocation(fullPartitionPath); sd.setLocation(fullPartitionPath);
return new Partition(partitionValues, syncConfig.databaseName, tableName, 0, 0, sd, null); return new Partition(partitionValues, databaseName, tableName, 0, 0, sd, null);
}).collect(Collectors.toList()); }).collect(Collectors.toList());
client.alter_partitions(syncConfig.databaseName, tableName, partitionList, null); client.alter_partitions(databaseName, tableName, partitionList, null);
} catch (TException e) { } catch (TException e) {
LOG.error(syncConfig.databaseName + "." + tableName + " update partition failed", e); LOG.error(databaseName + "." + tableName + " update partition failed", e);
throw new HoodieHiveSyncException(syncConfig.databaseName + "." + tableName + " update partition failed", e); throw new HoodieHiveSyncException(databaseName + "." + tableName + " update partition failed", e);
} }
} }
@@ -241,20 +248,20 @@ public class HMSDDLExecutor implements DDLExecutor {
if (HivePartitionUtil.partitionExists(client, tableName, dropPartition, partitionValueExtractor, syncConfig)) { if (HivePartitionUtil.partitionExists(client, tableName, dropPartition, partitionValueExtractor, syncConfig)) {
String partitionClause = String partitionClause =
HivePartitionUtil.getPartitionClauseForDrop(dropPartition, partitionValueExtractor, syncConfig); HivePartitionUtil.getPartitionClauseForDrop(dropPartition, partitionValueExtractor, syncConfig);
client.dropPartition(syncConfig.databaseName, tableName, partitionClause, false); client.dropPartition(databaseName, tableName, partitionClause, false);
} }
LOG.info("Drop partition " + dropPartition + " on " + tableName); LOG.info("Drop partition " + dropPartition + " on " + tableName);
} }
} catch (TException e) { } catch (TException e) {
LOG.error(syncConfig.databaseName + "." + tableName + " drop partition failed", e); LOG.error(databaseName + "." + tableName + " drop partition failed", e);
throw new HoodieHiveSyncException(syncConfig.databaseName + "." + tableName + " drop partition failed", e); throw new HoodieHiveSyncException(databaseName + "." + tableName + " drop partition failed", e);
} }
} }
@Override @Override
public void updateTableComments(String tableName, Map<String, ImmutablePair<String,String>> alterSchema) { public void updateTableComments(String tableName, Map<String, Pair<String, String>> alterSchema) {
try { try {
Table table = client.getTable(syncConfig.databaseName, tableName); Table table = client.getTable(databaseName, tableName);
StorageDescriptor sd = new StorageDescriptor(table.getSd()); StorageDescriptor sd = new StorageDescriptor(table.getSd());
for (FieldSchema fieldSchema : sd.getCols()) { for (FieldSchema fieldSchema : sd.getCols()) {
if (alterSchema.containsKey(fieldSchema.getName())) { if (alterSchema.containsKey(fieldSchema.getName())) {
@@ -264,7 +271,7 @@ public class HMSDDLExecutor implements DDLExecutor {
} }
table.setSd(sd); table.setSd(sd);
EnvironmentContext environmentContext = new EnvironmentContext(); EnvironmentContext environmentContext = new EnvironmentContext();
client.alter_table_with_environmentContext(syncConfig.databaseName, tableName, table, environmentContext); client.alter_table_with_environmentContext(databaseName, tableName, table, environmentContext);
sd.clear(); sd.clear();
} catch (Exception e) { } catch (Exception e) {
LOG.error("Failed to update table comments for " + tableName, e); LOG.error("Failed to update table comments for " + tableName, e);

View File

@@ -21,9 +21,8 @@ package org.apache.hudi.hive.ddl;
import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HoodieHiveSyncException; import org.apache.hudi.hive.HoodieHiveSyncException;
import org.apache.hudi.hive.util.HivePartitionUtil;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.MetaException;
@@ -34,7 +33,6 @@ import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hudi.hive.util.HivePartitionUtil;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -46,26 +44,28 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static org.apache.hudi.sync.common.util.TableUtils.tableId;
/** /**
* This class offers DDL executor backed by the hive.ql Driver This class preserves the old useJDBC = false way of doing things. * This class offers DDL executor backed by the hive.ql Driver This class preserves the old useJDBC = false way of doing things.
*/ */
public class HiveQueryDDLExecutor extends QueryBasedDDLExecutor { public class HiveQueryDDLExecutor extends QueryBasedDDLExecutor {
private static final Logger LOG = LogManager.getLogger(HiveQueryDDLExecutor.class);
private final HiveSyncConfig config;
private final IMetaStoreClient metaStoreClient;
private SessionState sessionState = null;
private Driver hiveDriver = null;
public HiveQueryDDLExecutor(HiveSyncConfig config, FileSystem fs, HiveConf configuration) throws HiveException, MetaException { private static final Logger LOG = LogManager.getLogger(HiveQueryDDLExecutor.class);
super(config, fs);
this.config = config; private final IMetaStoreClient metaStoreClient;
this.metaStoreClient = Hive.get(configuration).getMSC(); private SessionState sessionState;
private Driver hiveDriver;
public HiveQueryDDLExecutor(HiveSyncConfig config) throws HiveException, MetaException {
super(config);
this.metaStoreClient = Hive.get(config.getHiveConf()).getMSC();
try { try {
this.sessionState = new SessionState(configuration, this.sessionState = new SessionState(config.getHiveConf(),
UserGroupInformation.getCurrentUser().getShortUserName()); UserGroupInformation.getCurrentUser().getShortUserName());
SessionState.start(this.sessionState); SessionState.start(this.sessionState);
this.sessionState.setCurrentDatabase(config.databaseName); this.sessionState.setCurrentDatabase(databaseName);
hiveDriver = new org.apache.hadoop.hive.ql.Driver(configuration); this.hiveDriver = new org.apache.hadoop.hive.ql.Driver(config.getHiveConf());
} catch (Exception e) { } catch (Exception e) {
if (sessionState != null) { if (sessionState != null) {
try { try {
@@ -109,7 +109,7 @@ public class HiveQueryDDLExecutor extends QueryBasedDDLExecutor {
// HiveMetastoreClient returns partition keys separate from Columns, hence get both and merge to // HiveMetastoreClient returns partition keys separate from Columns, hence get both and merge to
// get the Schema of the table. // get the Schema of the table.
final long start = System.currentTimeMillis(); final long start = System.currentTimeMillis();
Table table = metaStoreClient.getTable(config.databaseName, tableName); Table table = metaStoreClient.getTable(databaseName, tableName);
Map<String, String> partitionKeysMap = Map<String, String> partitionKeysMap =
table.getPartitionKeys().stream().collect(Collectors.toMap(FieldSchema::getName, f -> f.getType().toUpperCase())); table.getPartitionKeys().stream().collect(Collectors.toMap(FieldSchema::getName, f -> f.getType().toUpperCase()));
@@ -141,13 +141,13 @@ public class HiveQueryDDLExecutor extends QueryBasedDDLExecutor {
config)) { config)) {
String partitionClause = String partitionClause =
HivePartitionUtil.getPartitionClauseForDrop(dropPartition, partitionValueExtractor, config); HivePartitionUtil.getPartitionClauseForDrop(dropPartition, partitionValueExtractor, config);
metaStoreClient.dropPartition(config.databaseName, tableName, partitionClause, false); metaStoreClient.dropPartition(databaseName, tableName, partitionClause, false);
} }
LOG.info("Drop partition " + dropPartition + " on " + tableName); LOG.info("Drop partition " + dropPartition + " on " + tableName);
} }
} catch (Exception e) { } catch (Exception e) {
LOG.error(config.databaseName + "." + tableName + " drop partition failed", e); LOG.error(tableId(databaseName, tableName) + " drop partition failed", e);
throw new HoodieHiveSyncException(config.databaseName + "." + tableName + " drop partition failed", e); throw new HoodieHiveSyncException(tableId(databaseName, tableName) + " drop partition failed", e);
} }
} }

View File

@@ -18,12 +18,9 @@
package org.apache.hudi.hive.ddl; package org.apache.hudi.hive.ddl;
import static org.apache.hudi.hive.util.HiveSchemaUtil.HIVE_ESCAPE_CHARACTER;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HoodieHiveSyncException; import org.apache.hudi.hive.HoodieHiveSyncException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -39,21 +36,27 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.hive.util.HiveSchemaUtil.HIVE_ESCAPE_CHARACTER;
/** /**
* This class offers DDL executor backed by the jdbc This class preserves the old useJDBC = true way of doing things. * This class offers DDL executor backed by the jdbc This class preserves the old useJDBC = true way of doing things.
*/ */
public class JDBCExecutor extends QueryBasedDDLExecutor { public class JDBCExecutor extends QueryBasedDDLExecutor {
private static final Logger LOG = LogManager.getLogger(QueryBasedDDLExecutor.class); private static final Logger LOG = LogManager.getLogger(QueryBasedDDLExecutor.class);
private final HiveSyncConfig config;
private Connection connection; private Connection connection;
public JDBCExecutor(HiveSyncConfig config, FileSystem fs) { public JDBCExecutor(HiveSyncConfig config) {
super(config, fs); super(config);
Objects.requireNonNull(config.jdbcUrl, "--jdbc-url option is required for jdbc sync mode"); Objects.requireNonNull(config.getStringOrDefault(HIVE_URL), "--jdbc-url option is required for jdbc sync mode");
Objects.requireNonNull(config.hiveUser, "--user option is required for jdbc sync mode"); Objects.requireNonNull(config.getStringOrDefault(HIVE_USER), "--user option is required for jdbc sync mode");
Objects.requireNonNull(config.hivePass, "--pass option is required for jdbc sync mode"); Objects.requireNonNull(config.getStringOrDefault(HIVE_PASS), "--pass option is required for jdbc sync mode");
this.config = config; createHiveConnection(config.getStringOrDefault(HIVE_URL), config.getStringOrDefault(HIVE_USER), config.getStringOrDefault(HIVE_PASS));
createHiveConnection(config.jdbcUrl, config.hiveUser, config.hivePass);
} }
@Override @Override
@@ -126,7 +129,7 @@ public class JDBCExecutor extends QueryBasedDDLExecutor {
ResultSet result = null; ResultSet result = null;
try { try {
DatabaseMetaData databaseMetaData = connection.getMetaData(); DatabaseMetaData databaseMetaData = connection.getMetaData();
result = databaseMetaData.getColumns(null, config.databaseName, tableName, null); result = databaseMetaData.getColumns(null, databaseName, tableName, null);
while (result.next()) { while (result.next()) {
String columnName = result.getString(4); String columnName = result.getString(4);
String columnType = result.getString(6); String columnType = result.getString(6);
@@ -157,11 +160,11 @@ public class JDBCExecutor extends QueryBasedDDLExecutor {
} }
private List<String> constructDropPartitions(String tableName, List<String> partitions) { private List<String> constructDropPartitions(String tableName, List<String> partitions) {
if (config.batchSyncNum <= 0) { if (config.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM) <= 0) {
throw new HoodieHiveSyncException("batch-sync-num for sync hive table must be greater than 0, pls check your parameter"); throw new HoodieHiveSyncException("batch-sync-num for sync hive table must be greater than 0, pls check your parameter");
} }
List<String> result = new ArrayList<>(); List<String> result = new ArrayList<>();
int batchSyncPartitionNum = config.batchSyncNum; int batchSyncPartitionNum = config.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM);
StringBuilder alterSQL = getAlterTableDropPrefix(tableName); StringBuilder alterSQL = getAlterTableDropPrefix(tableName);
for (int i = 0; i < partitions.size(); i++) { for (int i = 0; i < partitions.size(); i++) {
@@ -186,7 +189,7 @@ public class JDBCExecutor extends QueryBasedDDLExecutor {
public StringBuilder getAlterTableDropPrefix(String tableName) { public StringBuilder getAlterTableDropPrefix(String tableName) {
StringBuilder alterSQL = new StringBuilder("ALTER TABLE "); StringBuilder alterSQL = new StringBuilder("ALTER TABLE ");
alterSQL.append(HIVE_ESCAPE_CHARACTER).append(config.databaseName) alterSQL.append(HIVE_ESCAPE_CHARACTER).append(databaseName)
.append(HIVE_ESCAPE_CHARACTER).append(".").append(HIVE_ESCAPE_CHARACTER) .append(HIVE_ESCAPE_CHARACTER).append(".").append(HIVE_ESCAPE_CHARACTER)
.append(tableName).append(HIVE_ESCAPE_CHARACTER).append(" DROP IF EXISTS "); .append(tableName).append(HIVE_ESCAPE_CHARACTER).append(" DROP IF EXISTS ");
return alterSQL; return alterSQL;

View File

@@ -22,13 +22,12 @@ import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.fs.StorageSchemes; import org.apache.hudi.common.fs.StorageSchemes;
import org.apache.hudi.common.util.PartitionPathEncodeUtils; import org.apache.hudi.common.util.PartitionPathEncodeUtils;
import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HoodieHiveSyncException; import org.apache.hudi.hive.HoodieHiveSyncException;
import org.apache.hudi.hive.PartitionValueExtractor;
import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hudi.hive.util.HiveSchemaUtil;
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -39,26 +38,35 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
import static org.apache.hudi.hive.util.HiveSchemaUtil.HIVE_ESCAPE_CHARACTER; import static org.apache.hudi.hive.util.HiveSchemaUtil.HIVE_ESCAPE_CHARACTER;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DECODE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
/** /**
* This class adds functionality for all query based DDLExecutors. The classes extending it only have to provide runSQL(sql) functions. * This class adds functionality for all query based DDLExecutors. The classes extending it only have to provide runSQL(sql) functions.
*/ */
public abstract class QueryBasedDDLExecutor implements DDLExecutor { public abstract class QueryBasedDDLExecutor implements DDLExecutor {
private static final Logger LOG = LogManager.getLogger(QueryBasedDDLExecutor.class);
private final HiveSyncConfig config;
public final PartitionValueExtractor partitionValueExtractor;
private final FileSystem fs;
public QueryBasedDDLExecutor(HiveSyncConfig config, FileSystem fs) { private static final Logger LOG = LogManager.getLogger(QueryBasedDDLExecutor.class);
this.fs = fs;
protected final HiveSyncConfig config;
protected final String databaseName;
protected final PartitionValueExtractor partitionValueExtractor;
public QueryBasedDDLExecutor(HiveSyncConfig config) {
this.config = config; this.config = config;
this.databaseName = config.getStringOrDefault(META_SYNC_DATABASE_NAME);
try { try {
this.partitionValueExtractor = this.partitionValueExtractor =
(PartitionValueExtractor) Class.forName(config.partitionValueExtractorClass).newInstance(); (PartitionValueExtractor) Class.forName(config.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS)).newInstance();
} catch (Exception e) { } catch (Exception e) {
throw new HoodieHiveSyncException( throw new HoodieHiveSyncException(
"Failed to initialize PartitionValueExtractor class " + config.partitionValueExtractorClass, e); "Failed to initialize PartitionValueExtractor class " + config.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS), e);
} }
} }
@@ -90,11 +98,11 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
@Override @Override
public void updateTableDefinition(String tableName, MessageType newSchema) { public void updateTableDefinition(String tableName, MessageType newSchema) {
try { try {
String newSchemaStr = HiveSchemaUtil.generateSchemaString(newSchema, config.partitionFields, config.supportTimestamp); String newSchemaStr = HiveSchemaUtil.generateSchemaString(newSchema, config.getSplitStrings(META_SYNC_PARTITION_FIELDS), config.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE));
// Cascade clause should not be present for non-partitioned tables // Cascade clause should not be present for non-partitioned tables
String cascadeClause = config.partitionFields.size() > 0 ? " cascade" : ""; String cascadeClause = config.getSplitStrings(HIVE_SUPPORT_TIMESTAMP_TYPE).size() > 0 ? " cascade" : "";
StringBuilder sqlBuilder = new StringBuilder("ALTER TABLE ").append(HIVE_ESCAPE_CHARACTER) StringBuilder sqlBuilder = new StringBuilder("ALTER TABLE ").append(HIVE_ESCAPE_CHARACTER)
.append(config.databaseName).append(HIVE_ESCAPE_CHARACTER).append(".") .append(databaseName).append(HIVE_ESCAPE_CHARACTER).append(".")
.append(HIVE_ESCAPE_CHARACTER).append(tableName) .append(HIVE_ESCAPE_CHARACTER).append(tableName)
.append(HIVE_ESCAPE_CHARACTER).append(" REPLACE COLUMNS(") .append(HIVE_ESCAPE_CHARACTER).append(" REPLACE COLUMNS(")
.append(newSchemaStr).append(" )").append(cascadeClause); .append(newSchemaStr).append(" )").append(cascadeClause);
@@ -130,15 +138,15 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
} }
@Override @Override
public void updateTableComments(String tableName, Map<String, ImmutablePair<String,String>> newSchema) { public void updateTableComments(String tableName, Map<String, Pair<String, String>> newSchema) {
for (Map.Entry<String, ImmutablePair<String,String>> field : newSchema.entrySet()) { for (Map.Entry<String, Pair<String,String>> field : newSchema.entrySet()) {
String name = field.getKey(); String name = field.getKey();
StringBuilder sql = new StringBuilder(); StringBuilder sql = new StringBuilder();
String type = field.getValue().getLeft(); String type = field.getValue().getLeft();
String comment = field.getValue().getRight(); String comment = field.getValue().getRight();
comment = comment.replace("'",""); comment = comment.replace("'","");
sql.append("ALTER TABLE ").append(HIVE_ESCAPE_CHARACTER) sql.append("ALTER TABLE ").append(HIVE_ESCAPE_CHARACTER)
.append(config.databaseName).append(HIVE_ESCAPE_CHARACTER).append(".") .append(databaseName).append(HIVE_ESCAPE_CHARACTER).append(".")
.append(HIVE_ESCAPE_CHARACTER).append(tableName) .append(HIVE_ESCAPE_CHARACTER).append(tableName)
.append(HIVE_ESCAPE_CHARACTER) .append(HIVE_ESCAPE_CHARACTER)
.append(" CHANGE COLUMN `").append(name).append("` `").append(name) .append(" CHANGE COLUMN `").append(name).append("` `").append(name)
@@ -148,15 +156,15 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
} }
private List<String> constructAddPartitions(String tableName, List<String> partitions) { private List<String> constructAddPartitions(String tableName, List<String> partitions) {
if (config.batchSyncNum <= 0) { if (config.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM) <= 0) {
throw new HoodieHiveSyncException("batch-sync-num for sync hive table must be greater than 0, pls check your parameter"); throw new HoodieHiveSyncException("batch-sync-num for sync hive table must be greater than 0, pls check your parameter");
} }
List<String> result = new ArrayList<>(); List<String> result = new ArrayList<>();
int batchSyncPartitionNum = config.batchSyncNum; int batchSyncPartitionNum = config.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM);
StringBuilder alterSQL = getAlterTablePrefix(tableName); StringBuilder alterSQL = getAlterTablePrefix(tableName);
for (int i = 0; i < partitions.size(); i++) { for (int i = 0; i < partitions.size(); i++) {
String partitionClause = getPartitionClause(partitions.get(i)); String partitionClause = getPartitionClause(partitions.get(i));
String fullPartitionPath = FSUtils.getPartitionPath(config.basePath, partitions.get(i)).toString(); String fullPartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partitions.get(i)).toString();
alterSQL.append(" PARTITION (").append(partitionClause).append(") LOCATION '").append(fullPartitionPath) alterSQL.append(" PARTITION (").append(partitionClause).append(") LOCATION '").append(fullPartitionPath)
.append("' "); .append("' ");
if ((i + 1) % batchSyncPartitionNum == 0) { if ((i + 1) % batchSyncPartitionNum == 0) {
@@ -173,7 +181,7 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
private StringBuilder getAlterTablePrefix(String tableName) { private StringBuilder getAlterTablePrefix(String tableName) {
StringBuilder alterSQL = new StringBuilder("ALTER TABLE "); StringBuilder alterSQL = new StringBuilder("ALTER TABLE ");
alterSQL.append(HIVE_ESCAPE_CHARACTER).append(config.databaseName) alterSQL.append(HIVE_ESCAPE_CHARACTER).append(databaseName)
.append(HIVE_ESCAPE_CHARACTER).append(".").append(HIVE_ESCAPE_CHARACTER) .append(HIVE_ESCAPE_CHARACTER).append(".").append(HIVE_ESCAPE_CHARACTER)
.append(tableName).append(HIVE_ESCAPE_CHARACTER).append(" ADD IF NOT EXISTS "); .append(tableName).append(HIVE_ESCAPE_CHARACTER).append(" ADD IF NOT EXISTS ");
return alterSQL; return alterSQL;
@@ -181,18 +189,18 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
public String getPartitionClause(String partition) { public String getPartitionClause(String partition) {
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
ValidationUtils.checkArgument(config.partitionFields.size() == partitionValues.size(), ValidationUtils.checkArgument(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() == partitionValues.size(),
"Partition key parts " + config.partitionFields + " does not match with partition values " + partitionValues "Partition key parts " + config.getSplitStrings(META_SYNC_PARTITION_FIELDS) + " does not match with partition values " + partitionValues
+ ". Check partition strategy. "); + ". Check partition strategy. ");
List<String> partBuilder = new ArrayList<>(); List<String> partBuilder = new ArrayList<>();
for (int i = 0; i < config.partitionFields.size(); i++) { for (int i = 0; i < config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size(); i++) {
String partitionValue = partitionValues.get(i); String partitionValue = partitionValues.get(i);
// decode the partition before sync to hive to prevent multiple escapes of HIVE // decode the partition before sync to hive to prevent multiple escapes of HIVE
if (config.decodePartition) { if (config.getBoolean(META_SYNC_DECODE_PARTITION)) {
// This is a decode operator for encode in KeyGenUtils#getRecordPartitionPath // This is a decode operator for encode in KeyGenUtils#getRecordPartitionPath
partitionValue = PartitionPathEncodeUtils.unescapePathName(partitionValue); partitionValue = PartitionPathEncodeUtils.unescapePathName(partitionValue);
} }
partBuilder.add("`" + config.partitionFields.get(i) + "`='" + partitionValue + "'"); partBuilder.add("`" + config.getSplitStrings(META_SYNC_PARTITION_FIELDS).get(i) + "`='" + partitionValue + "'");
} }
return String.join(",", partBuilder); return String.join(",", partBuilder);
} }
@@ -200,15 +208,15 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
private List<String> constructChangePartitions(String tableName, List<String> partitions) { private List<String> constructChangePartitions(String tableName, List<String> partitions) {
List<String> changePartitions = new ArrayList<>(); List<String> changePartitions = new ArrayList<>();
// Hive 2.x doesn't like db.table name for operations, hence we need to change to using the database first // Hive 2.x doesn't like db.table name for operations, hence we need to change to using the database first
String useDatabase = "USE " + HIVE_ESCAPE_CHARACTER + config.databaseName + HIVE_ESCAPE_CHARACTER; String useDatabase = "USE " + HIVE_ESCAPE_CHARACTER + databaseName + HIVE_ESCAPE_CHARACTER;
changePartitions.add(useDatabase); changePartitions.add(useDatabase);
String alterTable = "ALTER TABLE " + HIVE_ESCAPE_CHARACTER + tableName + HIVE_ESCAPE_CHARACTER; String alterTable = "ALTER TABLE " + HIVE_ESCAPE_CHARACTER + tableName + HIVE_ESCAPE_CHARACTER;
for (String partition : partitions) { for (String partition : partitions) {
String partitionClause = getPartitionClause(partition); String partitionClause = getPartitionClause(partition);
Path partitionPath = FSUtils.getPartitionPath(config.basePath, partition); Path partitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), partition);
String partitionScheme = partitionPath.toUri().getScheme(); String partitionScheme = partitionPath.toUri().getScheme();
String fullPartitionPath = StorageSchemes.HDFS.getScheme().equals(partitionScheme) String fullPartitionPath = StorageSchemes.HDFS.getScheme().equals(partitionScheme)
? FSUtils.getDFSFullPartitionPath(fs, partitionPath) : partitionPath.toString(); ? FSUtils.getDFSFullPartitionPath(config.getHadoopFileSystem(), partitionPath) : partitionPath.toString();
String changePartition = String changePartition =
alterTable + " PARTITION (" + partitionClause + ") SET LOCATION '" + fullPartitionPath + "'"; alterTable + " PARTITION (" + partitionClause + ") SET LOCATION '" + fullPartitionPath + "'";
changePartitions.add(changePartition); changePartitions.add(changePartition);

View File

@@ -18,46 +18,44 @@
package org.apache.hudi.hive.replication; package org.apache.hudi.hive.replication;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import org.apache.hadoop.conf.Configuration;
import java.util.Properties;
public class GlobalHiveSyncConfig extends HiveSyncConfig { public class GlobalHiveSyncConfig extends HiveSyncConfig {
public static final ConfigProperty<String> META_SYNC_GLOBAL_REPLICATE_TIMESTAMP = ConfigProperty
.key("hoodie.meta_sync.global.replicate.timestamp")
.noDefaultValue()
.withDocumentation("");
public GlobalHiveSyncConfig(Properties props, Configuration hadoopConf) {
super(props, hadoopConf);
}
public static class GlobalHiveSyncConfigParams {
@ParametersDelegate()
public final HiveSyncConfigParams hiveSyncConfigParams = new HiveSyncConfigParams();
@Parameter(names = {"--replicated-timestamp"}, description = "Add globally replicated timestamp to enable consistent reads across clusters") @Parameter(names = {"--replicated-timestamp"}, description = "Add globally replicated timestamp to enable consistent reads across clusters")
public String globallyReplicatedTimeStamp; public String globallyReplicatedTimeStamp;
public GlobalHiveSyncConfig() { public boolean isHelp() {
return hiveSyncConfigParams.isHelp();
} }
public GlobalHiveSyncConfig(TypedProperties props) { public TypedProperties toProps() {
super(props); final TypedProperties props = hiveSyncConfigParams.toProps();
props.setPropertyIfNonNull(META_SYNC_GLOBAL_REPLICATE_TIMESTAMP.key(), globallyReplicatedTimeStamp);
return props;
} }
public static GlobalHiveSyncConfig copy(GlobalHiveSyncConfig cfg) {
GlobalHiveSyncConfig newConfig = new GlobalHiveSyncConfig(cfg.getProps());
newConfig.basePath = cfg.basePath;
newConfig.assumeDatePartitioning = cfg.assumeDatePartitioning;
newConfig.databaseName = cfg.databaseName;
newConfig.hivePass = cfg.hivePass;
newConfig.hiveUser = cfg.hiveUser;
newConfig.partitionFields = cfg.partitionFields;
newConfig.partitionValueExtractorClass = cfg.partitionValueExtractorClass;
newConfig.jdbcUrl = cfg.jdbcUrl;
newConfig.tableName = cfg.tableName;
newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat;
newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
newConfig.supportTimestamp = cfg.supportTimestamp;
newConfig.decodePartition = cfg.decodePartition;
newConfig.batchSyncNum = cfg.batchSyncNum;
newConfig.globallyReplicatedTimeStamp = cfg.globallyReplicatedTimeStamp;
return newConfig;
}
@Override
public String toString() {
return "GlobalHiveSyncConfig{" + super.toString()
+ " globallyReplicatedTimeStamp=" + globallyReplicatedTimeStamp + "}";
} }
} }

View File

@@ -18,26 +18,28 @@
package org.apache.hudi.hive.replication; package org.apache.hudi.hive.replication;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Properties;
import static org.apache.hudi.hive.replication.GlobalHiveSyncConfig.META_SYNC_GLOBAL_REPLICATE_TIMESTAMP;
public class GlobalHiveSyncTool extends HiveSyncTool { public class GlobalHiveSyncTool extends HiveSyncTool {
private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class); private static final Logger LOG = LogManager.getLogger(GlobalHiveSyncTool.class);
protected final GlobalHiveSyncConfig config;
public GlobalHiveSyncTool(GlobalHiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { public GlobalHiveSyncTool(Properties props, Configuration hadoopConf) {
super(cfg, configuration, fs); super(props, hadoopConf);
this.config = new GlobalHiveSyncConfig(props, hadoopConf);
} }
@Override @Override
@@ -48,19 +50,21 @@ public class GlobalHiveSyncTool extends HiveSyncTool {
@Override @Override
protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) { protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) {
super.syncHoodieTable(tableName, useRealtimeInputFormat, readAsOptimized); super.syncHoodieTable(tableName, useRealtimeInputFormat, readAsOptimized);
if (((GlobalHiveSyncConfig) hiveSyncConfig).globallyReplicatedTimeStamp != null) { Option<String> timestamp = Option.ofNullable(config.getString(META_SYNC_GLOBAL_REPLICATE_TIMESTAMP));
hoodieHiveClient.updateLastReplicatedTimeStamp(tableName, if (timestamp.isPresent()) {
((GlobalHiveSyncConfig) hiveSyncConfig).globallyReplicatedTimeStamp); syncClient.updateLastReplicatedTimeStamp(tableName, timestamp.get());
}
LOG.info("Sync complete for " + tableName); LOG.info("Sync complete for " + tableName);
} else {
LOG.warn("Sync skipped: " + META_SYNC_GLOBAL_REPLICATE_TIMESTAMP.key() + " is not set.");
}
} }
public Map<String, Option<String>> getLastReplicatedTimeStampMap() { public Map<String, Option<String>> getLastReplicatedTimeStampMap() {
Map<String, Option<String>> timeStampMap = new HashMap<>(); Map<String, Option<String>> timeStampMap = new HashMap<>();
Option<String> timeStamp = hoodieHiveClient.getLastReplicatedTime(snapshotTableName); Option<String> timeStamp = syncClient.getLastReplicatedTime(snapshotTableName);
timeStampMap.put(snapshotTableName, timeStamp); timeStampMap.put(snapshotTableName, timeStamp);
if (HoodieTableType.MERGE_ON_READ.equals(hoodieHiveClient.getTableType())) { if (HoodieTableType.MERGE_ON_READ.equals(syncClient.getTableType())) {
Option<String> roTimeStamp = hoodieHiveClient.getLastReplicatedTime(roTableName.get()); Option<String> roTimeStamp = syncClient.getLastReplicatedTime(roTableName.get());
timeStampMap.put(roTableName.get(), roTimeStamp); timeStampMap.put(roTableName.get(), roTimeStamp);
} }
return timeStampMap; return timeStampMap;
@@ -70,18 +74,12 @@ public class GlobalHiveSyncTool extends HiveSyncTool {
for (String tableName : timeStampMap.keySet()) { for (String tableName : timeStampMap.keySet()) {
Option<String> timestamp = timeStampMap.get(tableName); Option<String> timestamp = timeStampMap.get(tableName);
if (timestamp.isPresent()) { if (timestamp.isPresent()) {
hoodieHiveClient.updateLastReplicatedTimeStamp(tableName, timestamp.get()); syncClient.updateLastReplicatedTimeStamp(tableName, timestamp.get());
LOG.info("updated timestamp for " + tableName + " to: " + timestamp.get()); LOG.info("updated timestamp for " + tableName + " to: " + timestamp.get());
} else { } else {
hoodieHiveClient.deleteLastReplicatedTimeStamp(tableName); syncClient.deleteLastReplicatedTimeStamp(tableName);
LOG.info("deleted timestamp for " + tableName); LOG.info("deleted timestamp for " + tableName);
} }
} }
} }
public static GlobalHiveSyncTool buildGlobalHiveSyncTool(GlobalHiveSyncConfig cfg, HiveConf hiveConf) {
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
hiveConf.addResource(fs.getConf());
return new GlobalHiveSyncTool(cfg, hiveConf, fs);
}
} }

View File

@@ -18,18 +18,22 @@
package org.apache.hudi.hive.replication; package org.apache.hudi.hive.replication;
import com.beust.jcommander.Parameter; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.StringUtils;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters; import com.beust.jcommander.Parameters;
import java.io.File; import com.beust.jcommander.ParametersDelegate;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream;
import java.util.Properties; import java.util.Properties;
import org.apache.hudi.common.util.StringUtils;
import org.apache.log4j.LogManager; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import org.apache.log4j.Logger; import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
// TODO: stop extending HiveSyncConfig and take all the variables needed from config file // TODO: stop extending HiveSyncConfig and take all the variables needed from config file
@Parameters(commandDescription = "A tool to sync the hudi table to hive from different clusters. Similar to HiveSyncTool but syncs it to more" @Parameters(commandDescription = "A tool to sync the hudi table to hive from different clusters. Similar to HiveSyncTool but syncs it to more"
@@ -40,9 +44,9 @@ import org.apache.log4j.Logger;
+ " The tool tries to be transactional but does not guarantee it. If the sync fails midway in one cluster it will try to roll back the committed " + " The tool tries to be transactional but does not guarantee it. If the sync fails midway in one cluster it will try to roll back the committed "
+ " timestamp from already successful sync on other clusters but that can also fail." + " timestamp from already successful sync on other clusters but that can also fail."
+ " The tool does not roll back any synced partitions but only the timestamp.") + " The tool does not roll back any synced partitions but only the timestamp.")
public class HiveSyncGlobalCommitConfig extends GlobalHiveSyncConfig { public class HiveSyncGlobalCommitParams {
private static final Logger LOG = LogManager.getLogger(HiveSyncGlobalCommitConfig.class); private static final Logger LOG = LogManager.getLogger(HiveSyncGlobalCommitParams.class);
public static String LOCAL_HIVE_SITE_URI = "hivesyncglobal.local_hive_site_uri"; public static String LOCAL_HIVE_SITE_URI = "hivesyncglobal.local_hive_site_uri";
public static String REMOTE_HIVE_SITE_URI = "hivesyncglobal.remote_hive_site_uri"; public static String REMOTE_HIVE_SITE_URI = "hivesyncglobal.remote_hive_site_uri";
@@ -55,7 +59,14 @@ public class HiveSyncGlobalCommitConfig extends GlobalHiveSyncConfig {
"--config-xml-file"}, description = "path to the config file in Hive", required = true) "--config-xml-file"}, description = "path to the config file in Hive", required = true)
public String configFile; public String configFile;
public Properties properties = new Properties(); @ParametersDelegate()
public final GlobalHiveSyncConfig.GlobalHiveSyncConfigParams globalHiveSyncConfigParams = new GlobalHiveSyncConfig.GlobalHiveSyncConfigParams();
public boolean isHelp() {
return globalHiveSyncConfigParams.isHelp();
}
public Properties loadedProps = new Properties();
private boolean finalize = false; private boolean finalize = false;
@@ -64,33 +75,33 @@ public class HiveSyncGlobalCommitConfig extends GlobalHiveSyncConfig {
throw new RuntimeException("trying to modify finalized config"); throw new RuntimeException("trying to modify finalized config");
} }
finalize = true; finalize = true;
try (InputStream configStream = new FileInputStream(new File(configFile))) { try (InputStream configStream = new FileInputStream(configFile)) {
properties.loadFromXML(configStream); loadedProps.loadFromXML(configStream);
} }
if (StringUtils.isNullOrEmpty(globallyReplicatedTimeStamp)) { if (StringUtils.isNullOrEmpty(globalHiveSyncConfigParams.globallyReplicatedTimeStamp)) {
throw new RuntimeException("globally replicated timestamp not set"); throw new RuntimeException("globally replicated timestamp not set");
} }
} }
GlobalHiveSyncConfig mkGlobalHiveSyncConfig(boolean forRemote) { Properties mkGlobalHiveSyncProps(boolean forRemote) {
GlobalHiveSyncConfig cfg = GlobalHiveSyncConfig.copy(this); TypedProperties props = new TypedProperties(loadedProps);
cfg.basePath = forRemote ? properties.getProperty(REMOTE_BASE_PATH) props.putAll(globalHiveSyncConfigParams.toProps());
: properties.getProperty(LOCAL_BASE_PATH, cfg.basePath); String basePath = forRemote ? loadedProps.getProperty(REMOTE_BASE_PATH)
cfg.jdbcUrl = forRemote ? properties.getProperty(REMOTE_HIVE_SERVER_JDBC_URLS) : loadedProps.getProperty(LOCAL_BASE_PATH, loadedProps.getProperty(META_SYNC_BASE_PATH.key()));
: properties.getProperty(LOCAL_HIVE_SERVER_JDBC_URLS, cfg.jdbcUrl); props.setPropertyIfNonNull(META_SYNC_BASE_PATH.key(), basePath);
LOG.info("building hivesync config forRemote: " + forRemote + " " + cfg.jdbcUrl + " " String jdbcUrl = forRemote ? loadedProps.getProperty(REMOTE_HIVE_SERVER_JDBC_URLS)
+ cfg.basePath); : loadedProps.getProperty(LOCAL_HIVE_SERVER_JDBC_URLS, loadedProps.getProperty(HIVE_URL.key()));
return cfg; props.setPropertyIfNonNull(HIVE_URL.key(), jdbcUrl);
LOG.info("building hivesync config forRemote: " + forRemote + " " + jdbcUrl + " "
+ basePath);
return props;
} }
@Override @Override
public String toString() { public String toString() {
return "HiveSyncGlobalCommitConfig{ " + "configFile=" + configFile + ", properties=" return "HiveSyncGlobalCommitParams{ " + "configFile=" + configFile + ", properties="
+ properties + ", " + super.toString() + loadedProps + ", " + super.toString()
+ " }"; + " }";
} }
public void storeToXML(OutputStream configStream) throws IOException {
this.properties.storeToXML(configStream, "hivesync global config");
}
} }

View File

@@ -18,36 +18,37 @@
package org.apache.hudi.hive.replication; package org.apache.hudi.hive.replication;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.LOCAL_HIVE_SITE_URI; import org.apache.hudi.hive.HoodieHiveSyncException;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.REMOTE_HIVE_SITE_URI;
import com.beust.jcommander.JCommander; import com.beust.jcommander.JCommander;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hudi.hive.HoodieHiveSyncException; import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.LOCAL_HIVE_SITE_URI;
import org.apache.log4j.LogManager; import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.REMOTE_HIVE_SITE_URI;
import org.apache.log4j.Logger;
public class HiveSyncGlobalCommitTool implements HiveSyncGlobalCommit, AutoCloseable { public class HiveSyncGlobalCommitTool implements HiveSyncGlobalCommit, AutoCloseable {
private static final Logger LOG = LogManager.getLogger(HiveSyncGlobalCommitTool.class); private static final Logger LOG = LogManager.getLogger(HiveSyncGlobalCommitTool.class);
private final HiveSyncGlobalCommitConfig config; private final HiveSyncGlobalCommitParams params;
private List<ReplicationStateSync> replicationStateSyncList; private final List<ReplicationStateSync> replicationStateSyncList;
private ReplicationStateSync getReplicatedState(boolean forRemote) { ReplicationStateSync getReplicatedState(boolean forRemote) {
HiveConf hiveConf = new HiveConf(); HiveConf hiveConf = new HiveConf();
// we probably just need to set the metastore URIs // we probably just need to set the metastore URIs
// TODO: figure out how to integrate this in production // TODO: figure out how to integrate this in production
// how to load balance between piper HMS,HS2 // how to load balance between piper HMS,HS2
// if we have list of uris, we can do something similar to createHiveConf in reairsync // if we have list of uris, we can do something similar to createHiveConf in reairsync
hiveConf.addResource(new Path(config.properties.getProperty( hiveConf.addResource(new Path(params.loadedProps.getProperty(
forRemote ? REMOTE_HIVE_SITE_URI : LOCAL_HIVE_SITE_URI))); forRemote ? REMOTE_HIVE_SITE_URI : LOCAL_HIVE_SITE_URI)));
// TODO: get clusterId as input parameters // TODO: get clusterId as input parameters
ReplicationStateSync state = new ReplicationStateSync(config.mkGlobalHiveSyncConfig(forRemote), ReplicationStateSync state = new ReplicationStateSync(params.mkGlobalHiveSyncProps(forRemote),
hiveConf, forRemote ? "REMOTESYNC" : "LOCALSYNC"); hiveConf, forRemote ? "REMOTESYNC" : "LOCALSYNC");
return state; return state;
} }
@@ -93,23 +94,24 @@ public class HiveSyncGlobalCommitTool implements HiveSyncGlobalCommit, AutoClose
return true; return true;
} }
public HiveSyncGlobalCommitTool(HiveSyncGlobalCommitConfig config) { public HiveSyncGlobalCommitTool(HiveSyncGlobalCommitParams params) {
this.config = config; this.params = params;
this.replicationStateSyncList = new ArrayList<>(2); this.replicationStateSyncList = new ArrayList<>(2);
this.replicationStateSyncList.add(getReplicatedState(false)); this.replicationStateSyncList.add(getReplicatedState(false));
this.replicationStateSyncList.add(getReplicatedState(true)); this.replicationStateSyncList.add(getReplicatedState(true));
} }
private static HiveSyncGlobalCommitConfig getHiveSyncGlobalCommitConfig(String[] args) private static HiveSyncGlobalCommitParams loadParams(String[] args)
throws IOException { throws IOException {
HiveSyncGlobalCommitConfig cfg = new HiveSyncGlobalCommitConfig(); final HiveSyncGlobalCommitParams params = new HiveSyncGlobalCommitParams();
JCommander cmd = new JCommander(cfg, null, args); JCommander cmd = JCommander.newBuilder().addObject(params).build();
if (cfg.help || args.length == 0) { cmd.parse(args);
if (params.isHelp()) {
cmd.usage(); cmd.usage();
System.exit(1); System.exit(0);
} }
cfg.load(); params.load();
return cfg; return params;
} }
@Override @Override
@@ -120,8 +122,8 @@ public class HiveSyncGlobalCommitTool implements HiveSyncGlobalCommit, AutoClose
} }
public static void main(String[] args) throws IOException, HoodieHiveSyncException { public static void main(String[] args) throws IOException, HoodieHiveSyncException {
final HiveSyncGlobalCommitConfig cfg = getHiveSyncGlobalCommitConfig(args); final HiveSyncGlobalCommitParams params = loadParams(args);
try (final HiveSyncGlobalCommitTool globalCommitTool = new HiveSyncGlobalCommitTool(cfg)) { try (final HiveSyncGlobalCommitTool globalCommitTool = new HiveSyncGlobalCommitTool(params)) {
boolean success = globalCommitTool.commit(); boolean success = globalCommitTool.commit();
if (!success) { if (!success) {
if (!globalCommitTool.rollback()) { if (!globalCommitTool.rollback()) {

View File

@@ -18,31 +18,26 @@
package org.apache.hudi.hive.replication; package org.apache.hudi.hive.replication;
import java.util.Map;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
public class ReplicationStateSync { import org.apache.hadoop.hive.conf.HiveConf;
private GlobalHiveSyncTool globalHiveSyncTool; import java.util.Map;
private final GlobalHiveSyncConfig globalHiveSyncConfig; import java.util.Properties;
private final HiveConf hiveConf;
public class ReplicationStateSync implements AutoCloseable {
protected GlobalHiveSyncTool globalHiveSyncTool;
private Map<String, Option<String>> replicatedTimeStampMap; private Map<String, Option<String>> replicatedTimeStampMap;
private Map<String, Option<String>> oldReplicatedTimeStampMap; private Map<String, Option<String>> oldReplicatedTimeStampMap;
private final String clusterId; private final String clusterId;
ReplicationStateSync(GlobalHiveSyncConfig conf, HiveConf hiveConf, String uid) { ReplicationStateSync(Properties props, HiveConf hiveConf, String uid) {
this.globalHiveSyncConfig = conf; globalHiveSyncTool = new GlobalHiveSyncTool(props, hiveConf);
this.hiveConf = hiveConf;
initGlobalHiveSyncTool();
replicatedTimeStampMap = globalHiveSyncTool.getLastReplicatedTimeStampMap(); replicatedTimeStampMap = globalHiveSyncTool.getLastReplicatedTimeStampMap();
clusterId = uid; clusterId = uid;
} }
private void initGlobalHiveSyncTool() {
globalHiveSyncTool = GlobalHiveSyncTool.buildGlobalHiveSyncTool(globalHiveSyncConfig, hiveConf);
}
public void sync() throws Exception { public void sync() throws Exception {
// the cluster maybe down by the time we reach here so we refresh our replication // the cluster maybe down by the time we reach here so we refresh our replication
// state right before we set the oldReplicatedTimeStamp to narrow this window. this is a // state right before we set the oldReplicatedTimeStamp to narrow this window. this is a
@@ -80,6 +75,7 @@ public class ReplicationStateSync {
return clusterId; return clusterId;
} }
@Override
public void close() { public void close() {
if (globalHiveSyncTool != null) { if (globalHiveSyncTool != null) {
globalHiveSyncTool.close(); globalHiveSyncTool.close();

View File

@@ -9,14 +9,15 @@
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing,
* distributed under the License is distributed on an "AS IS" BASIS, * software distributed under the License is distributed on an
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* See the License for the specific language governing permissions and * KIND, either express or implied. See the License for the
* limitations under the License. * specific language governing permissions and limitations
* under the License.
*/ */
package org.apache.hudi.hive; package org.apache.hudi.hive.transaction.lock;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf;

View File

@@ -18,20 +18,26 @@
package org.apache.hudi.hive.util; package org.apache.hudi.hive.util;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hudi.common.util.PartitionPathEncodeUtils; import org.apache.hudi.common.util.PartitionPathEncodeUtils;
import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HoodieHiveSyncException; import org.apache.hudi.hive.HoodieHiveSyncException;
import org.apache.hudi.hive.PartitionValueExtractor; import org.apache.hudi.sync.common.model.PartitionValueExtractor;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.thrift.TException; import org.apache.thrift.TException;
import java.util.ArrayList;
import java.util.List;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DECODE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
public class HivePartitionUtil { public class HivePartitionUtil {
private static final Logger LOG = LogManager.getLogger(HivePartitionUtil.class); private static final Logger LOG = LogManager.getLogger(HivePartitionUtil.class);
@@ -40,18 +46,18 @@ public class HivePartitionUtil {
*/ */
public static String getPartitionClauseForDrop(String partition, PartitionValueExtractor partitionValueExtractor, HiveSyncConfig config) { public static String getPartitionClauseForDrop(String partition, PartitionValueExtractor partitionValueExtractor, HiveSyncConfig config) {
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
ValidationUtils.checkArgument(config.partitionFields.size() == partitionValues.size(), ValidationUtils.checkArgument(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() == partitionValues.size(),
"Partition key parts " + config.partitionFields + " does not match with partition values " + partitionValues "Partition key parts " + config.getSplitStrings(META_SYNC_PARTITION_FIELDS) + " does not match with partition values " + partitionValues
+ ". Check partition strategy. "); + ". Check partition strategy. ");
List<String> partBuilder = new ArrayList<>(); List<String> partBuilder = new ArrayList<>();
for (int i = 0; i < config.partitionFields.size(); i++) { for (int i = 0; i < config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size(); i++) {
String partitionValue = partitionValues.get(i); String partitionValue = partitionValues.get(i);
// decode the partition before sync to hive to prevent multiple escapes of HIVE // decode the partition before sync to hive to prevent multiple escapes of HIVE
if (config.decodePartition) { if (config.getBoolean(META_SYNC_DECODE_PARTITION)) {
// This is a decode operator for encode in KeyGenUtils#getRecordPartitionPath // This is a decode operator for encode in KeyGenUtils#getRecordPartitionPath
partitionValue = PartitionPathEncodeUtils.unescapePathName(partitionValue); partitionValue = PartitionPathEncodeUtils.unescapePathName(partitionValue);
} }
partBuilder.add(config.partitionFields.get(i) + "=" + partitionValue); partBuilder.add(config.getSplitStrings(META_SYNC_PARTITION_FIELDS).get(i) + "=" + partitionValue);
} }
return String.join("/", partBuilder); return String.join("/", partBuilder);
} }
@@ -61,7 +67,7 @@ public class HivePartitionUtil {
Partition newPartition; Partition newPartition;
try { try {
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partitionPath); List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partitionPath);
newPartition = client.getPartition(config.databaseName, tableName, partitionValues); newPartition = client.getPartition(config.getStringOrDefault(META_SYNC_DATABASE_NAME), tableName, partitionValues);
} catch (NoSuchObjectException ignored) { } catch (NoSuchObjectException ignored) {
newPartition = null; newPartition = null;
} catch (TException e) { } catch (TException e) {

View File

@@ -42,6 +42,12 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
/** /**
* Schema Utilities. * Schema Utilities.
*/ */
@@ -156,7 +162,7 @@ public class HiveSchemaUtil {
* @return : Hive Table schema read from parquet file List[FieldSchema] without partitionField * @return : Hive Table schema read from parquet file List[FieldSchema] without partitionField
*/ */
public static List<FieldSchema> convertParquetSchemaToHiveFieldSchema(MessageType messageType, HiveSyncConfig syncConfig) throws IOException { public static List<FieldSchema> convertParquetSchemaToHiveFieldSchema(MessageType messageType, HiveSyncConfig syncConfig) throws IOException {
return convertMapSchemaToHiveFieldSchema(parquetSchemaToMapSchema(messageType, syncConfig.supportTimestamp, false), syncConfig); return convertMapSchemaToHiveFieldSchema(parquetSchemaToMapSchema(messageType, syncConfig.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE), false), syncConfig);
} }
/** /**
@@ -202,7 +208,7 @@ public class HiveSchemaUtil {
public static List<FieldSchema> convertMapSchemaToHiveFieldSchema(LinkedHashMap<String, String> schema, HiveSyncConfig syncConfig) throws IOException { public static List<FieldSchema> convertMapSchemaToHiveFieldSchema(LinkedHashMap<String, String> schema, HiveSyncConfig syncConfig) throws IOException {
return schema.keySet().stream() return schema.keySet().stream()
.map(key -> new FieldSchema(key, schema.get(key).toLowerCase(), "")) .map(key -> new FieldSchema(key, schema.get(key).toLowerCase(), ""))
.filter(field -> !syncConfig.partitionFields.contains(field.getName())) .filter(field -> !syncConfig.getSplitStrings(META_SYNC_PARTITION_FIELDS).contains(field.getName()))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@@ -448,11 +454,11 @@ public class HiveSchemaUtil {
public static String generateCreateDDL(String tableName, MessageType storageSchema, HiveSyncConfig config, String inputFormatClass, public static String generateCreateDDL(String tableName, MessageType storageSchema, HiveSyncConfig config, String inputFormatClass,
String outputFormatClass, String serdeClass, Map<String, String> serdeProperties, String outputFormatClass, String serdeClass, Map<String, String> serdeProperties,
Map<String, String> tableProperties) throws IOException { Map<String, String> tableProperties) throws IOException {
Map<String, String> hiveSchema = convertParquetSchemaToHiveSchema(storageSchema, config.supportTimestamp); Map<String, String> hiveSchema = convertParquetSchemaToHiveSchema(storageSchema, config.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE));
String columns = generateSchemaString(storageSchema, config.partitionFields, config.supportTimestamp); String columns = generateSchemaString(storageSchema, config.getSplitStrings(META_SYNC_PARTITION_FIELDS), config.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE));
List<String> partitionFields = new ArrayList<>(); List<String> partitionFields = new ArrayList<>();
for (String partitionKey : config.partitionFields) { for (String partitionKey : config.getSplitStrings(META_SYNC_PARTITION_FIELDS)) {
String partitionKeyWithTicks = tickSurround(partitionKey); String partitionKeyWithTicks = tickSurround(partitionKey);
partitionFields.add(new StringBuilder().append(partitionKeyWithTicks).append(" ") partitionFields.add(new StringBuilder().append(partitionKeyWithTicks).append(" ")
.append(getPartitionKeyType(hiveSchema, partitionKeyWithTicks)).toString()); .append(getPartitionKeyType(hiveSchema, partitionKeyWithTicks)).toString());
@@ -460,26 +466,26 @@ public class HiveSchemaUtil {
String partitionsStr = String.join(",", partitionFields); String partitionsStr = String.join(",", partitionFields);
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
if (config.createManagedTable) { if (config.getBoolean(HIVE_CREATE_MANAGED_TABLE)) {
sb.append("CREATE TABLE IF NOT EXISTS "); sb.append("CREATE TABLE IF NOT EXISTS ");
} else { } else {
sb.append("CREATE EXTERNAL TABLE IF NOT EXISTS "); sb.append("CREATE EXTERNAL TABLE IF NOT EXISTS ");
} }
sb.append(HIVE_ESCAPE_CHARACTER).append(config.databaseName).append(HIVE_ESCAPE_CHARACTER) sb.append(HIVE_ESCAPE_CHARACTER).append(config.getStringOrDefault(META_SYNC_DATABASE_NAME)).append(HIVE_ESCAPE_CHARACTER)
.append(".").append(HIVE_ESCAPE_CHARACTER).append(tableName).append(HIVE_ESCAPE_CHARACTER); .append(".").append(HIVE_ESCAPE_CHARACTER).append(tableName).append(HIVE_ESCAPE_CHARACTER);
sb.append("( ").append(columns).append(")"); sb.append("( ").append(columns).append(")");
if (!config.partitionFields.isEmpty()) { if (!config.getSplitStrings(META_SYNC_PARTITION_FIELDS).isEmpty()) {
sb.append(" PARTITIONED BY (").append(partitionsStr).append(")"); sb.append(" PARTITIONED BY (").append(partitionsStr).append(")");
} }
if (config.bucketSpec != null) { if (config.getString(HIVE_SYNC_BUCKET_SYNC_SPEC) != null) {
sb.append(' ' + config.bucketSpec + ' '); sb.append(' ' + config.getString(HIVE_SYNC_BUCKET_SYNC_SPEC) + ' ');
} }
sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'"); sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'");
if (serdeProperties != null && !serdeProperties.isEmpty()) { if (serdeProperties != null && !serdeProperties.isEmpty()) {
sb.append(" WITH SERDEPROPERTIES (").append(propertyToString(serdeProperties)).append(")"); sb.append(" WITH SERDEPROPERTIES (").append(propertyToString(serdeProperties)).append(")");
} }
sb.append(" STORED AS INPUTFORMAT '").append(inputFormatClass).append("'"); sb.append(" STORED AS INPUTFORMAT '").append(inputFormatClass).append("'");
sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '").append(config.basePath).append("'"); sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '").append(config.getAbsoluteBasePath()).append("'");
if (tableProperties != null && !tableProperties.isEmpty()) { if (tableProperties != null && !tableProperties.isEmpty()) {
sb.append(" TBLPROPERTIES(").append(propertyToString(tableProperties)).append(")"); sb.append(" TBLPROPERTIES(").append(propertyToString(tableProperties)).append(")");

View File

@@ -1,128 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.hive;
import static org.apache.hudi.hadoop.utils.HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.LOCAL_BASE_PATH;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.LOCAL_HIVE_SERVER_JDBC_URLS;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.LOCAL_HIVE_SITE_URI;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.REMOTE_BASE_PATH;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.REMOTE_HIVE_SERVER_JDBC_URLS;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig.REMOTE_HIVE_SITE_URI;
import java.util.Collections;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.hive.replication.HiveSyncGlobalCommitConfig;
import org.apache.hudi.hive.replication.HiveSyncGlobalCommitTool;
import org.apache.hudi.hive.testutils.TestCluster;
import org.junit.jupiter.api.extension.RegisterExtension;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
public class TestHiveSyncGlobalCommitTool {
@RegisterExtension
public static TestCluster localCluster = new TestCluster();
@RegisterExtension
public static TestCluster remoteCluster = new TestCluster();
private static String DB_NAME = "foo";
private static String TBL_NAME = "bar";
private HiveSyncGlobalCommitConfig getGlobalCommitConfig(
String commitTime, String dbName, String tblName) throws Exception {
HiveSyncGlobalCommitConfig config = new HiveSyncGlobalCommitConfig();
config.properties.setProperty(LOCAL_HIVE_SITE_URI, localCluster.getHiveSiteXmlLocation());
config.properties.setProperty(REMOTE_HIVE_SITE_URI, remoteCluster.getHiveSiteXmlLocation());
config.properties.setProperty(LOCAL_HIVE_SERVER_JDBC_URLS, localCluster.getHiveJdBcUrl());
config.properties.setProperty(REMOTE_HIVE_SERVER_JDBC_URLS, remoteCluster.getHiveJdBcUrl());
config.properties.setProperty(LOCAL_BASE_PATH, localCluster.tablePath(dbName, tblName));
config.properties.setProperty(REMOTE_BASE_PATH, remoteCluster.tablePath(dbName, tblName));
config.globallyReplicatedTimeStamp = commitTime;
config.hiveUser = System.getProperty("user.name");
config.hivePass = "";
config.databaseName = dbName;
config.tableName = tblName;
config.basePath = localCluster.tablePath(dbName, tblName);
config.assumeDatePartitioning = true;
config.usePreApacheInputFormat = false;
config.partitionFields = Collections.singletonList("datestr");
return config;
}
private void compareEqualLastReplicatedTimeStamp(HiveSyncGlobalCommitConfig config) throws Exception {
Assertions.assertEquals(localCluster.getHMSClient()
.getTable(config.databaseName, config.tableName).getParameters()
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP), remoteCluster.getHMSClient()
.getTable(config.databaseName, config.tableName).getParameters()
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP), "compare replicated timestamps");
}
@BeforeEach
public void setUp() throws Exception {
localCluster.forceCreateDb(DB_NAME);
remoteCluster.forceCreateDb(DB_NAME);
localCluster.dfsCluster.getFileSystem().delete(new Path(localCluster.tablePath(DB_NAME, TBL_NAME)), true);
remoteCluster.dfsCluster.getFileSystem().delete(new Path(remoteCluster.tablePath(DB_NAME, TBL_NAME)), true);
}
@AfterEach
public void clear() throws Exception {
localCluster.getHMSClient().dropTable(DB_NAME, TBL_NAME);
remoteCluster.getHMSClient().dropTable(DB_NAME, TBL_NAME);
}
@Test
public void testBasicGlobalCommit() throws Exception {
String commitTime = "100";
localCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
// simulate drs
remoteCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
HiveSyncGlobalCommitConfig config = getGlobalCommitConfig(commitTime, DB_NAME, TBL_NAME);
HiveSyncGlobalCommitTool tool = new HiveSyncGlobalCommitTool(config);
Assertions.assertTrue(tool.commit());
compareEqualLastReplicatedTimeStamp(config);
}
@Test
public void testBasicRollback() throws Exception {
String commitTime = "100";
localCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
// simulate drs
remoteCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
HiveSyncGlobalCommitConfig config = getGlobalCommitConfig(commitTime, DB_NAME, TBL_NAME);
HiveSyncGlobalCommitTool tool = new HiveSyncGlobalCommitTool(config);
Assertions.assertFalse(localCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
Assertions.assertFalse(remoteCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
// stop the remote cluster hive server to simulate cluster going down
remoteCluster.stopHiveServer2();
Assertions.assertFalse(tool.commit());
Assertions.assertEquals(commitTime, localCluster.getHMSClient()
.getTable(config.databaseName, config.tableName).getParameters()
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP));
Assertions.assertTrue(tool.rollback()); // do a rollback
Assertions.assertNotEquals(commitTime, localCluster.getHMSClient()
.getTable(config.databaseName, config.tableName).getParameters()
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP));
Assertions.assertFalse(remoteCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
remoteCluster.startHiveServer2();
}
}

View File

@@ -27,19 +27,18 @@ import org.apache.hudi.common.testutils.SchemaTestUtil;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.ImmutablePair;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.hive.testutils.HiveTestUtil; import org.apache.hudi.hive.testutils.HiveTestUtil;
import org.apache.hudi.sync.common.model.FieldSchema;
import org.apache.hudi.sync.common.model.Partition;
import org.apache.hudi.sync.common.model.PartitionEvent;
import org.apache.hudi.sync.common.model.PartitionEvent.PartitionEventType;
import org.apache.hudi.sync.common.util.ConfigUtils; import org.apache.hudi.sync.common.util.ConfigUtils;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Field;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.ql.Driver; import org.apache.hadoop.hive.ql.Driver;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState;
import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
@@ -53,17 +52,30 @@ import java.net.URISyntaxException;
import java.time.ZonedDateTime; import java.time.ZonedDateTime;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_COMMENT;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_MODE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.testutils.HiveTestUtil.basePath; import static org.apache.hudi.hive.testutils.HiveTestUtil.basePath;
import static org.apache.hudi.hive.testutils.HiveTestUtil.ddlExecutor; import static org.apache.hudi.hive.testutils.HiveTestUtil.ddlExecutor;
import static org.apache.hudi.hive.testutils.HiveTestUtil.fileSystem;
import static org.apache.hudi.hive.testutils.HiveTestUtil.getHiveConf; import static org.apache.hudi.hive.testutils.HiveTestUtil.getHiveConf;
import static org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncProps; import static org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncProps;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -92,7 +104,7 @@ public class TestHiveSyncTool {
} }
private HiveSyncTool hiveSyncTool; private HiveSyncTool hiveSyncTool;
private HoodieHiveClient hiveClient; private HoodieHiveSyncClient hiveClient;
@AfterAll @AfterAll
public static void cleanUpClass() { public static void cleanUpClass() {
@@ -131,7 +143,7 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource({"syncModeAndSchemaFromCommitMetadata"}) @MethodSource({"syncModeAndSchemaFromCommitMetadata"})
public void testBasicSync(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception { public void testBasicSync(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String instantTime = "100"; String instantTime = "100";
HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata); HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata);
@@ -144,29 +156,29 @@ public class TestHiveSyncTool {
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
hiveClient.getDataSchema().getColumns().size() + 1, hiveClient.getStorageSchema().getColumns().size() + 1,
"Hive Schema should match the table schema + partition field"); "Hive Schema should match the table schema + partition field");
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(5, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
"The last commit that was synced should be updated in the TBLPROPERTIES"); "The last commit that was synced should be updated in the TBLPROPERTIES");
// Adding of new partitions // Adding of new partitions
List<String> newPartition = Arrays.asList("2050/01/01"); List<String> newPartition = Collections.singletonList("2050/01/01");
hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, Arrays.asList()); hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, Collections.emptyList());
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(5, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"No new partition should be added"); "No new partition should be added");
hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition); hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition);
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"New partition should be added"); "New partition should be added");
// Update partitions // Update partitions
hiveClient.updatePartitionsToTable(HiveTestUtil.TABLE_NAME, Arrays.asList()); hiveClient.updatePartitionsToTable(HiveTestUtil.TABLE_NAME, Collections.emptyList());
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"Partition count should remain the same"); "Partition count should remain the same");
hiveClient.updatePartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition); hiveClient.updatePartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition);
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"Partition count should remain the same"); "Partition count should remain the same");
// Alter partitions // Alter partitions
@@ -175,7 +187,7 @@ public class TestHiveSyncTool {
ddlExecutor.runSQL("ALTER TABLE `" + HiveTestUtil.TABLE_NAME ddlExecutor.runSQL("ALTER TABLE `" + HiveTestUtil.TABLE_NAME
+ "` PARTITION (`datestr`='2050-01-01') SET LOCATION '/some/new/location'"); + "` PARTITION (`datestr`='2050-01-01') SET LOCATION '/some/new/location'");
List<org.apache.hudi.sync.common.model.Partition> hivePartitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME); List<Partition> hivePartitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
List<String> writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(Option.empty()); List<String> writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(Option.empty());
List<PartitionEvent> partitionEvents = hiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, false); List<PartitionEvent> partitionEvents = hiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, false);
assertEquals(1, partitionEvents.size(), "There should be only one partition event"); assertEquals(1, partitionEvents.size(), "There should be only one partition event");
@@ -186,7 +198,7 @@ public class TestHiveSyncTool {
reSyncHiveTable(); reSyncHiveTable();
// Sync should update the changed partition to correct path // Sync should update the changed partition to correct path
List<Partition> tablePartitions = hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME); List<Partition> tablePartitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
assertEquals(6, tablePartitions.size(), "The one partition we wrote should be added to hive"); assertEquals(6, tablePartitions.size(), "The one partition we wrote should be added to hive");
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
"The last commit that was synced should be 100"); "The last commit that was synced should be 100");
@@ -195,33 +207,33 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource({"syncMode"}) @MethodSource({"syncMode"})
public void testSyncDataBase(String syncMode) throws Exception { public void testSyncDataBase(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String instantTime = "100"; String instantTime = "100";
HiveTestUtil.createCOWTable(instantTime, 5, true); HiveTestUtil.createCOWTable(instantTime, 5, true);
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), HiveTestUtil.DB_NAME); hiveSyncProps.setProperty(META_SYNC_DATABASE_NAME.key(), HiveTestUtil.DB_NAME);
// while autoCreateDatabase is false and database not exists; // while autoCreateDatabase is false and database not exists;
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "false"); hiveSyncProps.setProperty(HIVE_AUTO_CREATE_DATABASE.key(), "false");
reinitHiveSyncClient(); reinitHiveSyncClient();
// Lets do the sync // Lets do the sync
assertThrows(Exception.class, (this::reSyncHiveTable)); assertThrows(Exception.class, (this::reSyncHiveTable));
// while autoCreateDatabase is true and database not exists; // while autoCreateDatabase is true and database not exists;
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "true"); hiveSyncProps.setProperty(HIVE_AUTO_CREATE_DATABASE.key(), "true");
reinitHiveSyncClient(); reinitHiveSyncClient();
assertDoesNotThrow((this::reSyncHiveTable)); assertDoesNotThrow((this::reSyncHiveTable));
assertTrue(hiveClient.databaseExists(HiveTestUtil.DB_NAME), assertTrue(hiveClient.databaseExists(HiveTestUtil.DB_NAME),
"DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes"); "DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes");
// while autoCreateDatabase is false and database exists; // while autoCreateDatabase is false and database exists;
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "false"); hiveSyncProps.setProperty(HIVE_AUTO_CREATE_DATABASE.key(), "false");
reinitHiveSyncClient(); reinitHiveSyncClient();
assertDoesNotThrow((this::reSyncHiveTable)); assertDoesNotThrow((this::reSyncHiveTable));
assertTrue(hiveClient.databaseExists(HiveTestUtil.DB_NAME), assertTrue(hiveClient.databaseExists(HiveTestUtil.DB_NAME),
"DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes"); "DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes");
// while autoCreateDatabase is true and database exists; // while autoCreateDatabase is true and database exists;
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "true"); hiveSyncProps.setProperty(HIVE_AUTO_CREATE_DATABASE.key(), "true");
assertDoesNotThrow((this::reSyncHiveTable)); assertDoesNotThrow((this::reSyncHiveTable));
assertTrue(hiveClient.databaseExists(HiveTestUtil.DB_NAME), assertTrue(hiveClient.databaseExists(HiveTestUtil.DB_NAME),
"DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes"); "DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes");
@@ -244,10 +256,10 @@ public class TestHiveSyncTool {
put("tp_1", "p1"); put("tp_1", "p1");
} }
}; };
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsDataSourceTable)); hiveSyncProps.setProperty(HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsDataSourceTable));
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_SERDE_PROPERTIES.key(), ConfigUtils.configToString(serdeProperties)); hiveSyncProps.setProperty(HIVE_TABLE_SERDE_PROPERTIES.key(), ConfigUtils.configToString(serdeProperties));
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_PROPERTIES.key(), ConfigUtils.configToString(tableProperties)); hiveSyncProps.setProperty(HIVE_TABLE_PROPERTIES.key(), ConfigUtils.configToString(tableProperties));
String instantTime = "100"; String instantTime = "100";
HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata); HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata);
@@ -335,10 +347,10 @@ public class TestHiveSyncTool {
put("tp_1", "p1"); put("tp_1", "p1");
} }
}; };
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsDataSourceTable)); hiveSyncProps.setProperty(HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsDataSourceTable));
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_SERDE_PROPERTIES.key(), ConfigUtils.configToString(serdeProperties)); hiveSyncProps.setProperty(HIVE_TABLE_SERDE_PROPERTIES.key(), ConfigUtils.configToString(serdeProperties));
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_PROPERTIES.key(), ConfigUtils.configToString(tableProperties)); hiveSyncProps.setProperty(HIVE_TABLE_PROPERTIES.key(), ConfigUtils.configToString(tableProperties));
String instantTime = "100"; String instantTime = "100";
String deltaCommitTime = "101"; String deltaCommitTime = "101";
@@ -394,8 +406,8 @@ public class TestHiveSyncTool {
public void testSyncManagedTable(boolean useSchemaFromCommitMetadata, public void testSyncManagedTable(boolean useSchemaFromCommitMetadata,
boolean isManagedTable, boolean isManagedTable,
String syncMode) throws Exception { String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE.key(), String.valueOf(isManagedTable)); hiveSyncProps.setProperty(HIVE_CREATE_MANAGED_TABLE.key(), String.valueOf(isManagedTable));
String instantTime = "100"; String instantTime = "100";
HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata); HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata);
@@ -422,13 +434,13 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testSyncWithSchema(String syncMode) throws Exception { public void testSyncWithSchema(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String commitTime = "100"; String commitTime = "100";
HiveTestUtil.createCOWTableWithSchema(commitTime, "/complex.schema.avsc"); HiveTestUtil.createCOWTableWithSchema(commitTime, "/complex.schema.avsc");
reinitHiveSyncClient(); reinitHiveSyncClient();
reSyncHiveTable(); reSyncHiveTable();
assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(1, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(commitTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(commitTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
"The last commit that was synced should be updated in the TBLPROPERTIES"); "The last commit that was synced should be updated in the TBLPROPERTIES");
@@ -437,12 +449,12 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testSyncIncremental(String syncMode) throws Exception { public void testSyncIncremental(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String commitTime1 = "100"; String commitTime1 = "100";
HiveTestUtil.createCOWTable(commitTime1, 5, true); HiveTestUtil.createCOWTable(commitTime1, 5, true);
reinitHiveSyncClient(); reinitHiveSyncClient();
reSyncHiveTable(); reSyncHiveTable();
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(5, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
"The last commit that was synced should be updated in the TBLPROPERTIES"); "The last commit that was synced should be updated in the TBLPROPERTIES");
@@ -463,7 +475,7 @@ public class TestHiveSyncTool {
// Sync should add the one partition // Sync should add the one partition
reSyncHiveTable(); reSyncHiveTable();
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"The one partition we wrote should be added to hive"); "The one partition we wrote should be added to hive");
assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
"The last commit that was synced should be 101"); "The last commit that was synced should be 101");
@@ -472,13 +484,13 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testSyncIncrementalWithSchemaEvolution(String syncMode) throws Exception { public void testSyncIncrementalWithSchemaEvolution(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String commitTime1 = "100"; String commitTime1 = "100";
HiveTestUtil.createCOWTable(commitTime1, 5, true); HiveTestUtil.createCOWTable(commitTime1, 5, true);
reinitHiveSyncClient(); reinitHiveSyncClient();
reSyncHiveTable(); reSyncHiveTable();
int fields = hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(); int fields = hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size();
// Now lets create more partitions and these are the only ones which needs to be synced // Now lets create more partitions and these are the only ones which needs to be synced
ZonedDateTime dateTime = ZonedDateTime.now().plusDays(6); ZonedDateTime dateTime = ZonedDateTime.now().plusDays(6);
@@ -488,15 +500,15 @@ public class TestHiveSyncTool {
// Lets do the sync // Lets do the sync
reinitHiveSyncClient(); reinitHiveSyncClient();
reSyncHiveTable(); reSyncHiveTable();
assertEquals(fields + 3, hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), assertEquals(fields + 3, hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
"Hive Schema has evolved and should not be 3 more field"); "Hive Schema has evolved and should not be 3 more field");
assertEquals("BIGINT", hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).get("favorite_number"), assertEquals("BIGINT", hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).get("favorite_number"),
"Hive Schema has evolved - Field favorite_number has evolved from int to long"); "Hive Schema has evolved - Field favorite_number has evolved from int to long");
assertTrue(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).containsKey("favorite_movie"), assertTrue(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).containsKey("favorite_movie"),
"Hive Schema has evolved - Field favorite_movie was added"); "Hive Schema has evolved - Field favorite_movie was added");
// Sync should add the one partition // Sync should add the one partition
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"The one partition we wrote should be added to hive"); "The one partition we wrote should be added to hive");
assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
"The last commit that was synced should be 101"); "The last commit that was synced should be 101");
@@ -505,13 +517,13 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testUpdateTableComments(String syncMode) throws Exception { public void testUpdateTableComments(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String commitTime = "100"; String commitTime = "100";
HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test.avsc"); HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test.avsc");
reinitHiveSyncClient(); reinitHiveSyncClient();
reSyncHiveTable(); reSyncHiveTable();
Map<String, ImmutablePair<String,String>> alterCommentSchema = new HashMap<>(); Map<String, Pair<String, String>> alterCommentSchema = new HashMap<>();
//generate commented schema field //generate commented schema field
Schema schema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test.avsc"); Schema schema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test.avsc");
Schema commentedSchema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test-doced.avsc"); Schema commentedSchema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test-doced.avsc");
@@ -527,10 +539,10 @@ public class TestHiveSyncTool {
ddlExecutor.updateTableComments(HiveTestUtil.TABLE_NAME, alterCommentSchema); ddlExecutor.updateTableComments(HiveTestUtil.TABLE_NAME, alterCommentSchema);
List<FieldSchema> fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(HiveTestUtil.TABLE_NAME); List<FieldSchema> fieldSchemas = hiveClient.getMetastoreFieldSchemas(HiveTestUtil.TABLE_NAME);
int commentCnt = 0; int commentCnt = 0;
for (FieldSchema fieldSchema : fieldSchemas) { for (FieldSchema fieldSchema : fieldSchemas) {
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) { if (StringUtils.nonEmpty(fieldSchema.getCommentOrEmpty())) {
commentCnt++; commentCnt++;
} }
} }
@@ -540,29 +552,29 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testSyncWithCommentedSchema(String syncMode) throws Exception { public void testSyncWithCommentedSchema(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_COMMENT.key(), "false"); hiveSyncProps.setProperty(HIVE_SYNC_COMMENT.key(), "false");
String commitTime = "100"; String commitTime = "100";
HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test-doced.avsc"); HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test-doced.avsc");
reinitHiveSyncClient(); reinitHiveSyncClient();
reSyncHiveTable(); reSyncHiveTable();
List<FieldSchema> fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(HiveTestUtil.TABLE_NAME); List<FieldSchema> fieldSchemas = hiveClient.getMetastoreFieldSchemas(HiveTestUtil.TABLE_NAME);
int commentCnt = 0; int commentCnt = 0;
for (FieldSchema fieldSchema : fieldSchemas) { for (FieldSchema fieldSchema : fieldSchemas) {
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) { if (StringUtils.nonEmpty(fieldSchema.getCommentOrEmpty())) {
commentCnt++; commentCnt++;
} }
} }
assertEquals(0, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers"); assertEquals(0, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers");
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_COMMENT.key(), "true"); hiveSyncProps.setProperty(HIVE_SYNC_COMMENT.key(), "true");
reinitHiveSyncClient(); reinitHiveSyncClient();
reSyncHiveTable(); reSyncHiveTable();
fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(HiveTestUtil.TABLE_NAME); fieldSchemas = hiveClient.getMetastoreFieldSchemas(HiveTestUtil.TABLE_NAME);
commentCnt = 0; commentCnt = 0;
for (FieldSchema fieldSchema : fieldSchemas) { for (FieldSchema fieldSchema : fieldSchemas) {
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) { if (StringUtils.nonEmpty(fieldSchema.getCommentOrEmpty())) {
commentCnt++; commentCnt++;
} }
} }
@@ -572,7 +584,7 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncModeAndSchemaFromCommitMetadata") @MethodSource("syncModeAndSchemaFromCommitMetadata")
public void testSyncMergeOnRead(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception { public void testSyncMergeOnRead(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String instantTime = "100"; String instantTime = "100";
String deltaCommitTime = "101"; String deltaCommitTime = "101";
HiveTestUtil.createMORTable(instantTime, deltaCommitTime, 5, true, HiveTestUtil.createMORTable(instantTime, deltaCommitTime, 5, true,
@@ -587,18 +599,18 @@ public class TestHiveSyncTool {
assertTrue(hiveClient.tableExists(roTableName), "Table " + roTableName + " should exist after sync completes"); assertTrue(hiveClient.tableExists(roTableName), "Table " + roTableName + " should exist after sync completes");
if (useSchemaFromCommitMetadata) { if (useSchemaFromCommitMetadata) {
assertEquals(hiveClient.getTableSchema(roTableName).size(), assertEquals(hiveClient.getMetastoreSchema(roTableName).size(),
SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize() SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize()
+ HoodieRecord.HOODIE_META_COLUMNS.size(), + HoodieRecord.HOODIE_META_COLUMNS.size(),
"Hive Schema should match the table schema + partition field"); "Hive Schema should match the table schema + partition field");
} else { } else {
// The data generated and schema in the data file do not have metadata columns, so we need a separate check. // The data generated and schema in the data file do not have metadata columns, so we need a separate check.
assertEquals(hiveClient.getTableSchema(roTableName).size(), assertEquals(hiveClient.getMetastoreSchema(roTableName).size(),
SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize(), SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize(),
"Hive Schema should match the table schema + partition field"); "Hive Schema should match the table schema + partition field");
} }
assertEquals(5, hiveClient.scanTablePartitions(roTableName).size(), assertEquals(5, hiveClient.getAllPartitions(roTableName).size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(deltaCommitTime, hiveClient.getLastCommitTimeSynced(roTableName).get(), assertEquals(deltaCommitTime, hiveClient.getLastCommitTimeSynced(roTableName).get(),
"The last commit that was synced should be updated in the TBLPROPERTIES"); "The last commit that was synced should be updated in the TBLPROPERTIES");
@@ -616,18 +628,18 @@ public class TestHiveSyncTool {
reSyncHiveTable(); reSyncHiveTable();
if (useSchemaFromCommitMetadata) { if (useSchemaFromCommitMetadata) {
assertEquals(hiveClient.getTableSchema(roTableName).size(), assertEquals(hiveClient.getMetastoreSchema(roTableName).size(),
SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize() SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize()
+ HoodieRecord.HOODIE_META_COLUMNS.size(), + HoodieRecord.HOODIE_META_COLUMNS.size(),
"Hive Schema should match the evolved table schema + partition field"); "Hive Schema should match the evolved table schema + partition field");
} else { } else {
// The data generated and schema in the data file do not have metadata columns, so we need a separate check. // The data generated and schema in the data file do not have metadata columns, so we need a separate check.
assertEquals(hiveClient.getTableSchema(roTableName).size(), assertEquals(hiveClient.getMetastoreSchema(roTableName).size(),
SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize(), SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize(),
"Hive Schema should match the evolved table schema + partition field"); "Hive Schema should match the evolved table schema + partition field");
} }
// Sync should add the one partition // Sync should add the one partition
assertEquals(6, hiveClient.scanTablePartitions(roTableName).size(), assertEquals(6, hiveClient.getAllPartitions(roTableName).size(),
"The 2 partitions we wrote should be added to hive"); "The 2 partitions we wrote should be added to hive");
assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(roTableName).get(), assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(roTableName).get(),
"The last commit that was synced should be 103"); "The last commit that was synced should be 103");
@@ -636,7 +648,7 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncModeAndSchemaFromCommitMetadata") @MethodSource("syncModeAndSchemaFromCommitMetadata")
public void testSyncMergeOnReadRT(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception { public void testSyncMergeOnReadRT(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String instantTime = "100"; String instantTime = "100";
String deltaCommitTime = "101"; String deltaCommitTime = "101";
String snapshotTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; String snapshotTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
@@ -654,18 +666,18 @@ public class TestHiveSyncTool {
+ " should exist after sync completes"); + " should exist after sync completes");
if (useSchemaFromCommitMetadata) { if (useSchemaFromCommitMetadata) {
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize() SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize()
+ HoodieRecord.HOODIE_META_COLUMNS.size(), + HoodieRecord.HOODIE_META_COLUMNS.size(),
"Hive Schema should match the table schema + partition field"); "Hive Schema should match the table schema + partition field");
} else { } else {
// The data generated and schema in the data file do not have metadata columns, so we need a separate check. // The data generated and schema in the data file do not have metadata columns, so we need a separate check.
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize(), SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize(),
"Hive Schema should match the table schema + partition field"); "Hive Schema should match the table schema + partition field");
} }
assertEquals(5, hiveClient.scanTablePartitions(snapshotTableName).size(), assertEquals(5, hiveClient.getAllPartitions(snapshotTableName).size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(deltaCommitTime, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(), assertEquals(deltaCommitTime, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(),
"The last commit that was synced should be updated in the TBLPROPERTIES"); "The last commit that was synced should be updated in the TBLPROPERTIES");
@@ -682,18 +694,18 @@ public class TestHiveSyncTool {
reSyncHiveTable(); reSyncHiveTable();
if (useSchemaFromCommitMetadata) { if (useSchemaFromCommitMetadata) {
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize() SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize()
+ HoodieRecord.HOODIE_META_COLUMNS.size(), + HoodieRecord.HOODIE_META_COLUMNS.size(),
"Hive Schema should match the evolved table schema + partition field"); "Hive Schema should match the evolved table schema + partition field");
} else { } else {
// The data generated and schema in the data file do not have metadata columns, so we need a separate check. // The data generated and schema in the data file do not have metadata columns, so we need a separate check.
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize(), SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize(),
"Hive Schema should match the evolved table schema + partition field"); "Hive Schema should match the evolved table schema + partition field");
} }
// Sync should add the one partition // Sync should add the one partition
assertEquals(6, hiveClient.scanTablePartitions(snapshotTableName).size(), assertEquals(6, hiveClient.getAllPartitions(snapshotTableName).size(),
"The 2 partitions we wrote should be added to hive"); "The 2 partitions we wrote should be added to hive");
assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(), assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(),
"The last commit that was synced should be 103"); "The last commit that was synced should be 103");
@@ -702,12 +714,12 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testMultiPartitionKeySync(String syncMode) throws Exception { public void testMultiPartitionKeySync(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String instantTime = "100"; String instantTime = "100";
HiveTestUtil.createCOWTable(instantTime, 5, true); HiveTestUtil.createCOWTable(instantTime, 5, true);
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName()); hiveSyncProps.setProperty(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName());
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day"); hiveSyncProps.setProperty(META_SYNC_PARTITION_FIELDS.key(), "year,month,day");
HiveTestUtil.getCreatedTablesSet().add(HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME); HiveTestUtil.getCreatedTablesSet().add(HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME);
@@ -718,15 +730,15 @@ public class TestHiveSyncTool {
reSyncHiveTable(); reSyncHiveTable();
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
hiveClient.getDataSchema().getColumns().size() + 3, hiveClient.getStorageSchema().getColumns().size() + 3,
"Hive Schema should match the table schema + partition fields"); "Hive Schema should match the table schema + partition fields");
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(5, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
"The last commit that was synced should be updated in the TBLPROPERTIES"); "The last commit that was synced should be updated in the TBLPROPERTIES");
// HoodieHiveClient had a bug where partition vals were sorted // HoodieHiveSyncClient had a bug where partition vals were sorted
// and stored as keys in a map. The following tests this particular case. // and stored as keys in a map. The following tests this particular case.
// Now lets create partition "2010/01/02" and followed by "2010/02/01". // Now lets create partition "2010/01/02" and followed by "2010/02/01".
String commitTime2 = "101"; String commitTime2 = "101";
@@ -742,7 +754,7 @@ public class TestHiveSyncTool {
reSyncHiveTable(); reSyncHiveTable();
// Sync should add the one partition // Sync should add the one partition
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(6, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
"The last commit that was synced should be 101"); "The last commit that was synced should be 101");
@@ -756,10 +768,10 @@ public class TestHiveSyncTool {
reSyncHiveTable(); reSyncHiveTable();
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
hiveClient.getDataSchema().getColumns().size() + 3, hiveClient.getStorageSchema().getColumns().size() + 3,
"Hive Schema should match the table schema + partition fields"); "Hive Schema should match the table schema + partition fields");
assertEquals(7, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(7, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(commitTime3, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(commitTime3, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
"The last commit that was synced should be updated in the TBLPROPERTIES"); "The last commit that was synced should be updated in the TBLPROPERTIES");
@@ -769,7 +781,7 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testDropPartitionKeySync(String syncMode) throws Exception { public void testDropPartitionKeySync(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String instantTime = "100"; String instantTime = "100";
HiveTestUtil.createCOWTable(instantTime, 1, true); HiveTestUtil.createCOWTable(instantTime, 1, true);
@@ -782,21 +794,21 @@ public class TestHiveSyncTool {
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
hiveClient.getDataSchema().getColumns().size() + 1, hiveClient.getStorageSchema().getColumns().size() + 1,
"Hive Schema should match the table schema + partition field"); "Hive Schema should match the table schema + partition field");
assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(1, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
"The last commit that was synced should be updated in the TBLPROPERTIES"); "The last commit that was synced should be updated in the TBLPROPERTIES");
// Adding of new partitions // Adding of new partitions
List<String> newPartition = Arrays.asList("2050/01/01"); List<String> newPartition = Collections.singletonList("2050/01/01");
hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, Arrays.asList()); hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, Collections.emptyList());
assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(1, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"No new partition should be added"); "No new partition should be added");
hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition); hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition);
assertEquals(2, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(2, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"New partition should be added"); "New partition should be added");
reSyncHiveTable(); reSyncHiveTable();
@@ -805,7 +817,7 @@ public class TestHiveSyncTool {
ddlExecutor.runSQL("ALTER TABLE `" + HiveTestUtil.TABLE_NAME ddlExecutor.runSQL("ALTER TABLE `" + HiveTestUtil.TABLE_NAME
+ "` DROP PARTITION (`datestr`='2050-01-01')"); + "` DROP PARTITION (`datestr`='2050-01-01')");
List<Partition> hivePartitions = hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME); List<Partition> hivePartitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
assertEquals(1, hivePartitions.size(), assertEquals(1, hivePartitions.size(),
"Table should have 1 partition because of the drop 1 partition"); "Table should have 1 partition because of the drop 1 partition");
} }
@@ -813,7 +825,7 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testDropPartition(String syncMode) throws Exception { public void testDropPartition(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String instantTime = "100"; String instantTime = "100";
HiveTestUtil.createCOWTable(instantTime, 1, true); HiveTestUtil.createCOWTable(instantTime, 1, true);
@@ -825,10 +837,10 @@ public class TestHiveSyncTool {
reSyncHiveTable(); reSyncHiveTable();
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
hiveClient.getDataSchema().getColumns().size() + 1, hiveClient.getStorageSchema().getColumns().size() + 1,
"Hive Schema should match the table schema + partition field"); "Hive Schema should match the table schema + partition field");
List<Partition> partitions = hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME); List<Partition> partitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
assertEquals(1, partitions.size(), assertEquals(1, partitions.size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(),
@@ -841,7 +853,7 @@ public class TestHiveSyncTool {
reinitHiveSyncClient(); reinitHiveSyncClient();
reSyncHiveTable(); reSyncHiveTable();
List<Partition> hivePartitions = hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME); List<Partition> hivePartitions = hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME);
assertEquals(0, hivePartitions.size(), assertEquals(0, hivePartitions.size(),
"Table should have 0 partition because of the drop the only one partition"); "Table should have 0 partition because of the drop the only one partition");
} }
@@ -849,12 +861,12 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testNonPartitionedSync(String syncMode) throws Exception { public void testNonPartitionedSync(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String instantTime = "100"; String instantTime = "100";
HiveTestUtil.createCOWTable(instantTime, 5, true); HiveTestUtil.createCOWTable(instantTime, 5, true);
// Set partition value extractor to NonPartitionedExtractor // Set partition value extractor to NonPartitionedExtractor
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), NonPartitionedExtractor.class.getCanonicalName()); hiveSyncProps.setProperty(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), NonPartitionedExtractor.class.getCanonicalName());
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year, month, day"); hiveSyncProps.setProperty(META_SYNC_PARTITION_FIELDS.key(), "");
HiveTestUtil.getCreatedTablesSet().add(HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME); HiveTestUtil.getCreatedTablesSet().add(HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME);
@@ -865,17 +877,17 @@ public class TestHiveSyncTool {
reSyncHiveTable(); reSyncHiveTable();
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
"Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
hiveClient.getDataSchema().getColumns().size(), hiveClient.getStorageSchema().getColumns().size(),
"Hive Schema should match the table schemaignoring the partition fields"); "Hive Schema should match the table schemaignoring the partition fields");
assertEquals(0, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), assertEquals(0, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(),
"Table should not have partitions because of the NonPartitionedExtractor"); "Table should not have partitions because of the NonPartitionedExtractor");
} }
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testReadSchemaForMOR(String syncMode) throws Exception { public void testReadSchemaForMOR(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
String commitTime = "100"; String commitTime = "100";
String snapshotTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; String snapshotTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
HiveTestUtil.createMORTable(commitTime, "", 5, false, true); HiveTestUtil.createMORTable(commitTime, "", 5, false, true);
@@ -891,11 +903,11 @@ public class TestHiveSyncTool {
+ " should exist after sync completes"); + " should exist after sync completes");
// Schema being read from compacted base files // Schema being read from compacted base files
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize() SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize()
+ HoodieRecord.HOODIE_META_COLUMNS.size(), + HoodieRecord.HOODIE_META_COLUMNS.size(),
"Hive Schema should match the table schema + partition field"); "Hive Schema should match the table schema + partition field");
assertEquals(5, hiveClient.scanTablePartitions(snapshotTableName).size(), "Table partitions should match the number of partitions we wrote"); assertEquals(5, hiveClient.getAllPartitions(snapshotTableName).size(), "Table partitions should match the number of partitions we wrote");
// Now lets create more partitions and these are the only ones which needs to be synced // Now lets create more partitions and these are the only ones which needs to be synced
ZonedDateTime dateTime = ZonedDateTime.now().plusDays(6); ZonedDateTime dateTime = ZonedDateTime.now().plusDays(6);
@@ -908,28 +920,28 @@ public class TestHiveSyncTool {
reSyncHiveTable(); reSyncHiveTable();
// Schema being read from the log filesTestHiveSyncTool // Schema being read from the log filesTestHiveSyncTool
assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), assertEquals(hiveClient.getMetastoreSchema(snapshotTableName).size(),
SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize() SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize()
+ HoodieRecord.HOODIE_META_COLUMNS.size(), + HoodieRecord.HOODIE_META_COLUMNS.size(),
"Hive Schema should match the evolved table schema + partition field"); "Hive Schema should match the evolved table schema + partition field");
// Sync should add the one partition // Sync should add the one partition
assertEquals(6, hiveClient.scanTablePartitions(snapshotTableName).size(), "The 1 partition we wrote should be added to hive"); assertEquals(6, hiveClient.getAllPartitions(snapshotTableName).size(), "The 1 partition we wrote should be added to hive");
assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(), assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(),
"The last commit that was synced should be 103"); "The last commit that was synced should be 103");
} }
@Test @Test
public void testConnectExceptionIgnoreConfigSet() throws IOException, URISyntaxException, HiveException, MetaException { public void testConnectExceptionIgnoreConfigSet() throws IOException, URISyntaxException {
String instantTime = "100"; String instantTime = "100";
HiveTestUtil.createCOWTable(instantTime, 5, false); HiveTestUtil.createCOWTable(instantTime, 5, false);
reinitHiveSyncClient(); reinitHiveSyncClient();
HoodieHiveClient prevHiveClient = hiveClient; HoodieHiveSyncClient prevHiveClient = hiveClient;
assertFalse(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), assertFalse(hiveClient.tableExists(HiveTestUtil.TABLE_NAME),
"Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially");
// Lets do the sync // Lets do the sync
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS.key(), "true"); hiveSyncProps.setProperty(HIVE_IGNORE_EXCEPTIONS.key(), "true");
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_URL.key(), hiveSyncProps.getString(HiveSyncConfig.HIVE_URL.key()) hiveSyncProps.setProperty(HIVE_URL.key(), hiveSyncProps.getString(HIVE_URL.key())
.replace(String.valueOf(HiveTestUtil.hiveTestService.getHiveServerPort()), String.valueOf(NetworkTestUtils.nextFreePort()))); .replace(String.valueOf(HiveTestUtil.hiveTestService.getHiveServerPort()), String.valueOf(NetworkTestUtils.nextFreePort())));
reinitHiveSyncClient(); reinitHiveSyncClient();
reSyncHiveTable(); reSyncHiveTable();
@@ -939,12 +951,12 @@ public class TestHiveSyncTool {
"Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially");
} }
private void verifyOldParquetFileTest(HoodieHiveClient hiveClient, String emptyCommitTime) throws Exception { private void verifyOldParquetFileTest(HoodieHiveSyncClient hiveClient, String emptyCommitTime) throws Exception {
assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes");
assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), assertEquals(hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(),
hiveClient.getDataSchema().getColumns().size() + 1, hiveClient.getStorageSchema().getColumns().size() + 1,
"Hive Schema should match the table schema + partition field"); "Hive Schema should match the table schema + partition field");
assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote"); assertEquals(1, hiveClient.getAllPartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote");
assertEquals(emptyCommitTime, assertEquals(emptyCommitTime,
hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be updated in the TBLPROPERTIES"); hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be updated in the TBLPROPERTIES");
@@ -952,19 +964,19 @@ public class TestHiveSyncTool {
Schema schema = SchemaTestUtil.getSimpleSchema(); Schema schema = SchemaTestUtil.getSimpleSchema();
for (Field field : schema.getFields()) { for (Field field : schema.getFields()) {
assertEquals(field.schema().getType().getName(), assertEquals(field.schema().getType().getName(),
hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).get(field.name()).toLowerCase(), hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).get(field.name()).toLowerCase(),
String.format("Hive Schema Field %s was added", field)); String.format("Hive Schema Field %s was added", field));
} }
assertEquals("string", assertEquals("string",
hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).get("datestr").toLowerCase(), "Hive Schema Field datestr was added"); hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).get("datestr").toLowerCase(), "Hive Schema Field datestr was added");
assertEquals(schema.getFields().size() + 1 + HoodieRecord.HOODIE_META_COLUMNS.size(), assertEquals(schema.getFields().size() + 1 + HoodieRecord.HOODIE_META_COLUMNS.size(),
hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), "Hive Schema fields size"); hiveClient.getMetastoreSchema(HiveTestUtil.TABLE_NAME).size(), "Hive Schema fields size");
} }
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testPickingOlderParquetFileIfLatestIsEmptyCommit(String syncMode) throws Exception { public void testPickingOlderParquetFileIfLatestIsEmptyCommit(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
final String commitTime = "100"; final String commitTime = "100";
HiveTestUtil.createCOWTable(commitTime, 1, true); HiveTestUtil.createCOWTable(commitTime, 1, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
@@ -983,7 +995,7 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testNotPickingOlderParquetFileWhenLatestCommitReadFails(String syncMode) throws Exception { public void testNotPickingOlderParquetFileWhenLatestCommitReadFails(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
final String commitTime = "100"; final String commitTime = "100";
HiveTestUtil.createCOWTable(commitTime, 1, true); HiveTestUtil.createCOWTable(commitTime, 1, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
@@ -1001,7 +1013,7 @@ public class TestHiveSyncTool {
assertFalse( assertFalse(
hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially");
HiveSyncTool tool = new HiveSyncTool(hiveSyncProps, getHiveConf(), fileSystem); HiveSyncTool tool = new HiveSyncTool(hiveSyncProps, getHiveConf());
// now delete the evolved commit instant // now delete the evolved commit instant
Path fullPath = new Path(HiveTestUtil.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" Path fullPath = new Path(HiveTestUtil.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
+ hiveClient.getActiveTimeline().getInstants() + hiveClient.getActiveTimeline().getInstants()
@@ -1022,7 +1034,7 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testNotPickingOlderParquetFileWhenLatestCommitReadFailsForExistingTable(String syncMode) throws Exception { public void testNotPickingOlderParquetFileWhenLatestCommitReadFailsForExistingTable(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
final String commitTime = "100"; final String commitTime = "100";
HiveTestUtil.createCOWTable(commitTime, 1, true); HiveTestUtil.createCOWTable(commitTime, 1, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
@@ -1067,7 +1079,7 @@ public class TestHiveSyncTool {
@ParameterizedTest @ParameterizedTest
@MethodSource("syncMode") @MethodSource("syncMode")
public void testTypeConverter(String syncMode) throws Exception { public void testTypeConverter(String syncMode) throws Exception {
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
HiveTestUtil.createCOWTable("100", 5, true); HiveTestUtil.createCOWTable("100", 5, true);
// create database. // create database.
ddlExecutor.runSQL("create database " + HiveTestUtil.DB_NAME); ddlExecutor.runSQL("create database " + HiveTestUtil.DB_NAME);
@@ -1082,24 +1094,24 @@ public class TestHiveSyncTool {
// test one column in DECIMAL // test one column in DECIMAL
String oneTargetColumnSql = createTableSqlPrefix + "(`decimal_col` DECIMAL(9,8), `bigint_col` BIGINT)"; String oneTargetColumnSql = createTableSqlPrefix + "(`decimal_col` DECIMAL(9,8), `bigint_col` BIGINT)";
ddlExecutor.runSQL(oneTargetColumnSql); ddlExecutor.runSQL(oneTargetColumnSql);
System.out.println(hiveClient.getTableSchema(tableName)); System.out.println(hiveClient.getMetastoreSchema(tableName));
assertTrue(hiveClient.getTableSchema(tableName).containsValue("DECIMAL(9,8)"), errorMsg); assertTrue(hiveClient.getMetastoreSchema(tableName).containsValue("DECIMAL(9,8)"), errorMsg);
ddlExecutor.runSQL(dropTableSql); ddlExecutor.runSQL(dropTableSql);
// test multiple columns in DECIMAL // test multiple columns in DECIMAL
String multipleTargetColumnSql = String multipleTargetColumnSql =
createTableSqlPrefix + "(`decimal_col1` DECIMAL(9,8), `bigint_col` BIGINT, `decimal_col2` DECIMAL(7,4))"; createTableSqlPrefix + "(`decimal_col1` DECIMAL(9,8), `bigint_col` BIGINT, `decimal_col2` DECIMAL(7,4))";
ddlExecutor.runSQL(multipleTargetColumnSql); ddlExecutor.runSQL(multipleTargetColumnSql);
System.out.println(hiveClient.getTableSchema(tableName)); System.out.println(hiveClient.getMetastoreSchema(tableName));
assertTrue(hiveClient.getTableSchema(tableName).containsValue("DECIMAL(9,8)") assertTrue(hiveClient.getMetastoreSchema(tableName).containsValue("DECIMAL(9,8)")
&& hiveClient.getTableSchema(tableName).containsValue("DECIMAL(7,4)"), errorMsg); && hiveClient.getMetastoreSchema(tableName).containsValue("DECIMAL(7,4)"), errorMsg);
ddlExecutor.runSQL(dropTableSql); ddlExecutor.runSQL(dropTableSql);
// test no columns in DECIMAL // test no columns in DECIMAL
String noTargetColumnsSql = createTableSqlPrefix + "(`bigint_col` BIGINT)"; String noTargetColumnsSql = createTableSqlPrefix + "(`bigint_col` BIGINT)";
ddlExecutor.runSQL(noTargetColumnsSql); ddlExecutor.runSQL(noTargetColumnsSql);
System.out.println(hiveClient.getTableSchema(tableName)); System.out.println(hiveClient.getMetastoreSchema(tableName));
assertTrue(hiveClient.getTableSchema(tableName).size() == 1 && hiveClient.getTableSchema(tableName) assertTrue(hiveClient.getMetastoreSchema(tableName).size() == 1 && hiveClient.getMetastoreSchema(tableName)
.containsValue("BIGINT"), errorMsg); .containsValue("BIGINT"), errorMsg);
ddlExecutor.runSQL(dropTableSql); ddlExecutor.runSQL(dropTableSql);
} }
@@ -1108,8 +1120,8 @@ public class TestHiveSyncTool {
@MethodSource("syncMode") @MethodSource("syncMode")
public void testSyncWithoutDiffs(String syncMode) throws Exception { public void testSyncWithoutDiffs(String syncMode) throws Exception {
String tableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; String tableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), syncMode);
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_CONDITIONAL_SYNC.key(), "true"); hiveSyncProps.setProperty(META_SYNC_CONDITIONAL_SYNC.key(), "true");
String commitTime0 = "100"; String commitTime0 = "100";
String commitTime1 = "101"; String commitTime1 = "101";
@@ -1136,11 +1148,11 @@ public class TestHiveSyncTool {
} }
private void reinitHiveSyncClient() { private void reinitHiveSyncClient() {
hiveSyncTool = new HiveSyncTool(hiveSyncProps, HiveTestUtil.getHiveConf(), fileSystem); hiveSyncTool = new HiveSyncTool(hiveSyncProps, HiveTestUtil.getHiveConf());
hiveClient = (HoodieHiveClient) hiveSyncTool.hoodieHiveClient; hiveClient = (HoodieHiveSyncClient) hiveSyncTool.syncClient;
} }
private int getPartitionFieldSize() { private int getPartitionFieldSize() {
return hiveSyncProps.getString(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key()).split(",").length; return hiveSyncProps.getString(META_SYNC_PARTITION_FIELDS.key()).split(",").length;
} }
} }

View File

@@ -21,7 +21,7 @@ package org.apache.hudi.hive.functional;
import org.apache.hudi.common.config.LockConfiguration; import org.apache.hudi.common.config.LockConfiguration;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.hive.HiveMetastoreBasedLockProvider; import org.apache.hudi.hive.transaction.lock.HiveMetastoreBasedLockProvider;
import org.apache.hudi.hive.testutils.HiveSyncFunctionalTestHarness; import org.apache.hudi.hive.testutils.HiveSyncFunctionalTestHarness;
import org.apache.hadoop.hive.metastore.api.DataOperationType; import org.apache.hadoop.hive.metastore.api.DataOperationType;

View File

@@ -0,0 +1,154 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.hive.replication;
import org.apache.hudi.hive.testutils.TestCluster;
import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.RegisterExtension;
import static org.apache.hudi.hadoop.utils.HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
import static org.apache.hudi.hive.replication.GlobalHiveSyncConfig.META_SYNC_GLOBAL_REPLICATE_TIMESTAMP;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.LOCAL_BASE_PATH;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.LOCAL_HIVE_SERVER_JDBC_URLS;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.LOCAL_HIVE_SITE_URI;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.REMOTE_BASE_PATH;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.REMOTE_HIVE_SERVER_JDBC_URLS;
import static org.apache.hudi.hive.replication.HiveSyncGlobalCommitParams.REMOTE_HIVE_SITE_URI;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestHiveSyncGlobalCommitTool {
@RegisterExtension
public static TestCluster localCluster = new TestCluster();
@RegisterExtension
public static TestCluster remoteCluster = new TestCluster();
private static final String DB_NAME = "foo";
private static final String TBL_NAME = "bar";
private HiveSyncGlobalCommitParams getGlobalCommitConfig(String commitTime) throws Exception {
HiveSyncGlobalCommitParams params = new HiveSyncGlobalCommitParams();
params.loadedProps.setProperty(LOCAL_HIVE_SITE_URI, localCluster.getHiveSiteXmlLocation());
params.loadedProps.setProperty(REMOTE_HIVE_SITE_URI, remoteCluster.getHiveSiteXmlLocation());
params.loadedProps.setProperty(LOCAL_HIVE_SERVER_JDBC_URLS, localCluster.getHiveJdBcUrl());
params.loadedProps.setProperty(REMOTE_HIVE_SERVER_JDBC_URLS, remoteCluster.getHiveJdBcUrl());
params.loadedProps.setProperty(LOCAL_BASE_PATH, localCluster.tablePath(DB_NAME, TBL_NAME));
params.loadedProps.setProperty(REMOTE_BASE_PATH, remoteCluster.tablePath(DB_NAME, TBL_NAME));
params.loadedProps.setProperty(META_SYNC_GLOBAL_REPLICATE_TIMESTAMP.key(), commitTime);
params.loadedProps.setProperty(HIVE_USER.key(), System.getProperty("user.name"));
params.loadedProps.setProperty(HIVE_PASS.key(), "");
params.loadedProps.setProperty(META_SYNC_DATABASE_NAME.key(), DB_NAME);
params.loadedProps.setProperty(META_SYNC_TABLE_NAME.key(), TBL_NAME);
params.loadedProps.setProperty(META_SYNC_BASE_PATH.key(), localCluster.tablePath(DB_NAME, TBL_NAME));
params.loadedProps.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
params.loadedProps.setProperty(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
params.loadedProps.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr");
return params;
}
private void compareEqualLastReplicatedTimeStamp(HiveSyncGlobalCommitParams config) throws Exception {
assertEquals(localCluster.getHMSClient()
.getTable(DB_NAME, TBL_NAME).getParameters()
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP), remoteCluster.getHMSClient()
.getTable(DB_NAME, TBL_NAME).getParameters()
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP), "compare replicated timestamps");
}
@BeforeEach
public void setUp() throws Exception {
localCluster.forceCreateDb(DB_NAME);
remoteCluster.forceCreateDb(DB_NAME);
localCluster.dfsCluster.getFileSystem().delete(new Path(localCluster.tablePath(DB_NAME, TBL_NAME)), true);
remoteCluster.dfsCluster.getFileSystem().delete(new Path(remoteCluster.tablePath(DB_NAME, TBL_NAME)), true);
}
@AfterEach
public void clear() throws Exception {
localCluster.getHMSClient().dropTable(DB_NAME, TBL_NAME);
remoteCluster.getHMSClient().dropTable(DB_NAME, TBL_NAME);
}
@Test
public void testHiveConfigShouldMatchClusterConf() throws Exception {
String commitTime = "100";
localCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
// simulate drs
remoteCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
HiveSyncGlobalCommitParams params = getGlobalCommitConfig(commitTime);
HiveSyncGlobalCommitTool tool = new HiveSyncGlobalCommitTool(params);
ReplicationStateSync localReplicationStateSync = tool.getReplicatedState(false);
ReplicationStateSync remoteReplicationStateSync = tool.getReplicatedState(true);
assertEquals(localReplicationStateSync.globalHiveSyncTool.config.getHiveConf().get("hive.metastore.uris"),
localCluster.getHiveConf().get("hive.metastore.uris"));
assertEquals(remoteReplicationStateSync.globalHiveSyncTool.config.getHiveConf().get("hive.metastore.uris"),
remoteCluster.getHiveConf().get("hive.metastore.uris"));
}
@Test
public void testBasicGlobalCommit() throws Exception {
String commitTime = "100";
localCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
// simulate drs
remoteCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
HiveSyncGlobalCommitParams params = getGlobalCommitConfig(commitTime);
HiveSyncGlobalCommitTool tool = new HiveSyncGlobalCommitTool(params);
assertTrue(tool.commit());
compareEqualLastReplicatedTimeStamp(params);
}
@Test
public void testBasicRollback() throws Exception {
String commitTime = "100";
localCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
// simulate drs
remoteCluster.createCOWTable(commitTime, 5, DB_NAME, TBL_NAME);
HiveSyncGlobalCommitParams params = getGlobalCommitConfig(commitTime);
HiveSyncGlobalCommitTool tool = new HiveSyncGlobalCommitTool(params);
assertFalse(localCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
assertFalse(remoteCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
// stop the remote cluster hive server to simulate cluster going down
remoteCluster.stopHiveServer2();
assertFalse(tool.commit());
assertEquals(commitTime, localCluster.getHMSClient()
.getTable(DB_NAME, TBL_NAME).getParameters()
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP));
assertTrue(tool.rollback()); // do a rollback
assertNotEquals(commitTime, localCluster.getHMSClient()
.getTable(DB_NAME, TBL_NAME).getParameters()
.get(GLOBALLY_CONSISTENT_READ_TIMESTAMP));
assertFalse(remoteCluster.getHMSClient().tableExists(DB_NAME, TBL_NAME));
remoteCluster.startHiveServer2();
}
}

View File

@@ -24,7 +24,7 @@ import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.testutils.minicluster.ZookeeperTestService; import org.apache.hudi.common.testutils.minicluster.ZookeeperTestService;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HoodieHiveClient; import org.apache.hudi.hive.HoodieHiveSyncClient;
import org.apache.hudi.hive.ddl.HiveQueryDDLExecutor; import org.apache.hudi.hive.ddl.HiveQueryDDLExecutor;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@@ -39,7 +39,17 @@ import org.junit.jupiter.api.io.TempDir;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.time.Instant; import java.time.Instant;
import java.util.Collections; import java.util.Properties;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
public class HiveSyncFunctionalTestHarness { public class HiveSyncFunctionalTestHarness {
@@ -79,42 +89,42 @@ public class HiveSyncFunctionalTestHarness {
} }
public HiveSyncConfig hiveSyncConf() throws IOException { public HiveSyncConfig hiveSyncConf() throws IOException {
HiveSyncConfig conf = new HiveSyncConfig(); Properties props = new Properties();
conf.jdbcUrl = hiveTestService.getJdbcHive2Url(); props.setProperty(HIVE_URL.key(), hiveTestService.getJdbcHive2Url());
conf.hiveUser = ""; props.setProperty(HIVE_USER.key(), "");
conf.hivePass = ""; props.setProperty(HIVE_PASS.key(), "");
conf.databaseName = "hivesynctestdb"; props.setProperty(META_SYNC_DATABASE_NAME.key(), "hivesynctestdb");
conf.tableName = "hivesynctesttable"; props.setProperty(META_SYNC_TABLE_NAME.key(), "hivesynctesttable");
conf.basePath = Files.createDirectories(tempDir.resolve("hivesynctestcase-" + Instant.now().toEpochMilli())).toUri().toString(); props.setProperty(META_SYNC_BASE_PATH.key(), Files.createDirectories(tempDir.resolve("hivesynctestcase-" + Instant.now().toEpochMilli())).toUri().toString());
conf.assumeDatePartitioning = true; props.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
conf.usePreApacheInputFormat = false; props.setProperty(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
conf.partitionFields = Collections.singletonList("datestr"); props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr");
return conf; return new HiveSyncConfig(props, hiveConf());
} }
public HoodieHiveClient hiveClient(HiveSyncConfig hiveSyncConfig) throws IOException { public HoodieHiveSyncClient hiveClient(HiveSyncConfig hiveSyncConfig) throws IOException {
HoodieTableMetaClient.withPropertyBuilder() HoodieTableMetaClient.withPropertyBuilder()
.setTableType(HoodieTableType.COPY_ON_WRITE) .setTableType(HoodieTableType.COPY_ON_WRITE)
.setTableName(hiveSyncConfig.tableName) .setTableName(hiveSyncConfig.getString(META_SYNC_TABLE_NAME))
.setPayloadClass(HoodieAvroPayload.class) .setPayloadClass(HoodieAvroPayload.class)
.initTable(hadoopConf, hiveSyncConfig.basePath); .initTable(hadoopConf, hiveSyncConfig.getString(META_SYNC_BASE_PATH));
return new HoodieHiveClient(hiveSyncConfig, hiveConf(), fs()); return new HoodieHiveSyncClient(hiveSyncConfig);
} }
public void dropTables(String database, String... tables) throws IOException, HiveException, MetaException { public void dropTables(String database, String... tables) throws IOException, HiveException, MetaException {
HiveSyncConfig hiveSyncConfig = hiveSyncConf(); HiveSyncConfig hiveSyncConfig = hiveSyncConf();
hiveSyncConfig.databaseName = database; hiveSyncConfig.setValue(META_SYNC_DATABASE_NAME, database);
for (String table : tables) { for (String table : tables) {
hiveSyncConfig.tableName = table; hiveSyncConfig.setValue(META_SYNC_TABLE_NAME, table);
new HiveQueryDDLExecutor(hiveSyncConfig, fs(), hiveConf()).runSQL("drop table if exists " + table); new HiveQueryDDLExecutor(hiveSyncConfig).runSQL("drop table if exists " + table);
} }
} }
public void dropDatabases(String... databases) throws IOException, HiveException, MetaException { public void dropDatabases(String... databases) throws IOException, HiveException, MetaException {
HiveSyncConfig hiveSyncConfig = hiveSyncConf(); HiveSyncConfig hiveSyncConfig = hiveSyncConf();
for (String database : databases) { for (String database : databases) {
hiveSyncConfig.databaseName = database; hiveSyncConfig.setValue(META_SYNC_DATABASE_NAME, database);
new HiveQueryDDLExecutor(hiveSyncConfig, fs(), hiveConf()).runSQL("drop database if exists " + database); new HiveQueryDDLExecutor(hiveSyncConfig).runSQL("drop database if exists " + database);
} }
} }

View File

@@ -84,6 +84,16 @@ import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
import java.util.UUID; import java.util.UUID;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
import static org.junit.jupiter.api.Assertions.fail; import static org.junit.jupiter.api.Assertions.fail;
@SuppressWarnings("SameParameterValue") @SuppressWarnings("SameParameterValue")
@@ -120,21 +130,21 @@ public class HiveTestUtil {
basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString(); basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString();
hiveSyncProps = new TypedProperties(); hiveSyncProps = new TypedProperties();
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_URL.key(), hiveTestService.getJdbcHive2Url()); hiveSyncProps.setProperty(HIVE_URL.key(), hiveTestService.getJdbcHive2Url());
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USER.key(), ""); hiveSyncProps.setProperty(HIVE_USER.key(), "");
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_PASS.key(), ""); hiveSyncProps.setProperty(HIVE_PASS.key(), "");
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), DB_NAME); hiveSyncProps.setProperty(META_SYNC_DATABASE_NAME.key(), DB_NAME);
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), TABLE_NAME); hiveSyncProps.setProperty(META_SYNC_TABLE_NAME.key(), TABLE_NAME);
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_BASE_PATH.key(), basePath); hiveSyncProps.setProperty(META_SYNC_BASE_PATH.key(), basePath);
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key(), "true"); hiveSyncProps.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false"); hiveSyncProps.setProperty(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"); hiveSyncProps.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr");
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_BATCH_SYNC_PARTITION_NUM.key(), "3"); hiveSyncProps.setProperty(HIVE_BATCH_SYNC_PARTITION_NUM.key(), "3");
hiveSyncConfig = new HiveSyncConfig(hiveSyncProps); hiveSyncConfig = new HiveSyncConfig(hiveSyncProps, configuration);
dtfOut = DateTimeFormatter.ofPattern("yyyy/MM/dd"); dtfOut = DateTimeFormatter.ofPattern("yyyy/MM/dd");
ddlExecutor = new HiveQueryDDLExecutor(hiveSyncConfig, fileSystem, getHiveConf()); ddlExecutor = new HiveQueryDDLExecutor(hiveSyncConfig);
clear(); clear();
} }

View File

@@ -1,276 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sync.common;
import java.io.Serializable;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.TimelineUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.schema.MessageType;
public abstract class AbstractSyncHoodieClient implements AutoCloseable {
private static final Logger LOG = LogManager.getLogger(AbstractSyncHoodieClient.class);
public static final String HOODIE_LAST_COMMIT_TIME_SYNC = "last_commit_time_sync";
public static final TypeConverter TYPE_CONVERTOR = new TypeConverter() {};
protected final HoodieTableMetaClient metaClient;
protected final HoodieTableType tableType;
protected final FileSystem fs;
private final String basePath;
private final boolean assumeDatePartitioning;
private final boolean useFileListingFromMetadata;
private final boolean withOperationField;
@Deprecated
public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata,
boolean verifyMetadataFileListing, boolean withOperationField, FileSystem fs) {
this(basePath, assumeDatePartitioning, useFileListingFromMetadata, withOperationField, fs);
}
public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata,
boolean withOperationField, FileSystem fs) {
this.metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
this.tableType = metaClient.getTableType();
this.basePath = basePath;
this.assumeDatePartitioning = assumeDatePartitioning;
this.useFileListingFromMetadata = useFileListingFromMetadata;
this.withOperationField = withOperationField;
this.fs = fs;
}
/**
* Create the table.
* @param tableName The table name.
* @param storageSchema The table schema.
* @param inputFormatClass The input format class of this table.
* @param outputFormatClass The output format class of this table.
* @param serdeClass The serde class of this table.
* @param serdeProperties The serde properties of this table.
* @param tableProperties The table properties for this table.
*/
public abstract void createTable(String tableName, MessageType storageSchema,
String inputFormatClass, String outputFormatClass,
String serdeClass, Map<String, String> serdeProperties,
Map<String, String> tableProperties);
/**
* @deprecated Use {@link #tableExists} instead.
*/
@Deprecated
public abstract boolean doesTableExist(String tableName);
public abstract boolean tableExists(String tableName);
public abstract Option<String> getLastCommitTimeSynced(String tableName);
public abstract void updateLastCommitTimeSynced(String tableName);
public abstract Option<String> getLastReplicatedTime(String tableName);
public abstract void updateLastReplicatedTimeStamp(String tableName, String timeStamp);
public abstract void deleteLastReplicatedTimeStamp(String tableName);
public abstract void addPartitionsToTable(String tableName, List<String> partitionsToAdd);
public abstract void updatePartitionsToTable(String tableName, List<String> changedPartitions);
public abstract void dropPartitions(String tableName, List<String> partitionsToDrop);
public void updateTableProperties(String tableName, Map<String, String> tableProperties) {}
public abstract Map<String, String> getTableSchema(String tableName);
public HoodieTableType getTableType() {
return tableType;
}
public String getBasePath() {
return metaClient.getBasePath();
}
public FileSystem getFs() {
return fs;
}
public boolean isBootstrap() {
return metaClient.getTableConfig().getBootstrapBasePath().isPresent();
}
public void closeQuietly(ResultSet resultSet, Statement stmt) {
try {
if (stmt != null) {
stmt.close();
}
} catch (SQLException e) {
LOG.warn("Could not close the statement opened ", e);
}
try {
if (resultSet != null) {
resultSet.close();
}
} catch (SQLException e) {
LOG.warn("Could not close the resultset opened ", e);
}
}
/**
* Gets the schema for a hoodie table. Depending on the type of table, try to read schema from commit metadata if
* present, else fallback to reading from any file written in the latest commit. We will assume that the schema has
* not changed within a single atomic write.
*
* @return Parquet schema for this table
*/
public MessageType getDataSchema() {
try {
return new TableSchemaResolver(metaClient).getTableParquetSchema();
} catch (Exception e) {
throw new HoodieSyncException("Failed to read data schema", e);
}
}
public boolean isDropPartition() {
try {
Option<HoodieCommitMetadata> hoodieCommitMetadata = HoodieTableMetadataUtil.getLatestCommitMetadata(metaClient);
if (hoodieCommitMetadata.isPresent()
&& WriteOperationType.DELETE_PARTITION.equals(hoodieCommitMetadata.get().getOperationType())) {
return true;
}
} catch (Exception e) {
throw new HoodieSyncException("Failed to get commit metadata", e);
}
return false;
}
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
public List<String> getPartitionsWrittenToSince(Option<String> lastCommitTimeSynced) {
if (!lastCommitTimeSynced.isPresent()) {
LOG.info("Last commit time synced is not known, listing all partitions in " + basePath + ",FS :" + fs);
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
return FSUtils.getAllPartitionPaths(engineContext, basePath, useFileListingFromMetadata, assumeDatePartitioning);
} else {
LOG.info("Last commit time synced is " + lastCommitTimeSynced.get() + ", Getting commits since then");
return TimelineUtils.getPartitionsWritten(metaClient.getActiveTimeline().getCommitsTimeline()
.findInstantsAfter(lastCommitTimeSynced.get(), Integer.MAX_VALUE));
}
}
public abstract static class TypeConverter implements Serializable {
static final String DEFAULT_TARGET_TYPE = "DECIMAL";
protected String targetType;
public TypeConverter() {
this.targetType = DEFAULT_TARGET_TYPE;
}
public TypeConverter(String targetType) {
ValidationUtils.checkArgument(Objects.nonNull(targetType));
this.targetType = targetType;
}
public void doConvert(ResultSet resultSet, Map<String, String> schema) throws SQLException {
schema.put(getColumnName(resultSet), targetType.equalsIgnoreCase(getColumnType(resultSet))
? convert(resultSet) : getColumnType(resultSet));
}
public String convert(ResultSet resultSet) throws SQLException {
String columnType = getColumnType(resultSet);
int columnSize = resultSet.getInt("COLUMN_SIZE");
int decimalDigits = resultSet.getInt("DECIMAL_DIGITS");
return columnType + String.format("(%s,%s)", columnSize, decimalDigits);
}
public String getColumnName(ResultSet resultSet) throws SQLException {
return resultSet.getString(4);
}
public String getColumnType(ResultSet resultSet) throws SQLException {
return resultSet.getString(6);
}
}
/**
* Read the schema from the log file on path.
*/
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
private MessageType readSchemaFromLogFile(Option<HoodieInstant> lastCompactionCommitOpt, Path path) throws Exception {
MessageType messageType = TableSchemaResolver.readSchemaFromLogFile(fs, path);
// Fall back to read the schema from last compaction
if (messageType == null) {
LOG.info("Falling back to read the schema from last compaction " + lastCompactionCommitOpt);
return new TableSchemaResolver(this.metaClient).readSchemaFromLastCompaction(lastCompactionCommitOpt);
}
return messageType;
}
/**
* Partition Event captures any partition that needs to be added or updated.
*/
public static class PartitionEvent {
public enum PartitionEventType {
ADD, UPDATE, DROP
}
public PartitionEventType eventType;
public String storagePartition;
PartitionEvent(PartitionEventType eventType, String storagePartition) {
this.eventType = eventType;
this.storagePartition = storagePartition;
}
public static PartitionEvent newPartitionAddEvent(String storagePartition) {
return new PartitionEvent(PartitionEventType.ADD, storagePartition);
}
public static PartitionEvent newPartitionUpdateEvent(String storagePartition) {
return new PartitionEvent(PartitionEventType.UPDATE, storagePartition);
}
public static PartitionEvent newPartitionDropEvent(String storagePartition) {
return new PartitionEvent(PartitionEventType.DROP, storagePartition);
}
}
}

View File

@@ -0,0 +1,196 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.sync.common;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.sync.common.model.FieldSchema;
import org.apache.hudi.sync.common.model.Partition;
import org.apache.parquet.schema.MessageType;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public interface HoodieMetaSyncOperations {
String HOODIE_LAST_COMMIT_TIME_SYNC = "last_commit_time_sync";
/**
* Create the table.
*
* @param tableName The table name.
* @param storageSchema The table schema.
* @param inputFormatClass The input format class of this table.
* @param outputFormatClass The output format class of this table.
* @param serdeClass The serde class of this table.
* @param serdeProperties The serde properties of this table.
* @param tableProperties The table properties for this table.
*/
default void createTable(String tableName,
MessageType storageSchema,
String inputFormatClass,
String outputFormatClass,
String serdeClass,
Map<String, String> serdeProperties,
Map<String, String> tableProperties) {
}
/**
* Check if table exists in metastore.
*/
default boolean tableExists(String tableName) {
return false;
}
/**
* Drop table from metastore.
*/
default void dropTable(String tableName) {
}
/**
* Add partitions to the table in metastore.
*/
default void addPartitionsToTable(String tableName, List<String> partitionsToAdd) {
}
/**
* Update partitions to the table in metastore.
*/
default void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
}
/**
* Drop partitions from the table in metastore.
*/
default void dropPartitions(String tableName, List<String> partitionsToDrop) {
}
/**
* Get all partitions for the table in the metastore.
*/
default List<Partition> getAllPartitions(String tableName) {
return Collections.emptyList();
}
/**
* Check if a database already exists in the metastore.
*/
default boolean databaseExists(String databaseName) {
return false;
}
/**
* Create a database in the metastore.
*/
default void createDatabase(String databaseName) {
}
/**
* Get the schema from metastore.
*/
default Map<String, String> getMetastoreSchema(String tableName) {
return Collections.emptyMap();
}
/**
* Get the schema from the Hudi table on storage.
*/
default MessageType getStorageSchema() {
return null;
}
/**
* Update schema for the table in the metastore.
*/
default void updateTableSchema(String tableName, MessageType newSchema) {
}
/**
* Get the list of field schemas from metastore.
*/
default List<FieldSchema> getMetastoreFieldSchemas(String tableName) {
return Collections.emptyList();
}
/**
* Get the list of field schema from the Hudi table on storage.
*/
default List<FieldSchema> getStorageFieldSchemas() {
return Collections.emptyList();
}
/**
* Update the field comments for table in metastore, by using the ones from storage.
*/
default void updateTableComments(String tableName, List<FieldSchema> fromMetastore, List<FieldSchema> fromStorage) {
}
/**
* Get the timestamp of last sync.
*/
default Option<String> getLastCommitTimeSynced(String tableName) {
return Option.empty();
}
/**
* Update the timestamp of last sync.
*/
default void updateLastCommitTimeSynced(String tableName) {
}
/**
* Update the table properties in metastore.
*/
default void updateTableProperties(String tableName, Map<String, String> tableProperties) {
}
/**
* Get the timestamp of last replication.
*/
default Option<String> getLastReplicatedTime(String tableName) {
return Option.empty();
}
/**
* Update the timestamp of last replication.
*/
default void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
}
/**
* Delete the timestamp of last replication.
*/
default void deleteLastReplicatedTimeStamp(String tableName) {
}
}

View File

@@ -0,0 +1,161 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sync.common;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
import org.apache.hudi.sync.common.model.Partition;
import org.apache.hudi.sync.common.model.PartitionEvent;
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.schema.MessageType;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_USE_FILE_LISTING_FROM_METADATA;
public abstract class HoodieSyncClient implements HoodieMetaSyncOperations, AutoCloseable {
private static final Logger LOG = LogManager.getLogger(HoodieSyncClient.class);
protected final HoodieSyncConfig config;
protected final PartitionValueExtractor partitionValueExtractor;
protected final HoodieTableMetaClient metaClient;
public HoodieSyncClient(HoodieSyncConfig config) {
this.config = config;
this.partitionValueExtractor = ReflectionUtils.loadClass(config.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS));
this.metaClient = HoodieTableMetaClient.builder()
.setConf(config.getHadoopConf())
.setBasePath(config.getString(META_SYNC_BASE_PATH))
.setLoadActiveTimelineOnLoad(true)
.build();
}
public HoodieTimeline getActiveTimeline() {
return metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
}
public HoodieTableType getTableType() {
return metaClient.getTableType();
}
public String getBasePath() {
return metaClient.getBasePathV2().toString();
}
public boolean isBootstrap() {
return metaClient.getTableConfig().getBootstrapBasePath().isPresent();
}
public boolean isDropPartition() {
try {
Option<HoodieCommitMetadata> hoodieCommitMetadata = HoodieTableMetadataUtil.getLatestCommitMetadata(metaClient);
if (hoodieCommitMetadata.isPresent()
&& WriteOperationType.DELETE_PARTITION.equals(hoodieCommitMetadata.get().getOperationType())) {
return true;
}
} catch (Exception e) {
throw new HoodieSyncException("Failed to get commit metadata", e);
}
return false;
}
@Override
public MessageType getStorageSchema() {
try {
return new TableSchemaResolver(metaClient).getTableParquetSchema();
} catch (Exception e) {
throw new HoodieSyncException("Failed to read schema from storage.", e);
}
}
public List<String> getPartitionsWrittenToSince(Option<String> lastCommitTimeSynced) {
if (!lastCommitTimeSynced.isPresent()) {
LOG.info("Last commit time synced is not known, listing all partitions in "
+ config.getString(META_SYNC_BASE_PATH)
+ ",FS :" + config.getHadoopFileSystem());
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
return FSUtils.getAllPartitionPaths(engineContext,
config.getString(META_SYNC_BASE_PATH),
config.getBoolean(META_SYNC_USE_FILE_LISTING_FROM_METADATA),
config.getBoolean(META_SYNC_ASSUME_DATE_PARTITION));
} else {
LOG.info("Last commit time synced is " + lastCommitTimeSynced.get() + ", Getting commits since then");
return TimelineUtils.getPartitionsWritten(metaClient.getActiveTimeline().getCommitsTimeline()
.findInstantsAfter(lastCommitTimeSynced.get(), Integer.MAX_VALUE));
}
}
/**
* Iterate over the storage partitions and find if there are any new partitions that need to be added or updated.
* Generate a list of PartitionEvent based on the changes required.
*/
public List<PartitionEvent> getPartitionEvents(List<Partition> tablePartitions, List<String> partitionStoragePartitions, boolean isDropPartition) {
Map<String, String> paths = new HashMap<>();
for (Partition tablePartition : tablePartitions) {
List<String> hivePartitionValues = tablePartition.getValues();
String fullTablePartitionPath =
Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getStorageLocation())).toUri().getPath();
paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath);
}
List<PartitionEvent> events = new ArrayList<>();
for (String storagePartition : partitionStoragePartitions) {
Path storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), storagePartition);
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
// Check if the partition values or if hdfs path is the same
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
if (isDropPartition) {
events.add(PartitionEvent.newPartitionDropEvent(storagePartition));
} else {
if (!storagePartitionValues.isEmpty()) {
String storageValue = String.join(", ", storagePartitionValues);
if (!paths.containsKey(storageValue)) {
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
}
}
}
}
return events;
}
}

View File

@@ -22,14 +22,19 @@ import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.hudi.sync.common.util.ConfigUtils;
import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Properties;
import java.util.function.Function; import java.util.function.Function;
/** /**
@@ -37,41 +42,6 @@ import java.util.function.Function;
*/ */
public class HoodieSyncConfig extends HoodieConfig { public class HoodieSyncConfig extends HoodieConfig {
@Parameter(names = {"--database"}, description = "name of the target database in meta store", required = true)
public String databaseName;
@Parameter(names = {"--table"}, description = "name of the target table in meta store", required = true)
public String tableName;
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
public String basePath;
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
public String baseFileFormat;
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
public List<String> partitionFields;
@Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
+ "to extract the partition values from HDFS path")
public String partitionValueExtractorClass;
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
public Boolean assumeDatePartitioning;
@Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
public Boolean decodePartition;
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
public Boolean useFileListingFromMetadata;
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
public Boolean isConditionalSync;
@Parameter(names = {"--spark-version"}, description = "The spark version")
public String sparkVersion;
public static final ConfigProperty<String> META_SYNC_BASE_PATH = ConfigProperty public static final ConfigProperty<String> META_SYNC_BASE_PATH = ConfigProperty
.key("hoodie.datasource.meta.sync.base.path") .key("hoodie.datasource.meta.sync.base.path")
.defaultValue("") .defaultValue("")
@@ -150,6 +120,11 @@ public class HoodieSyncConfig extends HoodieConfig {
.defaultValue("false") .defaultValue("false")
.withDocumentation("Assume partitioning is yyyy/mm/dd"); .withDocumentation("Assume partitioning is yyyy/mm/dd");
public static final ConfigProperty<Boolean> META_SYNC_DECODE_PARTITION = ConfigProperty
.key("hoodie.meta.sync.decode_partition")
.defaultValue(false) // TODO infer from url encode option
.withDocumentation("");
public static final ConfigProperty<Boolean> META_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty public static final ConfigProperty<Boolean> META_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty
.key("hoodie.meta.sync.metadata_file_listing") .key("hoodie.meta.sync.metadata_file_listing")
.defaultValue(HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS) .defaultValue(HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS)
@@ -165,24 +140,85 @@ public class HoodieSyncConfig extends HoodieConfig {
.defaultValue("") .defaultValue("")
.withDocumentation("The spark version used when syncing with a metastore."); .withDocumentation("The spark version used when syncing with a metastore.");
public HoodieSyncConfig(TypedProperties props) { private Configuration hadoopConf;
public HoodieSyncConfig(Properties props) {
this(props, ConfigUtils.createHadoopConf(props));
}
public HoodieSyncConfig(Properties props, Configuration hadoopConf) {
super(props); super(props);
setDefaults(); this.hadoopConf = hadoopConf;
this.basePath = getStringOrDefault(META_SYNC_BASE_PATH);
this.databaseName = getStringOrDefault(META_SYNC_DATABASE_NAME);
this.tableName = getStringOrDefault(META_SYNC_TABLE_NAME);
this.baseFileFormat = getStringOrDefault(META_SYNC_BASE_FILE_FORMAT);
this.partitionFields = props.getStringList(META_SYNC_PARTITION_FIELDS.key(), ",", Collections.emptyList());
this.partitionValueExtractorClass = getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS);
this.assumeDatePartitioning = getBooleanOrDefault(META_SYNC_ASSUME_DATE_PARTITION);
this.decodePartition = getBooleanOrDefault(KeyGeneratorOptions.URL_ENCODE_PARTITIONING);
this.useFileListingFromMetadata = getBooleanOrDefault(META_SYNC_USE_FILE_LISTING_FROM_METADATA);
this.isConditionalSync = getBooleanOrDefault(META_SYNC_CONDITIONAL_SYNC);
this.sparkVersion = getStringOrDefault(META_SYNC_SPARK_VERSION);
} }
protected void setDefaults() { public void setHadoopConf(Configuration hadoopConf) {
this.setDefaultValue(META_SYNC_TABLE_NAME); this.hadoopConf = hadoopConf;
}
public Configuration getHadoopConf() {
return hadoopConf;
}
public FileSystem getHadoopFileSystem() {
return FSUtils.getFs(getString(META_SYNC_BASE_PATH), getHadoopConf());
}
public String getAbsoluteBasePath() {
return getString(META_SYNC_BASE_PATH);
}
@Override
public String toString() {
return props.toString();
}
public static class HoodieSyncConfigParams {
@Parameter(names = {"--database"}, description = "name of the target database in meta store", required = true)
public String databaseName;
@Parameter(names = {"--table"}, description = "name of the target table in meta store", required = true)
public String tableName;
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
public String basePath;
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
public String baseFileFormat;
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
public List<String> partitionFields;
@Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
+ "to extract the partition values from HDFS path")
public String partitionValueExtractorClass;
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
public Boolean assumeDatePartitioning;
@Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
public Boolean decodePartition;
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
public Boolean useFileListingFromMetadata;
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
public Boolean isConditionalSync;
@Parameter(names = {"--spark-version"}, description = "The spark version")
public String sparkVersion;
@Parameter(names = {"--help", "-h"}, help = true)
public boolean help = false;
public boolean isHelp() {
return help;
}
public TypedProperties toProps() {
final TypedProperties props = new TypedProperties();
props.setPropertyIfNonNull(META_SYNC_BASE_PATH.key(), basePath);
props.setPropertyIfNonNull(META_SYNC_DATABASE_NAME.key(), databaseName);
props.setPropertyIfNonNull(META_SYNC_TABLE_NAME.key(), tableName);
props.setPropertyIfNonNull(META_SYNC_BASE_FILE_FORMAT.key(), baseFileFormat);
props.setPropertyIfNonNull(META_SYNC_PARTITION_FIELDS.key(), StringUtils.join(",", partitionFields));
props.setPropertyIfNonNull(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), partitionValueExtractorClass);
props.setPropertyIfNonNull(META_SYNC_ASSUME_DATE_PARTITION.key(), assumeDatePartitioning);
props.setPropertyIfNonNull(META_SYNC_DECODE_PARTITION.key(), decodePartition);
props.setPropertyIfNonNull(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), useFileListingFromMetadata);
props.setPropertyIfNonNull(META_SYNC_CONDITIONAL_SYNC.key(), isConditionalSync);
props.setPropertyIfNonNull(META_SYNC_SPARK_VERSION.key(), sparkVersion);
return props;
}
} }
} }

View File

@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sync.common;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.sync.common.util.ConfigUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import java.util.Properties;
/**
* Base class to sync metadata with metastores to make
* Hudi table queryable through external systems.
*/
public abstract class HoodieSyncTool implements AutoCloseable {
protected Properties props;
protected Configuration hadoopConf;
public HoodieSyncTool(Properties props) {
this(props, ConfigUtils.createHadoopConf(props));
}
public HoodieSyncTool(Properties props, Configuration hadoopConf) {
this.props = props;
this.hadoopConf = hadoopConf;
}
@Deprecated
public HoodieSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
this(props, conf);
}
@Deprecated
public HoodieSyncTool(Properties props, FileSystem fileSystem) {
this(props, fileSystem.getConf());
}
public abstract void syncHoodieTable();
@Override
public void close() throws Exception {
// no op
}
}

View File

@@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.sync.common.model;
import org.apache.hudi.common.util.Option;
import java.util.Objects;
public class FieldSchema {
private final String name;
private String type;
private Option<String> comment;
public FieldSchema(String name, String type) {
this(name, type, Option.empty());
}
public FieldSchema(String name, String type, String comment) {
this(name, type, Option.ofNullable(comment));
}
public FieldSchema(String name, String type, Option<String> comment) {
this.name = name;
this.type = type;
this.comment = comment;
}
public String getName() {
return name;
}
public String getType() {
return type;
}
public Option<String> getComment() {
return comment;
}
public String getCommentOrEmpty() {
return comment.orElse("");
}
public void setType(String type) {
this.type = type;
}
public void setComment(Option<String> comment) {
this.comment = comment;
}
public void setComment(String comment) {
this.comment = Option.ofNullable(comment);
}
public boolean updateComment(FieldSchema another) {
if (Objects.equals(name, another.getName())
&& !Objects.equals(getCommentOrEmpty(), another.getCommentOrEmpty())) {
setComment(another.getComment());
return true;
} else {
return false;
}
}
}

View File

@@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.sync.common.model;
/**
* Partition Event captures any partition that needs to be added or updated.
*/
public class PartitionEvent {
public enum PartitionEventType {
ADD, UPDATE, DROP
}
public PartitionEventType eventType;
public String storagePartition;
PartitionEvent(PartitionEventType eventType, String storagePartition) {
this.eventType = eventType;
this.storagePartition = storagePartition;
}
public static PartitionEvent newPartitionAddEvent(String storagePartition) {
return new PartitionEvent(PartitionEventType.ADD, storagePartition);
}
public static PartitionEvent newPartitionUpdateEvent(String storagePartition) {
return new PartitionEvent(PartitionEventType.UPDATE, storagePartition);
}
public static PartitionEvent newPartitionDropEvent(String storagePartition) {
return new PartitionEvent(PartitionEventType.DROP, storagePartition);
}
}

View File

@@ -9,14 +9,15 @@
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing,
* distributed under the License is distributed on an "AS IS" BASIS, * software distributed under the License is distributed on an
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* See the License for the specific language governing permissions and * KIND, either express or implied. See the License for the
* limitations under the License. * specific language governing permissions and limitations
* under the License.
*/ */
package org.apache.hudi.hive; package org.apache.hudi.sync.common.model;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;

View File

@@ -18,9 +18,13 @@
package org.apache.hudi.sync.common.util; package org.apache.hudi.sync.common.util;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hadoop.conf.Configuration;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.apache.hudi.common.util.StringUtils; import java.util.Properties;
public class ConfigUtils { public class ConfigUtils {
/** /**
@@ -32,6 +36,7 @@ public class ConfigUtils {
/** /**
* Convert the key-value config to a map.The format of the config * Convert the key-value config to a map.The format of the config
* is a key-value pair just like "k1=v1\nk2=v2\nk3=v3". * is a key-value pair just like "k1=v1\nk2=v2\nk3=v3".
*
* @param keyValueConfig * @param keyValueConfig
* @return * @return
*/ */
@@ -58,6 +63,7 @@ public class ConfigUtils {
/** /**
* Convert map config to key-value string.The format of the config * Convert map config to key-value string.The format of the config
* is a key-value pair just like "k1=v1\nk2=v2\nk3=v3". * is a key-value pair just like "k1=v1\nk2=v2\nk3=v3".
*
* @param config * @param config
* @return * @return
*/ */
@@ -75,4 +81,10 @@ public class ConfigUtils {
return sb.toString(); return sb.toString();
} }
public static Configuration createHadoopConf(Properties props) {
Configuration hadoopConf = new Configuration();
props.stringPropertyNames().forEach(k -> hadoopConf.set(k, props.getProperty(k)));
return hadoopConf;
}
} }

View File

@@ -1,10 +1,11 @@
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one
* contributor license agreements. See the NOTICE file distributed with * or more contributor license agreements. See the NOTICE file
* this work for additional information regarding copyright ownership. * distributed with this work for additional information
* The ASF licenses this file to You under the Apache License, Version 2.0 * regarding copyright ownership. The ASF licenses this file
* (the "License"); you may not use this file except in compliance with * to you under the Apache License, Version 2.0 (the
* the License. You may obtain a copy of the License at * "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
@@ -15,15 +16,10 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hudi.sync.common; package org.apache.hudi.sync.common.util;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.sync.common.util.ConfigUtils;
import org.apache.hudi.sync.common.util.Parquet2SparkSchemaUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType;
@@ -33,39 +29,17 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Properties;
import static org.apache.parquet.schema.OriginalType.UTF8; import static org.apache.parquet.schema.OriginalType.UTF8;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
/** public class SparkDataSourceTableUtils {
* Base class to sync Hudi meta data with Metastores to make
* Hudi table queryable through external systems.
*/
public abstract class AbstractSyncTool {
protected final Configuration conf;
protected final FileSystem fs;
protected TypedProperties props;
public AbstractSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
this.props = props;
this.conf = conf;
this.fs = fs;
}
@Deprecated
public AbstractSyncTool(Properties props, FileSystem fileSystem) {
this(new TypedProperties(props), fileSystem.getConf(), fileSystem);
}
public abstract void syncHoodieTable();
/** /**
* Get Spark Sql related table properties. This is used for spark datasource table. * Get Spark Sql related table properties. This is used for spark datasource table.
* @param schema The schema to write to the table. * @param schema The schema to write to the table.
* @return A new parameters added the spark's table properties. * @return A new parameters added the spark's table properties.
*/ */
protected Map<String, String> getSparkTableProperties(List<String> partitionNames, String sparkVersion, public static Map<String, String> getSparkTableProperties(List<String> partitionNames, String sparkVersion,
int schemaLengthThreshold, MessageType schema) { int schemaLengthThreshold, MessageType schema) {
// Convert the schema and partition info used by spark sql to hive table properties. // Convert the schema and partition info used by spark sql to hive table properties.
// The following code refers to the spark code in // The following code refers to the spark code in
@@ -122,7 +96,7 @@ public abstract class AbstractSyncTool {
return sparkProperties; return sparkProperties;
} }
protected Map<String, String> getSparkSerdeProperties(boolean readAsOptimized, String basePath) { public static Map<String, String> getSparkSerdeProperties(boolean readAsOptimized, String basePath) {
Map<String, String> sparkSerdeProperties = new HashMap<>(); Map<String, String> sparkSerdeProperties = new HashMap<>();
sparkSerdeProperties.put("path", basePath); sparkSerdeProperties.put("path", basePath);
sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized)); sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized));

View File

@@ -22,13 +22,11 @@ package org.apache.hudi.sync.common.util;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.sync.common.AbstractSyncTool;
import org.apache.hudi.sync.common.HoodieSyncConfig; import org.apache.hudi.sync.common.HoodieSyncConfig;
import org.apache.hudi.sync.common.HoodieSyncTool;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.util.Properties; import java.util.Properties;
@@ -36,10 +34,9 @@ import java.util.Properties;
* Helper class for syncing Hudi commit data with external metastores. * Helper class for syncing Hudi commit data with external metastores.
*/ */
public class SyncUtilHelpers { public class SyncUtilHelpers {
private static final Logger LOG = LogManager.getLogger(SyncUtilHelpers.class);
/** /**
* Create an instance of an implementation of {@link AbstractSyncTool} that will sync all the relevant meta information * Create an instance of an implementation of {@link HoodieSyncTool} that will sync all the relevant meta information
* with an external metastore such as Hive etc. to ensure Hoodie tables can be queried or read via external systems. * with an external metastore such as Hive etc. to ensure Hoodie tables can be queried or read via external systems.
* *
* @param metaSyncFQCN The class that implements the sync of the metadata. * @param metaSyncFQCN The class that implements the sync of the metadata.
@@ -62,7 +59,7 @@ public class SyncUtilHelpers {
} }
} }
static AbstractSyncTool instantiateMetaSyncTool(String metaSyncFQCN, static HoodieSyncTool instantiateMetaSyncTool(String metaSyncFQCN,
TypedProperties props, TypedProperties props,
Configuration hadoopConfig, Configuration hadoopConfig,
FileSystem fs, FileSystem fs,
@@ -74,18 +71,28 @@ public class SyncUtilHelpers {
properties.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), baseFileFormat); properties.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), baseFileFormat);
if (ReflectionUtils.hasConstructor(metaSyncFQCN, if (ReflectionUtils.hasConstructor(metaSyncFQCN,
new Class<?>[] {Properties.class, Configuration.class})) {
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
new Class<?>[] {Properties.class, Configuration.class},
properties, hadoopConfig));
} else if (ReflectionUtils.hasConstructor(metaSyncFQCN,
new Class<?>[] {Properties.class})) {
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
new Class<?>[] {Properties.class},
properties));
} else if (ReflectionUtils.hasConstructor(metaSyncFQCN,
new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class})) { new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class})) {
return ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncFQCN, return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class}, new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class},
properties, hadoopConfig, fs)); properties, hadoopConfig, fs));
} else if (ReflectionUtils.hasConstructor(metaSyncFQCN,
new Class<?>[] {Properties.class, FileSystem.class})) {
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
new Class<?>[] {Properties.class, FileSystem.class},
properties, fs));
} else { } else {
LOG.warn("Falling back to deprecated constructor for class: " + metaSyncFQCN); throw new HoodieException("Could not load meta sync class " + metaSyncFQCN
try { + ": no valid constructor found.");
return ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
new Class<?>[] {Properties.class, FileSystem.class}, properties, fs));
} catch (Throwable t) {
throw new HoodieException("Could not load meta sync class " + metaSyncFQCN, t);
}
} }
} }
} }

View File

@@ -20,16 +20,19 @@ package org.apache.hudi.sync.common.util;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.sync.common.AbstractSyncTool; import org.apache.hudi.sync.common.HoodieSyncTool;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.IOException; import java.io.IOException;
import java.util.Properties; import java.util.Properties;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -46,42 +49,44 @@ public class TestSyncUtilHelpers {
hadoopConf = fileSystem.getConf(); hadoopConf = fileSystem.getConf();
} }
@Test @ParameterizedTest
public void testCreateValidSyncClass() { @ValueSource(classes = {DummySyncTool1.class, DummySyncTool2.class})
AbstractSyncTool metaSyncTool = SyncUtilHelpers.instantiateMetaSyncTool( public void testCreateValidSyncClass(Class<?> clazz) {
ValidMetaSyncClass.class.getName(), HoodieSyncTool syncTool = SyncUtilHelpers.instantiateMetaSyncTool(
clazz.getName(),
new TypedProperties(), new TypedProperties(),
hadoopConf, hadoopConf,
fileSystem, fileSystem,
BASE_PATH, BASE_PATH,
BASE_FORMAT BASE_FORMAT
); );
assertTrue(metaSyncTool instanceof ValidMetaSyncClass); assertTrue(clazz.isAssignableFrom(syncTool.getClass()));
} }
/** /**
* Ensure it still works for the deprecated constructor of {@link AbstractSyncTool} * Ensure it still works for the deprecated constructor of {@link HoodieSyncTool}
* as we implemented the fallback. * as we implemented the fallback.
*/ */
@Test @ParameterizedTest
public void testCreateDeprecatedSyncClass() { @ValueSource(classes = {DeprecatedSyncTool1.class, DeprecatedSyncTool2.class})
public void testCreateDeprecatedSyncClass(Class<?> clazz) {
Properties properties = new Properties(); Properties properties = new Properties();
AbstractSyncTool deprecatedMetaSyncClass = SyncUtilHelpers.instantiateMetaSyncTool( HoodieSyncTool syncTool = SyncUtilHelpers.instantiateMetaSyncTool(
DeprecatedMetaSyncClass.class.getName(), clazz.getName(),
new TypedProperties(properties), new TypedProperties(properties),
hadoopConf, hadoopConf,
fileSystem, fileSystem,
BASE_PATH, BASE_PATH,
BASE_FORMAT BASE_FORMAT
); );
assertTrue(deprecatedMetaSyncClass instanceof DeprecatedMetaSyncClass); assertTrue(clazz.isAssignableFrom(syncTool.getClass()));
} }
@Test @Test
public void testCreateInvalidSyncClass() { public void testCreateInvalidSyncClass() {
Exception exception = assertThrows(HoodieException.class, () -> { Throwable t = assertThrows(HoodieException.class, () -> {
SyncUtilHelpers.instantiateMetaSyncTool( SyncUtilHelpers.instantiateMetaSyncTool(
InvalidSyncClass.class.getName(), InvalidSyncTool.class.getName(),
new TypedProperties(), new TypedProperties(),
hadoopConf, hadoopConf,
fileSystem, fileSystem,
@@ -90,14 +95,14 @@ public class TestSyncUtilHelpers {
); );
}); });
String expectedMessage = "Could not load meta sync class " + InvalidSyncClass.class.getName(); String expectedMessage = "Could not load meta sync class " + InvalidSyncTool.class.getName()
assertTrue(exception.getMessage().contains(expectedMessage)); + ": no valid constructor found.";
assertEquals(expectedMessage, t.getMessage());
} }
public static class ValidMetaSyncClass extends AbstractSyncTool { public static class DummySyncTool1 extends HoodieSyncTool {
public ValidMetaSyncClass(TypedProperties props, Configuration conf, FileSystem fs) { public DummySyncTool1(Properties props, Configuration hadoopConf) {
super(props, conf, fs); super(props, hadoopConf);
} }
@Override @Override
@@ -106,9 +111,9 @@ public class TestSyncUtilHelpers {
} }
} }
public static class DeprecatedMetaSyncClass extends AbstractSyncTool { public static class DummySyncTool2 extends HoodieSyncTool {
public DeprecatedMetaSyncClass(Properties props, FileSystem fileSystem) { public DummySyncTool2(Properties props, Configuration hadoopConf) {
super(props, fileSystem); super(props, hadoopConf);
} }
@Override @Override
@@ -117,8 +122,30 @@ public class TestSyncUtilHelpers {
} }
} }
public static class InvalidSyncClass { public static class DeprecatedSyncTool1 extends HoodieSyncTool {
public InvalidSyncClass(Properties props) { public DeprecatedSyncTool1(TypedProperties props, Configuration hadoopConf, FileSystem fs) {
super(props, hadoopConf, fs);
}
@Override
public void syncHoodieTable() {
throw new HoodieException("Method unimplemented as its a test class");
}
}
public static class DeprecatedSyncTool2 extends HoodieSyncTool {
public DeprecatedSyncTool2(Properties props, FileSystem fs) {
super(props, fs);
}
@Override
public void syncHoodieTable() {
throw new HoodieException("Method unimplemented as its a test class");
}
}
public static class InvalidSyncTool {
public InvalidSyncTool(Properties props, FileSystem fs, Configuration hadoopConf) {
} }
} }
} }

View File

@@ -33,7 +33,6 @@ import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.sync.common.HoodieSyncConfig;
import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.UtilHelpers;
import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.schema.SchemaProvider;
@@ -49,6 +48,10 @@ import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
/** /**
* Performs bootstrap from a non-hudi source. * Performs bootstrap from a non-hudi source.
@@ -162,14 +165,14 @@ public class BootstrapExecutor implements Serializable {
if (cfg.enableHiveSync || cfg.enableMetaSync) { if (cfg.enableHiveSync || cfg.enableMetaSync) {
TypedProperties metaProps = new TypedProperties(); TypedProperties metaProps = new TypedProperties();
metaProps.putAll(props); metaProps.putAll(props);
metaProps.put(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), cfg.targetBasePath); metaProps.put(META_SYNC_BASE_PATH.key(), cfg.targetBasePath);
metaProps.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), cfg.baseFileFormat); metaProps.put(META_SYNC_BASE_FILE_FORMAT.key(), cfg.baseFileFormat);
if (props.getBoolean(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.key(), HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.defaultValue())) { if (props.getBoolean(HIVE_SYNC_BUCKET_SYNC.key(), HIVE_SYNC_BUCKET_SYNC.defaultValue())) {
metaProps.put(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()), metaProps.put(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()),
props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key()))); props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key())));
} }
new HiveSyncTool(metaProps, configuration, fs).syncHoodieTable(); new HiveSyncTool(metaProps, configuration).syncHoodieTable();
} }
} }

View File

@@ -114,6 +114,8 @@ import static org.apache.hudi.config.HoodieCompactionConfig.INLINE_COMPACT;
import static org.apache.hudi.config.HoodieWriteConfig.AUTO_COMMIT_ENABLE; import static org.apache.hudi.config.HoodieWriteConfig.AUTO_COMMIT_ENABLE;
import static org.apache.hudi.config.HoodieWriteConfig.COMBINE_BEFORE_INSERT; import static org.apache.hudi.config.HoodieWriteConfig.COMBINE_BEFORE_INSERT;
import static org.apache.hudi.config.HoodieWriteConfig.COMBINE_BEFORE_UPSERT; import static org.apache.hudi.config.HoodieWriteConfig.COMBINE_BEFORE_UPSERT;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC;
import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY; import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY;
import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_RESET_KEY; import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_RESET_KEY;
import static org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE; import static org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE;
@@ -629,7 +631,7 @@ public class DeltaSync implements Serializable {
} }
if (!isEmpty) { if (!isEmpty) {
syncMeta(metrics); runMetaSync();
} }
} else { } else {
LOG.info("Commit " + instantTime + " failed!"); LOG.info("Commit " + instantTime + " failed!");
@@ -690,7 +692,7 @@ public class DeltaSync implements Serializable {
return syncClassName.substring(syncClassName.lastIndexOf(".") + 1); return syncClassName.substring(syncClassName.lastIndexOf(".") + 1);
} }
private void syncMeta(HoodieDeltaStreamerMetrics metrics) { public void runMetaSync() {
Set<String> syncClientToolClasses = new HashSet<>(Arrays.asList(cfg.syncClientToolClassNames.split(","))); Set<String> syncClientToolClasses = new HashSet<>(Arrays.asList(cfg.syncClientToolClassNames.split(",")));
// for backward compatibility // for backward compatibility
if (cfg.enableHiveSync) { if (cfg.enableHiveSync) {
@@ -703,8 +705,8 @@ public class DeltaSync implements Serializable {
TypedProperties metaProps = new TypedProperties(); TypedProperties metaProps = new TypedProperties();
metaProps.putAll(props); metaProps.putAll(props);
if (props.getBoolean(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.key(), HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.defaultValue())) { if (props.getBoolean(HIVE_SYNC_BUCKET_SYNC.key(), HIVE_SYNC_BUCKET_SYNC.defaultValue())) {
metaProps.put(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()), metaProps.put(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()),
props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key()))); props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key())));
} }

View File

@@ -18,16 +18,15 @@
package org.apache.hudi.utilities; package org.apache.hudi.utilities;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hudi.hive.HoodieHiveClient; import org.apache.hudi.hive.HoodieHiveSyncClient;
import org.apache.hudi.hive.testutils.HiveTestUtil; import org.apache.hudi.hive.testutils.HiveTestUtil;
import org.apache.hudi.sync.common.HoodieSyncConfig;
import org.apache.hudi.utilities.exception.HoodieIncrementalPullSQLException; import org.apache.hudi.utilities.exception.HoodieIncrementalPullSQLException;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@@ -40,8 +39,14 @@ import java.nio.file.Files;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.time.Instant; import java.time.Instant;
import static org.apache.hudi.hive.testutils.HiveTestUtil.fileSystem; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_MODE;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncProps; import static org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncProps;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -72,12 +77,12 @@ public class TestHiveIncrementalPuller {
} }
private HiveIncrementalPuller.Config getHivePullerConfig(String incrementalSql) throws IOException { private HiveIncrementalPuller.Config getHivePullerConfig(String incrementalSql) throws IOException {
config.hiveJDBCUrl = hiveSyncProps.getString(HiveSyncConfig.HIVE_URL.key()); config.hiveJDBCUrl = hiveSyncProps.getString(HIVE_URL.key());
config.hiveUsername = hiveSyncProps.getString(HiveSyncConfig.HIVE_USER.key()); config.hiveUsername = hiveSyncProps.getString(HIVE_USER.key());
config.hivePassword = hiveSyncProps.getString(HiveSyncConfig.HIVE_PASS.key()); config.hivePassword = hiveSyncProps.getString(HIVE_PASS.key());
config.hoodieTmpDir = Files.createTempDirectory("hivePullerTest").toUri().toString(); config.hoodieTmpDir = Files.createTempDirectory("hivePullerTest").toUri().toString();
config.sourceDb = hiveSyncProps.getString(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key()); config.sourceDb = hiveSyncProps.getString(META_SYNC_DATABASE_NAME.key());
config.sourceTable = hiveSyncProps.getString(HoodieSyncConfig.META_SYNC_TABLE_NAME.key()); config.sourceTable = hiveSyncProps.getString(META_SYNC_TABLE_NAME.key());
config.targetDb = "tgtdb"; config.targetDb = "tgtdb";
config.targetTable = "test2"; config.targetTable = "test2";
config.tmpDb = "tmp_db"; config.tmpDb = "tmp_db";
@@ -101,9 +106,8 @@ public class TestHiveIncrementalPuller {
private void createSourceTable() throws IOException, URISyntaxException { private void createSourceTable() throws IOException, URISyntaxException {
String instantTime = "101"; String instantTime = "101";
HiveTestUtil.createCOWTable(instantTime, 5, true); HiveTestUtil.createCOWTable(instantTime, 5, true);
hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), "jdbc"); hiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), "jdbc");
HiveSyncTool tool = new HiveSyncTool(hiveSyncProps, HiveTestUtil.getHiveConf());
HiveSyncTool tool = new HiveSyncTool(hiveSyncProps, HiveTestUtil.getHiveConf(), fileSystem);
tool.syncHoodieTable(); tool.syncHoodieTable();
} }
@@ -112,23 +116,23 @@ public class TestHiveIncrementalPuller {
targetBasePath = Files.createTempDirectory("hivesynctest1" + Instant.now().toEpochMilli()).toUri().toString(); targetBasePath = Files.createTempDirectory("hivesynctest1" + Instant.now().toEpochMilli()).toUri().toString();
HiveTestUtil.createCOWTable(instantTime, 5, true, HiveTestUtil.createCOWTable(instantTime, 5, true,
targetBasePath, "tgtdb", "test2"); targetBasePath, "tgtdb", "test2");
HiveSyncTool tool = new HiveSyncTool(getTargetHiveSyncConfig(targetBasePath), HiveTestUtil.getHiveConf(), fileSystem); HiveSyncTool tool = new HiveSyncTool(getTargetHiveSyncConfig(targetBasePath), HiveTestUtil.getHiveConf());
tool.syncHoodieTable(); tool.syncHoodieTable();
} }
private TypedProperties getTargetHiveSyncConfig(String basePath) { private TypedProperties getTargetHiveSyncConfig(String basePath) {
TypedProperties targetHiveSyncProps = new TypedProperties(hiveSyncProps); TypedProperties targetHiveSyncProps = new TypedProperties(hiveSyncProps);
targetHiveSyncProps.setProperty(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "tgtdb"); targetHiveSyncProps.setProperty(META_SYNC_DATABASE_NAME.key(), "tgtdb");
targetHiveSyncProps.setProperty(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "test2"); targetHiveSyncProps.setProperty(META_SYNC_TABLE_NAME.key(), "test2");
targetHiveSyncProps.setProperty(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), basePath); targetHiveSyncProps.setProperty(META_SYNC_BASE_PATH.key(), basePath);
targetHiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), "jdbc"); targetHiveSyncProps.setProperty(HIVE_SYNC_MODE.key(), "jdbc");
return targetHiveSyncProps; return targetHiveSyncProps;
} }
private TypedProperties getAssertionSyncConfig(String databaseName) { private TypedProperties getAssertionSyncConfig(String databaseName) {
TypedProperties assertHiveSyncProps = new TypedProperties(hiveSyncProps); TypedProperties assertHiveSyncProps = new TypedProperties(hiveSyncProps);
assertHiveSyncProps.setProperty(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), databaseName); assertHiveSyncProps.setProperty(META_SYNC_DATABASE_NAME.key(), databaseName);
return assertHiveSyncProps; return assertHiveSyncProps;
} }
@@ -161,11 +165,11 @@ public class TestHiveIncrementalPuller {
public void testPuller() throws IOException, URISyntaxException { public void testPuller() throws IOException, URISyntaxException {
createTables(); createTables();
HiveIncrementalPuller.Config cfg = getHivePullerConfig("select name from testdb.test1 where `_hoodie_commit_time` > '%s'"); HiveIncrementalPuller.Config cfg = getHivePullerConfig("select name from testdb.test1 where `_hoodie_commit_time` > '%s'");
HoodieHiveClient hiveClient = new HoodieHiveClient(new HiveSyncConfig(hiveSyncProps), HiveTestUtil.getHiveConf(), fileSystem); HoodieHiveSyncClient hiveClient = new HoodieHiveSyncClient(new HiveSyncConfig(hiveSyncProps, HiveTestUtil.getHiveConf()));
hiveClient.createDatabase(cfg.tmpDb); hiveClient.createDatabase(cfg.tmpDb);
HiveIncrementalPuller puller = new HiveIncrementalPuller(cfg); HiveIncrementalPuller puller = new HiveIncrementalPuller(cfg);
puller.saveDelta(); puller.saveDelta();
HoodieHiveClient assertingClient = new HoodieHiveClient(new HiveSyncConfig(getAssertionSyncConfig(cfg.tmpDb)), HiveTestUtil.getHiveConf(), fileSystem); HoodieHiveSyncClient assertingClient = new HoodieHiveSyncClient(new HiveSyncConfig(getAssertionSyncConfig(cfg.tmpDb), HiveTestUtil.getHiveConf()));
String tmpTable = cfg.targetTable + "__" + cfg.sourceTable; String tmpTable = cfg.targetTable + "__" + cfg.sourceTable;
assertTrue(assertingClient.tableExists(tmpTable)); assertTrue(assertingClient.tableExists(tmpTable));
} }

View File

@@ -27,7 +27,6 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.MultiPartKeysValueExtractor; import org.apache.hudi.hive.MultiPartKeysValueExtractor;
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
import org.apache.hudi.utilities.sources.TestDataSource; import org.apache.hudi.utilities.sources.TestDataSource;
@@ -48,6 +47,13 @@ import java.util.Collections;
import java.util.Map; import java.util.Map;
import java.util.Random; import java.util.Random;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase { public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase {
@@ -180,11 +186,11 @@ public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase {
props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc");
// Hive Configs // Hive Configs
props.setProperty(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/"); props.setProperty(HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/");
props.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), "testdb1"); props.setProperty(META_SYNC_DATABASE_NAME.key(), "testdb1");
props.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), "hive_trips"); props.setProperty(META_SYNC_TABLE_NAME.key(), "hive_trips");
props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"); props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr");
props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), props.setProperty(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(),
MultiPartKeysValueExtractor.class.getName()); MultiPartKeysValueExtractor.class.getName());
UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE); UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE);
} }
@@ -240,11 +246,11 @@ public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase {
protected static void populateCommonHiveProps(TypedProperties props) { protected static void populateCommonHiveProps(TypedProperties props) {
// Hive Configs // Hive Configs
props.setProperty(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/"); props.setProperty(HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/");
props.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), "testdb2"); props.setProperty(META_SYNC_DATABASE_NAME.key(), "testdb2");
props.setProperty(HiveSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key(), "false"); props.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "false");
props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"); props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr");
props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), props.setProperty(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(),
MultiPartKeysValueExtractor.class.getName()); MultiPartKeysValueExtractor.class.getName());
} }

View File

@@ -43,7 +43,6 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieClusteringConfig;
@@ -53,7 +52,7 @@ import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.exception.TableNotFoundException;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HoodieHiveClient; import org.apache.hudi.hive.HoodieHiveSyncClient;
import org.apache.hudi.keygen.SimpleKeyGenerator; import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.apache.hudi.utilities.DummySchemaProvider; import org.apache.hudi.utilities.DummySchemaProvider;
import org.apache.hudi.utilities.HoodieClusteringJob; import org.apache.hudi.utilities.HoodieClusteringJob;
@@ -133,6 +132,8 @@ import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
import static org.apache.hudi.utilities.UtilHelpers.EXECUTE; import static org.apache.hudi.utilities.UtilHelpers.EXECUTE;
import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE; import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE;
import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE_AND_EXECUTE; import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE_AND_EXECUTE;
@@ -1355,13 +1356,15 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
// Test Hive integration // Test Hive integration
HiveSyncConfig hiveSyncConfig = getHiveSyncConfig(tableBasePath, "hive_trips"); HiveSyncConfig hiveSyncConfig = getHiveSyncConfig(tableBasePath, "hive_trips");
hiveSyncConfig.partitionFields = CollectionUtils.createImmutableList("year", "month", "day"); hiveSyncConfig.setValue(META_SYNC_PARTITION_FIELDS, "year,month,day");
HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), dfs); hiveSyncConfig.setHadoopConf(hiveServer.getHiveConf());
assertTrue(hiveClient.tableExists(hiveSyncConfig.tableName), "Table " + hiveSyncConfig.tableName + " should exist"); HoodieHiveSyncClient hiveClient = new HoodieHiveSyncClient(hiveSyncConfig);
assertEquals(3, hiveClient.getAllPartitions(hiveSyncConfig.tableName).size(), final String tableName = hiveSyncConfig.getString(META_SYNC_TABLE_NAME);
assertTrue(hiveClient.tableExists(tableName), "Table " + tableName + " should exist");
assertEquals(3, hiveClient.getAllPartitions(tableName).size(),
"Table partitions should match the number of partitions we wrote"); "Table partitions should match the number of partitions we wrote");
assertEquals(lastInstantForUpstreamTable, assertEquals(lastInstantForUpstreamTable,
hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), hiveClient.getLastCommitTimeSynced(tableName).get(),
"The last commit that was synced should be updated in the TBLPROPERTIES"); "The last commit that was synced should be updated in the TBLPROPERTIES");
} }

View File

@@ -30,7 +30,6 @@ import org.apache.hudi.common.testutils.RawTripTestPayload;
import org.apache.hudi.common.testutils.minicluster.HdfsTestService; import org.apache.hudi.common.testutils.minicluster.HdfsTestService;
import org.apache.hudi.common.testutils.minicluster.ZookeeperTestService; import org.apache.hudi.common.testutils.minicluster.ZookeeperTestService;
import org.apache.hudi.common.util.AvroOrcUtils; import org.apache.hudi.common.util.AvroOrcUtils;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncConfig;
@@ -86,6 +85,17 @@ import java.io.PrintStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Properties;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;
/** /**
* Abstract test that provides a dfs & spark contexts. * Abstract test that provides a dfs & spark contexts.
@@ -183,17 +193,17 @@ public class UtilitiesTestBase {
* @return * @return
*/ */
protected static HiveSyncConfig getHiveSyncConfig(String basePath, String tableName) { protected static HiveSyncConfig getHiveSyncConfig(String basePath, String tableName) {
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(); Properties props = new Properties();
hiveSyncConfig.jdbcUrl = "jdbc:hive2://127.0.0.1:9999/"; props.setProperty(HIVE_URL.key(),"jdbc:hive2://127.0.0.1:9999/");
hiveSyncConfig.hiveUser = ""; props.setProperty(HIVE_USER.key(), "");
hiveSyncConfig.hivePass = ""; props.setProperty(HIVE_PASS.key(), "");
hiveSyncConfig.databaseName = "testdb1"; props.setProperty(META_SYNC_DATABASE_NAME.key(), "testdb1");
hiveSyncConfig.tableName = tableName; props.setProperty(META_SYNC_TABLE_NAME.key(), tableName);
hiveSyncConfig.basePath = basePath; props.setProperty(META_SYNC_BASE_PATH.key(), basePath);
hiveSyncConfig.assumeDatePartitioning = false; props.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "false");
hiveSyncConfig.usePreApacheInputFormat = false; props.setProperty(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
hiveSyncConfig.partitionFields = CollectionUtils.createImmutableList("datestr"); props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr");
return hiveSyncConfig; return new HiveSyncConfig(props);
} }
/** /**
@@ -206,14 +216,15 @@ public class UtilitiesTestBase {
// Create Dummy hive sync config // Create Dummy hive sync config
HiveSyncConfig hiveSyncConfig = getHiveSyncConfig("/dummy", "dummy"); HiveSyncConfig hiveSyncConfig = getHiveSyncConfig("/dummy", "dummy");
hiveConf.addResource(hiveServer.getHiveConf()); hiveConf.addResource(hiveServer.getHiveConf());
hiveSyncConfig.setHadoopConf(hiveConf);
HoodieTableMetaClient.withPropertyBuilder() HoodieTableMetaClient.withPropertyBuilder()
.setTableType(HoodieTableType.COPY_ON_WRITE) .setTableType(HoodieTableType.COPY_ON_WRITE)
.setTableName(hiveSyncConfig.tableName) .setTableName(hiveSyncConfig.getString(META_SYNC_TABLE_NAME))
.initTable(dfs.getConf(), hiveSyncConfig.basePath); .initTable(dfs.getConf(), hiveSyncConfig.getString(META_SYNC_BASE_PATH));
QueryBasedDDLExecutor ddlExecutor = new JDBCExecutor(hiveSyncConfig, dfs); QueryBasedDDLExecutor ddlExecutor = new JDBCExecutor(hiveSyncConfig);
ddlExecutor.runSQL("drop database if exists " + hiveSyncConfig.databaseName); ddlExecutor.runSQL("drop database if exists " + hiveSyncConfig.getString(META_SYNC_DATABASE_NAME));
ddlExecutor.runSQL("create database " + hiveSyncConfig.databaseName); ddlExecutor.runSQL("create database " + hiveSyncConfig.getString(META_SYNC_DATABASE_NAME));
ddlExecutor.close(); ddlExecutor.close();
} }

View File

@@ -279,7 +279,7 @@
<artifactId>hudi-aws</artifactId> <artifactId>hudi-aws</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<!-- Need parquet and avro to run AwsGlueCatalogSyncTool using run_sync_tool with this bundle. <!-- Need parquet and avro to run AWSGlueCatalogSyncTool using run_sync_tool with this bundle.
Parquet and avro from other packages have already been shaded above--> Parquet and avro from other packages have already been shaded above-->
<dependency> <dependency>
<groupId>org.apache.parquet</groupId> <groupId>org.apache.parquet</groupId>