diff --git a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java index cbc5b030c..7983aab90 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -254,6 +254,8 @@ public class DataSourceUtils { SlashEncodedDayPartitionValueExtractor.class.getName()); hiveSyncConfig.useJdbc = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_USE_JDBC_OPT_KEY(), DataSourceWriteOptions.DEFAULT_HIVE_USE_JDBC_OPT_VAL())); + hiveSyncConfig.autoCreateDatabase = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_AUTO_CREATE_DATABASE_OPT_KEY(), + DataSourceWriteOptions.DEFAULT_HIVE_AUTO_CREATE_DATABASE_OPT_KEY())); hiveSyncConfig.skipROSuffix = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX(), DataSourceWriteOptions.DEFAULT_HIVE_SKIP_RO_SUFFIX_VAL())); return hiveSyncConfig; diff --git a/hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala index 34d5dd2e3..e10bb677a 100644 --- a/hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -290,6 +290,7 @@ object DataSourceWriteOptions { val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = "hoodie.datasource.hive_sync.assume_date_partitioning" val HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY = "hoodie.datasource.hive_sync.use_pre_apache_input_format" val HIVE_USE_JDBC_OPT_KEY = "hoodie.datasource.hive_sync.use_jdbc" + val HIVE_AUTO_CREATE_DATABASE_OPT_KEY = "hoodie.datasource.hive_sync.auto_create_database" val HIVE_SKIP_RO_SUFFIX = "hoodie.datasource.hive_sync.skip_ro_suffix" // DEFAULT FOR HIVE SPECIFIC CONFIGS @@ -306,6 +307,7 @@ object DataSourceWriteOptions { val DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL = "false" val DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL = "false" val DEFAULT_HIVE_USE_JDBC_OPT_VAL = "true" + val DEFAULT_HIVE_AUTO_CREATE_DATABASE_OPT_KEY = "true" val DEFAULT_HIVE_SKIP_RO_SUFFIX_VAL = "false" // Async Compaction - Enabled by default for MOR diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java index 00a35aae6..c861a53b1 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java @@ -71,6 +71,9 @@ public class HiveSyncConfig implements Serializable { @Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url") public Boolean useJdbc = true; + @Parameter(names = {"--auto-create-database"}, description = "Auto create hive database") + public Boolean autoCreateDatabase = true; + @Parameter(names = {"--skip-ro-suffix"}, description = "Skip the `_ro` suffix for Read optimized table, when registering") public Boolean skipROSuffix = false; diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index a3b524fea..a3c94dcb1 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -117,11 +117,17 @@ public class HiveSyncTool extends AbstractSyncTool { boolean tableExists = hoodieHiveClient.doesTableExist(tableName); // check if the database exists else create it - try { - hoodieHiveClient.updateHiveSQL("create database if not exists " + cfg.databaseName); - } catch (Exception e) { - // this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing - LOG.warn("Unable to create database", e); + if (cfg.autoCreateDatabase) { + try { + hoodieHiveClient.updateHiveSQL("create database if not exists " + cfg.databaseName); + } catch (Exception e) { + // this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing + LOG.warn("Unable to create database", e); + } + } else { + if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) { + throw new HoodieHiveSyncException("hive database does not exist " + cfg.databaseName); + } } // Get the parquet schema for this table looking at the latest commit diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java index 5a4b72a1c..eb8b867c3 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java @@ -18,6 +18,11 @@ package org.apache.hudi.hive; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.StorageSchemes; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -29,10 +34,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.IMetaStoreClient; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.Partition; -import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; @@ -336,6 +337,22 @@ public class HoodieHiveClient extends AbstractSyncHoodieClient { } } + /** + * @param databaseName + * @return true if the configured database exists + */ + public boolean doesDataBaseExist(String databaseName) { + try { + Database database = client.getDatabase(databaseName); + if (database != null && databaseName.equals(database.getName())) { + return true; + } + } catch (TException e) { + throw new HoodieHiveSyncException("Failed to check if database exists " + databaseName, e); + } + return false; + } + /** * Execute a update in hive metastore with this SQL. *