Adding HiveSyncTool to sync hoodie dataset schema/partitions to Hive

- Designed to be run by your workflow manager after hoodie upsert - Assumes jdbc connectivity via HiveServer2, which should work with all major distros
2017-04-03 14:44:43 -07:00
parent 2b6322318c
commit 542d622e49
11 changed files with 313 additions and 114 deletions
--- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncTool.java
+++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncTool.java
@@ -0,0 +1,82 @@
+/*
+ *  Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.hive;
+
+import com.beust.jcommander.JCommander;
+import com.uber.hoodie.hive.impl.DayBasedPartitionStrategy;
+import com.uber.hoodie.hive.impl.ParseSchemaFromDataStrategy;
+import com.uber.hoodie.hive.model.HoodieDatasetReference;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Tool to sync new data from commits, into Hive in terms of
+ *
+ *  - New table/partitions
+ *  - Updated schema for table/partitions
+ */
+public class HiveSyncTool {
+
+
+    /**
+     * Sync to Hive, based on day based partitioning
+     *
+     * @param cfg
+     */
+    public static void sync(HiveSyncConfig cfg) {
+        // Configure to point to which metastore and database to connect to
+        HoodieHiveConfiguration apiConfig =
+                HoodieHiveConfiguration.newBuilder().hadoopConfiguration(new Configuration())
+                        .hivedb(cfg.databaseName)
+                        .hiveJdbcUrl(cfg.jdbcUrl)
+                        .jdbcUsername(cfg.hiveUser)
+                        .jdbcPassword(cfg.hivePass)
+                        .build();
+
+        HoodieDatasetReference datasetReference =
+                new HoodieDatasetReference(cfg.tableName, cfg.basePath, cfg.databaseName);
+
+        // initialize the strategies
+        PartitionStrategy partitionStrategy = new DayBasedPartitionStrategy();
+        SchemaStrategy schemaStrategy = new ParseSchemaFromDataStrategy();
+
+        // Creates a new dataset which reflects the state at the time of creation
+        HoodieHiveDatasetSyncTask datasetSyncTask =
+                HoodieHiveDatasetSyncTask.newBuilder().withReference(datasetReference)
+                        .withConfiguration(apiConfig).partitionStrategy(partitionStrategy)
+                        .schemaStrategy(schemaStrategy).build();
+
+        // Sync dataset
+        datasetSyncTask.sync();
+    }
+
+
+    public static void main(String[] args) throws Exception {
+
+        // parse the params
+        final HiveSyncConfig cfg = new HiveSyncConfig();
+        JCommander cmd = new JCommander(cfg, args);
+        if (cfg.help || args.length == 0) {
+            cmd.usage();
+            System.exit(1);
+        }
+
+        sync(cfg);
+    }
+}