From fbff0799b999c5cc49d5a36fd2538f9bf5c6c60d Mon Sep 17 00:00:00 2001
From: rmahindra123 <76502047+rmahindra123@users.noreply.github.com>
Date: Tue, 23 Nov 2021 15:48:06 -0800
Subject: [PATCH] [HUDI-2325] Add hive sync support to kafka connect (#3660)

Co-authored-by: Rajesh Mahindra <rmahindra@Rajeshs-MacBook-Pro.local>
---
 hudi-kafka-connect/README.md                  | 114 ++++++++++---
 hudi-kafka-connect/demo/config-sink.json      |  19 ++-
 .../demo/connect-distributed.properties       |   2 +-
 hudi-kafka-connect/demo/setupKafka.sh         |  11 +-
 hudi-kafka-connect/pom.xml                    |  20 ++-
 .../hudi/connect/utils/KafkaConnectUtils.java |  12 +-
 .../connect/writers/KafkaConnectConfigs.java  |   2 -
 .../KafkaConnectTransactionServices.java      |  57 ++++++-
 .../writers/KafkaConnectWriterProvider.java   |   2 +-
 .../test/resources/log4j-surefire.properties  |  32 ----
 packaging/hudi-kafka-connect-bundle/pom.xml   | 161 ++++++++++++++++--
 11 files changed, 344 insertions(+), 88 deletions(-)
 delete mode 100644 hudi-kafka-connect/src/test/resources/log4j-surefire.properties

diff --git a/hudi-kafka-connect/README.md b/hudi-kafka-connect/README.md
index 1754eb825..33adb5b1d 100644
--- a/hudi-kafka-connect/README.md
+++ b/hudi-kafka-connect/README.md
@@ -27,20 +27,14 @@ The first thing you need to do to start using this connector is building it. In
 - [Java 1.8+](https://openjdk.java.net/)
 - [Apache Maven](https://maven.apache.org/)
 - Install [kcat](https://github.com/edenhill/kcat)
+= Install jq. `brew install jq`
 
 After installing these dependencies, execute the following commands. This will install all the Hudi dependency jars,
 including the fat packaged jar that contains all the dependencies required for a functional Hudi Kafka Connect Sink.
 
-```bash
-cd $HUDI_DIR
-mvn clean -DskipTests install
-```
-
-Henceforth, incremental builds can be performed as follows. 
 
 ```bash
-mvn clean -pl hudi-kafka-connect install -DskipTests
-mvn clean -pl packaging/hudi-kafka-connect-bundle install
+mvn package -DskipTests -pl packaging/hudi-kafka-connect-bundle -am
 ```
 
 Next, we need to make sure that the hudi sink connector bundle jar is in Kafka Connect classpath. Note that the connect
@@ -56,31 +50,58 @@ After building the package, we need to install the Apache Kafka
 
 ### 1 - Starting the environment
 
-To try out the Connect Sink locally, set up a Kafka broker locally. Download the latest apache kafka from https://kafka.apache.org/downloads.
-Once downloaded and built, run the Zookeeper server and Kafka server using the command line tools.
+For runtime dependencies, we encourage using the confluent HDFS connector jars. We have tested our setup with version `10.1.0`.
+After downloading the connector, copy the jars from the lib folder to the Kafka Connect classpath.
 
 ```bash
-export KAFKA_HOME=/path/to/kafka_install_dir
-cd $KAFKA_HOME
-./bin/zookeeper-server-start.sh ./config/zookeeper.properties
-./bin/kafka-server-start.sh ./config/server.properties
+confluent-hub install confluentinc/kafka-connect-hdfs:10.1.0
+```
+Add `confluentinc-kafka-connect-hdfs-10.1.0/lib` to the plugin.path (comma separated) in $HUDI_DIR/hudi-kafka-connect/demo/connect-distributed.properties
+
+### 2 - Set up the docker containers
+
+To run the connect locally, we need kafka, zookeeper, hdfs, hive etc. To make the setup easier, we use the docker 
+containers from the hudi docker demo. Refer to [this link for the setup](https://hudi.apache.org/docs/docker_demo)
+
+Essentially, follow the steps listed here:
+
+/etc/hosts : The demo references many services running in container by the hostname. Add the following settings to /etc/hosts
+```bash
+127.0.0.1 adhoc-1
+127.0.0.1 adhoc-2
+127.0.0.1 namenode
+127.0.0.1 datanode1
+127.0.0.1 hiveserver
+127.0.0.1 hivemetastore
+127.0.0.1 kafkabroker
+127.0.0.1 sparkmaster
+127.0.0.1 zookeeper
 ```
 
-Wait until the kafka cluster is up and running.
+Bring up the docker containers
+```bash
+cd $HUDI_DIR/docker
+./setup_demo.sh
+```
 
-### 2 - Set up the schema registry
+The schema registry and kafka connector can be run from host system directly (mac/ linux).
+
+### 3 - Set up the schema registry
 
 Hudi leverages schema registry to obtain the latest schema when writing records. While it supports most popular schema
 registries, we use Confluent schema registry. Download the
 latest [confluent platform](https://docs.confluent.io/platform/current/installation/index.html) and run the schema
 registry service.
 
+NOTE: You might need to change the port from `8081` to `8082`.
+
 ```bash
 cd $CONFLUENT_DIR
+/bin/kafka-configs --zookeeper localhost --entity-type topics --entity-name _schemas --alter --add-config cleanup.policy=compact
 ./bin/schema-registry-start etc/schema-registry/schema-registry.properties
 ```
 
-### 3 - Create the Hudi Control Topic for Coordination of the transactions
+### 4 - Create the Hudi Control Topic for Coordination of the transactions
 
 The control topic should only have `1` partition, since its used to coordinate the Hudi write transactions across the multiple Connect tasks.
 
@@ -90,7 +111,7 @@ cd $KAFKA_HOME
 ./bin/kafka-topics.sh --create --topic hudi-control-topic --partitions 1 --replication-factor 1 --bootstrap-server localhost:9092
 ```
 
-### 4 - Create the Hudi Topic for the Sink and insert data into the topic
+### 5 - Create the Hudi Topic for the Sink and insert data into the topic
 
 Open a terminal to execute the following command:
 
@@ -106,7 +127,7 @@ to generate, with each batch containing a number of messages and idle time betwe
 bash setupKafka.sh -n <num_kafka_messages_per_batch> -b <num_batches>
 ```
 
-### 5 - Run the Sink connector worker (multiple workers can be run)
+### 6 - Run the Sink connector worker (multiple workers can be run)
 
 The Kafka connect is a distributed platform, with the ability to run one or more workers (each running multiple tasks) 
 that parallely process the records from the Kafka partitions for the same topic. We provide a properties file with 
@@ -120,7 +141,7 @@ cd $KAFKA_HOME
 ./bin/connect-distributed.sh $HUDI_DIR/hudi-kafka-connect/demo/connect-distributed.properties
 ```
 
-### 6 - To add the Hudi Sink to the Connector (delete it if you want to re-configure)
+### 7 - To add the Hudi Sink to the Connector (delete it if you want to re-configure)
 
 Once the Connector has started, it will not run the Sink, until the Hudi sink is added using the web api. The following 
 curl APIs can be used to delete and add a new Hudi Sink. Again, a default configuration is provided for the Hudi Sink, 
@@ -144,8 +165,8 @@ Note: HUDI-2325 tracks Hive sync, which will unlock pretty much every other quer
 
 ```bash
 ls -a /tmp/hoodie/hudi-test-topic
-.		.hoodie		partition-1	partition-3
-..		partition-0	partition-2	partition-4
+.		.hoodie		partition_1	partition_3
+..		partition_0	partition_2	partition_4
 
 ls -lt /tmp/hoodie/hudi-test-topic/.hoodie
 total 72
@@ -160,7 +181,7 @@ total 72
 -rw-r--r--  1 user  wheel      0 Sep 13 21:41 20210913214114.commit.requested
 drwxr-xr-x  2 user  wheel     64 Sep 13 21:41 archived
 
-ls -l /tmp/hoodie/hudi-test-topic/partition-0
+ls -l /tmp/hoodie/hudi-test-topic/partition_0
 total 5168
 -rw-r--r--  1 user  wheel  439332 Sep 13 21:43 2E0E6DB44ACC8479059574A2C71C7A7E-0_0-0-0_20210913214114.parquet
 -rw-r--r--  1 user  wheel  440179 Sep 13 21:42 3B56FAAAE2BDD04E480C1CBACD463D3E-0_0-0-0_20210913214114.parquet
@@ -170,7 +191,52 @@ total 5168
 -rw-r--r--  1 user  wheel  440214 Sep 13 21:43 E200FA75DCD1CED60BE86BCE6BF5D23A-0_0-0-0_20210913214114.parquet
 ```
 
-### 7 - Run async compaction and clustering if scheduled
+### 8- Querying via Hive
+
+```bash
+docker exec -it adhoc-2 /bin/bash
+beeline -u jdbc:hive2://hiveserver:10000 \
+  --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
+  --hiveconf hive.stats.autogather=false
+
+
+# List Tables
+0: jdbc:hive2://hiveserver:10000> show tables;
++---------------------+--+
+|      tab_name       |
++---------------------+--+
+| huditesttopic_ro  |
+| huditesttopic_rt  |
++---------------------+--+
+3 rows selected (1.199 seconds)
+0: jdbc:hive2://hiveserver:10000>
+
+
+# Look at partitions that were added
+0: jdbc:hive2://hiveserver:10000> show partitions huditesttopic_rt;
++-------------------+--+
+|     partition     |
++-------------------+--+
+| date=partition_0  |
+| date=partition_1  |
+| date=partition_2  |
+| date=partition_3  |
+| date=partition_4  |
++-------------------+--+
+1 row selected (0.24 seconds)
+
+
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from huditesttopic_rt;
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+```
+
+
+### 9 - Run async compaction and clustering if scheduled
 
 When using Merge-On-Read (MOR) as the table type, async compaction and clustering can be scheduled when the Sink is
 running. Inline compaction and clustering are disabled by default due to performance reason. By default, async
diff --git a/hudi-kafka-connect/demo/config-sink.json b/hudi-kafka-connect/demo/config-sink.json
index 12a2f5053..bf7e99833 100644
--- a/hudi-kafka-connect/demo/config-sink.json
+++ b/hudi-kafka-connect/demo/config-sink.json
@@ -1,7 +1,7 @@
 {
     "name": "hudi-sink",
     "config": {
-		"bootstrap.servers": "localhost:9092",
+		"bootstrap.servers": "kafkabroker:9092",
 		"connector.class": "org.apache.hudi.connect.HoodieSinkConnector",
 		"tasks.max": "4",
 		"key.converter": "org.apache.kafka.connect.storage.StringConverter",
@@ -11,10 +11,21 @@
 		"hoodie.table.name": "hudi-test-topic",
 		"hoodie.table.type": "MERGE_ON_READ",
 		"hoodie.metadata.enable": "false",
-		"hoodie.base.path": "file:///tmp/hoodie/hudi-test-topic",
+		"hoodie.base.path": "hdfs://namenode:8020/user/hive/warehouse/hudi-test-topic",
 		"hoodie.datasource.write.recordkey.field": "volume",
 		"hoodie.datasource.write.partitionpath.field": "date",
 		"hoodie.schemaprovider.class": "org.apache.hudi.schema.SchemaRegistryProvider",
-		"hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8081/subjects/hudi-test-topic/versions/latest"
-	}
+		"hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8082/subjects/hudi-test-topic/versions/latest",
+		"hoodie.kafka.commit.interval.secs": 60,
+		"hoodie.meta.sync.enable": "true",
+		"hoodie.meta.sync.classes": "org.apache.hudi.hive.HiveSyncTool",
+		"hoodie.datasource.hive_sync.table": "huditesttopic",
+		"hoodie.datasource.hive_sync.partition_fields": "date",
+		"hoodie.datasource.hive_sync.partition_extractor_class": "org.apache.hudi.hive.MultiPartKeysValueExtractor",
+		"hoodie.datasource.hive_sync.use_jdbc": "false",
+		"hoodie.datasource.hive_sync.mode": "hms",
+		"dfs.client.use.datanode.hostname": "true",
+		"hive.metastore.uris": "thrift://hivemetastore:9083",
+		"hive.metastore.client.socket.timeout": "1500s"
+      }
 }
diff --git a/hudi-kafka-connect/demo/connect-distributed.properties b/hudi-kafka-connect/demo/connect-distributed.properties
index 9e3cec149..172e84789 100644
--- a/hudi-kafka-connect/demo/connect-distributed.properties
+++ b/hudi-kafka-connect/demo/connect-distributed.properties
@@ -15,7 +15,7 @@
 # limitations under the License.
 ##
 
-bootstrap.servers=localhost:9092
+bootstrap.servers=kafkabroker:9092
 group.id=hudi-connect-cluster
 key.converter=org.apache.kafka.connect.json.JsonConverter
 value.converter=org.apache.kafka.connect.json.JsonConverter
diff --git a/hudi-kafka-connect/demo/setupKafka.sh b/hudi-kafka-connect/demo/setupKafka.sh
index f4a88ee91..bc615a567 100755
--- a/hudi-kafka-connect/demo/setupKafka.sh
+++ b/hudi-kafka-connect/demo/setupKafka.sh
@@ -50,6 +50,7 @@ fi
 
 ## defaults
 rawDataFile=${HUDI_DIR}/docker/demo/data/batch_1.json
+kafkaBrokerHostname=kafkabroker
 kafkaTopicName=hudi-test-topic
 numKafkaPartitions=4
 recordKey=volume
@@ -115,23 +116,23 @@ done
 if [ $recreateTopic = "Y" ]; then
   # First delete the existing topic
   echo "Delete Kafka topic $kafkaTopicName ..."
-  ${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092
+  ${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server ${kafkaBrokerHostname}:9092
 
   # Create the topic with 4 partitions
   echo "Create Kafka topic $kafkaTopicName ..."
-  ${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092
+  ${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server ${kafkaBrokerHostname}:9092
 fi
 
 # Setup the schema registry
 export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring)
-curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8081/subjects/${kafkaTopicName}/versions
+curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8082/subjects/${kafkaTopicName}/versions
 curl -X GET http://localhost:8081/subjects/${kafkaTopicName}/versions/latest
 
 # Generate kafka messages from raw records
 # Each records with unique keys and generate equal messages across each hudi partition
 partitions={}
 for ((i = 0; i < ${numHudiPartitions}; i++)); do
-  partitions[$i]="partition-"$i
+  partitions[$i]="partition_"$i
 done
 
 events_file=/tmp/kcat-input.events
@@ -170,5 +171,5 @@ for ((i = 1;i<=numBatch;i++)); do
   done
 
   echo "publish to Kafka ..."
-  grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic
+  grep -v '^$' ${events_file} | kcat -P -b ${kafkaBrokerHostname}:9092 -t ${kafkaTopicName}
 done
diff --git a/hudi-kafka-connect/pom.xml b/hudi-kafka-connect/pom.xml
index 7a79f265c..835621adf 100644
--- a/hudi-kafka-connect/pom.xml
+++ b/hudi-kafka-connect/pom.xml
@@ -148,7 +148,7 @@
         <dependency>
             <groupId>org.apache.flink</groupId>
             <artifactId>flink-core</artifactId>
-            <version>1.12.1</version>
+            <version>${flink.version}</version>
             <exclusions>
                 <exclusion>
                     <groupId>com.esotericsoftware.kryo</groupId>
@@ -156,6 +156,11 @@
                 </exclusion>
             </exclusions>
         </dependency>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-hadoop-compatibility_${scala.binary.version}</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
 
         <!-- Protobuf -->
         <dependency>
@@ -195,6 +200,19 @@
             <version>${hadoop.version}</version>
         </dependency>
 
+        <!-- Hive -->
+        <dependency>
+            <groupId>org.apache.hive</groupId>
+            <artifactId>hive-common</artifactId>
+            <version>${hive.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>${hive.groupid}</groupId>
+            <artifactId>hive-metastore</artifactId>
+            <version>${hive.version}</version>
+        </dependency>
+
+
         <!-- Hudi - Test -->
         <dependency>
             <groupId>org.apache.hudi</groupId>
diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java
index 3c77063dd..cf60b9e5c 100644
--- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java
+++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java
@@ -30,6 +30,7 @@ import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.SerializationUtils;
 import org.apache.hudi.common.util.StringUtils;
 import org.apache.hudi.connect.ControlMessage;
+import org.apache.hudi.connect.writers.KafkaConnectConfigs;
 import org.apache.hudi.exception.HoodieException;
 import org.apache.hudi.keygen.BaseKeyGenerator;
 import org.apache.hudi.keygen.CustomAvroKeyGenerator;
@@ -63,6 +64,7 @@ import java.util.stream.Collectors;
 public class KafkaConnectUtils {
 
   private static final Logger LOG = LogManager.getLogger(KafkaConnectUtils.class);
+  private static final String HOODIE_CONF_PREFIX = "hoodie.";
 
   public static int getLatestNumPartitions(String bootstrapServers, String topicName) {
     Properties props = new Properties();
@@ -85,9 +87,15 @@ public class KafkaConnectUtils {
    *
    * @return
    */
-  public static Configuration getDefaultHadoopConf() {
+  public static Configuration getDefaultHadoopConf(KafkaConnectConfigs connectConfigs) {
     Configuration hadoopConf = new Configuration();
-    hadoopConf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
+    connectConfigs.getProps().keySet().stream().filter(prop -> {
+      // In order to prevent printing unnecessary warn logs, here filter out the hoodie
+      // configuration items before passing to hadoop/hive configs
+      return !prop.toString().startsWith(HOODIE_CONF_PREFIX);
+    }).forEach(prop -> {
+      hadoopConf.set(prop.toString(), connectConfigs.getProps().get(prop.toString()).toString());
+    });
     return hadoopConf;
   }
 
diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java
index 714d4d2a8..692edf75a 100644
--- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java
+++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java
@@ -94,8 +94,6 @@ public class KafkaConnectConfigs extends HoodieConfig {
 
   protected KafkaConnectConfigs(Properties props) {
     super(props);
-    Properties newProps = new Properties();
-    newProps.putAll(props);
   }
 
   public static KafkaConnectConfigs.Builder newBuilder() {
diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java
index 7ec3034bf..b18e7c695 100644
--- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java
+++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java
@@ -18,31 +18,43 @@
 
 package org.apache.hudi.connect.writers;
 
+import org.apache.hudi.DataSourceUtils;
 import org.apache.hudi.client.HoodieJavaWriteClient;
 import org.apache.hudi.client.WriteStatus;
 import org.apache.hudi.client.common.HoodieJavaEngineContext;
 import org.apache.hudi.common.config.TypedProperties;
 import org.apache.hudi.common.engine.EngineType;
 import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.fs.FSUtils;
 import org.apache.hudi.common.model.HoodieAvroPayload;
 import org.apache.hudi.common.model.HoodieCommitMetadata;
 import org.apache.hudi.common.model.HoodieTableType;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ReflectionUtils;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.connect.transaction.TransactionCoordinator;
 import org.apache.hudi.connect.utils.KafkaConnectUtils;
 import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.hive.HiveSyncConfig;
+import org.apache.hudi.hive.HiveSyncTool;
 import org.apache.hudi.keygen.KeyGenerator;
 import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory;
+import org.apache.hudi.sync.common.AbstractSyncTool;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 
+import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
 
 /**
  * Implementation of Transaction service APIs used by
@@ -53,10 +65,10 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
 
   private static final Logger LOG = LogManager.getLogger(KafkaConnectTransactionServices.class);
 
+  private final KafkaConnectConfigs connectConfigs;
   private final Option<HoodieTableMetaClient> tableMetaClient;
   private final Configuration hadoopConf;
   private final HoodieWriteConfig writeConfig;
-  private final KafkaConnectConfigs connectConfigs;
   private final String tableBasePath;
   private final String tableName;
   private final HoodieEngineContext context;
@@ -72,7 +84,7 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
 
     tableBasePath = writeConfig.getBasePath();
     tableName = writeConfig.getTableName();
-    hadoopConf = KafkaConnectUtils.getDefaultHadoopConf();
+    hadoopConf = KafkaConnectUtils.getDefaultHadoopConf(connectConfigs);
     context = new HoodieJavaEngineContext(hadoopConf);
 
     try {
@@ -123,6 +135,7 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
       javaClient.scheduleCompaction(Option.empty()).ifPresent(
           instantTs -> LOG.info("Scheduled compaction at instant time:" + instantTs));
     }
+    syncMeta();
   }
 
   @Override
@@ -144,4 +157,44 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServic
         && HoodieTableType.MERGE_ON_READ.equals(tableMetaClient.get().getTableType())
         && connectConfigs.isAsyncCompactEnabled();
   }
+
+  private void syncMeta() {
+    Set<String> syncClientToolClasses = new HashSet<>(
+        Arrays.asList(connectConfigs.getMetaSyncClasses().split(",")));
+    if (connectConfigs.isMetaSyncEnabled()) {
+      for (String impl : syncClientToolClasses) {
+        impl = impl.trim();
+        switch (impl) {
+          case "org.apache.hudi.hive.HiveSyncTool":
+            syncHive();
+            break;
+          default:
+            FileSystem fs = FSUtils.getFs(tableBasePath, new Configuration());
+            Properties properties = new Properties();
+            properties.putAll(connectConfigs.getProps());
+            properties.put("basePath", tableBasePath);
+            AbstractSyncTool syncTool = (AbstractSyncTool) ReflectionUtils.loadClass(impl, new Class[] {Properties.class, FileSystem.class}, properties, fs);
+            syncTool.syncHoodieTable();
+        }
+      }
+    }
+  }
+
+  private void syncHive() {
+    HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(
+        new TypedProperties(connectConfigs.getProps()),
+        tableBasePath,
+        "PARQUET");
+    LOG.info("Syncing target hoodie table with hive table("
+        + hiveSyncConfig.tableName
+        + "). Hive metastore URL :"
+        + hiveSyncConfig.jdbcUrl
+        + ", basePath :" + tableBasePath);
+    LOG.info("Hive Sync Conf => " + hiveSyncConfig.toString());
+    FileSystem fs = FSUtils.getFs(tableBasePath, hadoopConf);
+    HiveConf hiveConf = new HiveConf();
+    hiveConf.addResource(fs.getConf());
+    LOG.info("Hive Conf => " + hiveConf.getAllProperties().toString());
+    new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable();
+  }
 }
diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectWriterProvider.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectWriterProvider.java
index 9d007dd09..18c1835f6 100644
--- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectWriterProvider.java
+++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectWriterProvider.java
@@ -62,7 +62,7 @@ public class KafkaConnectWriterProvider implements ConnectWriterProvider<WriteSt
       KafkaConnectConfigs connectConfigs,
       TopicPartition partition) throws HoodieException {
     this.connectConfigs = connectConfigs;
-    Configuration hadoopConf = KafkaConnectUtils.getDefaultHadoopConf();
+    Configuration hadoopConf = KafkaConnectUtils.getDefaultHadoopConf(connectConfigs);
 
     try {
       this.schemaProvider = StringUtils.isNullOrEmpty(connectConfigs.getSchemaProviderClass()) ? null
diff --git a/hudi-kafka-connect/src/test/resources/log4j-surefire.properties b/hudi-kafka-connect/src/test/resources/log4j-surefire.properties
deleted file mode 100644
index 9ee04e1a3..000000000
--- a/hudi-kafka-connect/src/test/resources/log4j-surefire.properties
+++ /dev/null
@@ -1,32 +0,0 @@
-###
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-###
-log4j.rootLogger=WARN, CONSOLE
-log4j.logger.org.apache=INFO
-log4j.logger.org.apache.hudi=DEBUG
-log4j.logger.org.apache.hadoop.hbase=ERROR
-
-# A1 is set to be a ConsoleAppender.
-log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
-# A1 uses PatternLayout.
-log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
-log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
-log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
-log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
-log4j.appender.CONSOLE.filter.a.LevelMin=WARN
-log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
-
diff --git a/packaging/hudi-kafka-connect-bundle/pom.xml b/packaging/hudi-kafka-connect-bundle/pom.xml
index debbfa785..7f2afb37e 100644
--- a/packaging/hudi-kafka-connect-bundle/pom.xml
+++ b/packaging/hudi-kafka-connect-bundle/pom.xml
@@ -30,6 +30,7 @@
     <properties>
         <checkstyle.skip>true</checkstyle.skip>
         <main.basedir>${project.parent.basedir}</main.basedir>
+        <kafka.connect.bundle.shade.prefix>org.apache.hudi.</kafka.connect.bundle.shade.prefix>
     </properties>
 
     <build>
@@ -67,19 +68,76 @@
                                 </transformer>
                             </transformers>
                             <artifactSet>
-                                <excludes>
-                                    <exclude>com.amazonaws.*</exclude>
-                                    <exclude>org.apache.zookeeper:zookeeper</exclude>
-                                    <exclude>com.fasterxml.jackson.core:jackson-annotations</exclude>
-                                    <exclude>commons-httpclient:commons-httpclient</exclude>
-                                    <exclude>org.apache.htrace:htrace-core</exclude>
-                                    <exclude>org.jamon:jamon-runtime</exclude>
-                                    <exclude>org.slf4j:*</exclude>
-                                    <exclude>log4j:log4j</exclude>
-                                    <exclude>jdk.tools:jdk.tools</exclude>
-                                    <exclude>junit:junit</exclude>
-                                </excludes>
+                                <includes>
+                                    <include>org.apache.hudi:hudi-common</include>
+                                    <include>org.apache.hudi:hudi-client-common</include>
+                                    <include>org.apache.hudi:hudi-java-client</include>
+                                    <include>org.apache.hudi:hudi-spark-client</include>
+                                    <include>org.apache.hudi:hudi-spark-common_${scala.binary.version}</include>
+                                    <include>org.apache.hudi:hudi-kafka-connect</include>
+                                    <include>org.apache.hudi:hudi-utilities_${scala.binary.version}</include>
+                                    <include>org.apache.hudi:hudi-hive-sync</include>
+                                    <include>org.apache.hudi:hudi-sync-common</include>
+                                    <include>org.apache.hudi:hudi-hadoop-mr</include>
+                                    <include>org.apache.hudi:hudi-timeline-service</include>
+
+                                    <!-- NOTE: This is temp (SchemaProvide dep) until PR3162 lands -->
+                                    <include>org.apache.hudi:hudi-flink_${scala.binary.version}</include>
+                                    <include>org.apache.hudi:flink-core</include>
+                                    <include>org.apache.hudi:hudi-flink-client</include>
+                                    <include>org.apache.flink:flink-core</include>
+                                    <include>org.apache.flink:flink-hadoop-compatibility_${scala.binary.version}</include>
+
+                                    <include>com.yammer.metrics:metrics-core</include>
+                                    <include>com.beust:jcommander</include>
+                                    <include>io.javalin:javalin</include>
+                                    <include>org.jetbrains.kotlin:*</include>
+                                    <include>org.eclipse.jetty:*</include>
+                                    <include>org.eclipse.jetty.websocket:*</include>
+                                    <include>org.rocksdb:rocksdbjni</include>
+                                    <include>org.apache.httpcomponents:httpclient</include>
+                                    <include>org.apache.httpcomponents:httpcore</include>
+                                    <include>org.apache.httpcomponents:fluent-hc</include>
+
+                                    <include>io.dropwizard.metrics:metrics-core</include>
+                                    <include>io.dropwizard.metrics:metrics-graphite</include>
+                                    <include>io.prometheus:simpleclient</include>
+                                    <include>io.prometheus:simpleclient_httpserver</include>
+                                    <include>io.prometheus:simpleclient_dropwizard</include>
+                                    <include>io.prometheus:simpleclient_pushgateway</include>
+                                    <include>io.prometheus:simpleclient_common</include>
+                                    <include>com.yammer.metrics:metrics-core</include>
+                                    <include>com.google.protobuf:protobuf-java</include>
+                                    <include>org.objenesis:objenesis</include>
+                                    <include>com.esotericsoftware:kryo-shaded</include>
+                                    <include>com.esotericsoftware:minlog</include>
+
+                                    <include>org.apache.hbase:hbase-client</include>
+                                    <include>org.apache.hbase:hbase-common</include>
+                                    <include>org.apache.hbase:hbase-protocol</include>
+                                    <include>org.apache.hbase:hbase-server</include>
+                                    <include>org.apache.htrace:htrace-core</include>
+                                    <include>org.scala-lang:*</include>
+                                </includes>
                             </artifactSet>
+                            <relocations>
+                                <relocation>
+                                    <pattern>com.google.protobuf.</pattern>
+                                    <shadedPattern>${kafka.connect.bundle.shade.prefix}com.google.protobuf.</shadedPattern>
+                                </relocation>
+                                <relocation>
+                                    <pattern>com.yammer.metrics.</pattern>
+                                    <shadedPattern>${kafka.connect.bundle.shade.prefix}com.yammer.metrics.</shadedPattern>
+                                </relocation>
+                                <relocation>
+                                    <pattern>com.beust.jcommander.</pattern>
+                                    <shadedPattern>${kafka.connect.bundle.shade.prefix}com.beust.jcommander.</shadedPattern>
+                                </relocation>
+                                <relocation>
+                                    <pattern>org.eclipse.jetty.</pattern>
+                                    <shadedPattern>${kafka.connect.bundle.shade.prefix}org.eclipse.jetty.</shadedPattern>
+                                </relocation>
+                            </relocations>
                             <filters>
                                 <filter>
                                     <artifact>*:*</artifact>
@@ -108,11 +166,18 @@
     </build>
 
     <dependencies>
-        <!-- Hudi -->
+        <!-- Hoodie -->
         <dependency>
             <groupId>org.apache.hudi</groupId>
             <artifactId>hudi-kafka-connect</artifactId>
             <version>${project.version}</version>
+            <scope>compile</scope>
+            <exclusions>
+                <exclusion>
+                    <groupId>log4j</groupId>
+                    <artifactId>log4j</artifactId>
+                </exclusion>
+            </exclusions>
         </dependency>
         <dependency>
             <groupId>org.apache.hudi</groupId>
@@ -143,6 +208,22 @@
                 </exclusion>
             </exclusions>
         </dependency>
+        <dependency>
+            <groupId>org.apache.hudi</groupId>
+            <artifactId>hudi-hive-sync</artifactId>
+            <version>${project.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>javax.servlet</groupId>
+                    <artifactId>servlet-api</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hudi</groupId>
+            <artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
+            <version>${project.version}</version>
+        </dependency>
 
         <!-- Avro/ Parquet -->
         <dependency>
@@ -170,7 +251,6 @@
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
             <version>${hadoop.version}</version>
-            <scope>compile</scope>
             <exclusions>
                 <exclusion>
                     <groupId>org.mortbay.jetty</groupId>
@@ -190,8 +270,61 @@
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-auth</artifactId>
             <version>${hadoop.version}</version>
+        </dependency>
+
+        <!-- Hive -->
+        <dependency>
+            <groupId>${hive.groupid}</groupId>
+            <artifactId>hive-service</artifactId>
+            <version>${hive.version}</version>
+            <scope>${utilities.bundle.hive.scope}</scope>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-api</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-log4j12</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+
+        <dependency>
+            <groupId>${hive.groupid}</groupId>
+            <artifactId>hive-service-rpc</artifactId>
+            <version>${hive.version}</version>
+            <scope>${utilities.bundle.hive.scope}</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>${hive.groupid}</groupId>
+            <artifactId>hive-jdbc</artifactId>
+            <version>${hive.version}</version>
+            <scope>${utilities.bundle.hive.scope}</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>${hive.groupid}</groupId>
+            <artifactId>hive-metastore</artifactId>
+            <version>${hive.version}</version>
+            <scope>${utilities.bundle.hive.scope}</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>${hive.groupid}</groupId>
+            <artifactId>hive-common</artifactId>
+            <version>${hive.version}</version>
+            <scope>${utilities.bundle.hive.scope}</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.htrace</groupId>
+            <artifactId>htrace-core</artifactId>
+            <version>${htrace.version}</version>
             <scope>compile</scope>
         </dependency>
+
     </dependencies>
 </project>