Refactor hoodie-hive

2017-05-19 23:47:27 -07:00
parent c192dd60b4
commit db6150c5ef
40 changed files with 1614 additions and 2296 deletions
--- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/DatasetSchemaTest.java
+++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/DatasetSchemaTest.java
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *           http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.uber.hoodie.hive;
-
-import com.uber.hoodie.hive.client.SchemaUtil;
-import com.uber.hoodie.hive.model.HoodieDatasetReference;
-import com.uber.hoodie.hive.model.SchemaDifference;
-import com.uber.hoodie.hive.util.TestUtil;
-import org.joda.time.DateTime;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-import org.junit.runners.model.InitializationError;
-import parquet.schema.MessageType;
-import parquet.schema.OriginalType;
-import parquet.schema.PrimitiveType;
-
-import java.io.IOException;
-
-import static org.junit.Assert.assertEquals;
-
-public class DatasetSchemaTest {
-    @Before
-    public void setUp() throws IOException, InterruptedException {
-        TestUtil.setUp();
-    }
-
-    @Test
-    public void testSchemaDiff() throws IOException, InitializationError {
-        HoodieDatasetReference metadata = TestUtil
-            .createDataset("test1", "/tmp/hdfs/DatasetSchemaTest/testSchema/", 5, "/nation.schema");
-        HoodieHiveSchemaSyncTask schema =
-            HoodieHiveSchemaSyncTask.newBuilder().withReference(metadata)
-                .withConfiguration(TestUtil.hDroneConfiguration).build();
-        SchemaDifference diff = schema.getSchemaDifference();
-        assertEquals("There should be 4 columns to be added", 4, diff.getAddColumnTypes().size());
-        assertEquals("No update columns expected", 0, diff.getUpdateColumnTypes().size());
-        assertEquals("No delete columns expected", 0, diff.getDeleteColumns().size());
-        schema.sync();
-
-        schema = HoodieHiveSchemaSyncTask.newBuilder().withReference(metadata)
-            .withConfiguration(TestUtil.hDroneConfiguration).build();
-        diff = schema.getSchemaDifference();
-        assertEquals("After sync, there should not be any new columns to add", 0,
-            diff.getAddColumnTypes().size());
-        assertEquals("After sync, there should not be any new columns to update", 0,
-            diff.getUpdateColumnTypes().size());
-        assertEquals("After sync, there should not be any new columns to delete", 0,
-            diff.getDeleteColumns().size());
-    }
-
-    @Test
-    public void testSchemaEvolution() throws IOException, InitializationError {
-        int initialPartitionsCount = 5;
-        HoodieDatasetReference metadata = TestUtil
-            .createDataset("test1", "/tmp/hdfs/DatasetSchemaTest/testSchema/",
-                initialPartitionsCount, "/nation.schema");
-        HoodieHiveSchemaSyncTask schema =
-            HoodieHiveSchemaSyncTask.newBuilder().withReference(metadata)
-                .withConfiguration(TestUtil.hDroneConfiguration).build();
-        schema.sync();
-
-        schema = HoodieHiveSchemaSyncTask.newBuilder().withReference(metadata)
-            .withConfiguration(TestUtil.hDroneConfiguration).build();
-        SchemaDifference diff = schema.getSchemaDifference();
-        assertEquals("After sync, diff should be empty", true, diff.isEmpty());
-        int newSchemaversion = 2;
-        int newPartitionsCount = 2;
-        TestUtil.evolveDataset(metadata, newPartitionsCount, "/nation_evolved.schema",
-            DateTime.now().getMillis(), newSchemaversion);
-        schema = HoodieHiveSchemaSyncTask.newBuilder().withReference(metadata)
-            .withConfiguration(TestUtil.hDroneConfiguration).build();
-        diff = schema.getSchemaDifference();
-        assertEquals("Schema has evolved, there should be a diff", false, diff.isEmpty());
-        assertEquals("Schema has evolved, there should be 1 column to add", 1,
-            diff.getAddColumnTypes().size());
-        assertEquals("Schema has evolved, there should be 1 column to update", 1,
-            diff.getUpdateColumnTypes().size());
-        assertEquals(0, diff.getDeleteColumns().size());
-    }
-
-    /**
-     * Testing converting array types to Hive field declaration strings,
-     * according to the Parquet-113 spec:
-     * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
-     */
-    @Test
-    public void testSchemaConvertArray() throws IOException {
-        // Testing the 3-level annotation structure
-        MessageType schema =
-            parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
-                .repeatedGroup().optional(PrimitiveType.PrimitiveTypeName.INT32).named("element")
-                .named("list").named("int_list").named("ArrayOfInts");
-
-        String schemaString = SchemaUtil.generateSchemaString(schema);
-        assertEquals("`int_list` ARRAY< int>", schemaString);
-
-        // A array of arrays
-        schema =
-            parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
-                .repeatedGroup().requiredGroup().as(OriginalType.LIST).repeatedGroup()
-                .required(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list")
-                .named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts");
-
-        schemaString = SchemaUtil.generateSchemaString(schema);
-        assertEquals("`int_list_list` ARRAY< ARRAY< int>>", schemaString);
-
-        // A list of integers
-        schema =
-            parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
-                .repeated(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("int_list")
-                .named("ArrayOfInts");
-
-        schemaString = SchemaUtil.generateSchemaString(schema);
-        assertEquals("`int_list` ARRAY< int>", schemaString);
-
-        // A list of structs with two fields
-        schema =
-            parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
-                .repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
-                .required(PrimitiveType.PrimitiveTypeName.INT32).named("num").named("element")
-                .named("tuple_list").named("ArrayOfTuples");
-
-        schemaString = SchemaUtil.generateSchemaString(schema);
-        assertEquals("`tuple_list` ARRAY< STRUCT< `str` : binary, `num` : int>>", schemaString);
-
-        // A list of structs with a single field
-        // For this case, since the inner group name is "array", we treat the
-        // element type as a one-element struct.
-        schema =
-            parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
-                .repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
-                .named("array").named("one_tuple_list").named("ArrayOfOneTuples");
-
-        schemaString = SchemaUtil.generateSchemaString(schema);
-        assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
-
-        // A list of structs with a single field
-        // For this case, since the inner group name ends with "_tuple", we also treat the
-        // element type as a one-element struct.
-        schema =
-            parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
-                .repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
-                .named("one_tuple_list_tuple").named("one_tuple_list").named("ArrayOfOneTuples2");
-
-        schemaString = SchemaUtil.generateSchemaString(schema);
-        assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
-
-        // A list of structs with a single field
-        // Unlike the above two cases, for this the element type is the type of the
-        // only field in the struct.
-        schema =
-            parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
-                .repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
-                .named("one_tuple_list").named("one_tuple_list").named("ArrayOfOneTuples3");
-
-        schemaString = SchemaUtil.generateSchemaString(schema);
-        assertEquals("`one_tuple_list` ARRAY< binary>", schemaString);
-
-        // A list of maps
-        schema =
-            parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
-                .repeatedGroup().as(OriginalType.MAP).repeatedGroup().as(OriginalType.MAP_KEY_VALUE)
-                .required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
-                .named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32)
-                .named("int_value").named("key_value").named("array").named("map_list")
-                .named("ArrayOfMaps");
-
-        schemaString = SchemaUtil.generateSchemaString(schema);
-        assertEquals("`map_list` ARRAY< MAP< string, int>>", schemaString);
-    }
-}
--- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/HDroneDatasetTest.java
+++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/HDroneDatasetTest.java
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *           http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.uber.hoodie.hive;
-
-import com.uber.hoodie.hive.client.HoodieHiveClient;
-import com.uber.hoodie.hive.model.HoodieDatasetReference;
-import com.uber.hoodie.hive.util.TestUtil;
-import org.joda.time.DateTime;
-import org.junit.Before;
-import org.junit.Test;
-import org.junit.runners.model.InitializationError;
-import parquet.schema.MessageType;
-
-import java.io.IOException;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-public class HDroneDatasetTest {
-    private HoodieHiveClient hiveClient;
-
-    @Before
-    public void setUp() throws IOException, InterruptedException {
-        TestUtil.setUp();
-        hiveClient = new HoodieHiveClient(TestUtil.hDroneConfiguration);
-    }
-
-    @Test
-    public void testDatasetCreation() throws IOException, InitializationError {
-        HoodieDatasetReference metadata = TestUtil
-            .createDataset("test1", "/tmp/hdfs/DatasetSchemaTest/testSchema/", 5, "/nation.schema");
-        HoodieHiveDatasetSyncTask dataset =
-            HoodieHiveDatasetSyncTask.newBuilder().withReference(metadata)
-                .withConfiguration(TestUtil.hDroneConfiguration).build();
-        assertEquals("There should be 5 new partitions", 5, dataset.getNewPartitions().size());
-        assertEquals("There should not be any changed partitions", 0,
-            dataset.getChangedPartitions().size());
-        assertFalse("Table should not exist", hiveClient.checkTableExists(metadata));
-        dataset.sync();
-
-        dataset = HoodieHiveDatasetSyncTask.newBuilder().withReference(metadata)
-            .withConfiguration(TestUtil.hDroneConfiguration).build();
-        assertTrue("Table should exist after flush", hiveClient.checkTableExists(metadata));
-        assertEquals("After flush, There should not be any new partitions to flush", 0,
-            dataset.getNewPartitions().size());
-        assertEquals("After flush, There should not be any modified partitions to flush", 0,
-            dataset.getChangedPartitions().size());
-
-        assertEquals("Table Schema should have 5 fields", 5,
-            hiveClient.getTableSchema(metadata).size());
-    }
-
-    @Test
-    public void testDatasetEvolution() throws IOException, InitializationError {
-        int initialPartitionsCount = 5;
-        HoodieDatasetReference metadata = TestUtil
-            .createDataset("test1", "/tmp/hdfs/DatasetSchemaTest/testSchema/",
-                initialPartitionsCount, "/nation.schema");
-        HoodieHiveDatasetSyncTask dataset =
-            HoodieHiveDatasetSyncTask.newBuilder().withReference(metadata)
-                .withConfiguration(TestUtil.hDroneConfiguration).build();
-        dataset.sync();
-
-        dataset = HoodieHiveDatasetSyncTask.newBuilder(dataset).build();
-        int newSchemaversion = 2;
-        int newPartitionsCount = 2;
-        TestUtil.evolveDataset(metadata, newPartitionsCount, "/nation_evolved.schema",
-            DateTime.now().getMillis(), newSchemaversion);
-        dataset = HoodieHiveDatasetSyncTask.newBuilder(dataset).build();
-        assertEquals("There should be " + newPartitionsCount + " partitions to be added",
-            newPartitionsCount, dataset.getNewPartitions().size());
-        dataset.sync();
-
-        dataset = HoodieHiveDatasetSyncTask.newBuilder(dataset).build();
-        MessageType newDatasetSchema = dataset.getSchemaSyncTask().getStorageSchema();
-        MessageType expectedSchema = TestUtil.readSchema("/nation_evolved.schema");
-        assertEquals("Table schema should be evolved schema", expectedSchema, newDatasetSchema);
-        assertEquals("Table schema should have 6 fields", 6,
-            hiveClient.getTableSchema(metadata).size());
-        assertEquals("Valid Evolution should be reflected", "BIGINT",
-            hiveClient.getTableSchema(metadata).get("region_key"));
-    }
-
-}
--- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/HiveSyncToolTest.java
+++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/HiveSyncToolTest.java
@@ -0,0 +1,308 @@
+/*
+ *  Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.hive;
+
+import static org.junit.Assert.*;
+
+import com.uber.hoodie.common.util.SchemaTestUtil;
+import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent;
+import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
+import com.uber.hoodie.hive.util.SchemaUtil;
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.util.List;
+import java.util.Optional;
+import org.apache.hadoop.hive.metastore.api.Partition;
+import org.apache.thrift.TException;
+import org.joda.time.DateTime;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runners.model.InitializationError;
+import parquet.schema.MessageType;
+import parquet.schema.OriginalType;
+import parquet.schema.PrimitiveType;
+
+@SuppressWarnings("ConstantConditions")
+public class HiveSyncToolTest {
+
+  @Before
+  public void setUp() throws IOException, InterruptedException, URISyntaxException {
+    TestUtil.setUp();
+  }
+
+  @Before
+  public void teardown() throws IOException, InterruptedException {
+    TestUtil.clear();
+  }
+
+  /**
+   * Testing converting array types to Hive field declaration strings,
+   * according to the Parquet-113 spec:
+   * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
+   */
+  @Test
+  public void testSchemaConvertArray() throws IOException {
+    // Testing the 3-level annotation structure
+    MessageType schema =
+        parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
+            .repeatedGroup().optional(PrimitiveType.PrimitiveTypeName.INT32).named("element")
+            .named("list").named("int_list").named("ArrayOfInts");
+
+    String schemaString = SchemaUtil.generateSchemaString(schema);
+    assertEquals("`int_list` ARRAY< int>", schemaString);
+
+    // A array of arrays
+    schema =
+        parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
+            .repeatedGroup().requiredGroup().as(OriginalType.LIST).repeatedGroup()
+            .required(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list")
+            .named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts");
+
+    schemaString = SchemaUtil.generateSchemaString(schema);
+    assertEquals("`int_list_list` ARRAY< ARRAY< int>>", schemaString);
+
+    // A list of integers
+    schema =
+        parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
+            .repeated(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("int_list")
+            .named("ArrayOfInts");
+
+    schemaString = SchemaUtil.generateSchemaString(schema);
+    assertEquals("`int_list` ARRAY< int>", schemaString);
+
+    // A list of structs with two fields
+    schema =
+        parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
+            .repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
+            .required(PrimitiveType.PrimitiveTypeName.INT32).named("num").named("element")
+            .named("tuple_list").named("ArrayOfTuples");
+
+    schemaString = SchemaUtil.generateSchemaString(schema);
+    assertEquals("`tuple_list` ARRAY< STRUCT< `str` : binary, `num` : int>>", schemaString);
+
+    // A list of structs with a single field
+    // For this case, since the inner group name is "array", we treat the
+    // element type as a one-element struct.
+    schema =
+        parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
+            .repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
+            .named("array").named("one_tuple_list").named("ArrayOfOneTuples");
+
+    schemaString = SchemaUtil.generateSchemaString(schema);
+    assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
+
+    // A list of structs with a single field
+    // For this case, since the inner group name ends with "_tuple", we also treat the
+    // element type as a one-element struct.
+    schema =
+        parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
+            .repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
+            .named("one_tuple_list_tuple").named("one_tuple_list").named("ArrayOfOneTuples2");
+
+    schemaString = SchemaUtil.generateSchemaString(schema);
+    assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
+
+    // A list of structs with a single field
+    // Unlike the above two cases, for this the element type is the type of the
+    // only field in the struct.
+    schema =
+        parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
+            .repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
+            .named("one_tuple_list").named("one_tuple_list").named("ArrayOfOneTuples3");
+
+    schemaString = SchemaUtil.generateSchemaString(schema);
+    assertEquals("`one_tuple_list` ARRAY< binary>", schemaString);
+
+    // A list of maps
+    schema =
+        parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
+            .repeatedGroup().as(OriginalType.MAP).repeatedGroup().as(OriginalType.MAP_KEY_VALUE)
+            .required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
+            .named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32)
+            .named("int_value").named("key_value").named("array").named("map_list")
+            .named("ArrayOfMaps");
+
+    schemaString = SchemaUtil.generateSchemaString(schema);
+    assertEquals("`map_list` ARRAY< MAP< string, int>>", schemaString);
+  }
+
+
+  @Test
+  public void testBasicSync()
+      throws IOException, InitializationError, URISyntaxException, TException, InterruptedException {
+    String commitTime = "100";
+    TestUtil.createCOWDataset(commitTime, 5);
+    HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
+        TestUtil.getHiveConf(), TestUtil.fileSystem);
+    assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
+        hiveClient.doesTableExist());
+    // Lets do the sync
+    HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
+        TestUtil.fileSystem);
+    tool.syncHoodieTable();
+    assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
+        hiveClient.doesTableExist());
+    assertEquals("Hive Schema should match the dataset schema + partition field",
+        hiveClient.getTableSchema().size(),
+        hiveClient.getDataSchema().getColumns().size() + 1);
+    assertEquals("Table partitions should match the number of partitions we wrote", 5,
+        hiveClient.scanTablePartitions().size());
+    assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
+        commitTime,
+        hiveClient.getLastCommitTimeSynced().get());
+  }
+
+  @Test
+  public void testSyncIncremental()
+      throws IOException, InitializationError, URISyntaxException, TException, InterruptedException {
+    String commitTime1 = "100";
+    TestUtil.createCOWDataset(commitTime1, 5);
+    HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
+        TestUtil.getHiveConf(), TestUtil.fileSystem);
+    // Lets do the sync
+    HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
+        TestUtil.fileSystem);
+    tool.syncHoodieTable();
+    assertEquals("Table partitions should match the number of partitions we wrote", 5,
+        hiveClient.scanTablePartitions().size());
+    assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
+        commitTime1,
+        hiveClient.getLastCommitTimeSynced().get());
+
+    // Now lets create more parititions and these are the only ones which needs to be synced
+    DateTime dateTime = DateTime.now().plusDays(6);
+    String commitTime2 = "101";
+    TestUtil.addCOWPartitions(1, true, dateTime, commitTime2);
+
+    // Lets do the sync
+    hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
+        TestUtil.getHiveConf(), TestUtil.fileSystem);
+    List<String> writtenPartitionsSince = hiveClient
+        .getPartitionsWrittenToSince(Optional.of(commitTime1));
+    assertEquals("We should have one partition written after 100 commit", 1,
+        writtenPartitionsSince.size());
+    List<Partition> hivePartitions = hiveClient.scanTablePartitions();
+    List<PartitionEvent> partitionEvents = hiveClient
+        .getPartitionEvents(hivePartitions, writtenPartitionsSince);
+    assertEquals("There should be only one paritition event", 1, partitionEvents.size());
+    assertEquals("The one partition event must of type ADD", PartitionEventType.ADD,
+        partitionEvents.iterator().next().eventType);
+
+    tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
+        TestUtil.fileSystem);
+    tool.syncHoodieTable();
+    // Sync should add the one partition
+    assertEquals("The one partition we wrote should be added to hive", 6,
+        hiveClient.scanTablePartitions().size());
+    assertEquals("The last commit that was sycned should be 101",
+        commitTime2,
+        hiveClient.getLastCommitTimeSynced().get());
+  }
+
+  @Test
+  public void testSyncIncrementalWithSchemaEvolution()
+      throws IOException, InitializationError, URISyntaxException, TException, InterruptedException {
+    String commitTime1 = "100";
+    TestUtil.createCOWDataset(commitTime1, 5);
+    HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
+        TestUtil.getHiveConf(), TestUtil.fileSystem);
+    // Lets do the sync
+    HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
+        TestUtil.fileSystem);
+    tool.syncHoodieTable();
+
+    int fields = hiveClient.getTableSchema().size();
+
+    // Now lets create more parititions and these are the only ones which needs to be synced
+    DateTime dateTime = DateTime.now().plusDays(6);
+    String commitTime2 = "101";
+    TestUtil.addCOWPartitions(1, false, dateTime, commitTime2);
+
+    // Lets do the sync
+    tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
+        TestUtil.fileSystem);
+    tool.syncHoodieTable();
+
+    assertEquals("Hive Schema has evolved and should not be 3 more field",
+        fields + 3,
+        hiveClient.getTableSchema().size());
+    assertEquals("Hive Schema has evolved - Field favorite_number has evolved from int to long",
+        "BIGINT",
+        hiveClient.getTableSchema().get("favorite_number"));
+    assertTrue("Hive Schema has evolved - Field favorite_movie was added",
+        hiveClient.getTableSchema().containsKey("favorite_movie"));
+
+    // Sync should add the one partition
+    assertEquals("The one partition we wrote should be added to hive", 6,
+        hiveClient.scanTablePartitions().size());
+    assertEquals("The last commit that was sycned should be 101",
+        commitTime2,
+        hiveClient.getLastCommitTimeSynced().get());
+  }
+
+  @Test
+  public void testSyncMergeOnRead()
+      throws IOException, InitializationError, URISyntaxException, TException, InterruptedException {
+    String commitTime = "100";
+    String deltaCommitTime = "101";
+    TestUtil.createMORDataset(commitTime, deltaCommitTime, 5);
+    HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
+        TestUtil.getHiveConf(), TestUtil.fileSystem);
+    assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
+        hiveClient.doesTableExist());
+    // Lets do the sync
+    HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
+        TestUtil.fileSystem);
+    tool.syncHoodieTable();
+
+    assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
+        hiveClient.doesTableExist());
+    assertEquals("Hive Schema should match the dataset schema + partition field",
+        hiveClient.getTableSchema().size(), SchemaTestUtil.getSimpleSchema().getFields().size() + 1);
+    assertEquals("Table partitions should match the number of partitions we wrote", 5,
+        hiveClient.scanTablePartitions().size());
+    assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
+        deltaCommitTime,
+        hiveClient.getLastCommitTimeSynced().get());
+
+    // Now lets create more parititions and these are the only ones which needs to be synced
+    DateTime dateTime = DateTime.now().plusDays(6);
+    String commitTime2 = "102";
+    String deltaCommitTime2 = "103";
+
+    TestUtil.addCOWPartitions(1, true, dateTime, commitTime2);
+    TestUtil.addMORPartitions(1, true, false, dateTime, commitTime2, deltaCommitTime2);
+    // Lets do the sync
+    tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
+        TestUtil.fileSystem);
+    tool.syncHoodieTable();
+    hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
+        TestUtil.getHiveConf(), TestUtil.fileSystem);
+
+    assertEquals("Hive Schema should match the evolved dataset schema + partition field",
+        hiveClient.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
+    // Sync should add the one partition
+    assertEquals("The 2 partitions we wrote should be added to hive", 6,
+        hiveClient.scanTablePartitions().size());
+    assertEquals("The last commit that was sycned should be 103",
+        deltaCommitTime2,
+        hiveClient.getLastCommitTimeSynced().get());
+  }
+
+}
--- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/TestUtil.java
+++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/TestUtil.java
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.uber.hoodie.hive;
+
+import static com.uber.hoodie.common.model.HoodieTestUtils.DEFAULT_TASK_PARTITIONID;
+import static org.junit.Assert.fail;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import com.uber.hoodie.avro.HoodieAvroWriteSupport;
+import com.uber.hoodie.common.BloomFilter;
+import com.uber.hoodie.common.minicluster.HdfsTestService;
+import com.uber.hoodie.common.minicluster.ZookeeperTestService;
+import com.uber.hoodie.common.model.CompactionWriteStat;
+import com.uber.hoodie.common.model.HoodieCommitMetadata;
+import com.uber.hoodie.common.model.HoodieCompactionMetadata;
+import com.uber.hoodie.common.model.HoodieDataFile;
+import com.uber.hoodie.common.model.HoodieDeltaWriteStat;
+import com.uber.hoodie.common.model.HoodieTableType;
+import com.uber.hoodie.common.model.HoodieWriteStat;
+import com.uber.hoodie.common.table.HoodieTableMetaClient;
+import com.uber.hoodie.common.table.HoodieTimeline;
+import com.uber.hoodie.common.table.log.HoodieLogFile;
+import com.uber.hoodie.common.table.log.HoodieLogFormat;
+import com.uber.hoodie.common.table.log.HoodieLogFormat.Writer;
+import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
+import com.uber.hoodie.common.util.FSUtils;
+import com.uber.hoodie.common.util.SchemaTestUtil;
+import com.uber.hoodie.hive.util.HiveTestService;
+import java.io.File;
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.UUID;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hive.service.server.HiveServer2;
+import org.apache.parquet.avro.AvroSchemaConverter;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.zookeeper.server.ZooKeeperServer;
+import org.joda.time.DateTime;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+import org.junit.runners.model.InitializationError;
+
+@SuppressWarnings("SameParameterValue")
+public class TestUtil {
+
+  private static MiniDFSCluster dfsCluster;
+  private static ZooKeeperServer zkServer;
+  private static HiveServer2 hiveServer;
+  private static Configuration configuration;
+  static HiveSyncConfig hiveSyncConfig;
+  private static DateTimeFormatter dtfOut;
+  static FileSystem fileSystem;
+  private static Set<String> createdTablesSet = Sets.newHashSet();
+
+  public static void setUp() throws IOException, InterruptedException, URISyntaxException {
+    if (dfsCluster == null) {
+      HdfsTestService service = new HdfsTestService();
+      dfsCluster = service.start(true);
+      configuration = service.getHadoopConf();
+    }
+    if (zkServer == null) {
+      ZookeeperTestService zkService = new ZookeeperTestService(configuration);
+      zkServer = zkService.start();
+    }
+    if (hiveServer == null) {
+      HiveTestService hiveService = new HiveTestService(configuration);
+      hiveServer = hiveService.start();
+    }
+    fileSystem = FileSystem.get(configuration);
+
+    hiveSyncConfig = new HiveSyncConfig();
+    hiveSyncConfig.jdbcUrl = "jdbc:hive2://127.0.0.1:9999/";
+    hiveSyncConfig.databaseName = "hdrone_test";
+    hiveSyncConfig.hiveUser = "";
+    hiveSyncConfig.hivePass = "";
+    hiveSyncConfig.databaseName = "testdb";
+    hiveSyncConfig.tableName = "test1";
+    hiveSyncConfig.basePath = "/tmp/hdfs/HiveSyncToolTest/";
+    hiveSyncConfig.assumeDatePartitioning = true;
+    hiveSyncConfig.partitionFields = Lists.newArrayList("datestr");
+
+    dtfOut = DateTimeFormat.forPattern("yyyy/MM/dd");
+
+    clear();
+  }
+
+  static void clear() throws IOException {
+    fileSystem.delete(new Path(hiveSyncConfig.basePath), true);
+    HoodieTableMetaClient
+        .initTableType(fileSystem, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
+            hiveSyncConfig.tableName);
+
+    HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(),
+        fileSystem);
+    for (String tableName : createdTablesSet) {
+      client.updateHiveSQL("drop table if exists " + tableName);
+    }
+    createdTablesSet.clear();
+    client.updateHiveSQL(
+        "drop database if exists " + hiveSyncConfig.databaseName);
+    client.updateHiveSQL("create database " + hiveSyncConfig.databaseName);
+  }
+
+  static HiveConf getHiveConf() {
+    return hiveServer.getHiveConf();
+  }
+
+  @SuppressWarnings("unused")
+  public static void shutdown() {
+    if (hiveServer != null) {
+      hiveServer.stop();
+    }
+    if (dfsCluster != null) {
+      dfsCluster.shutdown();
+    }
+    if (zkServer != null) {
+      zkServer.shutdown();
+    }
+  }
+
+  static void createCOWDataset(String commitTime, int numberOfPartitions)
+      throws IOException, InitializationError, URISyntaxException, InterruptedException {
+    Path path = new Path(hiveSyncConfig.basePath);
+    FileUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
+    HoodieTableMetaClient
+        .initTableType(fileSystem, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
+            hiveSyncConfig.tableName);
+    boolean result = fileSystem.mkdirs(path);
+    checkResult(result);
+    DateTime dateTime = DateTime.now();
+    HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime);
+    createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
+    createCommitFile(commitMetadata, commitTime);
+  }
+
+  static void createMORDataset(String commitTime, String deltaCommitTime, int numberOfPartitions)
+      throws IOException, InitializationError, URISyntaxException, InterruptedException {
+    Path path = new Path(hiveSyncConfig.basePath);
+    FileUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
+    HoodieTableMetaClient
+        .initTableType(fileSystem, hiveSyncConfig.basePath, HoodieTableType.MERGE_ON_READ,
+            hiveSyncConfig.tableName);
+
+    boolean result = fileSystem.mkdirs(path);
+    checkResult(result);
+    DateTime dateTime = DateTime.now();
+    HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime);
+    createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
+    HoodieCompactionMetadata compactionMetadata = new HoodieCompactionMetadata();
+    commitMetadata.getPartitionToWriteStats()
+        .forEach((key, value) -> value.stream().map(k -> new CompactionWriteStat(k, key, 0, 0, 0))
+            .forEach(l -> compactionMetadata.addWriteStat(key, l)));
+    createCompactionCommitFile(compactionMetadata, commitTime);
+    // Write a delta commit
+    HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), true);
+    createDeltaCommitFile(deltaMetadata, deltaCommitTime);
+  }
+
+  static void addCOWPartitions(int numberOfPartitions, boolean isParquetSchemaSimple,
+      DateTime startFrom, String commitTime)
+      throws IOException, URISyntaxException, InterruptedException {
+    HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions,
+        isParquetSchemaSimple, startFrom, commitTime);
+    createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
+    createCommitFile(commitMetadata, commitTime);
+  }
+
+  static void addMORPartitions(int numberOfPartitions, boolean isParquetSchemaSimple,
+      boolean isLogSchemaSimple, DateTime startFrom,
+      String commitTime, String deltaCommitTime)
+      throws IOException, URISyntaxException, InterruptedException {
+    HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions,
+        isParquetSchemaSimple, startFrom, commitTime);
+    createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
+    HoodieCompactionMetadata compactionMetadata = new HoodieCompactionMetadata();
+    commitMetadata.getPartitionToWriteStats()
+        .forEach((key, value) -> value.stream().map(k -> new CompactionWriteStat(k, key, 0, 0, 0))
+            .forEach(l -> compactionMetadata.addWriteStat(key, l)));
+    createCompactionCommitFile(compactionMetadata, commitTime);
+    HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), isLogSchemaSimple);
+    createDeltaCommitFile(deltaMetadata, deltaCommitTime);
+  }
+
+  private static HoodieCommitMetadata createLogFiles(
+      HashMap<String, List<HoodieWriteStat>> partitionWriteStats, boolean isLogSchemaSimple)
+      throws InterruptedException, IOException, URISyntaxException {
+    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
+    for (Entry<String, List<HoodieWriteStat>> wEntry : partitionWriteStats.entrySet()) {
+      String partitionPath = wEntry.getKey();
+      for (HoodieWriteStat wStat : wEntry.getValue()) {
+        Path path = new Path(wStat.getFullPath());
+        HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(path));
+        HoodieLogFile logFile = generateLogData(path, isLogSchemaSimple);
+        HoodieDeltaWriteStat writeStat = new HoodieDeltaWriteStat();
+        writeStat.setFileId(dataFile.getFileId());
+        writeStat.setFullPath(logFile.getPath().toString());
+        commitMetadata.addWriteStat(partitionPath, writeStat);
+      }
+    }
+    return commitMetadata;
+  }
+
+  private static HoodieCommitMetadata createPartitions(int numberOfPartitions,
+      boolean isParquetSchemaSimple, DateTime startFrom, String commitTime)
+      throws IOException, URISyntaxException, InterruptedException {
+    startFrom = startFrom.withTimeAtStartOfDay();
+
+    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
+    for (int i = 0; i < numberOfPartitions; i++) {
+      String partitionPath = dtfOut.print(startFrom);
+      Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath);
+      fileSystem.makeQualified(partPath);
+      fileSystem.mkdirs(partPath);
+      List<HoodieWriteStat> writeStats = createTestData(partPath, isParquetSchemaSimple, commitTime);
+      startFrom = startFrom.minusDays(1);
+      writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s));
+    }
+    return commitMetadata;
+  }
+
+  private static List<HoodieWriteStat> createTestData(Path partPath, boolean isParquetSchemaSimple,
+      String commitTime) throws IOException, URISyntaxException, InterruptedException {
+    List<HoodieWriteStat> writeStats = Lists.newArrayList();
+    for (int i = 0; i < 5; i++) {
+      // Create 5 files
+      String fileId = UUID.randomUUID().toString();
+      Path filePath = new Path(partPath.toString() + "/" + FSUtils
+          .makeDataFileName(commitTime, DEFAULT_TASK_PARTITIONID, fileId));
+      generateParquetData(filePath, isParquetSchemaSimple);
+      HoodieWriteStat writeStat = new HoodieWriteStat();
+      writeStat.setFileId(fileId);
+      writeStat.setFullPath(filePath.toString());
+      writeStats.add(writeStat);
+    }
+    return writeStats;
+  }
+
+  @SuppressWarnings({"unchecked", "deprecation"})
+  private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple)
+      throws IOException, URISyntaxException, InterruptedException {
+    Schema schema = (isParquetSchemaSimple ? SchemaTestUtil.getSimpleSchema()
+        : SchemaTestUtil.getEvolvedSchema());
+    org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
+    BloomFilter filter = new BloomFilter(1000, 0.0001);
+    HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
+    ParquetWriter writer = new ParquetWriter(filePath,
+        writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE,
+        ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
+        ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION,
+        fileSystem.getConf());
+
+    List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil
+        .generateTestRecords(0, 100)
+        : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
+    testRecords.forEach(s -> {
+      try {
+        writer.write(s);
+      } catch (IOException e) {
+        fail("IOException while writing test records as parquet" + e.toString());
+      }
+    });
+    writer.close();
+  }
+
+  private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLogSchemaSimple)
+      throws IOException, InterruptedException, URISyntaxException {
+    Schema schema = (isLogSchemaSimple ? SchemaTestUtil.getSimpleSchema()
+        : SchemaTestUtil.getEvolvedSchema());
+    HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(parquetFilePath));
+    // Write a log file for this parquet file
+    Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(parquetFilePath.getParent())
+        .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(dataFile.getFileId())
+        .overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build();
+    List<IndexedRecord> records = (isLogSchemaSimple ? SchemaTestUtil
+        .generateTestRecords(0, 100)
+        : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
+    HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, schema);
+    logWriter.appendBlock(dataBlock);
+    logWriter.close();
+    return logWriter.getLogFile();
+  }
+
+  private static void checkResult(boolean result) throws InitializationError {
+    if (!result) {
+      throw new InitializationError("Could not initialize");
+    }
+  }
+
+  private static void createCommitFile(
+      HoodieCommitMetadata commitMetadata, String commitTime)
+      throws IOException {
+    byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
+    Path fullPath = new Path(
+        hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
+            .makeCommitFileName(commitTime));
+    FSDataOutputStream fsout = fileSystem.create(fullPath, true);
+    fsout.write(bytes);
+    fsout.close();
+  }
+
+  private static void createCompactionCommitFile(
+      HoodieCompactionMetadata commitMetadata, String commitTime)
+      throws IOException {
+    byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
+    Path fullPath = new Path(
+        hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
+            .makeCompactionFileName(commitTime));
+    FSDataOutputStream fsout = fileSystem.create(fullPath, true);
+    fsout.write(bytes);
+    fsout.close();
+  }
+
+  private static void createDeltaCommitFile(
+      HoodieCommitMetadata deltaCommitMetadata, String deltaCommitTime)
+      throws IOException {
+    byte[] bytes = deltaCommitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
+    Path fullPath = new Path(
+        hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
+            .makeDeltaFileName(deltaCommitTime));
+    FSDataOutputStream fsout = fileSystem.create(fullPath, true);
+    fsout.write(bytes);
+    fsout.close();
+  }
+}
--- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/CsvParquetWriter.java
+++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/CsvParquetWriter.java
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *           http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.uber.hoodie.hive.util;
-
-
-import org.apache.hadoop.fs.Path;
-import parquet.hadoop.ParquetWriter;
-import parquet.hadoop.metadata.CompressionCodecName;
-import parquet.schema.MessageType;
-
-import java.io.IOException;
-import java.util.List;
-
-public class CsvParquetWriter extends ParquetWriter<List<String>> {
-
-    public CsvParquetWriter(Path file, MessageType schema) throws IOException {
-        this(file, schema, false);
-    }
-
-    public CsvParquetWriter(Path file, MessageType schema, boolean enableDictionary)
-        throws IOException {
-        this(file, schema, CompressionCodecName.UNCOMPRESSED, enableDictionary);
-    }
-
-    public CsvParquetWriter(Path file, MessageType schema, CompressionCodecName codecName,
-        boolean enableDictionary) throws IOException {
-        super(file, new CsvWriteSupport(schema), codecName,
-            DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE, enableDictionary, false);
-    }
-}
--- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/CsvWriteSupport.java
+++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/CsvWriteSupport.java
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *           http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.uber.hoodie.hive.util;
-
-import org.apache.hadoop.conf.Configuration;
-import parquet.column.ColumnDescriptor;
-import parquet.hadoop.api.WriteSupport;
-import parquet.io.ParquetEncodingException;
-import parquet.io.api.Binary;
-import parquet.io.api.RecordConsumer;
-import parquet.schema.MessageType;
-
-import java.util.HashMap;
-import java.util.List;
-
-public class CsvWriteSupport extends WriteSupport<List<String>> {
-    MessageType schema;
-    RecordConsumer recordConsumer;
-    List<ColumnDescriptor> cols;
-
-    // TODO: support specifying encodings and compression
-    public CsvWriteSupport(MessageType schema) {
-        this.schema = schema;
-        this.cols = schema.getColumns();
-    }
-
-    @Override public WriteContext init(Configuration config) {
-        return new WriteContext(schema, new HashMap<String, String>());
-    }
-
-    @Override public void prepareForWrite(RecordConsumer r) {
-        recordConsumer = r;
-    }
-
-    @Override public void write(List<String> values) {
-        if (values.size() != cols.size()) {
-            throw new ParquetEncodingException("Invalid input data. Expecting " +
-                cols.size() + " columns. Input had " + values.size() + " columns (" + cols + ") : "
-                + values);
-        }
-
-        recordConsumer.startMessage();
-        for (int i = 0; i < cols.size(); ++i) {
-            String val = values.get(i);
-            // val.length() == 0 indicates a NULL value.
-            if (val.length() > 0) {
-                recordConsumer.startField(cols.get(i).getPath()[0], i);
-                switch (cols.get(i).getType()) {
-                    case BOOLEAN:
-                        recordConsumer.addBoolean(Boolean.parseBoolean(val));
-                        break;
-                    case FLOAT:
-                        recordConsumer.addFloat(Float.parseFloat(val));
-                        break;
-                    case DOUBLE:
-                        recordConsumer.addDouble(Double.parseDouble(val));
-                        break;
-                    case INT32:
-                        recordConsumer.addInteger(Integer.parseInt(val));
-                        break;
-                    case INT64:
-                        recordConsumer.addLong(Long.parseLong(val));
-                        break;
-                    case BINARY:
-                        recordConsumer.addBinary(stringToBinary(val));
-                        break;
-                    default:
-                        throw new ParquetEncodingException(
-                            "Unsupported column type: " + cols.get(i).getType());
-                }
-                recordConsumer.endField(cols.get(i).getPath()[0], i);
-            }
-        }
-        recordConsumer.endMessage();
-    }
-
-    private Binary stringToBinary(Object value) {
-        return Binary.fromString(value.toString());
-    }
-}
--- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/TestUtil.java
+++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/TestUtil.java
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *           http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.uber.hoodie.hive.util;
-
-import com.google.common.collect.Sets;
-import com.uber.hoodie.common.minicluster.HdfsTestService;
-import com.uber.hoodie.common.minicluster.ZookeeperTestService;
-import com.uber.hoodie.hive.HoodieHiveConfiguration;
-import com.uber.hoodie.hive.client.HoodieHiveClient;
-import com.uber.hoodie.hive.model.HoodieDatasetReference;
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hive.service.server.HiveServer2;
-import org.apache.zookeeper.server.ZooKeeperServer;
-import org.joda.time.DateTime;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-import org.junit.runners.model.InitializationError;
-import parquet.schema.MessageType;
-import parquet.schema.MessageTypeParser;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.Arrays;
-import java.util.Set;
-import java.util.regex.Pattern;
-
-public class TestUtil {
-    private static MiniDFSCluster dfsCluster;
-    private static ZooKeeperServer zkServer;
-    private static HiveServer2 hiveServer;
-    public static Configuration configuration;
-    public static HoodieHiveConfiguration hDroneConfiguration;
-    private static DateTimeFormatter dtfOut;
-    public static final String CSV_DELIMITER = "|";
-    private static FileSystem fileSystem;
-    private static Set<String> createdTablesSet = Sets.newHashSet();
-
-    public static void setUp() throws IOException, InterruptedException {
-        if (dfsCluster == null) {
-            HdfsTestService service = new HdfsTestService();
-            dfsCluster = service.start(true);
-            configuration = service.getHadoopConf();
-        }
-        if (zkServer == null) {
-            ZookeeperTestService zkService = new ZookeeperTestService(configuration);
-            zkServer = zkService.start();
-        }
-        if (hiveServer == null) {
-            HiveTestService hiveService = new HiveTestService(configuration);
-            hiveServer = hiveService.start();
-        }
-        hDroneConfiguration =
-            HoodieHiveConfiguration.newBuilder().hiveJdbcUrl("jdbc:hive2://127.0.0.1:9999/")
-                .hivedb("hdrone_test").jdbcUsername("").jdbcPassword("")
-                .hadoopConfiguration(hiveServer.getHiveConf()).build();
-        dtfOut = DateTimeFormat.forPattern("yyyy/MM/dd");
-
-        HoodieHiveClient client = new HoodieHiveClient(hDroneConfiguration);
-        for (String tableName : createdTablesSet) {
-            client.updateHiveSQL("drop table if exists " + tableName);
-        }
-        createdTablesSet.clear();
-        client.updateHiveSQL(
-            "drop database if exists " + hDroneConfiguration.getDbName());
-        client.updateHiveSQL("create database " + hDroneConfiguration.getDbName());
-
-        fileSystem = FileSystem.get(configuration);
-    }
-
-    public static void shutdown() {
-        if (hiveServer != null) {
-            hiveServer.stop();
-        }
-        if (dfsCluster != null) {
-            dfsCluster.shutdown();
-        }
-        if (zkServer != null) {
-            zkServer.shutdown();
-        }
-    }
-
-    public static HoodieDatasetReference createDataset(String tableName, String hdfsPath, int numberOfPartitions,
-        String schemaFile) throws IOException, InitializationError {
-        Path path = new Path(hdfsPath);
-        FileUtils.deleteDirectory(new File(hdfsPath));
-
-        boolean result = fileSystem.mkdirs(path);
-        checkResult(result);
-        HoodieDatasetReference metadata =
-            new HoodieDatasetReference(tableName, path.toString(),
-                hDroneConfiguration.getDbName());
-        DateTime dateTime = DateTime.now();
-        createPartitions(metadata, numberOfPartitions, schemaFile, dateTime, 1);
-        createdTablesSet.add(metadata.getDatabaseTableName());
-        return metadata;
-    }
-
-    private static void createPartitions(HoodieDatasetReference metadata, int numberOfPartitions,
-        String schemaFile, DateTime startFrom, int schemaVersion) throws IOException {
-        startFrom = startFrom.withTimeAtStartOfDay();
-
-        for (int i = 0; i < numberOfPartitions; i++) {
-            Path partPath = new Path(metadata.getBaseDatasetPath() + "/" + dtfOut.print(startFrom));
-            fileSystem.makeQualified(partPath);
-            fileSystem.mkdirs(partPath);
-            createTestData(partPath, schemaFile, schemaVersion);
-            startFrom = startFrom.minusDays(1);
-        }
-    }
-
-    private static void createTestData(Path partPath, String schemaFile, int schemaVersion)
-        throws IOException {
-        for (int i = 0; i < 5; i++) {
-            // Create 5 files
-            Path filePath =
-                new Path(partPath.toString() + "/" + getParquetFilePath(schemaVersion, i));
-            generateParquetData(filePath, schemaFile);
-        }
-    }
-
-    private static String getParquetFilePath(int version, int iteration) {
-        return "test.topic.name@sjc1@SV_" + version + "@" + iteration + ".parquet";
-    }
-
-    public static MessageType readSchema(String schemaFile) throws IOException {
-        return MessageTypeParser
-            .parseMessageType(IOUtils.toString(TestUtil.class.getResourceAsStream(schemaFile)));
-    }
-
-    public static void generateParquetData(Path filePath, String schemaFile) throws IOException {
-        MessageType schema = readSchema(schemaFile);
-        CsvParquetWriter writer = new CsvParquetWriter(filePath, schema);
-
-        BufferedReader br = new BufferedReader(
-            new InputStreamReader(TestUtil.class.getResourceAsStream(getDataFile(schemaFile))));
-        String line;
-        try {
-            while ((line = br.readLine()) != null) {
-                String[] fields = line.split(Pattern.quote(CSV_DELIMITER));
-                writer.write(Arrays.asList(fields));
-            }
-            writer.close();
-        } finally {
-            br.close();
-        }
-
-        InputStreamReader io = null;
-        FSDataOutputStream hdfsPath = null;
-        try {
-            io = new FileReader(filePath.toString());
-            hdfsPath = fileSystem.create(filePath);
-            IOUtils.copy(io, hdfsPath);
-        } finally {
-            if (io != null) {
-                io.close();
-            }
-            if (hdfsPath != null) {
-                hdfsPath.close();
-            }
-        }
-    }
-
-    private static String getDataFile(String schemaFile) {
-        return schemaFile.replaceAll(".schema", ".csv");
-    }
-
-    private static void checkResult(boolean result) throws InitializationError {
-        if (!result) {
-            throw new InitializationError("Could not initialize");
-        }
-    }
-
-    public static void evolveDataset(HoodieDatasetReference metadata, int newPartitionCount,
-        String newSchema, Long startFrom, int schemaVersion) throws IOException {
-        createPartitions(metadata, newPartitionCount, newSchema,
-            new DateTime(startFrom).plusDays(newPartitionCount + 1), schemaVersion);
-    }
-}
--- a/hoodie-hive/src/test/resources/nation.csv
+++ b/hoodie-hive/src/test/resources/nation.csv
@@ -1,25 +0,0 @@
-0|ALGERIA|0| haggle. carefully final deposits detect slyly agai
-1|ARGENTINA|1|al foxes promise slyly according to the regular accounts. bold requests alon
-2|BRAZIL|1|y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special 
-3|CANADA|1|eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold
-4|EGYPT|4|y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d
-5|ETHIOPIA|0|ven packages wake quickly. regu
-6|FRANCE|3|refully final requests. regular, ironi
-7|GERMANY|3|l platelets. regular accounts x-ray: unusual, regular acco
-8|INDIA|2|ss excuses cajole slyly across the packages. deposits print aroun
-9|INDONESIA|2| slyly express asymptotes. regular deposits haggle slyly. carefully ironic hockey players sleep blithely. carefull
-10|IRAN|4|efully alongside of the slyly final dependencies. 
-11|IRAQ|4|nic deposits boost atop the quickly final requests? quickly regula
-12|JAPAN|2|ously. final, express gifts cajole a
-13|JORDAN|4|ic deposits are blithely about the carefully regular pa
-14|KENYA|0| pending excuses haggle furiously deposits. pending, express pinto beans wake fluffily past t
-15|MOROCCO|0|rns. blithely bold courts among the closely regular packages use furiously bold platelets?
-16|MOZAMBIQUE|0|s. ironic, unusual asymptotes wake blithely r
-17|PERU|1|platelets. blithely pending dependencies use fluffily across the even pinto beans. carefully silent accoun
-18|CHINA|2|c dependencies. furiously express notornis sleep slyly regular accounts. ideas sleep. depos
-19|ROMANIA|3|ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account
-20|SAUDI ARABIA|4|ts. silent requests haggle. closely express packages sleep across the blithely
-21|VIETNAM|2|hely enticingly express accounts. even, final 
-22|RUSSIA|3| requests against the platelets use never according to the quickly regular pint
-23|UNITED KINGDOM|3|eans boost carefully special requests. accounts are. carefull
-24|UNITED STATES|1|y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be
--- a/hoodie-hive/src/test/resources/nation.schema
+++ b/hoodie-hive/src/test/resources/nation.schema
@@ -1,6 +0,0 @@
-message m {
-  required int32 nation_key;
-  required binary name;
-  required int32 region_key;
-  required binary comment_col;
-}
--- a/hoodie-hive/src/test/resources/nation_evolved.csv
+++ b/hoodie-hive/src/test/resources/nation_evolved.csv
@@ -1,25 +0,0 @@
-0|ALGERIA|0| haggle. carefully final deposits detect slyly agai|desc0
-1|ARGENTINA|1|al foxes promise slyly according to the regular accounts. bold requests alon|desc1
-2|BRAZIL|1|y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special |desc2
-3|CANADA|1|eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold|desc3
-4|EGYPT|4|y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d|desc4
-5|ETHIOPIA|0|ven packages wake quickly. regu|desc5
-6|FRANCE|3|refully final requests. regular, ironi|desc6
-7|GERMANY|3|l platelets. regular accounts x-ray: unusual, regular acco|desc7
-8|INDIA|2|ss excuses cajole slyly across the packages. deposits print aroun|desc8
-9|INDONESIA|2| slyly express asymptotes. regular deposits haggle slyly. carefully ironic hockey players sleep blithely. carefull|desc9
-10|IRAN|4|efully alongside of the slyly final dependencies. |desc10
-11|IRAQ|4|nic deposits boost atop the quickly final requests? quickly regula|desc11
-12|JAPAN|2|ously. final, express gifts cajole a|desc12
-13|JORDAN|4|ic deposits are blithely about the carefully regular pa|desc13
-14|KENYA|0| pending excuses haggle furiously deposits. pending, express pinto beans wake fluffily past t|desc14
-15|MOROCCO|0|rns. blithely bold courts among the closely regular packages use furiously bold platelets?|desc15
-16|MOZAMBIQUE|0|s. ironic, unusual asymptotes wake blithely r|desc16
-17|PERU|1|platelets. blithely pending dependencies use fluffily across the even pinto beans. carefully silent accoun|desc17
-18|CHINA|2|c dependencies. furiously express notornis sleep slyly regular accounts. ideas sleep. depos|desc18
-19|ROMANIA|3|ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account|desc19
-20|SAUDI ARABIA|4|ts. silent requests haggle. closely express packages sleep across the blithely|desc20
-21|VIETNAM|2|hely enticingly express accounts. even, final |desc21
-22|RUSSIA|3| requests against the platelets use never according to the quickly regular pint|desc22
-23|UNITED KINGDOM|3|eans boost carefully special requests. accounts are. carefull|desc23
-24|UNITED STATES|1|y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be|desc24
--- a/hoodie-hive/src/test/resources/nation_evolved.schema
+++ b/hoodie-hive/src/test/resources/nation_evolved.schema
@@ -1,7 +0,0 @@
-message m {
-  required int32 nation_key;
-  required binary name;
-  required int64 region_key;
-  required binary comment_col;
-  optional binary desc;
-}