From 3a0044216cb2f707639d48e2869f4ee6f25cfc19 Mon Sep 17 00:00:00 2001
From: Balaji Varadarajan <varadarb@uber.com>
Date: Wed, 10 Oct 2018 10:31:34 -0700
Subject: [PATCH] New Features in DeltaStreamer :  (1) Apply transformation
 when using delta-streamer to ingest data.  (2) Add Hudi Incremental Source
 for Delta Streamer  (3) Allow delta-streamer config-property to be passed as
 command-line  (4) Add Hive Integration to Delta-Streamer and address Review
 comments  (5) Ensure MultiPartKeysValueExtractor  handle hive style partition
 description  (6) Reuse same spark session on both source and transformer  (7)
 Support extracting partition fields from _hoodie_partition_path for
 HoodieIncrSource  (8) Reuse Binary Avro coders  (9) Add push down filter for
 Incremental source  (10) Add Hoodie DeltaStreamer metrics to track total time
 taken

---
 docs/incremental_processing.md                |   8 +-
 docs/quickstart.md                            |   4 +-
 .../cli/commands/ArchivedCommitsCommand.java  |  88 +++++++
 hoodie-client/pom.xml                         |  42 +---
 .../uber/hoodie/metrics/HoodieMetrics.java    |  16 +-
 .../java/com/uber/hoodie/metrics/Metrics.java |  16 ++
 .../common/HoodieTestDataGenerator.java       |   5 +-
 .../uber/hoodie/io/TestHoodieCompactor.java   |   3 +-
 .../hoodie/metrics/TestHoodieMetrics.java     |   3 +-
 .../hoodie/table/TestMergeOnReadTable.java    |   4 +-
 .../src/main/avro/HoodieCommitMetadata.avsc   |  42 ++--
 .../hoodie/common/model/HoodieRecord.java     |  10 +
 .../util/DFSPropertiesConfiguration.java      |  18 +-
 .../hoodie/common/util/HoodieAvroUtils.java   |  12 +-
 .../hoodie/common/util/TypedProperties.java   |  10 +
 .../util/collection/TestDiskBasedMap.java     | 209 +++++++++++++++
 hoodie-hadoop-mr/pom.xml                      |  70 ++----
 hoodie-hive/pom.xml                           |  84 ++-----
 .../uber/hoodie/hive/HoodieHiveClient.java    |  26 +-
 .../hive/MultiPartKeysValueExtractor.java     |  12 +-
 .../hoodie/hive/PartitionValueExtractor.java  |   3 +-
 ...lashEncodedDayPartitionValueExtractor.java |  11 +-
 hoodie-spark/pom.xml                          |  87 ++-----
 .../java/com/uber/hoodie/DataSourceUtils.java |  40 +++
 .../com/uber/hoodie/AvroConversionUtils.scala | 206 ++++++++++++++-
 .../com/uber/hoodie/DataSourceOptions.scala   |   9 +-
 .../com/uber/hoodie/IncrementalRelation.scala |  37 ++-
 .../src/test/scala/DataSourceTest.scala       |   1 -
 hoodie-utilities/pom.xml                      | 138 +++++-----
 .../uber/hoodie/utilities/HoodieCleaner.java  | 115 +++++++++
 .../uber/hoodie/utilities/UtilHelpers.java    |  33 ++-
 .../deltastreamer/HoodieDeltaStreamer.java    | 230 ++++++++++++++---
 .../HoodieDeltaStreamerMetrics.java           |  61 +++++
 .../deltastreamer/SourceFormatAdapter.java    | 112 +++++++++
 .../schema/RowBasedSchemaProvider.java        |  25 ++
 .../schema/SchemaRegistryProvider.java        |  28 ++-
 .../utilities/sources/AvroDFSSource.java      |  26 +-
 .../utilities/sources/AvroKafkaSource.java    |  42 +++-
 .../hoodie/utilities/sources/AvroSource.java  |  36 +++
 .../utilities/sources/HiveIncrPullSource.java |  25 +-
 .../utilities/sources/HoodieIncrSource.java   | 144 +++++++++++
 .../hoodie/utilities/sources/InputBatch.java  |  54 ++++
 .../utilities/sources/JsonDFSSource.java      |  28 ++-
 .../utilities/sources/JsonKafkaSource.java    |  37 ++-
 .../hoodie/utilities/sources/JsonSource.java  |  35 +++
 .../hoodie/utilities/sources/RowSource.java   |  51 ++++
 .../uber/hoodie/utilities/sources/Source.java |  69 +++--
 .../sources/{ => helpers}/AvroConvertor.java  |   6 +-
 .../DFSPathSelector.java}                     |  33 +--
 .../sources/helpers/IncrSourceHelper.java     |  88 +++++++
 .../KafkaOffsetGen.java}                      |  62 ++---
 .../transform/IdentityTransformer.java        |  37 +++
 .../transform/SqlQueryBasedTransformer.java   |  66 +++++
 .../utilities/transform/Transformer.java      |  43 ++++
 .../utilities/TestHoodieDeltaStreamer.java    | 238 +++++++++++++++++-
 .../hoodie/utilities/UtilitiesTestBase.java   |  60 +++++
 .../utilities/sources/TestDFSSource.java      |  53 ++--
 .../utilities/sources/TestDataSource.java     |  16 +-
 .../utilities/sources/TestKafkaSource.java    |  51 ++--
 .../sql-transformer.properties                |  19 ++
 .../delta-streamer-config/target.avsc         |  37 +++
 packaging/hoodie-hadoop-mr-bundle/pom.xml     | 154 ++++--------
 packaging/hoodie-hive-bundle/pom.xml          |  89 ++-----
 packaging/hoodie-spark-bundle/pom.xml         |  89 ++-----
 pom.xml                                       | 157 ++++--------
 65 files changed, 2752 insertions(+), 911 deletions(-)
 create mode 100644 hoodie-common/src/test/java/com/uber/hoodie/common/util/collection/TestDiskBasedMap.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieCleaner.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/SourceFormatAdapter.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/RowBasedSchemaProvider.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroSource.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HoodieIncrSource.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/InputBatch.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonSource.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/RowSource.java
 rename hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/{ => helpers}/AvroConvertor.java (93%)
 rename hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/{DFSSource.java => helpers/DFSPathSelector.java} (76%)
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/IncrSourceHelper.java
 rename hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/{KafkaSource.java => helpers/KafkaOffsetGen.java} (84%)
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/IdentityTransformer.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/SqlQueryBasedTransformer.java
 create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/Transformer.java
 create mode 100644 hoodie-utilities/src/test/resources/delta-streamer-config/sql-transformer.properties
 create mode 100644 hoodie-utilities/src/test/resources/delta-streamer-config/target.avsc

diff --git a/docs/incremental_processing.md b/docs/incremental_processing.md
index 973875a0b..9a65ccfe1 100644
--- a/docs/incremental_processing.md
+++ b/docs/incremental_processing.md
@@ -85,8 +85,12 @@ Usage: <main class> [options]
       exist first time around. If exists, expected to be a hoodie dataset)
   * --target-table
       name of the target table in Hive
-
-
+    --transformer-class
+      subclass of com.uber.hoodie.utilities.transform.Transformer. UDF to 
+      transform raw source dataset to a target dataset (conforming to target 
+      schema) before writing. Default : Not set. E:g - 
+      com.uber.hoodie.utilities.transform.SqlQueryBasedTransformer (which 
+      allows a SQL query template to be passed as a transformation function)
 ```
 
 
diff --git a/docs/quickstart.md b/docs/quickstart.md
index 41ec9a96d..bb7be6edd 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -33,7 +33,7 @@ Hoodie requires Java 8 to be installed. Hoodie works with Spark-2.x versions. We
 
 | Hadoop | Hive  | Spark | Instructions to Build Hoodie | 
 | ---- | ----- | ---- | ---- |
-| 2.6.0-cdh5.7.2 | 1.1.0-cdh5.7.2 | spark-2.[1-3].x | Use "mvn clean install -DskipTests -Dhive11". Jars will have ".hive11" as suffix |
+| 2.6.0-cdh5.7.2 | 1.1.0-cdh5.7.2 | spark-2.[1-3].x | Use "mvn clean install -DskipTests -Dhadoop.version=2.6.0-cdh5.7.2 -Dhive.version=1.1.0-cdh5.7.2" |
 | Apache hadoop-2.8.4 | Apache hive-2.3.3 | spark-2.[1-3].x | Use "mvn clean install -DskipTests" |
 | Apache hadoop-2.7.3 | Apache hive-1.2.1 | spark-2.[1-3].x | Use "mvn clean install -DskipTests" |
 
@@ -1244,4 +1244,4 @@ cd docker
 [INFO] Finished at: 2018-09-10T17:47:37-07:00
 [INFO] Final Memory: 236M/1848M
 [INFO] ------------------------------------------------------------------------
-```
\ No newline at end of file
+```
diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java
index 5f4d5de0d..49fcc5919 100644
--- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java
+++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java
@@ -17,6 +17,7 @@
 package com.uber.hoodie.cli.commands;
 
 import com.uber.hoodie.avro.model.HoodieArchivedMetaEntry;
+import com.uber.hoodie.avro.model.HoodieCommitMetadata;
 import com.uber.hoodie.cli.HoodieCLI;
 import com.uber.hoodie.cli.HoodiePrintHelper;
 import com.uber.hoodie.cli.TableHeader;
@@ -32,6 +33,7 @@ import java.util.List;
 import java.util.stream.Collectors;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.generic.IndexedRecord;
+import org.apache.avro.specific.SpecificData;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 import org.springframework.shell.core.CommandMarker;
@@ -48,6 +50,92 @@ public class ArchivedCommitsCommand implements CommandMarker {
     return HoodieCLI.tableMetadata != null;
   }
 
+  @CliCommand(value = "show archived commit stats", help = "Read commits from archived files and show details")
+  public String showArchivedCommits(
+      @CliOption(key = {"archiveFolderPattern"}, help = "Archive Folder", unspecifiedDefaultValue = "") String folder,
+      @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
+      @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
+      @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
+      @CliOption(key = {
+          "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
+      throws IOException {
+    System.out.println("===============> Showing only " + limit + " archived commits <===============");
+    String basePath = HoodieCLI.tableMetadata.getBasePath();
+    Path archivePath = new Path(basePath + "/.hoodie/.commits_.archive*");
+    if (folder != null && !folder.isEmpty()) {
+      archivePath = new Path(basePath + "/.hoodie/" + folder);
+    }
+    FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(archivePath);
+    List<Comparable[]> allStats = new ArrayList<>();
+    for (FileStatus fs : fsStatuses) {
+      //read the archived file
+      HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf),
+          new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
+
+      List<IndexedRecord> readRecords = new ArrayList<>();
+      //read the avro blocks
+      while (reader.hasNext()) {
+        HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
+        List<IndexedRecord> records = blk.getRecords();
+        readRecords.addAll(records);
+      }
+      List<Comparable[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
+          .filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION)
+              || r.get("actionType").toString().equals(HoodieTimeline.DELTA_COMMIT_ACTION))
+          .flatMap(r -> {
+            HoodieCommitMetadata metadata =
+                (HoodieCommitMetadata) SpecificData.get().deepCopy(HoodieCommitMetadata.SCHEMA$,
+                    r.get("hoodieCommitMetadata"));
+            final String instantTime = r.get("commitTime").toString();
+            final String action = r.get("actionType").toString();
+            return metadata.getPartitionToWriteStats().values().stream().flatMap(hoodieWriteStats -> {
+              return hoodieWriteStats.stream().map(hoodieWriteStat -> {
+                List<Comparable> row = new ArrayList<>();
+                row.add(action);
+                row.add(instantTime);
+                row.add(hoodieWriteStat.getPartitionPath());
+                row.add(hoodieWriteStat.getFileId());
+                row.add(hoodieWriteStat.getPrevCommit());
+                row.add(hoodieWriteStat.getNumWrites());
+                row.add(hoodieWriteStat.getNumInserts());
+                row.add(hoodieWriteStat.getNumDeletes());
+                row.add(hoodieWriteStat.getNumUpdateWrites());
+                row.add(hoodieWriteStat.getTotalLogFiles());
+                row.add(hoodieWriteStat.getTotalLogBlocks());
+                row.add(hoodieWriteStat.getTotalCorruptLogBlock());
+                row.add(hoodieWriteStat.getTotalRollbackBlocks());
+                row.add(hoodieWriteStat.getTotalLogRecords());
+                row.add(hoodieWriteStat.getTotalUpdatedRecordsCompacted());
+                row.add(hoodieWriteStat.getTotalWriteBytes());
+                row.add(hoodieWriteStat.getTotalWriteErrors());
+                return row;
+              });
+            }).map(rowList -> rowList.toArray(new Comparable[0]));
+          }).collect(Collectors.toList());
+      allStats.addAll(readCommits);
+      reader.close();
+    }
+    TableHeader header = new TableHeader().addTableHeaderField("action")
+        .addTableHeaderField("instant")
+        .addTableHeaderField("partition")
+        .addTableHeaderField("file_id")
+        .addTableHeaderField("prev_instant")
+        .addTableHeaderField("num_writes")
+        .addTableHeaderField("num_inserts")
+        .addTableHeaderField("num_deletes")
+        .addTableHeaderField("num_update_writes")
+        .addTableHeaderField("total_log_files")
+        .addTableHeaderField("total_log_blocks")
+        .addTableHeaderField("total_corrupt_log_blocks")
+        .addTableHeaderField("total_rollback_blocks")
+        .addTableHeaderField("total_log_records")
+        .addTableHeaderField("total_updated_records_compacted")
+        .addTableHeaderField("total_write_bytes")
+        .addTableHeaderField("total_write_errors");
+
+    return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats);
+  }
+
   @CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
   public String showCommits(
       @CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata", unspecifiedDefaultValue = "true")
diff --git a/hoodie-client/pom.xml b/hoodie-client/pom.xml
index eccfa802c..c813ef24a 100644
--- a/hoodie-client/pom.xml
+++ b/hoodie-client/pom.xml
@@ -171,6 +171,13 @@
       <artifactId>spark-sql_2.11</artifactId>
     </dependency>
 
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-exec</artifactId>
+      <version>${hive.version}</version>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-client</artifactId>
@@ -218,39 +225,4 @@
         </exclusions>
     </dependency>
   </dependencies>
-  <profiles>
-    <profile>
-      <id>hive12</id>
-      <activation>
-        <property>
-          <name>!hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-exec</artifactId>
-          <version>${hive12.version}</version>
-          <scope>test</scope>
-        </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive11</id>
-      <activation>
-        <property>
-          <name>hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-exec</artifactId>
-          <version>${hive11.version}</version>
-          <scope>test</scope>
-        </dependency>
-      </dependencies>
-    </profile>
-
-  </profiles>
 </project>
diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java
index 5adf45fe0..625e73313 100644
--- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java
+++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java
@@ -16,8 +16,8 @@
 
 package com.uber.hoodie.metrics;
 
-import com.codahale.metrics.Gauge;
-import com.codahale.metrics.MetricRegistry;
+import static com.uber.hoodie.metrics.Metrics.registerGauge;
+
 import com.codahale.metrics.Timer;
 import com.google.common.annotations.VisibleForTesting;
 import com.uber.hoodie.common.model.HoodieCommitMetadata;
@@ -177,18 +177,6 @@ public class HoodieMetrics {
     return config == null ? null : String.format("%s.%s.%s", tableName, action, metric);
   }
 
-  void registerGauge(String metricName, final long value) {
-    try {
-      MetricRegistry registry = Metrics.getInstance().getRegistry();
-      registry.register(metricName, (Gauge<Long>) () -> value);
-    } catch (Exception e) {
-      // Here we catch all exception, so the major upsert pipeline will not be affected if the
-      // metrics system
-      // has some issues.
-      logger.error("Failed to send metrics: ", e);
-    }
-  }
-
   /**
    * By default, the timer context returns duration with nano seconds. Convert it to millisecond.
    */
diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java
index 924257493..e3fcc98c0 100644
--- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java
+++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java
@@ -16,17 +16,21 @@
 
 package com.uber.hoodie.metrics;
 
+import com.codahale.metrics.Gauge;
 import com.codahale.metrics.MetricRegistry;
 import com.google.common.io.Closeables;
 import com.uber.hoodie.config.HoodieWriteConfig;
 import com.uber.hoodie.exception.HoodieException;
 import java.io.Closeable;
 import org.apache.commons.configuration.ConfigurationException;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
 
 /**
  * This is the main class of the metrics system.
  */
 public class Metrics {
+  private static Logger logger = LogManager.getLogger(Metrics.class);
 
   private static volatile boolean initialized = false;
   private static Metrics metrics = null;
@@ -72,6 +76,18 @@ public class Metrics {
     initialized = true;
   }
 
+  public static void registerGauge(String metricName, final long value) {
+    try {
+      MetricRegistry registry = Metrics.getInstance().getRegistry();
+      registry.register(metricName, (Gauge<Long>) () -> value);
+    } catch (Exception e) {
+      // Here we catch all exception, so the major upsert pipeline will not be affected if the
+      // metrics system
+      // has some issues.
+      logger.error("Failed to send metrics: ", e);
+    }
+  }
+
   public MetricRegistry getRegistry() {
     return registry;
   }
diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java
index 06d66aeb4..d1713c5e4 100644
--- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java
+++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java
@@ -75,7 +75,9 @@ public class HoodieTestDataGenerator {
       + "{\"name\": \"end_lat\", \"type\": \"double\"},"
       + "{\"name\": \"end_lon\", \"type\": \"double\"},"
       + "{\"name\":\"fare\",\"type\": \"double\"}]}";
-  public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
+  public static Schema avroSchema = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA);
+  public static Schema avroSchemaWithMetadataFields = HoodieAvroUtils.addMetadataFields(avroSchema);
+
   private static Random rand = new Random(46474747);
 
   private List<KeyPartition> existingKeysList = new ArrayList<>();
@@ -100,7 +102,6 @@ public class HoodieTestDataGenerator {
    */
   public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException {
     GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, "driver-" + commitTime, 0.0);
-    HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
     return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA);
   }
 
diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java
index e1a6d66dc..ac171d4dc 100644
--- a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java
+++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java
@@ -160,7 +160,8 @@ public class TestHoodieCompactor {
 
     // Write them to corresponding avro logfiles
     HoodieTestUtils
-        .writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema, updatedRecords);
+        .writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchemaWithMetadataFields,
+            updatedRecords);
 
     // Verify that all data file has one log file
     metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
diff --git a/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java b/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java
index cb1a43969..c562b308a 100644
--- a/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java
+++ b/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java
@@ -16,6 +16,7 @@
 
 package com.uber.hoodie.metrics;
 
+import static com.uber.hoodie.metrics.Metrics.registerGauge;
 import static org.junit.Assert.assertTrue;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
@@ -39,7 +40,7 @@ public class TestHoodieMetrics {
 
   @Test
   public void testRegisterGauge() {
-    metrics.registerGauge("metric1", 123L);
+    registerGauge("metric1", 123L);
     assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123"));
   }
 }
diff --git a/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java b/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java
index f651b8ef1..93227c49c 100644
--- a/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java
+++ b/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java
@@ -671,8 +671,8 @@ public class TestMergeOnReadTable {
 
     // Write them to corresponding avro logfiles
     HoodieTestUtils
-        .writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema,
-            updatedRecords);
+        .writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(),
+            HoodieTestDataGenerator.avroSchemaWithMetadataFields, updatedRecords);
 
     // Verify that all data file has one log file
     metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
diff --git a/hoodie-common/src/main/avro/HoodieCommitMetadata.avsc b/hoodie-common/src/main/avro/HoodieCommitMetadata.avsc
index 3f4473200..c46825cdc 100644
--- a/hoodie-common/src/main/avro/HoodieCommitMetadata.avsc
+++ b/hoodie-common/src/main/avro/HoodieCommitMetadata.avsc
@@ -15,47 +15,58 @@
                   "fields":[
                      {
                         "name":"fileId",
-                        "type":["null","string"]
+                        "type":["null","string"],
+                        "default" : null
                      },
                      {
                         "name":"path",
-                        "type":["null","string"]
+                        "type":["null","string"],
+                        "default" : null
                      },
                      {
                         "name":"prevCommit",
-                        "type":["null","string"]
+                        "type":["null","string"],
+                        "default" : null
                      },
                      {
                         "name":"numWrites",
-                        "type":["null","long"]
+                        "type":["null","long"],
+                        "default" : null
                      },
                      {
                         "name":"numDeletes",
-                        "type":["null","long"]
+                        "type":["null","long"],
+                        "default" : null
                      },
                      {
                         "name":"numUpdateWrites",
-                        "type":["null","long"]
+                        "type":["null","long"],
+                        "default" : null
                      },
                      {
                         "name":"totalWriteBytes",
-                        "type":["null","long"]
+                        "type":["null","long"],
+                        "default" : null
                      },
                      {
                         "name":"totalWriteErrors",
-                        "type":["null","long"]
+                        "type":["null","long"],
+                        "default" : null
                      },
                      {
                         "name":"partitionPath",
-                        "type":["null","string"]
+                        "type":["null","string"],
+                        "default" : null
                      },
                      {
                         "name":"totalLogRecords",
-                        "type":["null","long"]
+                        "type":["null","long"],
+                        "default" : null
                      },
                      {
                         "name":"totalLogFiles",
-                        "type":["null","long"]
+                        "type":["null","long"],
+                        "default" : null
                      },
                      {
                         "name":"totalUpdatedRecordsCompacted",
@@ -69,15 +80,18 @@
                      },
                      {
                         "name":"totalLogBlocks",
-                        "type":["null","long"]
+                        "type":["null","long"],
+                        "default" : null
                      },
                      {
                         "name":"totalCorruptLogBlock",
-                        "type":["null","long"]
+                        "type":["null","long"],
+                        "default" : null
                      },
                      {
                         "name":"totalRollbackBlocks",
-                        "type":["null","long"]
+                        "type":["null","long"],
+                        "default" : null
                      }
                   ]
                }
diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java
index cef6e60a2..857dcaaa9 100644
--- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java
+++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java
@@ -17,7 +17,9 @@
 package com.uber.hoodie.common.model;
 
 import com.google.common.base.Objects;
+import com.google.common.collect.ImmutableList;
 import java.io.Serializable;
+import java.util.List;
 import java.util.Optional;
 
 /**
@@ -31,6 +33,14 @@ public class HoodieRecord<T extends HoodieRecordPayload> implements Serializable
   public static String PARTITION_PATH_METADATA_FIELD = "_hoodie_partition_path";
   public static String FILENAME_METADATA_FIELD = "_hoodie_file_name";
 
+  public static final List<String> HOODIE_META_COLUMNS =
+      new ImmutableList.Builder<String>().add(COMMIT_TIME_METADATA_FIELD)
+      .add(COMMIT_SEQNO_METADATA_FIELD)
+      .add(RECORD_KEY_METADATA_FIELD)
+      .add(PARTITION_PATH_METADATA_FIELD)
+      .add(FILENAME_METADATA_FIELD)
+      .build();
+
   /**
    * Identifies the record across the table
    */
diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/DFSPropertiesConfiguration.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/DFSPropertiesConfiguration.java
index 758ba0fca..f8dad81b2 100644
--- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/DFSPropertiesConfiguration.java
+++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/DFSPropertiesConfiguration.java
@@ -73,6 +73,20 @@ public class DFSPropertiesConfiguration {
       }
       visitedFiles.add(file.getName());
       BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(file)));
+      addProperties(reader);
+    } catch (IOException ioe) {
+      log.error("Error reading in properies from dfs", ioe);
+      throw new IllegalArgumentException("Cannot read properties from dfs", ioe);
+    }
+  }
+
+  /**
+   * Add properties from input stream
+   * @param reader Buffered Reader
+   * @throws IOException
+   */
+  public void addProperties(BufferedReader reader) throws IOException {
+    try {
       String line;
       while ((line = reader.readLine()) != null) {
         if (line.startsWith("#") || line.equals("") || !line.contains("=")) {
@@ -85,10 +99,8 @@ public class DFSPropertiesConfiguration {
           props.setProperty(split[0], split[1]);
         }
       }
+    } finally {
       reader.close();
-    } catch (IOException ioe) {
-      log.error("Error reading in properies from dfs", ioe);
-      throw new IllegalArgumentException("Cannot read properties from dfs", ioe);
     }
   }
 
diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java
index ae92b00f9..b76b4aeb1 100644
--- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java
+++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java
@@ -37,8 +37,8 @@ import org.apache.avro.generic.GenericData;
 import org.apache.avro.generic.GenericDatumReader;
 import org.apache.avro.generic.GenericDatumWriter;
 import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.io.BinaryDecoder;
 import org.apache.avro.io.BinaryEncoder;
-import org.apache.avro.io.Decoder;
 import org.apache.avro.io.DecoderFactory;
 import org.apache.avro.io.EncoderFactory;
 import org.codehaus.jackson.JsonNode;
@@ -48,6 +48,10 @@ import org.codehaus.jackson.JsonNode;
  */
 public class HoodieAvroUtils {
 
+  private static ThreadLocal<BinaryEncoder> reuseEncoder = ThreadLocal.withInitial(() -> null);
+
+  private static ThreadLocal<BinaryDecoder> reuseDecoder = ThreadLocal.withInitial(() -> null);
+
   // All metadata fields are optional strings.
   private static final Schema METADATA_FIELD_SCHEMA = Schema.createUnion(Arrays.asList(
       Schema.create(Schema.Type.NULL),
@@ -62,7 +66,8 @@ public class HoodieAvroUtils {
     GenericDatumWriter<GenericRecord> writer =
         new GenericDatumWriter<>(record.getSchema());
     ByteArrayOutputStream out = new ByteArrayOutputStream();
-    BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, null);
+    BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, reuseEncoder.get());
+    reuseEncoder.set(encoder);
     writer.write(record, encoder);
     encoder.flush();
     out.close();
@@ -73,7 +78,8 @@ public class HoodieAvroUtils {
    * Convert serialized bytes back into avro record
    */
   public static GenericRecord bytesToAvro(byte[] bytes, Schema schema) throws IOException {
-    Decoder decoder = DecoderFactory.get().binaryDecoder(bytes, null);
+    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(bytes, reuseDecoder.get());
+    reuseDecoder.set(decoder);
     GenericDatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
     return reader.read(null, decoder);
   }
diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/TypedProperties.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/TypedProperties.java
index 5674d2382..5acca156a 100644
--- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/TypedProperties.java
+++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/TypedProperties.java
@@ -19,7 +19,10 @@
 package com.uber.hoodie.common.util;
 
 import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
 import java.util.Properties;
+import java.util.stream.Collectors;
 
 /**
  * Type-aware extension of {@link java.util.Properties}
@@ -49,6 +52,13 @@ public class TypedProperties extends Properties implements Serializable {
     return containsKey(property) ? getProperty(property) : defaultValue;
   }
 
+  public List<String> getStringList(String property, String delimiter, List<String> defaultVal) {
+    if (!containsKey(property)) {
+      return defaultVal;
+    }
+    return Arrays.stream(getProperty(property).split(delimiter)).map(String::trim).collect(Collectors.toList());
+  }
+
   public int getInteger(String property) {
     checkKey(property);
     return Integer.valueOf(getProperty(property));
diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/collection/TestDiskBasedMap.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/collection/TestDiskBasedMap.java
new file mode 100644
index 000000000..9a288af82
--- /dev/null
+++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/collection/TestDiskBasedMap.java
@@ -0,0 +1,209 @@
+/*
+ *  Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package com.uber.hoodie.common.util.collection;
+
+import static com.uber.hoodie.common.util.SchemaTestUtil.getSimpleSchema;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import com.uber.hoodie.common.model.AvroBinaryTestPayload;
+import com.uber.hoodie.common.model.HoodieAvroPayload;
+import com.uber.hoodie.common.model.HoodieKey;
+import com.uber.hoodie.common.model.HoodieRecord;
+import com.uber.hoodie.common.model.HoodieRecordPayload;
+import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
+import com.uber.hoodie.common.util.HoodieAvroUtils;
+import com.uber.hoodie.common.util.HoodieRecordSizeEstimator;
+import com.uber.hoodie.common.util.SchemaTestUtil;
+import com.uber.hoodie.common.util.SpillableMapTestUtils;
+import com.uber.hoodie.common.util.SpillableMapUtils;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.UUID;
+import java.util.stream.Collectors;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.IndexedRecord;
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class TestDiskBasedMap {
+
+  private static final String BASE_OUTPUT_PATH = "/tmp/";
+
+  @Test
+  public void testSimpleInsert() throws IOException, URISyntaxException {
+    Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
+    String payloadClazz = HoodieAvroPayload.class.getName();
+
+    DiskBasedMap records = new DiskBasedMap<>(BASE_OUTPUT_PATH);
+    List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
+    ((GenericRecord) iRecords.get(0)).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
+    List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records);
+
+    // make sure records have spilled to disk
+    assertTrue(records.sizeOfFileOnDiskInBytes() > 0);
+    Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator();
+    List<HoodieRecord> oRecords = new ArrayList<>();
+    while (itr.hasNext()) {
+      HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
+      oRecords.add(rec);
+      assert recordKeys.contains(rec.getRecordKey());
+    }
+  }
+
+  @Test
+  public void testSimpleInsertWithoutHoodieMetadata() throws IOException, URISyntaxException {
+    Schema schema = getSimpleSchema();
+    String payloadClazz = HoodieAvroPayload.class.getName();
+
+    DiskBasedMap records = new DiskBasedMap<>(BASE_OUTPUT_PATH);
+    List<HoodieRecord> hoodieRecords = SchemaTestUtil
+        .generateHoodieTestRecordsWithoutHoodieMetadata(0, 1000);
+    Set<String> recordKeys = new HashSet<>();
+    // insert generated records into the map
+    hoodieRecords.stream().forEach(r -> {
+      records.put(r.getRecordKey(), r);
+      recordKeys.add(r.getRecordKey());
+    });
+    // make sure records have spilled to disk
+    assertTrue(records.sizeOfFileOnDiskInBytes() > 0);
+    Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator();
+    List<HoodieRecord> oRecords = new ArrayList<>();
+    while (itr.hasNext()) {
+      HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
+      oRecords.add(rec);
+      assert recordKeys.contains(rec.getRecordKey());
+    }
+  }
+
+  @Test
+  public void testSimpleUpsert() throws IOException, URISyntaxException {
+
+    Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
+    String payloadClazz = HoodieAvroPayload.class.getName();
+
+    DiskBasedMap records = new DiskBasedMap<>(BASE_OUTPUT_PATH);
+    List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
+
+    // perform some inserts
+    List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records);
+
+    long fileSize = records.sizeOfFileOnDiskInBytes();
+    // make sure records have spilled to disk
+    assertTrue(fileSize > 0);
+
+    // generate updates from inserts
+    List<IndexedRecord> updatedRecords =
+        SchemaTestUtil
+            .updateHoodieTestRecords(recordKeys, SchemaTestUtil.generateHoodieTestRecords(0, 100),
+                HoodieActiveTimeline.createNewCommitTime());
+    String newCommitTime = ((GenericRecord) updatedRecords.get(0))
+        .get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
+
+    // perform upserts
+    recordKeys = SpillableMapTestUtils.upsertRecords(updatedRecords, records);
+
+    // upserts should be appended to the existing file, hence increasing the sizeOfFile on disk
+    assertTrue(records.sizeOfFileOnDiskInBytes() > fileSize);
+
+    // Upserted records (on disk) should have the latest commit time
+    Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator();
+    while (itr.hasNext()) {
+      HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
+      assert recordKeys.contains(rec.getRecordKey());
+      try {
+        IndexedRecord indexedRecord = (IndexedRecord) rec.getData().getInsertValue(schema).get();
+        String latestCommitTime = ((GenericRecord) indexedRecord)
+            .get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
+        assertEquals(latestCommitTime, newCommitTime);
+      } catch (IOException io) {
+        throw new UncheckedIOException(io);
+      }
+    }
+  }
+
+  @Test
+  public void testSizeEstimator() throws IOException, URISyntaxException {
+    Schema schema = SchemaTestUtil.getSimpleSchema();
+
+    // Test sizeEstimator without hoodie metadata fields
+    List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
+
+    long payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0),
+        new HoodieRecordSizeEstimator(schema));
+    assertTrue(payloadSize > 0);
+
+    // Test sizeEstimator with hoodie metadata fields
+    schema = HoodieAvroUtils.addMetadataFields(schema);
+    hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
+    payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0),
+        new HoodieRecordSizeEstimator(schema));
+    assertTrue(payloadSize > 0);
+
+    // Following tests payloads without an Avro Schema in the Record
+
+    // Test sizeEstimator without hoodie metadata fields and without schema object in the payload
+    schema = SchemaTestUtil.getSimpleSchema();
+    List<IndexedRecord> indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
+    hoodieRecords = indexedRecords.stream()
+        .map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"),
+            new AvroBinaryTestPayload(Optional.of((GenericRecord) r)))).collect(Collectors.toList());
+    payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0),
+        new HoodieRecordSizeEstimator(schema));
+    assertTrue(payloadSize > 0);
+
+    // Test sizeEstimator with hoodie metadata fields and without schema object in the payload
+    final Schema simpleSchemaWithMetadata = HoodieAvroUtils
+        .addMetadataFields(SchemaTestUtil.getSimpleSchema());
+    indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
+    hoodieRecords = indexedRecords.stream()
+        .map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"),
+            new AvroBinaryTestPayload(Optional
+                .of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata)))))
+        .collect(Collectors.toList());
+    payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0),
+        new HoodieRecordSizeEstimator(schema));
+    assertTrue(payloadSize > 0);
+  }
+
+  /**
+   * @na: Leaving this test here for a quick performance test
+   */
+  @Ignore
+  @Test
+  public void testSizeEstimatorPerformance() throws IOException, URISyntaxException {
+    // Test sizeEstimatorPerformance with simpleSchema
+    Schema schema = SchemaTestUtil.getSimpleSchema();
+    List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
+    HoodieRecordSizeEstimator sizeEstimator =
+        new HoodieRecordSizeEstimator(schema);
+    HoodieRecord record = hoodieRecords.remove(0);
+    long startTime = System.currentTimeMillis();
+    SpillableMapUtils.computePayloadSize(record, sizeEstimator);
+    long timeTaken = System.currentTimeMillis() - startTime;
+    System.out.println("Time taken :" + timeTaken);
+    assertTrue(timeTaken < 100);
+  }
+}
diff --git a/hoodie-hadoop-mr/pom.xml b/hoodie-hadoop-mr/pom.xml
index fe1df4396..6ef41a32d 100644
--- a/hoodie-hadoop-mr/pom.xml
+++ b/hoodie-hadoop-mr/pom.xml
@@ -60,6 +60,22 @@
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-hdfs</artifactId>
     </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-jdbc</artifactId>
+      <version>${hive.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-exec</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
     <dependency>
       <groupId>commons-logging</groupId>
       <artifactId>commons-logging</artifactId>
@@ -105,58 +121,4 @@
       </plugin>
     </plugins>
   </build>
-  <profiles>
-    <profile>
-      <id>hive12</id>
-      <activation>
-        <property>
-          <name>!hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive12.version}</version>
-          <exclusions>
-            <exclusion>
-              <groupId>commons-logging</groupId>
-              <artifactId>commons-logging</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-exec</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive11</id>
-      <activation>
-        <property>
-          <name>hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-       <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive11.version}</version>
-          <exclusions>
-            <exclusion>
-              <groupId>commons-logging</groupId>
-              <artifactId>commons-logging</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-exec</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-      </dependencies>
-   </profile>
-  </profiles>
 </project>
diff --git a/hoodie-hive/pom.xml b/hoodie-hive/pom.xml
index f74e5ea83..da4e9a0d7 100644
--- a/hoodie-hive/pom.xml
+++ b/hoodie-hive/pom.xml
@@ -99,6 +99,27 @@
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
     </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-service</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-jdbc</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-metastore</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-common</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-hdfs</artifactId>
@@ -175,67 +196,4 @@
       </plugin>
     </plugins>
   </build>
-
-  <profiles>
-    <profile>
-      <id>hive12</id>
-      <activation>
-        <property>
-          <name>!hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive11</id>
-      <activation>
-        <property>
-          <name>hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
-  </profiles>
 </project>
diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java
index e3f9cb0fe..d93939198 100644
--- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java
+++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java
@@ -90,7 +90,7 @@ public class HoodieHiveClient {
   private Connection connection;
   private HoodieTimeline activeTimeline;
 
-  HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
+  public HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
     this.syncConfig = cfg;
     this.fs = fs;
     this.metaClient = new HoodieTableMetaClient(fs.getConf(), cfg.basePath, true);
@@ -231,7 +231,7 @@ public class HoodieHiveClient {
   /**
    * Scan table partitions
    */
-  List<Partition> scanTablePartitions() throws TException {
+  public List<Partition> scanTablePartitions() throws TException {
     return client.listPartitions(syncConfig.databaseName, syncConfig.tableName, (short) -1);
   }
 
@@ -268,7 +268,7 @@ public class HoodieHiveClient {
   /**
    * Get the table schema
    */
-  Map<String, String> getTableSchema() {
+  public Map<String, String> getTableSchema() {
     if (!doesTableExist()) {
       throw new IllegalArgumentException(
           "Failed to get schema for table " + syncConfig.tableName + " does not exist");
@@ -435,7 +435,7 @@ public class HoodieHiveClient {
   /**
    * @return true if the configured table exists
    */
-  boolean doesTableExist() {
+  public boolean doesTableExist() {
     try {
       return client.tableExists(syncConfig.databaseName, syncConfig.tableName);
     } catch (TException e) {
@@ -449,7 +449,7 @@ public class HoodieHiveClient {
    *
    * @param s SQL to execute
    */
-  void updateHiveSQL(String s) {
+  public void updateHiveSQL(String s) {
     Statement stmt = null;
     try {
       stmt = connection.createStatement();
@@ -468,8 +468,10 @@ public class HoodieHiveClient {
       BasicDataSource ds = new HiveDataSource();
       ds.setDriverClassName(HiveDriver.class.getCanonicalName());
       ds.setUrl(getHiveJdbcUrlWithDefaultDBName());
-      ds.setUsername(syncConfig.hiveUser);
-      ds.setPassword(syncConfig.hivePass);
+      if (syncConfig.hiveUser != null) {
+        ds.setUsername(syncConfig.hiveUser);
+        ds.setPassword(syncConfig.hivePass);
+      }
       LOG.info("Getting Hive Connection from Datasource " + ds);
       try {
         this.connection = ds.getConnection();
@@ -520,7 +522,7 @@ public class HoodieHiveClient {
     return fs;
   }
 
-  Optional<String> getLastCommitTimeSynced() {
+  public Optional<String> getLastCommitTimeSynced() {
     // Get the last commit time from the TBLproperties
     try {
       Table database = client.getTable(syncConfig.databaseName, syncConfig.tableName);
@@ -532,7 +534,7 @@ public class HoodieHiveClient {
     }
   }
 
-  void close() {
+  public void close() {
     try {
       if (connection != null) {
         connection.close();
@@ -548,7 +550,7 @@ public class HoodieHiveClient {
   @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
   List<String> getPartitionsWrittenToSince(Optional<String> lastCommitTimeSynced) {
     if (!lastCommitTimeSynced.isPresent()) {
-      LOG.info("Last commit time synced is not known, listing all partitions");
+      LOG.info("Last commit time synced is not known, listing all partitions in " + syncConfig.basePath + ",FS :" + fs);
       try {
         return FSUtils.getAllPartitionPaths(fs, syncConfig.basePath,
             syncConfig.assumeDatePartitioning);
@@ -573,6 +575,10 @@ public class HoodieHiveClient {
     }
   }
 
+  List<String> getAllTables(String db) throws Exception {
+    return client.getAllTables(db);
+  }
+
   void updateLastCommitTimeSynced() {
     // Set the last commit time from the TBLproperties
     String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp();
diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/MultiPartKeysValueExtractor.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/MultiPartKeysValueExtractor.java
index 00475e1e9..b0bcc59f4 100644
--- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/MultiPartKeysValueExtractor.java
+++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/MultiPartKeysValueExtractor.java
@@ -16,8 +16,10 @@
 
 package com.uber.hoodie.hive;
 
+import com.google.common.base.Preconditions;
 import java.util.Arrays;
 import java.util.List;
+import java.util.stream.Collectors;
 
 /**
  * Partition Key extractor treating each value delimited by slash as separate key.
@@ -27,6 +29,14 @@ public class MultiPartKeysValueExtractor implements PartitionValueExtractor {
   @Override
   public List<String> extractPartitionValuesInPath(String partitionPath) {
     String[] splits = partitionPath.split("/");
-    return Arrays.asList(splits);
+    return Arrays.stream(splits).map(s -> {
+      if (s.contains("=")) {
+        String[] moreSplit = s.split("=");
+        Preconditions.checkArgument(moreSplit.length == 2,
+            "Partition Field (" + s + ") not in expected format");
+        return moreSplit[1];
+      }
+      return s;
+    }).collect(Collectors.toList());
   }
 }
\ No newline at end of file
diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java
index 73776c461..b32f7cf0c 100644
--- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java
+++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java
@@ -18,6 +18,7 @@
 
 package com.uber.hoodie.hive;
 
+import java.io.Serializable;
 import java.util.List;
 
 /**
@@ -28,7 +29,7 @@ import java.util.List;
  * e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path
  * /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd]
  */
-public interface PartitionValueExtractor {
+public interface PartitionValueExtractor extends Serializable {
 
   List<String> extractPartitionValuesInPath(String partitionPath);
 }
diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java
index 893b61e4c..771f2771f 100644
--- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java
+++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java
@@ -33,12 +33,19 @@ import org.joda.time.format.DateTimeFormatter;
  */
 public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExtractor {
 
-  private final DateTimeFormatter dtfOut;
+  private transient DateTimeFormatter dtfOut;
 
   public SlashEncodedDayPartitionValueExtractor() {
     this.dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd");
   }
 
+  private DateTimeFormatter getDtfOut() {
+    if (dtfOut == null) {
+      dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd");
+    }
+    return dtfOut;
+  }
+
   @Override
   public List<String> extractPartitionValuesInPath(String partitionPath) {
     // partition path is expected to be in this format yyyy/mm/dd
@@ -52,6 +59,6 @@ public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExt
     int mm = Integer.parseInt(splits[1]);
     int dd = Integer.parseInt(splits[2]);
     DateTime dateTime = new DateTime(year, mm, dd, 0, 0);
-    return Lists.newArrayList(dtfOut.print(dateTime));
+    return Lists.newArrayList(getDtfOut().print(dateTime));
   }
 }
diff --git a/hoodie-spark/pom.xml b/hoodie-spark/pom.xml
index c4869f3b2..d800c1a72 100644
--- a/hoodie-spark/pom.xml
+++ b/hoodie-spark/pom.xml
@@ -221,6 +221,30 @@
       <artifactId>commons-configuration2</artifactId>
     </dependency>
 
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-service</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-jdbc</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-metastore</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-common</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+
     <dependency>
       <groupId>com.uber.hoodie</groupId>
       <artifactId>hoodie-client</artifactId>
@@ -264,67 +288,4 @@
       <scope>test</scope>
     </dependency>
   </dependencies>
-  <profiles>
-    <profile>
-      <id>hive12</id>
-      <activation>
-        <property>
-          <name>!hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive11</id>
-      <activation>
-        <property>
-          <name>hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
-  </profiles>
-
 </project>
diff --git a/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java b/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java
index b02c36675..429c43953 100644
--- a/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java
+++ b/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java
@@ -29,8 +29,13 @@ import com.uber.hoodie.config.HoodieWriteConfig;
 import com.uber.hoodie.exception.DatasetNotFoundException;
 import com.uber.hoodie.exception.HoodieException;
 import com.uber.hoodie.exception.HoodieNotSupportedException;
+import com.uber.hoodie.hive.HiveSyncConfig;
+import com.uber.hoodie.hive.PartitionValueExtractor;
+import com.uber.hoodie.hive.SlashEncodedDayPartitionValueExtractor;
 import com.uber.hoodie.index.HoodieIndex;
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;
@@ -86,6 +91,17 @@ public class DataSourceUtils {
     }
   }
 
+  /**
+   * Create a partition value extractor class via reflection, passing in any configs needed
+   */
+  public static PartitionValueExtractor createPartitionExtractor(String partitionExtractorClass)  {
+    try {
+      return (PartitionValueExtractor) ReflectionUtils.loadClass(partitionExtractorClass);
+    } catch (Throwable e) {
+      throw new HoodieException("Could not load partition extractor class " + partitionExtractorClass, e);
+    }
+  }
+
   /**
    * Create a payload class via reflection, passing in an ordering/precombine value.
    */
@@ -169,4 +185,28 @@ public class DataSourceUtils {
         .withProps(parameters).build();
     return dropDuplicates(jssc, incomingHoodieRecords, writeConfig);
   }
+
+  public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String basePath) {
+    checkRequiredProperties(props, Arrays.asList(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY()));
+    HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
+    hiveSyncConfig.basePath = basePath;
+    hiveSyncConfig.assumeDatePartitioning =
+        props.getBoolean(DataSourceWriteOptions.HIVE_ASSUME_DATE_PARTITION_OPT_KEY(),
+            Boolean.valueOf(DataSourceWriteOptions.DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL()));
+    hiveSyncConfig.databaseName = props.getString(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(),
+        DataSourceWriteOptions.DEFAULT_HIVE_DATABASE_OPT_VAL());
+    hiveSyncConfig.tableName = props.getString(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY());
+    hiveSyncConfig.hiveUser = props.getString(DataSourceWriteOptions.HIVE_USER_OPT_KEY(),
+        DataSourceWriteOptions.DEFAULT_HIVE_USER_OPT_VAL());
+    hiveSyncConfig.hivePass = props.getString(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(),
+        DataSourceWriteOptions.DEFAULT_HIVE_PASS_OPT_VAL());
+    hiveSyncConfig.jdbcUrl = props.getString(DataSourceWriteOptions.HIVE_URL_OPT_KEY(),
+        DataSourceWriteOptions.DEFAULT_HIVE_URL_OPT_VAL());
+    hiveSyncConfig.partitionFields =
+        props.getStringList(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), ",",  new ArrayList<>());
+    hiveSyncConfig.partitionValueExtractorClass =
+          props.getString(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
+              SlashEncodedDayPartitionValueExtractor.class.getName());
+    return hiveSyncConfig;
+  }
 }
diff --git a/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala b/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala
index 75d13e7cf..df3f96438 100644
--- a/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala
+++ b/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala
@@ -22,13 +22,18 @@ import java.sql.{Date, Timestamp}
 import java.util
 
 import com.databricks.spark.avro.SchemaConverters
-import org.apache.avro.generic.GenericData.Record
-import org.apache.avro.generic.GenericRecord
+import com.databricks.spark.avro.SchemaConverters.IncompatibleSchemaException
+import org.apache.avro.Schema.Type._
+import org.apache.avro.generic.GenericData.{Fixed, Record}
+import org.apache.avro.generic.{GenericData, GenericRecord}
 import org.apache.avro.{Schema, SchemaBuilder}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.catalyst.expressions.GenericRow
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+import scala.collection.JavaConverters._
 
 
 object AvroConversionUtils {
@@ -46,6 +51,22 @@ object AvroConversionUtils {
     }
   }
 
+  def createDataFrame(rdd: RDD[GenericRecord], schemaStr: String, ss : SparkSession): Dataset[Row] = {
+    if (rdd.isEmpty()) {
+      ss.emptyDataFrame
+    } else {
+      ss.createDataFrame(rdd.mapPartitions { records =>
+        if (records.isEmpty) Iterator.empty
+        else {
+          val schema = Schema.parse(schemaStr)
+          val dataType = convertAvroSchemaToStructType(schema)
+          val convertor = createConverterToRow(schema, dataType)
+          records.map { x => convertor(x).asInstanceOf[Row] }
+        }
+      }, convertAvroSchemaToStructType(Schema.parse(schemaStr))).asInstanceOf[Dataset[Row]]
+    }
+  }
+
   def getNewRecordNamespace(elementDataType: DataType,
                             currentRecordNamespace: String,
                             elementName: String): String = {
@@ -56,6 +77,185 @@ object AvroConversionUtils {
     }
   }
 
+  /**
+    * NOTE : This part of code is copied from com.databricks.spark.avro.SchemaConverters.scala (133:310) (spark-avro)
+    *
+    * Returns a converter function to convert row in avro format to GenericRow of catalyst.
+    *
+    * @param sourceAvroSchema Source schema before conversion inferred from avro file by passed in
+    *                       by user.
+    * @param targetSqlType Target catalyst sql type after the conversion.
+    * @return returns a converter function to convert row in avro format to GenericRow of catalyst.
+    */
+  def createConverterToRow(sourceAvroSchema: Schema,
+                           targetSqlType: DataType): AnyRef => AnyRef = {
+
+    def createConverter(avroSchema: Schema,
+                        sqlType: DataType, path: List[String]): AnyRef => AnyRef = {
+      val avroType = avroSchema.getType
+      (sqlType, avroType) match {
+        // Avro strings are in Utf8, so we have to call toString on them
+        case (StringType, STRING) | (StringType, ENUM) =>
+          (item: AnyRef) => if (item == null) null else item.toString
+        // Byte arrays are reused by avro, so we have to make a copy of them.
+        case (IntegerType, INT) | (BooleanType, BOOLEAN) | (DoubleType, DOUBLE) |
+             (FloatType, FLOAT) | (LongType, LONG) =>
+          identity
+        case (BinaryType, FIXED) =>
+          (item: AnyRef) =>
+            if (item == null) {
+              null
+            } else {
+              item.asInstanceOf[Fixed].bytes().clone()
+            }
+        case (BinaryType, BYTES) =>
+          (item: AnyRef) =>
+            if (item == null) {
+              null
+            } else {
+              val byteBuffer = item.asInstanceOf[ByteBuffer]
+              val bytes = new Array[Byte](byteBuffer.remaining)
+              byteBuffer.get(bytes)
+              bytes
+            }
+
+        case (struct: StructType, RECORD) =>
+          val length = struct.fields.length
+          val converters = new Array[AnyRef => AnyRef](length)
+          val avroFieldIndexes = new Array[Int](length)
+          var i = 0
+          while (i < length) {
+            val sqlField = struct.fields(i)
+            val avroField = avroSchema.getField(sqlField.name)
+            if (avroField != null) {
+              val converter = createConverter(avroField.schema(), sqlField.dataType,
+                path :+ sqlField.name)
+              converters(i) = converter
+              avroFieldIndexes(i) = avroField.pos()
+            } else if (!sqlField.nullable) {
+              throw new IncompatibleSchemaException(
+                s"Cannot find non-nullable field ${sqlField.name} at path ${path.mkString(".")} " +
+                  "in Avro schema\n" +
+                  s"Source Avro schema: $sourceAvroSchema.\n" +
+                  s"Target Catalyst type: $targetSqlType")
+            }
+            i += 1
+          }
+
+          (item: AnyRef) => {
+            if (item == null) {
+              null
+            } else {
+              val record = item.asInstanceOf[GenericRecord]
+
+              val result = new Array[Any](length)
+              var i = 0
+              while (i < converters.length) {
+                if (converters(i) != null) {
+                  val converter = converters(i)
+                  result(i) = converter(record.get(avroFieldIndexes(i)))
+                }
+                i += 1
+              }
+              new GenericRow(result)
+            }
+          }
+        case (arrayType: ArrayType, ARRAY) =>
+          val elementConverter = createConverter(avroSchema.getElementType, arrayType.elementType,
+            path)
+          val allowsNull = arrayType.containsNull
+          (item: AnyRef) => {
+            if (item == null) {
+              null
+            } else {
+              item.asInstanceOf[java.lang.Iterable[AnyRef]].asScala.map { element =>
+                if (element == null && !allowsNull) {
+                  throw new RuntimeException(s"Array value at path ${path.mkString(".")} is not " +
+                    "allowed to be null")
+                } else {
+                  elementConverter(element)
+                }
+              }
+            }
+          }
+        case (mapType: MapType, MAP) if mapType.keyType == StringType =>
+          val valueConverter = createConverter(avroSchema.getValueType, mapType.valueType, path)
+          val allowsNull = mapType.valueContainsNull
+          (item: AnyRef) => {
+            if (item == null) {
+              null
+            } else {
+              item.asInstanceOf[java.util.Map[AnyRef, AnyRef]].asScala.map { x =>
+                if (x._2 == null && !allowsNull) {
+                  throw new RuntimeException(s"Map value at path ${path.mkString(".")} is not " +
+                    "allowed to be null")
+                } else {
+                  (x._1.toString, valueConverter(x._2))
+                }
+              }.toMap
+            }
+          }
+        case (sqlType, UNION) =>
+          if (avroSchema.getTypes.asScala.exists(_.getType == NULL)) {
+            val remainingUnionTypes = avroSchema.getTypes.asScala.filterNot(_.getType == NULL)
+            if (remainingUnionTypes.size == 1) {
+              createConverter(remainingUnionTypes.head, sqlType, path)
+            } else {
+              createConverter(Schema.createUnion(remainingUnionTypes.asJava), sqlType, path)
+            }
+          } else avroSchema.getTypes.asScala.map(_.getType) match {
+            case Seq(t1) => createConverter(avroSchema.getTypes.get(0), sqlType, path)
+            case Seq(a, b) if Set(a, b) == Set(INT, LONG) && sqlType == LongType =>
+              (item: AnyRef) => {
+                item match {
+                  case null => null
+                  case l: java.lang.Long => l
+                  case i: java.lang.Integer => new java.lang.Long(i.longValue())
+                }
+              }
+            case Seq(a, b) if Set(a, b) == Set(FLOAT, DOUBLE) && sqlType == DoubleType =>
+              (item: AnyRef) => {
+                item match {
+                  case null => null
+                  case d: java.lang.Double => d
+                  case f: java.lang.Float => new java.lang.Double(f.doubleValue())
+                }
+              }
+            case other =>
+              sqlType match {
+                case t: StructType if t.fields.length == avroSchema.getTypes.size =>
+                  val fieldConverters = t.fields.zip(avroSchema.getTypes.asScala).map {
+                    case (field, schema) =>
+                      createConverter(schema, field.dataType, path :+ field.name)
+                  }
+
+                  (item: AnyRef) => if (item == null) {
+                    null
+                  } else {
+                    val i = GenericData.get().resolveUnion(avroSchema, item)
+                    val converted = new Array[Any](fieldConverters.length)
+                    converted(i) = fieldConverters(i)(item)
+                    new GenericRow(converted)
+                  }
+                case _ => throw new IncompatibleSchemaException(
+                  s"Cannot convert Avro schema to catalyst type because schema at path " +
+                    s"${path.mkString(".")} is not compatible " +
+                    s"(avroType = $other, sqlType = $sqlType). \n" +
+                    s"Source Avro schema: $sourceAvroSchema.\n" +
+                    s"Target Catalyst type: $targetSqlType")
+              }
+          }
+        case (left, right) =>
+          throw new IncompatibleSchemaException(
+            s"Cannot convert Avro schema to catalyst type because schema at path " +
+              s"${path.mkString(".")} is not compatible (avroType = $left, sqlType = $right). \n" +
+              s"Source Avro schema: $sourceAvroSchema.\n" +
+              s"Target Catalyst type: $targetSqlType")
+      }
+    }
+    createConverter(sourceAvroSchema, targetSqlType, List.empty[String])
+  }
+
   def createConverterToAvro(dataType: DataType,
                             structName: String,
                             recordNamespace: String): Any => Any = {
diff --git a/hoodie-spark/src/main/scala/com/uber/hoodie/DataSourceOptions.scala b/hoodie-spark/src/main/scala/com/uber/hoodie/DataSourceOptions.scala
index 5df7118bd..9973e4bce 100644
--- a/hoodie-spark/src/main/scala/com/uber/hoodie/DataSourceOptions.scala
+++ b/hoodie-spark/src/main/scala/com/uber/hoodie/DataSourceOptions.scala
@@ -43,7 +43,7 @@ object DataSourceReadOptions {
   val VIEW_TYPE_INCREMENTAL_OPT_VAL = "incremental"
   val VIEW_TYPE_REALTIME_OPT_VAL = "realtime"
   val DEFAULT_VIEW_TYPE_OPT_VAL = VIEW_TYPE_READ_OPTIMIZED_OPT_VAL
-
+  val DEFAULTPUSH_DOWN_FILTERS_OPT_VAL = ""
 
   /**
     * Instant time to start incrementally pulling data from. The instanttime here need not
@@ -64,6 +64,13 @@ object DataSourceReadOptions {
     *
     */
   val END_INSTANTTIME_OPT_KEY = "hoodie.datasource.read.end.instanttime"
+
+  /**
+    * For use-cases like DeltaStreamer which reads from Hoodie Incremental table and applies opaque map functions,
+    *  filters appearing late in the sequence of transformations cannot be automatically pushed down.
+    *  This option allows setting filters directly on Hoodie Source
+    */
+  val PUSH_DOWN_INCR_FILTERS_OPT_KEY = "hoodie.datasource.read.incr.filters"
 }
 
 /**
diff --git a/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala b/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala
index 1f7b47682..0f13c9348 100644
--- a/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala
+++ b/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala
@@ -64,21 +64,33 @@ class IncrementalRelation(val sqlContext: SQLContext,
     throw new HoodieException(s"Specify the begin instant time to pull from using " +
       s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY}")
   }
+
+  val lastInstant = commitTimeline.lastInstant().get()
+
   val commitsToReturn = commitTimeline.findInstantsInRange(
     optParams(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY),
-    optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY,
-      commitTimeline.lastInstant().get().getTimestamp))
+    optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY, lastInstant.getTimestamp))
     .getInstants.iterator().toList
 
   // use schema from a file produced in the latest instant
   val latestSchema = {
+    // use last instant if instant range is empty
+    val instant = commitsToReturn.lastOption.getOrElse(lastInstant)
     val latestMeta = HoodieCommitMetadata
-          .fromBytes(commitTimeline.getInstantDetails(commitsToReturn.last).get, classOf[HoodieCommitMetadata])
+          .fromBytes(commitTimeline.getInstantDetails(instant).get, classOf[HoodieCommitMetadata])
     val metaFilePath = latestMeta.getFileIdAndFullPaths(basePath).values().iterator().next()
     AvroConversionUtils.convertAvroSchemaToStructType(ParquetUtils.readAvroSchema(
       sqlContext.sparkContext.hadoopConfiguration, new Path(metaFilePath)))
   }
 
+  val filters = {
+    if (optParams.contains(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS_OPT_KEY)) {
+      val filterStr = optParams.get(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS_OPT_KEY).getOrElse("")
+      filterStr.split(",").filter(!_.isEmpty)
+    }
+    Array[String]()
+  }
+
   override def schema: StructType = latestSchema
 
   override def buildScan(): RDD[Row] = {
@@ -92,12 +104,17 @@ class IncrementalRelation(val sqlContext: SQLContext,
     // will filter out all the files incorrectly.
     sqlContext.sparkContext.hadoopConfiguration.unset("mapreduce.input.pathFilter.class")
     val sOpts = optParams.filter(p => !p._1.equalsIgnoreCase("path"))
-    sqlContext.read.options(sOpts)
-      .schema(latestSchema) // avoid AnalysisException for empty input
-      .parquet(fileIdToFullPath.values.toList: _*)
-      .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp))
-      .filter(String.format("%s <= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp))
-      .toDF().rdd
-
+    if (fileIdToFullPath.isEmpty) {
+      sqlContext.sparkContext.emptyRDD[Row]
+    } else {
+      log.info("Additional Filters to be applied to incremental source are :" + filters)
+      filters.foldLeft(sqlContext.read.options(sOpts)
+        .schema(latestSchema)
+        .parquet(fileIdToFullPath.values.toList: _*)
+        .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp))
+        .filter(String.format("%s <= '%s'",
+          HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)))((e, f) => e.filter(f))
+        .toDF().rdd
+    }
   }
 }
diff --git a/hoodie-spark/src/test/scala/DataSourceTest.scala b/hoodie-spark/src/test/scala/DataSourceTest.scala
index 2f34beb4e..42def5bf1 100644
--- a/hoodie-spark/src/test/scala/DataSourceTest.scala
+++ b/hoodie-spark/src/test/scala/DataSourceTest.scala
@@ -100,7 +100,6 @@ class DataSourceTest extends AssertionsForJUnit {
       .load(basePath + "/*/*/*/*");
     assertEquals(100, hoodieROViewDF2.count()) // still 100, since we only updated
 
-
     // Read Incremental View
     // we have 2 commits, try pulling the first commit (which is not the latest)
     val firstCommit = HoodieDataSourceHelpers.listCommitsSince(fs, basePath, "000").get(0);
diff --git a/hoodie-utilities/pom.xml b/hoodie-utilities/pom.xml
index 8a9feac7f..3cbf468d7 100644
--- a/hoodie-utilities/pom.xml
+++ b/hoodie-utilities/pom.xml
@@ -53,9 +53,10 @@
             <configuration>
               <dependencyReducedPomLocation>${project.build.directory}/dependency-reduced-pom.xml
               </dependencyReducedPomLocation>
-              <minimizeJar>true</minimizeJar>
               <artifactSet>
                 <includes>
+                  <include>commons-dbcp:commons-dbcp</include>
+                  <include>commons-pool:commons-pool</include>
                   <include>com.uber.hoodie:hoodie-common</include>
                   <include>com.uber.hoodie:hoodie-client</include>
                   <include>com.uber.hoodie:hoodie-spark</include>
@@ -76,8 +77,50 @@
                   <include>com.yammer.metrics:metrics-core</include>
                   <include>com.101tec:zkclient</include>
                   <include>org.apache.kafka:kafka-clients</include>
+                  <include>org.apache.hive:hive-common</include>
+                  <include>org.apache.hive:hive-service</include>
+                  <include>org.apache.hive:hive-metastore</include>
+                  <include>org.apache.hive:hive-jdbc</include>
                 </includes>
               </artifactSet>
+              <relocations>
+                <relocation>
+                  <pattern>org.apache.commons.dbcp.</pattern>
+                  <shadedPattern>com.uber.hoodie.org.apache.commons.dbcp.</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.commons.pool.</pattern>
+                  <shadedPattern>com.uber.hoodie.org.apache.commons.pool.</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hive.jdbc.</pattern>
+                  <shadedPattern>com.uber.hoodie.org.apache.hive.jdbc.</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hadoop.hive.metastore.</pattern>
+                  <shadedPattern>com.uber.hoodie.org.apache.hadoop_hive.metastore.</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hive.common.</pattern>
+                  <shadedPattern>com.uber.hoodie.org.apache.hive.common.</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hadoop.hive.common.</pattern>
+                  <shadedPattern>com.uber.hoodie.org.apache.hadoop_hive.common.</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hadoop.hive.conf.</pattern>
+                  <shadedPattern>com.uber.hoodie.org.apache.hadoop_hive.conf.</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hive.service.</pattern>
+                  <shadedPattern>com.uber.hoodie.org.apache.hive.service.</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hadoop.hive.service.</pattern>
+                  <shadedPattern>com.uber.hoodie.org.apache.hadoop_hive.service.</shadedPattern>
+                </relocation>
+              </relocations>
             </configuration>
           </execution>
         </executions>
@@ -123,6 +166,15 @@
       <scope>test</scope>
     </dependency>
 
+    <dependency>
+      <groupId>com.uber.hoodie</groupId>
+      <artifactId>hoodie-hive</artifactId>
+      <version>${project.version}</version>
+      <classifier>tests</classifier>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>com.uber.hoodie</groupId>
       <artifactId>hoodie-spark</artifactId>
@@ -154,6 +206,30 @@
       </exclusions>
     </dependency>
 
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-exec</artifactId>
+      <version>${hive.version}</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-jdbc</artifactId>
+      <version>${hive.version}</version>
+      <classifier>standalone</classifier>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>servlet-api</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
     <dependency>
       <groupId>com.uber.hoodie</groupId>
       <artifactId>hoodie-hive</artifactId>
@@ -185,6 +261,11 @@
       <groupId>commons-dbcp</groupId>
       <artifactId>commons-dbcp</artifactId>
     </dependency>
+    <dependency>
+      <groupId>commons-pool</groupId>
+      <artifactId>commons-pool</artifactId>
+    </dependency>
+
     <dependency>
       <groupId>org.apache.httpcomponents</groupId>
       <artifactId>httpcore</artifactId>
@@ -303,59 +384,4 @@
     </dependency>
 
   </dependencies>
-
-  <profiles>
-    <profile>
-      <id>hive12</id>
-      <activation>
-        <property>
-          <name>!hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive12.version}</version>
-          <classifier>standalone</classifier>
-          <exclusions>
-            <exclusion>
-              <groupId>org.slf4j</groupId>
-              <artifactId>slf4j-api</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>javax.servlet</groupId>
-              <artifactId>servlet-api</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive11</id>
-      <activation>
-        <property>
-          <name>hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive11.version}</version>
-          <classifier>standalone</classifier>
-          <exclusions>
-            <exclusion>
-              <groupId>org.slf4j</groupId>
-              <artifactId>slf4j-api</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>javax.servlet</groupId>
-              <artifactId>servlet-api</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-      </dependencies>
-    </profile>
-  </profiles>
 </project>
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieCleaner.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieCleaner.java
new file mode 100644
index 000000000..7ebca042c
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieCleaner.java
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *          http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.uber.hoodie.utilities;
+
+import com.beust.jcommander.JCommander;
+import com.beust.jcommander.Parameter;
+import com.uber.hoodie.HoodieWriteClient;
+import com.uber.hoodie.common.util.FSUtils;
+import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.config.HoodieWriteConfig;
+import com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaSparkContext;
+
+public class HoodieCleaner {
+
+  private static volatile Logger log = LogManager.getLogger(HoodieDeltaStreamer.class);
+
+  /**
+   * Config for Cleaner
+   */
+  private final Config cfg;
+
+  /**
+   * Filesystem used
+   */
+  private transient FileSystem fs;
+
+  /**
+   * Spark context
+   */
+  private transient JavaSparkContext jssc;
+
+  /**
+   * Bag of properties with source, hoodie client, key generator etc.
+   */
+  TypedProperties props;
+
+  public HoodieCleaner(Config cfg, JavaSparkContext jssc) throws IOException {
+    this.cfg = cfg;
+    this.jssc = jssc;
+    this.fs = FSUtils.getFs(cfg.basePath, jssc.hadoopConfiguration());
+
+    this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
+    log.info("Creating Cleaner with configs : " + props.toString());
+  }
+
+  public void run() throws Exception {
+    HoodieWriteConfig hoodieCfg = getHoodieClientConfig();
+    HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg, false);
+    client.clean();
+  }
+
+  private HoodieWriteConfig getHoodieClientConfig() throws Exception {
+    return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.basePath)
+        .withAutoCommit(false)
+        .withProps(props).build();
+  }
+
+  public static class Config implements Serializable {
+
+    @Parameter(names = {"--target-base-path"}, description = "base path for the hoodie dataset to be cleaner.",
+        required = true)
+    public String basePath;
+
+    @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
+        + "hoodie client for cleaning")
+    public String propsFilePath =
+        "file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties";
+
+    @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
+        + "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter")
+    public List<String> configs = new ArrayList<>();
+
+    @Parameter(names = {"--spark-master"}, description = "spark master to use.")
+    public String sparkMaster = "local[2]";
+
+    @Parameter(names = {"--help", "-h"}, help = true)
+    public Boolean help = false;
+  }
+
+  public static void main(String[] args) throws Exception {
+    final Config cfg = new Config();
+    JCommander cmd = new JCommander(cfg, args);
+    if (cfg.help || args.length == 0) {
+      cmd.usage();
+      System.exit(1);
+    }
+
+    String dirName = new Path(cfg.basePath).getName();
+    JavaSparkContext jssc = UtilHelpers.buildSparkContext("hoodie-cleaner-" + dirName, cfg.sparkMaster);
+    new HoodieCleaner(cfg, jssc).run();
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java
index b5008cf31..b24e3a277 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java
@@ -30,9 +30,13 @@ import com.uber.hoodie.exception.HoodieException;
 import com.uber.hoodie.index.HoodieIndex;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
 import com.uber.hoodie.utilities.sources.Source;
+import com.uber.hoodie.utilities.transform.Transformer;
+import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringReader;
 import java.nio.ByteBuffer;
+import java.util.List;
 import java.util.Optional;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
@@ -43,6 +47,7 @@ import org.apache.spark.Accumulator;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
 
 /**
  * Bunch of helper methods
@@ -51,12 +56,12 @@ public class UtilHelpers {
   private static Logger logger = LogManager.getLogger(UtilHelpers.class);
 
   public static Source createSource(String sourceClass, TypedProperties cfg,
-      JavaSparkContext jssc, SchemaProvider schemaProvider)
+      JavaSparkContext jssc, SparkSession sparkSession, SchemaProvider schemaProvider)
       throws IOException {
     try {
       return (Source) ReflectionUtils.loadClass(sourceClass,
-          new Class<?>[]{TypedProperties.class, JavaSparkContext.class, SchemaProvider.class},
-          cfg, jssc, schemaProvider);
+          new Class<?>[]{TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class},
+          cfg, jssc, sparkSession, schemaProvider);
     } catch (Throwable e) {
       throw new IOException("Could not load source class " + sourceClass, e);
     }
@@ -65,17 +70,31 @@ public class UtilHelpers {
   public static SchemaProvider createSchemaProvider(String schemaProviderClass,
       TypedProperties cfg, JavaSparkContext jssc) throws IOException {
     try {
-      return (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
+      return schemaProviderClass == null ? null :
+          (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
     } catch (Throwable e) {
       throw new IOException("Could not load schema provider class " + schemaProviderClass, e);
     }
   }
 
+  public static Transformer createTransformer(String transformerClass) throws IOException {
+    try {
+      return transformerClass == null ? null : (Transformer) ReflectionUtils.loadClass(transformerClass);
+    } catch (Throwable e) {
+      throw new IOException("Could not load transformer class " + transformerClass, e);
+    }
+  }
+
   /**
    */
-  public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath) {
+  public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath, List<String> overriddenProps) {
     try {
-      return new DFSPropertiesConfiguration(fs, cfgPath);
+      DFSPropertiesConfiguration conf = new DFSPropertiesConfiguration(fs, cfgPath);
+      if (!overriddenProps.isEmpty()) {
+        logger.info("Adding overridden properties to file properties.");
+        conf.addProperties(new BufferedReader(new StringReader(String.join("\n", overriddenProps))));
+      }
+      return conf;
     } catch (Exception e) {
       throw new HoodieException("Unable to read props file at :" + cfgPath, e);
     }
@@ -109,7 +128,7 @@ public class UtilHelpers {
       sparkConf.set("spark.eventLog.overwrite", "true");
       sparkConf.set("spark.eventLog.enabled", "true");
     }
-    sparkConf.set("spark.driver.maxResultSize", "2g");
+    sparkConf.setIfMissing("spark.driver.maxResultSize", "2g");
     sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
     sparkConf.set("spark.hadoop.mapred.output.compress", "true");
     sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java
index c3573fdeb..665fcff98 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java
@@ -18,10 +18,15 @@
 
 package com.uber.hoodie.utilities.deltastreamer;
 
+import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE;
+import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME;
+
 import com.beust.jcommander.IStringConverter;
 import com.beust.jcommander.JCommander;
 import com.beust.jcommander.Parameter;
 import com.beust.jcommander.ParameterException;
+import com.codahale.metrics.Timer;
+import com.uber.hoodie.AvroConversionUtils;
 import com.uber.hoodie.DataSourceUtils;
 import com.uber.hoodie.HoodieWriteClient;
 import com.uber.hoodie.KeyGenerator;
@@ -36,32 +41,40 @@ import com.uber.hoodie.common.table.HoodieTimeline;
 import com.uber.hoodie.common.table.timeline.HoodieInstant;
 import com.uber.hoodie.common.util.FSUtils;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.config.HoodieCompactionConfig;
 import com.uber.hoodie.config.HoodieIndexConfig;
 import com.uber.hoodie.config.HoodieWriteConfig;
+import com.uber.hoodie.hive.HiveSyncConfig;
+import com.uber.hoodie.hive.HiveSyncTool;
 import com.uber.hoodie.index.HoodieIndex;
 import com.uber.hoodie.utilities.HiveIncrementalPuller;
 import com.uber.hoodie.utilities.UtilHelpers;
 import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException;
-import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider;
+import com.uber.hoodie.utilities.schema.RowBasedSchemaProvider;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
+import com.uber.hoodie.utilities.sources.InputBatch;
 import com.uber.hoodie.utilities.sources.JsonDFSSource;
-import com.uber.hoodie.utilities.sources.Source;
+import com.uber.hoodie.utilities.transform.Transformer;
 import java.io.IOException;
 import java.io.Serializable;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Optional;
 import org.apache.avro.Schema;
 import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
 import scala.collection.JavaConversions;
 
 /**
@@ -81,7 +94,7 @@ public class HoodieDeltaStreamer implements Serializable {
   /**
    * Source to pull deltas from
    */
-  private transient Source source;
+  private transient SourceFormatAdapter formatAdapter;
 
   /**
    * Schema provider that supplies the command for reading the input and writing out the target
@@ -89,6 +102,11 @@ public class HoodieDeltaStreamer implements Serializable {
    */
   private transient SchemaProvider schemaProvider;
 
+  /**
+   * Allows transforming source to target dataset before writing
+   */
+  private transient Transformer transformer;
+
   /**
    * Extract the key for the target dataset
    */
@@ -109,16 +127,30 @@ public class HoodieDeltaStreamer implements Serializable {
    */
   private transient JavaSparkContext jssc;
 
+  /**
+   * Spark Session
+   */
+  private transient SparkSession sparkSession;
+
+  /**
+   * Hive Config
+   */
+  private transient HiveConf hiveConf;
 
   /**
    * Bag of properties with source, hoodie client, key generator etc.
    */
   TypedProperties props;
 
-
   public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc) throws IOException {
+    this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()),
+        getDefaultHiveConf(jssc.hadoopConfiguration()));
+  }
+
+  public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf) throws IOException {
     this.cfg = cfg;
     this.jssc = jssc;
+    this.sparkSession = SparkSession.builder().config(jssc.getConf()).getOrCreate();
     this.fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration());
 
     if (fs.exists(new Path(cfg.targetBasePath))) {
@@ -129,19 +161,28 @@ public class HoodieDeltaStreamer implements Serializable {
       this.commitTimelineOpt = Optional.empty();
     }
 
-    this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath)).getConfig();
+    this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
     log.info("Creating delta streamer with configs : " + props.toString());
     this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jssc);
+    this.transformer = UtilHelpers.createTransformer(cfg.transformerClassName);
     this.keyGenerator = DataSourceUtils.createKeyGenerator(cfg.keyGeneratorClass, props);
-    this.source = UtilHelpers.createSource(cfg.sourceClassName, props, jssc, schemaProvider);
 
-    // register the schemas, so that shuffle does not serialize the full schemas
-    List<Schema> schemas = Arrays.asList(schemaProvider.getSourceSchema(),
-        schemaProvider.getTargetSchema());
-    jssc.sc().getConf().registerAvroSchemas(JavaConversions.asScalaBuffer(schemas).toList());
+    this.formatAdapter =
+        new SourceFormatAdapter(UtilHelpers.createSource(cfg.sourceClassName, props, jssc, sparkSession,
+            schemaProvider));
+
+    this.hiveConf = hiveConf;
+  }
+
+  private static HiveConf getDefaultHiveConf(Configuration cfg) {
+    HiveConf hiveConf = new HiveConf();
+    hiveConf.addResource(cfg);
+    return hiveConf;
   }
 
   public void sync() throws Exception {
+    HoodieDeltaStreamerMetrics metrics = new HoodieDeltaStreamerMetrics(getHoodieClientConfig(null));
+    Timer.Context overallTimerContext = metrics.getOverallTimerContext();
     // Retrieve the previous round checkpoints, if any
     Optional<String> resumeCheckpointStr = Optional.empty();
     if (commitTimelineOpt.isPresent()) {
@@ -163,16 +204,42 @@ public class HoodieDeltaStreamer implements Serializable {
     }
     log.info("Checkpoint to resume from : " + resumeCheckpointStr);
 
-    // Pull the data from the source & prepare the write
-    Pair<Optional<JavaRDD<GenericRecord>>, String> dataAndCheckpoint = source.fetchNewData(
-        resumeCheckpointStr, cfg.sourceLimit);
+    final Optional<JavaRDD<GenericRecord>> avroRDDOptional;
+    final String checkpointStr;
+    final SchemaProvider schemaProvider;
+    if (transformer != null) {
+      // Transformation is needed. Fetch New rows in Row Format, apply transformation and then convert them
+      // to generic records for writing
+      InputBatch<Dataset<Row>> dataAndCheckpoint = formatAdapter.fetchNewDataInRowFormat(
+          resumeCheckpointStr, cfg.sourceLimit);
 
-    if (!dataAndCheckpoint.getKey().isPresent()) {
+      Optional<Dataset<Row>> transformed =
+          dataAndCheckpoint.getBatch().map(data -> transformer.apply(jssc, sparkSession, data, props));
+      checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
+      avroRDDOptional = transformed.map(t ->
+         AvroConversionUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()
+      );
+      // Use Transformed Row's schema if not overridden
+      schemaProvider =
+          this.schemaProvider == null ? transformed.map(r -> (SchemaProvider)new RowBasedSchemaProvider(r.schema()))
+              .orElse(dataAndCheckpoint.getSchemaProvider()) : this.schemaProvider;
+    } else {
+      // Pull the data from the source & prepare the write
+      InputBatch<JavaRDD<GenericRecord>> dataAndCheckpoint =
+          formatAdapter.fetchNewDataInAvroFormat(resumeCheckpointStr, cfg.sourceLimit);
+      avroRDDOptional = dataAndCheckpoint.getBatch();
+      checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
+      schemaProvider = dataAndCheckpoint.getSchemaProvider();
+    }
+
+    if ((!avroRDDOptional.isPresent()) || (avroRDDOptional.get().isEmpty())) {
       log.info("No new data, nothing to commit.. ");
       return;
     }
 
-    JavaRDD<GenericRecord> avroRDD = dataAndCheckpoint.getKey().get();
+    registerAvroSchemas(schemaProvider);
+
+    JavaRDD<GenericRecord> avroRDD = avroRDDOptional.get();
     JavaRDD<HoodieRecord> records = avroRDD.map(gr -> {
       HoodieRecordPayload payload = DataSourceUtils.createPayload(cfg.payloadClassName, gr,
           (Comparable) gr.get(cfg.sourceOrderingField));
@@ -180,20 +247,20 @@ public class HoodieDeltaStreamer implements Serializable {
     });
 
     // filter dupes if needed
-    HoodieWriteConfig hoodieCfg = getHoodieClientConfig();
+    HoodieWriteConfig hoodieCfg = getHoodieClientConfig(schemaProvider);
     if (cfg.filterDupes) {
       // turn upserts to insert
       cfg.operation = cfg.operation == Operation.UPSERT ? Operation.INSERT : cfg.operation;
       records = DataSourceUtils.dropDuplicates(jssc, records, hoodieCfg);
-    }
 
-    if (records.isEmpty()) {
-      log.info("No new data, nothing to commit.. ");
-      return;
+      if (records.isEmpty()) {
+        log.info("No new data, nothing to commit.. ");
+        return;
+      }
     }
 
     // Perform the write
-    HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg);
+    HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg, true);
     String commitTime = client.startCommit();
     log.info("Starting commit  : " + commitTime);
 
@@ -210,7 +277,7 @@ public class HoodieDeltaStreamer implements Serializable {
 
     // Simply commit for now. TODO(vc): Support better error handlers later on
     HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
-    checkpointCommitMetadata.put(CHECKPOINT_KEY, dataAndCheckpoint.getValue());
+    checkpointCommitMetadata.put(CHECKPOINT_KEY, checkpointStr);
 
     boolean success = client.commit(commitTime, writeStatusRDD,
         Optional.of(checkpointCommitMetadata));
@@ -220,17 +287,54 @@ public class HoodieDeltaStreamer implements Serializable {
     } else {
       log.info("Commit " + commitTime + " failed!");
     }
+
+    // Sync to hive if enabled
+    Timer.Context hiveSyncContext = metrics.getHiveSyncTimerContext();
+    syncHive();
+    long hiveSyncTimeMs = hiveSyncContext != null ? hiveSyncContext.stop() : 0;
+
     client.close();
+    long overallTimeMs = overallTimerContext != null ? overallTimerContext.stop() : 0;
+
+    // Send DeltaStreamer Metrics
+    metrics.updateDeltaStreamerMetrics(overallTimeMs, hiveSyncTimeMs);
   }
 
-  private HoodieWriteConfig getHoodieClientConfig() throws Exception {
-    return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
-        .withAutoCommit(false)
-        .withSchema(schemaProvider.getTargetSchema().toString())
-        .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build())
-        .forTable(cfg.targetTableName)
-        .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
-        .withProps(props).build();
+  public void syncHive() {
+    if (cfg.enableHiveSync) {
+      HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath);
+      log.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName
+          + "). Hive metastore URL :" + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
+
+      new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable();
+    }
+  }
+
+  /**
+   * Register Avro Schemas
+   * @param schemaProvider Schema Provider
+   */
+  private void registerAvroSchemas(SchemaProvider schemaProvider) {
+    // register the schemas, so that shuffle does not serialize the full schemas
+    if (null != schemaProvider) {
+      List<Schema> schemas = Arrays.asList(schemaProvider.getSourceSchema(), schemaProvider.getTargetSchema());
+      log.info("Registering Schema :" + schemas);
+      jssc.sc().getConf().registerAvroSchemas(JavaConversions.asScalaBuffer(schemas).toList());
+    }
+  }
+
+  private HoodieWriteConfig getHoodieClientConfig(SchemaProvider schemaProvider) throws Exception {
+    HoodieWriteConfig.Builder builder =
+        HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
+            .withAutoCommit(false)
+            .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build())
+            .forTable(cfg.targetTableName)
+            .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
+            .withProps(props);
+    if (null != schemaProvider) {
+      builder = builder.withSchema(schemaProvider.getTargetSchema().toString());
+    }
+    return builder.build();
   }
 
   public enum Operation {
@@ -266,6 +370,10 @@ public class HoodieDeltaStreamer implements Serializable {
     public String propsFilePath =
         "file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties";
 
+    @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
+        + "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter")
+    public List<String> configs = new ArrayList<>();
+
     @Parameter(names = {"--source-class"}, description = "Subclass of com.uber.hoodie.utilities.sources to read data. "
         + "Built-in options: com.uber.hoodie.utilities.sources.{JsonDFSSource (default), AvroDFSSource, "
         + "JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource}")
@@ -285,11 +393,22 @@ public class HoodieDeltaStreamer implements Serializable {
     public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName();
 
     @Parameter(names = {"--schemaprovider-class"}, description = "subclass of com.uber.hoodie.utilities.schema"
-        + ".SchemaProvider to attach schemas to input & target table data, built in options: FilebasedSchemaProvider")
-    public String schemaProviderClassName = FilebasedSchemaProvider.class.getName();
+        + ".SchemaProvider to attach schemas to input & target table data, built in options: "
+        + "com.uber.hoodie.utilities.schema.FilebasedSchemaProvider."
+        + "Source (See com.uber.hoodie.utilities.sources.Source) implementation can implement their own SchemaProvider."
+        + " For Sources that return Dataset<Row>, the schema is obtained implicitly. "
+        + "However, this CLI option allows overriding the schemaprovider returned by Source.")
+    public String schemaProviderClassName = null;
+
+    @Parameter(names = {"--transformer-class"},
+        description = "subclass of com.uber.hoodie.utilities.transform.Transformer"
+        + ". Allows transforming raw source dataset to a target dataset (conforming to target schema) before writing."
+            + " Default : Not set. E:g - com.uber.hoodie.utilities.transform.SqlQueryBasedTransformer (which allows"
+            + "a SQL query templated to be passed as a transformation function)")
+    public String transformerClassName = null;
 
     @Parameter(names = {"--source-limit"}, description = "Maximum amount of data to read from source. "
-        + "Default: No limit For e.g: DFSSource => max bytes to read, KafkaSource => max events to read")
+        + "Default: No limit For e.g: DFS-Source => max bytes to read, Kafka-Source => max events to read")
     public long sourceLimit = Long.MAX_VALUE;
 
     @Parameter(names = {"--op"}, description = "Takes one of these values : UPSERT (default), INSERT (use when input "
@@ -301,6 +420,9 @@ public class HoodieDeltaStreamer implements Serializable {
         + "before insert/bulk-insert")
     public Boolean filterDupes = false;
 
+    @Parameter(names = {"--enable-hive-sync"}, description = "Enable syncing to hive")
+    public Boolean enableHiveSync = false;
+
     @Parameter(names = {"--spark-master"}, description = "spark master to use.")
     public String sparkMaster = "local[2]";
 
@@ -319,4 +441,44 @@ public class HoodieDeltaStreamer implements Serializable {
     JavaSparkContext jssc = UtilHelpers.buildSparkContext("delta-streamer-" + cfg.targetTableName, cfg.sparkMaster);
     new HoodieDeltaStreamer(cfg, jssc).sync();
   }
+
+  public SourceFormatAdapter getFormatAdapter() {
+    return formatAdapter;
+  }
+
+  public SchemaProvider getSchemaProvider() {
+    return schemaProvider;
+  }
+
+  public Transformer getTransformer() {
+    return transformer;
+  }
+
+  public KeyGenerator getKeyGenerator() {
+    return keyGenerator;
+  }
+
+  public FileSystem getFs() {
+    return fs;
+  }
+
+  public Optional<HoodieTimeline> getCommitTimelineOpt() {
+    return commitTimelineOpt;
+  }
+
+  public JavaSparkContext getJssc() {
+    return jssc;
+  }
+
+  public SparkSession getSparkSession() {
+    return sparkSession;
+  }
+
+  public HiveConf getHiveConf() {
+    return hiveConf;
+  }
+
+  public TypedProperties getProps() {
+    return props;
+  }
 }
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java
new file mode 100644
index 000000000..2fc2f81a3
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java
@@ -0,0 +1,61 @@
+package com.uber.hoodie.utilities.deltastreamer;
+
+import static com.uber.hoodie.metrics.Metrics.registerGauge;
+
+import com.codahale.metrics.Timer;
+import com.uber.hoodie.config.HoodieWriteConfig;
+import com.uber.hoodie.metrics.Metrics;
+
+public class HoodieDeltaStreamerMetrics {
+
+  private HoodieWriteConfig config = null;
+  private String tableName = null;
+
+  public String overallTimerName = null;
+  public String hiveSyncTimerName = null;
+  private Timer overallTimer = null;
+  public Timer hiveSyncTimer = null;
+
+  public HoodieDeltaStreamerMetrics(HoodieWriteConfig config) {
+    this.config = config;
+    this.tableName = config.getTableName();
+    if (config.isMetricsOn()) {
+      Metrics.init(config);
+      this.overallTimerName = getMetricsName("timer", "deltastreamer");
+      this.hiveSyncTimerName = getMetricsName("timer", "deltastreamerHiveSync");
+    }
+  }
+
+  public Timer.Context getOverallTimerContext() {
+    if (config.isMetricsOn() && overallTimer == null) {
+      overallTimer = createTimer(overallTimerName);
+    }
+    return overallTimer == null ? null : overallTimer.time();
+  }
+
+  public Timer.Context getHiveSyncTimerContext() {
+    if (config.isMetricsOn() && hiveSyncTimer == null) {
+      hiveSyncTimer = createTimer(hiveSyncTimerName);
+    }
+    return hiveSyncTimer == null ? null : hiveSyncTimer.time();
+  }
+
+  private Timer createTimer(String name) {
+    return config.isMetricsOn() ? Metrics.getInstance().getRegistry().timer(name) : null;
+  }
+
+  String getMetricsName(String action, String metric) {
+    return config == null ? null : String.format("%s.%s.%s", tableName, action, metric);
+  }
+
+  public void updateDeltaStreamerMetrics(long durationInNs, long hiveSyncNs) {
+    if (config.isMetricsOn()) {
+      registerGauge(getMetricsName("deltastreamer", "duration"), getDurationInMs(durationInNs));
+      registerGauge(getMetricsName("deltastreamer", "hiveSyncDuration"), getDurationInMs(hiveSyncNs));
+    }
+  }
+
+  public long getDurationInMs(long ctxDuration) {
+    return ctxDuration / 1000000;
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/SourceFormatAdapter.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/SourceFormatAdapter.java
new file mode 100644
index 000000000..8214f260b
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/SourceFormatAdapter.java
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.deltastreamer;
+
+import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE;
+import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME;
+
+import com.uber.hoodie.AvroConversionUtils;
+import com.uber.hoodie.utilities.sources.AvroSource;
+import com.uber.hoodie.utilities.sources.InputBatch;
+import com.uber.hoodie.utilities.sources.JsonSource;
+import com.uber.hoodie.utilities.sources.RowSource;
+import com.uber.hoodie.utilities.sources.Source;
+import com.uber.hoodie.utilities.sources.helpers.AvroConvertor;
+import java.util.Optional;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * Adapts data-format provided by the source to the data-format required by the client (DeltaStreamer)
+ */
+public final class SourceFormatAdapter {
+
+  private final Source source;
+
+
+  public SourceFormatAdapter(Source source) {
+    this.source = source;
+  }
+
+  /**
+   * Fetch new data in avro format. If the source provides data in different format, they are translated
+   * to Avro format
+   * @param lastCkptStr
+   * @param sourceLimit
+   * @return
+   */
+  public InputBatch<JavaRDD<GenericRecord>> fetchNewDataInAvroFormat(Optional<String> lastCkptStr,
+      long sourceLimit) {
+    switch (source.getSourceType()) {
+      case AVRO:
+        return ((AvroSource)source).fetchNext(lastCkptStr, sourceLimit);
+      case JSON: {
+        InputBatch<JavaRDD<String>> r = ((JsonSource)source).fetchNext(lastCkptStr, sourceLimit);
+        AvroConvertor convertor = new AvroConvertor(r.getSchemaProvider().getSourceSchema());
+        return new InputBatch<>(Optional.ofNullable(
+            r.getBatch().map(rdd -> rdd.map(convertor::fromJson))
+                .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
+      }
+      case ROW: {
+        InputBatch<Dataset<Row>> r = ((RowSource)source).fetchNext(lastCkptStr, sourceLimit);
+        return new InputBatch<>(Optional.ofNullable(r.getBatch().map(
+            rdd -> (AvroConversionUtils.createRdd(rdd, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()))
+            .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
+      }
+      default:
+        throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")");
+    }
+  }
+
+  /**
+   * Fetch new data in row format. If the source provides data in different format, they are translated
+   * to Row format
+   * @param lastCkptStr
+   * @param sourceLimit
+   * @return
+   */
+  public InputBatch<Dataset<Row>> fetchNewDataInRowFormat(Optional<String> lastCkptStr, long sourceLimit) {
+    switch (source.getSourceType()) {
+      case ROW:
+        return ((RowSource)source).fetchNext(lastCkptStr, sourceLimit);
+      case AVRO: {
+        InputBatch<JavaRDD<GenericRecord>> r = ((AvroSource)source).fetchNext(lastCkptStr, sourceLimit);
+        Schema sourceSchema = r.getSchemaProvider().getSourceSchema();
+        return new InputBatch<>(Optional.ofNullable(
+            r.getBatch().map(rdd -> AvroConversionUtils.createDataFrame(JavaRDD.toRDD(rdd),
+                sourceSchema.toString(), source.getSparkSession()))
+                .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
+      }
+      case JSON: {
+        InputBatch<JavaRDD<String>> r = ((JsonSource)source).fetchNext(lastCkptStr, sourceLimit);
+        Schema sourceSchema = r.getSchemaProvider().getSourceSchema();
+        StructType dataType = AvroConversionUtils.convertAvroSchemaToStructType(sourceSchema);
+        return new InputBatch<>(Optional.ofNullable(
+            r.getBatch().map(rdd -> source.getSparkSession().read().schema(dataType).json(rdd))
+                .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
+      }
+      default:
+        throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")");
+    }
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/RowBasedSchemaProvider.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/RowBasedSchemaProvider.java
new file mode 100644
index 000000000..0a9d7c616
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/RowBasedSchemaProvider.java
@@ -0,0 +1,25 @@
+package com.uber.hoodie.utilities.schema;
+
+import com.uber.hoodie.AvroConversionUtils;
+import org.apache.avro.Schema;
+import org.apache.spark.sql.types.StructType;
+
+public class RowBasedSchemaProvider extends SchemaProvider {
+
+  // Used in GenericRecord conversions
+  public static final String HOODIE_RECORD_NAMESPACE = "hoodie.source";
+  public static final String HOODIE_RECORD_STRUCT_NAME = "hoodie_source";
+
+  private StructType rowStruct;
+
+  public RowBasedSchemaProvider(StructType rowStruct) {
+    super(null, null);
+    this.rowStruct = rowStruct;
+  }
+
+  @Override
+  public Schema getSourceSchema() {
+    return AvroConversionUtils.convertStructTypeToAvroSchema(rowStruct, HOODIE_RECORD_STRUCT_NAME,
+        HOODIE_RECORD_NAMESPACE);
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaRegistryProvider.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaRegistryProvider.java
index 3312db5aa..84d8f94fe 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaRegistryProvider.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaRegistryProvider.java
@@ -42,12 +42,15 @@ public class SchemaRegistryProvider extends SchemaProvider {
    */
   public static class Config {
 
-    private static final String SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
+    private static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
+    private static final String TARGET_SCHEMA_REGISTRY_URL_PROP =
+        "hoodie.deltastreamer.schemaprovider.registry.targetUrl";
   }
 
   private final Schema schema;
+  private final Schema targetSchema;
 
-  private String fetchSchemaFromRegistry(String registryUrl) throws IOException {
+  private static String fetchSchemaFromRegistry(String registryUrl) throws IOException {
     URL registry = new URL(registryUrl);
     ObjectMapper mapper = new ObjectMapper();
     JsonNode node = mapper.readTree(registry.openStream());
@@ -56,17 +59,32 @@ public class SchemaRegistryProvider extends SchemaProvider {
 
   public SchemaRegistryProvider(TypedProperties props, JavaSparkContext jssc) {
     super(props, jssc);
-    DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.SCHEMA_REGISTRY_URL_PROP));
-    String registryUrl = props.getString(Config.SCHEMA_REGISTRY_URL_PROP);
+    DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.SRC_SCHEMA_REGISTRY_URL_PROP));
+    String registryUrl = props.getString(Config.SRC_SCHEMA_REGISTRY_URL_PROP);
+    String targetRegistryUrl = props.getString(Config.TARGET_SCHEMA_REGISTRY_URL_PROP, registryUrl);
     try {
-      this.schema = new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl));
+      this.schema = getSchema(registryUrl);
+      if (!targetRegistryUrl.equals(registryUrl)) {
+        this.targetSchema = getSchema(targetRegistryUrl);
+      } else {
+        this.targetSchema = schema;
+      }
     } catch (IOException ioe) {
       throw new HoodieIOException("Error reading schema from registry :" + registryUrl, ioe);
     }
   }
 
+  private static Schema getSchema(String registryUrl) throws IOException {
+    return new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl));
+  }
+
   @Override
   public Schema getSourceSchema() {
     return schema;
   }
+
+  @Override
+  public Schema getTargetSchema() {
+    return targetSchema;
+  }
 }
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroDFSSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroDFSSource.java
index 335d06a9b..2f21f3253 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroDFSSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroDFSSource.java
@@ -19,7 +19,10 @@
 package com.uber.hoodie.utilities.sources;
 
 import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
+import com.uber.hoodie.utilities.sources.helpers.DFSPathSelector;
+import java.util.Optional;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.mapred.AvroKey;
 import org.apache.avro.mapreduce.AvroKeyInputFormat;
@@ -27,18 +30,33 @@ import org.apache.hadoop.io.NullWritable;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
 
 /**
  * DFS Source that reads avro data
  */
-public class AvroDFSSource extends DFSSource {
+public class AvroDFSSource extends AvroSource {
 
-  public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
+  private final DFSPathSelector pathSelector;
+
+  public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider);
+    this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
   }
 
   @Override
-  protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
+  protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Optional<String> lastCkptStr,
+      long sourceLimit) {
+    Pair<Optional<String>, String> selectPathsWithMaxModificationTime =
+        pathSelector.getNextFilePathsAndMaxModificationTime(lastCkptStr, sourceLimit);
+    return selectPathsWithMaxModificationTime.getLeft().map(pathStr -> new InputBatch<>(
+        Optional.of(fromFiles(pathStr)),
+        selectPathsWithMaxModificationTime.getRight()))
+        .orElseGet(() -> new InputBatch<>(Optional.empty(), selectPathsWithMaxModificationTime.getRight()));
+  }
+
+  private JavaRDD<GenericRecord> fromFiles(String pathStr) {
     JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
         AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
         sparkContext.hadoopConfiguration());
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroKafkaSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroKafkaSource.java
index 4e1471413..f1e51e0ea 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroKafkaSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroKafkaSource.java
@@ -20,27 +20,55 @@ package com.uber.hoodie.utilities.sources;
 
 import com.uber.hoodie.common.util.TypedProperties;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
+import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen;
+import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
 import io.confluent.kafka.serializers.KafkaAvroDecoder;
+import java.util.Optional;
 import kafka.serializer.StringDecoder;
 import org.apache.avro.generic.GenericRecord;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
 import org.apache.spark.streaming.kafka.KafkaUtils;
 import org.apache.spark.streaming.kafka.OffsetRange;
 
 /**
  * Reads avro serialized Kafka data, based on the confluent schema-registry
  */
-public class AvroKafkaSource extends KafkaSource {
+public class AvroKafkaSource extends AvroSource {
 
-  public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
+  private static Logger log = LogManager.getLogger(AvroKafkaSource.class);
+
+  private final KafkaOffsetGen offsetGen;
+
+  public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider);
+    offsetGen = new KafkaOffsetGen(props);
   }
 
   @Override
-  protected JavaRDD<GenericRecord> toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor) {
-    return KafkaUtils
-        .createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class, kafkaParams,
-            offsetRanges).values().map(obj -> (GenericRecord) obj);
+  protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Optional<String> lastCheckpointStr,
+      long sourceLimit) {
+    OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit);
+    long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
+    if (totalNewMsgs <= 0) {
+      return new InputBatch<>(Optional.empty(),
+          lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
+    } else {
+      log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
+    }
+    JavaRDD<GenericRecord> newDataRDD = toRDD(offsetRanges);
+    return new InputBatch<>(Optional.of(newDataRDD),
+        KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges));
+  }
+
+  private JavaRDD<GenericRecord> toRDD(OffsetRange[] offsetRanges) {
+    JavaRDD<GenericRecord> recordRDD = KafkaUtils
+        .createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class,
+            offsetGen.getKafkaParams(), offsetRanges).values().map(obj -> (GenericRecord) obj);
+    return recordRDD;
   }
 }
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroSource.java
new file mode 100644
index 000000000..ba767ad62
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroSource.java
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.sources;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.utilities.schema.SchemaProvider;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+
+public abstract class AvroSource extends Source<JavaRDD<GenericRecord>> {
+
+  public AvroSource(TypedProperties props,
+      JavaSparkContext sparkContext,
+      SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider, SourceType.AVRO);
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java
index 3d6af40fa..0f5e78746 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java
@@ -21,8 +21,6 @@ package com.uber.hoodie.utilities.sources;
 import com.uber.hoodie.DataSourceUtils;
 import com.uber.hoodie.common.util.FSUtils;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.ImmutablePair;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.exception.HoodieIOException;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
 import java.io.IOException;
@@ -44,19 +42,20 @@ import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
 
 /**
- * Source to read deltas produced by {@link com.uber.hoodie.utilities.HiveIncrementalPuller}, commit
- * by commit and apply to the target table
+ * Source to read deltas produced by {@link com.uber.hoodie.utilities.HiveIncrementalPuller}, commit by commit and apply
+ * to the target table
  * <p>
  * The general idea here is to have commits sync across the data pipeline.
  * <p>
- * [Source Tables(s)]  ====> HiveIncrementalScanner  ==> incrPullRootPath ==> targetTable
- * {c1,c2,c3,...}                                       {c1,c2,c3,...}       {c1,c2,c3,...}
+ * [Source Tables(s)]  ====> HiveIncrementalScanner  ==> incrPullRootPath ==> targetTable {c1,c2,c3,...}
+ * {c1,c2,c3,...}       {c1,c2,c3,...}
  * <p>
  * This produces beautiful causality, that makes data issues in ETLs very easy to debug
  */
-public class HiveIncrPullSource extends Source {
+public class HiveIncrPullSource extends AvroSource {
 
   private static volatile Logger log = LogManager.getLogger(HiveIncrPullSource.class);
 
@@ -73,9 +72,9 @@ public class HiveIncrPullSource extends Source {
     private static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.incrpull.root";
   }
 
-  public HiveIncrPullSource(TypedProperties props, JavaSparkContext sparkContext,
+  public HiveIncrPullSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
       SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
+    super(props, sparkContext, sparkSession, schemaProvider);
     DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.ROOT_INPUT_PATH_PROP));
     this.incrPullRootPath = props.getString(Config.ROOT_INPUT_PATH_PROP);
     this.fs = FSUtils.getFs(incrPullRootPath, sparkContext.hadoopConfiguration());
@@ -113,15 +112,15 @@ public class HiveIncrPullSource extends Source {
   }
 
   @Override
-  public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
+  protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(
       Optional<String> lastCheckpointStr, long sourceLimit) {
     try {
       // find the source commit to pull
       Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);
 
       if (!commitToPull.isPresent()) {
-        return new ImmutablePair<>(Optional.empty(),
-                lastCheckpointStr.orElse(""));
+        return new InputBatch<>(Optional.empty(),
+            lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
       }
 
       // read the files out.
@@ -132,7 +131,7 @@ public class HiveIncrPullSource extends Source {
       JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
           AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
           sparkContext.hadoopConfiguration());
-      return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
+      return new InputBatch<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
           String.valueOf(commitToPull.get()));
     } catch (IOException ioe) {
       throw new HoodieIOException(
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HoodieIncrSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HoodieIncrSource.java
new file mode 100644
index 000000000..430eb1e1e
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HoodieIncrSource.java
@@ -0,0 +1,144 @@
+package com.uber.hoodie.utilities.sources;
+
+import com.uber.hoodie.DataSourceReadOptions;
+import com.uber.hoodie.DataSourceUtils;
+import com.uber.hoodie.common.model.HoodieRecord;
+import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.common.util.collection.Pair;
+import com.uber.hoodie.hive.SlashEncodedDayPartitionValueExtractor;
+import com.uber.hoodie.utilities.schema.SchemaProvider;
+import com.uber.hoodie.utilities.sources.helpers.IncrSourceHelper;
+import java.util.Arrays;
+import java.util.Optional;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrameReader;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+public class HoodieIncrSource extends RowSource {
+
+  /**
+   * Configs supported
+   */
+  protected static class Config {
+
+    /**
+     * {@value #HOODIE_SRC_BASE_PATH} is the base-path for the source Hoodie table
+     */
+    private static final String HOODIE_SRC_BASE_PATH = "hoodie.deltastreamer.source.hoodieincr.path";
+
+    /**
+     * {@value #NUM_INSTANTS_PER_FETCH} allows the max number of instants whose changes can be incrementally fetched
+     */
+    private static final String NUM_INSTANTS_PER_FETCH = "hoodie.deltastreamer.source.hoodieincr.num_instants";
+    private static final Integer DEFAULT_NUM_INSTANTS_PER_FETCH = 1;
+
+    /**
+     * {@value #HOODIE_SRC_PARTITION_FIELDS} specifies partition fields that needs to be added to source table after
+     * parsing _hoodie_partition_path
+     */
+    private static final String HOODIE_SRC_PARTITION_FIELDS = "hoodie.deltastreamer.source.hoodieincr.partition.fields";
+
+    /**
+     * {@value #HOODIE_SRC_PARTITION_EXTRACTORCLASS} PartitionValueExtractor class to extract partition fields from
+     * _hoodie_partition_path
+     */
+    private static final String HOODIE_SRC_PARTITION_EXTRACTORCLASS =
+        "hoodie.deltastreamer.source.hoodieincr.partition.extractor.class";
+    private static final String DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS =
+        SlashEncodedDayPartitionValueExtractor.class.getCanonicalName();
+
+    /**
+     * {@value #READ_LATEST_INSTANT_ON_MISSING_CKPT} allows delta-streamer to incrementally fetch from latest committed
+     * instant when checkpoint is not provided.
+     */
+    private static final String READ_LATEST_INSTANT_ON_MISSING_CKPT =
+        "hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt";
+    private static final Boolean DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT = false;
+  }
+
+  public HoodieIncrSource(TypedProperties props,
+      JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider);
+  }
+
+  @Override
+  public Pair<Optional<Dataset<Row>>, String> fetchNextBatch(Optional<String> lastCkptStr, long sourceLimit) {
+
+    DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH));
+
+    /**
+     DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH,
+     Config.HOODIE_SRC_PARTITION_FIELDS));
+    List<String> partitionFields = props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",",
+        new ArrayList<>());
+    PartitionValueExtractor extractor = DataSourceUtils.createPartitionExtractor(props.getString(
+        Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS, Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS));
+    **/
+    String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH);
+    int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH);
+    boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT,
+        Config.DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT);
+
+    // Use begin Instant if set and non-empty
+    Optional<String> beginInstant =
+        lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Optional.empty() : lastCkptStr : Optional.empty();
+
+    Pair<String, String> instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath,
+        numInstantsPerFetch, beginInstant, readLatestOnMissingCkpt);
+
+    if (instantEndpts.getKey().equals(instantEndpts.getValue())) {
+      log.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey());
+      return Pair.of(Optional.empty(), instantEndpts.getKey());
+    }
+
+    // Do Incr pull. Set end instant if available
+    DataFrameReader reader = sparkSession.read().format("com.uber.hoodie")
+        .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
+        .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), instantEndpts.getLeft())
+        .option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY(), instantEndpts.getRight());
+
+    Dataset<Row> source = reader.load(srcPath);
+
+    /**
+    log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema());
+
+    StructType newSchema = new StructType(source.schema().fields());
+    for (String field : partitionFields) {
+      newSchema = newSchema.add(field, DataTypes.StringType, true);
+    }
+
+    /**
+     * Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if
+     * configured
+     *
+    Dataset<Row> validated = source.map((MapFunction<Row, Row>) (Row row) -> {
+      // _hoodie_instant_time
+      String instantTime = row.getString(0);
+      IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(), instantEndpts.getValue());
+      if (!partitionFields.isEmpty()) {
+        // _hoodie_partition_path
+        String hoodiePartitionPath = row.getString(3);
+        List<Object> partitionVals = extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream()
+            .map(o -> (Object) o).collect(Collectors.toList());
+        Preconditions.checkArgument(partitionVals.size() == partitionFields.size(),
+            "#partition-fields != #partition-values-extracted");
+        List<Object> rowObjs = new ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq()));
+        rowObjs.addAll(partitionVals);
+        return RowFactory.create(rowObjs.toArray());
+      }
+      return row;
+    }, RowEncoder.apply(newSchema));
+
+    log.info("Validated Source Schema :" + validated.schema());
+    **/
+
+    // Remove Hoodie meta columns except partition path from input source
+    final Dataset<Row> src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream()
+        .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new));
+    //log.info("Final Schema from Source is :" + src.schema());
+    return Pair.of(Optional.of(src), instantEndpts.getRight());
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/InputBatch.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/InputBatch.java
new file mode 100644
index 000000000..9139057b9
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/InputBatch.java
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.sources;
+
+import com.uber.hoodie.utilities.schema.SchemaProvider;
+import java.util.Optional;
+
+public class InputBatch<T> {
+
+  private final Optional<T> batch;
+  private final String checkpointForNextBatch;
+  private final SchemaProvider schemaProvider;
+
+  public InputBatch(Optional<T> batch, String checkpointForNextBatch,
+      SchemaProvider schemaProvider) {
+    this.batch = batch;
+    this.checkpointForNextBatch = checkpointForNextBatch;
+    this.schemaProvider = schemaProvider;
+  }
+
+  public InputBatch(Optional<T> batch, String checkpointForNextBatch) {
+    this.batch = batch;
+    this.checkpointForNextBatch = checkpointForNextBatch;
+    this.schemaProvider = null;
+  }
+
+  public Optional<T> getBatch() {
+    return batch;
+  }
+
+  public String getCheckpointForNextBatch() {
+    return checkpointForNextBatch;
+  }
+
+  public SchemaProvider getSchemaProvider() {
+    return schemaProvider;
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonDFSSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonDFSSource.java
index 6b1018e15..bbf985ba0 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonDFSSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonDFSSource.java
@@ -19,22 +19,38 @@
 package com.uber.hoodie.utilities.sources;
 
 import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
-import org.apache.avro.generic.GenericRecord;
+import com.uber.hoodie.utilities.sources.helpers.DFSPathSelector;
+import java.util.Optional;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
 
 /**
  * DFS Source that reads json data
  */
-public class JsonDFSSource extends DFSSource {
+public class JsonDFSSource extends JsonSource {
 
-  public JsonDFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
+  private final DFSPathSelector pathSelector;
+
+  public JsonDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider);
+    this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
   }
 
   @Override
-  protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
-    return sparkContext.textFile(pathStr).map(convertor::fromJson);
+  protected InputBatch<JavaRDD<String>> fetchNewData(Optional<String> lastCkptStr,
+      long sourceLimit) {
+    Pair<Optional<String>, String> selPathsWithMaxModificationTime =
+        pathSelector.getNextFilePathsAndMaxModificationTime(lastCkptStr, sourceLimit);
+    return selPathsWithMaxModificationTime.getLeft().map(pathStr -> new InputBatch<>(
+        Optional.of(fromFiles(pathStr)), selPathsWithMaxModificationTime.getRight()))
+        .orElse(new InputBatch<>(Optional.empty(), selPathsWithMaxModificationTime.getRight()));
+  }
+
+  private JavaRDD<String> fromFiles(String pathStr) {
+    return sparkContext.textFile(pathStr);
   }
 }
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonKafkaSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonKafkaSource.java
index b271e3704..339c37355 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonKafkaSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonKafkaSource.java
@@ -20,26 +20,49 @@ package com.uber.hoodie.utilities.sources;
 
 import com.uber.hoodie.common.util.TypedProperties;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
+import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen;
+import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
+import java.util.Optional;
 import kafka.serializer.StringDecoder;
-import org.apache.avro.generic.GenericRecord;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
 import org.apache.spark.streaming.kafka.KafkaUtils;
 import org.apache.spark.streaming.kafka.OffsetRange;
 
 /**
  * Read json kafka data
  */
-public class JsonKafkaSource extends KafkaSource {
+public class JsonKafkaSource extends JsonSource {
 
-  public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(properties, sparkContext, schemaProvider);
+  private static Logger log = LogManager.getLogger(JsonKafkaSource.class);
+
+  private final KafkaOffsetGen offsetGen;
+
+  public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(properties, sparkContext, sparkSession, schemaProvider);
+    offsetGen = new KafkaOffsetGen(properties);
   }
 
   @Override
-  protected JavaRDD<GenericRecord> toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor) {
+  protected InputBatch<JavaRDD<String>> fetchNewData(Optional<String> lastCheckpointStr,
+      long sourceLimit) {
+    OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit);
+    long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
+    if (totalNewMsgs <= 0) {
+      return new InputBatch<>(Optional.empty(),
+          lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
+    }
+    log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
+    JavaRDD<String> newDataRDD = toRDD(offsetRanges);
+    return new InputBatch<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
+  }
+
+  private JavaRDD<String> toRDD(OffsetRange[] offsetRanges) {
     return KafkaUtils.createRDD(sparkContext, String.class, String.class, StringDecoder.class, StringDecoder.class,
-        kafkaParams, offsetRanges)
-        .values().map(avroConvertor::fromJson);
+        offsetGen.getKafkaParams(), offsetRanges).values();
   }
 }
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonSource.java
new file mode 100644
index 000000000..27ec5f3eb
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonSource.java
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.sources;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.utilities.schema.SchemaProvider;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+
+public abstract class JsonSource extends Source<JavaRDD<String>> {
+
+  public JsonSource(TypedProperties props,
+      JavaSparkContext sparkContext,
+      SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider, SourceType.JSON);
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/RowSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/RowSource.java
new file mode 100644
index 000000000..708e55d04
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/RowSource.java
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.sources;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.common.util.collection.Pair;
+import com.uber.hoodie.utilities.schema.RowBasedSchemaProvider;
+import com.uber.hoodie.utilities.schema.SchemaProvider;
+import java.util.Optional;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+public abstract class RowSource extends Source<Dataset<Row>> {
+
+  public RowSource(TypedProperties props,
+      JavaSparkContext sparkContext,
+      SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider, SourceType.ROW);
+  }
+
+  protected abstract Pair<Optional<Dataset<Row>>, String> fetchNextBatch(Optional<String> lastCkptStr,
+      long sourceLimit);
+
+  @Override
+  protected final InputBatch<Dataset<Row>> fetchNewData(Optional<String> lastCkptStr, long sourceLimit) {
+    Pair<Optional<Dataset<Row>>, String> res = fetchNextBatch(lastCkptStr, sourceLimit);
+    return res.getKey().map(dsr -> {
+      SchemaProvider rowSchemaProvider = new RowBasedSchemaProvider(dsr.schema());
+      return new InputBatch<>(res.getKey(), res.getValue(), rowSchemaProvider);
+    }).orElseGet(() -> new InputBatch<>(res.getKey(), res.getValue()));
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java
index 06b83e9b1..4744e9375 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java
@@ -19,36 +19,67 @@
 package com.uber.hoodie.utilities.sources;
 
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
 import java.io.Serializable;
 import java.util.Optional;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.spark.api.java.JavaRDD;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
 
 /**
  * Represents a source from which we can tail data. Assumes a constructor that takes properties.
  */
-public abstract class Source implements Serializable {
+public abstract class Source<T> implements Serializable {
+  protected static volatile Logger log = LogManager.getLogger(Source.class);
 
-  protected transient TypedProperties props;
-
-  protected transient JavaSparkContext sparkContext;
-
-  protected transient SchemaProvider schemaProvider;
-
-
-  protected Source(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    this.props = props;
-    this.sparkContext = sparkContext;
-    this.schemaProvider = schemaProvider;
+  public enum SourceType {
+    JSON,
+    AVRO,
+    ROW
   }
 
+  protected transient TypedProperties props;
+  protected transient JavaSparkContext sparkContext;
+  protected transient SparkSession sparkSession;
+  private transient SchemaProvider overriddenSchemaProvider;
+
+  private final SourceType sourceType;
+
+  protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    this(props, sparkContext, sparkSession, schemaProvider, SourceType.AVRO);
+  }
+
+  protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider, SourceType sourceType) {
+    this.props = props;
+    this.sparkContext = sparkContext;
+    this.sparkSession = sparkSession;
+    this.overriddenSchemaProvider = schemaProvider;
+    this.sourceType = sourceType;
+  }
+
+  protected abstract InputBatch<T> fetchNewData(Optional<String> lastCkptStr, long sourceLimit);
+
   /**
-   * Fetches new data upto sourceLimit, from the provided checkpoint and returns an RDD of the
-   * data, as well as the checkpoint to be written as a result of that.
+   * Main API called by Hoodie Delta Streamer to fetch records
+   * @param lastCkptStr Last Checkpoint
+   * @param sourceLimit Source Limit
+   * @return
    */
-  public abstract Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
-      Optional<String> lastCheckpointStr, long sourceLimit);
+  public final InputBatch<T> fetchNext(Optional<String> lastCkptStr, long sourceLimit) {
+    InputBatch<T> batch = fetchNewData(lastCkptStr, sourceLimit);
+    // If overriddenSchemaProvider is passed in CLI, use it
+    return overriddenSchemaProvider == null ? batch : new InputBatch<>(batch.getBatch(),
+        batch.getCheckpointForNextBatch(), overriddenSchemaProvider);
+  }
+
+  public SourceType getSourceType() {
+    return sourceType;
+  }
+
+  public SparkSession getSparkSession() {
+    return sparkSession;
+  }
 }
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroConvertor.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/AvroConvertor.java
similarity index 93%
rename from hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroConvertor.java
rename to hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/AvroConvertor.java
index feb06d5b3..ef022b7a8 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroConvertor.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/AvroConvertor.java
@@ -16,7 +16,7 @@
  *
  */
 
-package com.uber.hoodie.utilities.sources;
+package com.uber.hoodie.utilities.sources.helpers;
 
 import com.twitter.bijection.Injection;
 import com.twitter.bijection.avro.GenericAvroCodecs;
@@ -55,6 +55,10 @@ public class AvroConvertor implements Serializable {
     this.schemaStr = schemaStr;
   }
 
+  public AvroConvertor(Schema schema) {
+    this.schemaStr = schema.toString();
+    this.schema = schema;
+  }
 
   private void initSchema() {
     if (schema == null) {
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/DFSPathSelector.java
similarity index 76%
rename from hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java
rename to hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/DFSPathSelector.java
index 6d962b276..2c5f9f292 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/DFSPathSelector.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
  *
  */
 
-package com.uber.hoodie.utilities.sources;
+package com.uber.hoodie.utilities.sources.helpers;
 
 import com.uber.hoodie.DataSourceUtils;
 import com.uber.hoodie.common.util.FSUtils;
@@ -24,45 +24,38 @@ import com.uber.hoodie.common.util.TypedProperties;
 import com.uber.hoodie.common.util.collection.ImmutablePair;
 import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.exception.HoodieIOException;
-import com.uber.hoodie.utilities.schema.SchemaProvider;
 import java.io.IOException;
 import java.util.*;
 import java.util.stream.Collectors;
-import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 
-/**
- * Source to read data from a given DFS directory structure, incrementally
- */
-public abstract class DFSSource extends Source {
+public class DFSPathSelector {
 
   /**
    * Configs supported
    */
   static class Config {
+
     private static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.dfs.root";
   }
 
   private static final List<String> IGNORE_FILEPREFIX_LIST = Arrays.asList(".", "_");
 
   private final transient FileSystem fs;
+  private final TypedProperties props;
 
-  public DFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
-    DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.ROOT_INPUT_PATH_PROP));
-    this.fs = FSUtils.getFs(props.getString(Config.ROOT_INPUT_PATH_PROP), sparkContext.hadoopConfiguration());
+  public DFSPathSelector(TypedProperties props, Configuration hadoopConf) {
+    DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.ROOT_INPUT_PATH_PROP));
+    this.props = props;
+    this.fs = FSUtils.getFs(props.getString(Config.ROOT_INPUT_PATH_PROP), hadoopConf);
   }
 
-  protected abstract JavaRDD<GenericRecord> fromFiles(final AvroConvertor convertor, String pathStr);
-
-  @Override
-  public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
+  public Pair<Optional<String>, String> getNextFilePathsAndMaxModificationTime(
       Optional<String> lastCheckpointStr, long sourceLimit) {
 
     try {
@@ -111,11 +104,9 @@ public abstract class DFSSource extends Source {
       // read the files out.
       String pathStr = filteredFiles.stream().map(f -> f.getPath().toString())
           .collect(Collectors.joining(","));
-      String schemaStr = schemaProvider.getSourceSchema().toString();
-      final AvroConvertor avroConvertor = new AvroConvertor(schemaStr);
 
       return new ImmutablePair<>(
-          Optional.of(fromFiles(avroConvertor, pathStr)),
+          Optional.ofNullable(pathStr),
           String.valueOf(maxModificationTime));
     } catch (IOException ioe) {
       throw new HoodieIOException(
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/IncrSourceHelper.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/IncrSourceHelper.java
new file mode 100644
index 000000000..93056012b
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/IncrSourceHelper.java
@@ -0,0 +1,88 @@
+package com.uber.hoodie.utilities.sources.helpers;
+
+import com.google.common.base.Preconditions;
+import com.uber.hoodie.common.table.HoodieTableMetaClient;
+import com.uber.hoodie.common.table.HoodieTimeline;
+import com.uber.hoodie.common.table.timeline.HoodieInstant;
+import com.uber.hoodie.common.util.collection.Pair;
+import java.util.Optional;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Row;
+
+
+/**
+ * Helper for Hudi Incremental Source. Has APIs to
+ *   (a) calculate begin and end instant time for incrementally pulling from Hudi source
+ *   (b) Find max seen instant to be set as checkpoint for next fetch.
+ */
+public class IncrSourceHelper {
+
+  /**
+   * Get a timestamp which is the next value in a descending sequence
+   *
+   * @param timestamp Timestamp
+   */
+  private static String getStrictlyLowerTimestamp(String timestamp) {
+    long ts = Long.parseLong(timestamp);
+    Preconditions.checkArgument(ts > 0, "Timestamp must be positive");
+    Long lower = ts - 1;
+    return "" + lower;
+  }
+
+  /**
+   * Find begin and end instants to be set for the next fetch
+   *
+   * @param jssc Java Spark Context
+   * @param srcBasePath Base path of Hudi source table
+   * @param numInstantsPerFetch Max Instants per fetch
+   * @param beginInstant Last Checkpoint String
+   * @param readLatestOnMissingBeginInstant when begin instant is missing, allow reading from latest committed instant
+   * @return begin and end instants
+   */
+  public static Pair<String, String> calculateBeginAndEndInstants(
+      JavaSparkContext jssc, String srcBasePath, int numInstantsPerFetch, Optional<String> beginInstant,
+      boolean readLatestOnMissingBeginInstant) {
+    Preconditions.checkArgument(numInstantsPerFetch > 0, "Make sure the config"
+        + " hoodie.deltastreamer.source.hoodieincr.num_instants is set to a positive value");
+    HoodieTableMetaClient srcMetaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(),
+        srcBasePath, true);
+
+    final HoodieTimeline activeCommitTimeline =
+        srcMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();
+
+    String beginInstantTime = beginInstant.orElseGet(() -> {
+      if (readLatestOnMissingBeginInstant) {
+        Optional<HoodieInstant> lastInstant = activeCommitTimeline.lastInstant();
+        return lastInstant.map(hoodieInstant -> getStrictlyLowerTimestamp(hoodieInstant.getTimestamp())).orElse("000");
+      } else {
+        throw new IllegalArgumentException("Missing begin instant for incremental pull. For reading from latest "
+            + "committed instant set hoodie.deltastreamer.source.hoodie.read_latest_on_midding_ckpt to true");
+      }
+    });
+
+    Optional<HoodieInstant> nthInstant =
+        activeCommitTimeline.findInstantsAfter(beginInstantTime, numInstantsPerFetch).getInstants().reduce((x, y) -> y);
+    return Pair.of(beginInstantTime, nthInstant.map(instant -> instant.getTimestamp()).orElse(beginInstantTime));
+  }
+
+  /**
+   * Validate instant time seen in the incoming row
+   *
+   * @param row Input Row
+   * @param instantTime Hoodie Instant time of the row
+   * @param sinceInstant begin instant of the batch
+   * @param endInstant end instant of the batch
+   */
+  public static void validateInstantTime(Row row, String instantTime, String sinceInstant, String endInstant) {
+    Preconditions.checkNotNull(instantTime);
+    Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime,
+        sinceInstant, HoodieTimeline.GREATER),
+        "Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime
+            + "but expected to be between " + sinceInstant + "(excl) - "
+            + endInstant + "(incl)");
+    Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime,
+        endInstant, HoodieTimeline.LESSER_OR_EQUAL),
+        "Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime
+            + "but expected to be between " + sinceInstant + "(excl) - " + endInstant + "(incl)");
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/KafkaOffsetGen.java
similarity index 84%
rename from hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java
rename to hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/KafkaOffsetGen.java
index 4699fcaf3..947f3c48a 100644
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/KafkaOffsetGen.java
@@ -16,24 +16,22 @@
  *
  */
 
-package com.uber.hoodie.utilities.sources;
+package com.uber.hoodie.utilities.sources.helpers;
 
 import com.uber.hoodie.DataSourceUtils;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.ImmutablePair;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.exception.HoodieNotSupportedException;
 import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException;
-import com.uber.hoodie.utilities.schema.SchemaProvider;
-
-import java.util.*;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Optional;
 import java.util.stream.Collectors;
 import kafka.common.TopicAndPartition;
-import org.apache.avro.generic.GenericRecord;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.streaming.kafka.KafkaCluster;
 import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset;
 import org.apache.spark.streaming.kafka.OffsetRange;
@@ -49,14 +47,13 @@ import scala.util.Either;
 /**
  * Source to read data from Kafka, incrementally
  */
-public abstract class KafkaSource extends Source {
+public class KafkaOffsetGen {
 
-  private static volatile Logger log = LogManager.getLogger(KafkaSource.class);
+  private static volatile Logger log = LogManager.getLogger(KafkaOffsetGen.class);
 
   private static long DEFAULT_MAX_EVENTS_TO_READ = 1000000; // 1M events max
 
-
-  static class CheckpointUtils {
+  public static class CheckpointUtils {
 
     /**
      * Reconstruct checkpoint from string.
@@ -90,7 +87,6 @@ public abstract class KafkaSource extends Source {
       return sb.toString();
     }
 
-
     /**
      * Compute the offset ranges to read from Kafka, while handling newly added partitions, skews, event limits.
      *
@@ -174,19 +170,18 @@ public abstract class KafkaSource extends Source {
    * Configs to be passed for this source. All standard Kafka consumer configs are also respected
    */
   static class Config {
+
     private static final String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic";
     private static final KafkaResetOffsetStrategies DEFAULT_AUTO_RESET_OFFSET = KafkaResetOffsetStrategies.LARGEST;
   }
 
-
-  protected HashMap<String, String> kafkaParams;
-
+  private final HashMap<String, String> kafkaParams;
+  private final TypedProperties props;
   protected final String topicName;
 
-  public KafkaSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
-
-    kafkaParams = new HashMap<>();
+  public KafkaOffsetGen(TypedProperties props) {
+    this.props = props;
+    kafkaParams = new HashMap<String, String>();
     for (Object prop : props.keySet()) {
       kafkaParams.put(prop.toString(), props.getString(prop.toString()));
     }
@@ -194,11 +189,7 @@ public abstract class KafkaSource extends Source {
     topicName = props.getString(Config.KAFKA_TOPIC_NAME);
   }
 
-  protected abstract JavaRDD<GenericRecord> toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor);
-
-  @Override
-  public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
-      Optional<String> lastCheckpointStr, long sourceLimit) {
+  public OffsetRange[] getNextOffsetRanges(Optional<String> lastCheckpointStr, long sourceLimit) {
 
     // Obtain current metadata for the topic
     KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams));
@@ -240,16 +231,15 @@ public abstract class KafkaSource extends Source {
     // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
     long numEvents = Math.min(DEFAULT_MAX_EVENTS_TO_READ, sourceLimit);
     OffsetRange[] offsetRanges = CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents);
-    long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
-    if (totalNewMsgs <= 0) {
-      return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.orElse(""));
-    } else {
-      log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + topicName);
-    }
 
-    // Produce a RDD[GenericRecord]
-    final AvroConvertor avroConvertor = new AvroConvertor(schemaProvider.getSourceSchema().toString());
-    JavaRDD<GenericRecord> newDataRDD = toAvroRDD(offsetRanges, avroConvertor);
-    return new ImmutablePair<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
+    return offsetRanges;
+  }
+
+  public String getTopicName() {
+    return topicName;
+  }
+
+  public HashMap<String, String> getKafkaParams() {
+    return kafkaParams;
   }
 }
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/IdentityTransformer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/IdentityTransformer.java
new file mode 100644
index 000000000..b454cdf1d
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/IdentityTransformer.java
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.transform;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+/**
+ * Identity transformer
+ */
+public class IdentityTransformer implements Transformer {
+
+  @Override
+  public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
+      Dataset<Row> rowDataset, TypedProperties properties) {
+    return rowDataset;
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/SqlQueryBasedTransformer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/SqlQueryBasedTransformer.java
new file mode 100644
index 000000000..b967f45d1
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/SqlQueryBasedTransformer.java
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.transform;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import java.util.UUID;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+/**
+ * A transformer that allows a sql-query template be used to transform the source before writing to Hudi data-set.
+ *
+ * The query should reference the source as a table named "\<SRC\>"
+ */
+public class SqlQueryBasedTransformer implements Transformer {
+
+  private static volatile Logger log = LogManager.getLogger(SqlQueryBasedTransformer.class);
+
+  private static final String SRC_PATTERN = "<SRC>";
+  private static final String TMP_TABLE = "HOODIE_SRC_TMP_TABLE_";
+
+  /**
+   * Configs supported
+   */
+  static class Config {
+
+    private static final String TRANSFORMER_SQL = "hoodie.deltastreamer.transformer.sql";
+  }
+
+  @Override
+  public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
+      Dataset<Row> rowDataset, TypedProperties properties) {
+    String transformerSQL = properties.getString(Config.TRANSFORMER_SQL);
+    if (null == transformerSQL) {
+      throw new IllegalArgumentException("Missing configuration : (" + Config.TRANSFORMER_SQL + ")");
+    }
+
+    // tmp table name doesn't like dashes
+    String tmpTable = TMP_TABLE.concat(UUID.randomUUID().toString().replace("-", "_"));
+    log.info("Registering tmp table : " + tmpTable);
+    rowDataset.registerTempTable(tmpTable);
+    String sqlStr = transformerSQL.replaceAll(SRC_PATTERN, tmpTable);
+    log.info("SQL Query for transformation : (" + sqlStr + ")");
+    return sparkSession.sql(sqlStr);
+  }
+}
diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/Transformer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/Transformer.java
new file mode 100644
index 000000000..32e80facd
--- /dev/null
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/Transformer.java
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.transform;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+/**
+ * Transform source to target dataset before writing
+ */
+public interface Transformer {
+
+  /**
+   * Transform source RDD to target RDD
+   *
+   * @param jsc JavaSparkContext
+   * @param rowDataset Source DataSet
+   * @param sparkSession Spark Session
+   * @param properties Config properties
+   * @return Transformed Dataset
+   */
+  Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
+      Dataset<Row> rowDataset, TypedProperties properties);
+}
diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieDeltaStreamer.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieDeltaStreamer.java
index 63d93b414..acce00451 100644
--- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieDeltaStreamer.java
+++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieDeltaStreamer.java
@@ -19,8 +19,10 @@
 package com.uber.hoodie.utilities;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import com.uber.hoodie.DataSourceWriteOptions;
 import com.uber.hoodie.common.model.HoodieCommitMetadata;
 import com.uber.hoodie.common.table.HoodieTableMetaClient;
 import com.uber.hoodie.common.table.HoodieTimeline;
@@ -28,17 +30,31 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
 import com.uber.hoodie.common.util.DFSPropertiesConfiguration;
 import com.uber.hoodie.common.util.TypedProperties;
 import com.uber.hoodie.exception.DatasetNotFoundException;
+import com.uber.hoodie.hive.HiveSyncConfig;
+import com.uber.hoodie.hive.HoodieHiveClient;
+import com.uber.hoodie.hive.MultiPartKeysValueExtractor;
 import com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer;
 import com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer.Operation;
+import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider;
+import com.uber.hoodie.utilities.sources.HoodieIncrSource;
 import com.uber.hoodie.utilities.sources.TestDataSource;
+import com.uber.hoodie.utilities.transform.SqlQueryBasedTransformer;
+import com.uber.hoodie.utilities.transform.Transformer;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.List;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.api.java.UDF4;
+import org.apache.spark.sql.functions;
+import org.apache.spark.sql.types.DataTypes;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
@@ -55,17 +71,43 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
 
   @BeforeClass
   public static void initClass() throws Exception {
-    UtilitiesTestBase.initClass();
+    UtilitiesTestBase.initClass(true);
 
     // prepare the configs.
     UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/base.properties", dfs, dfsBasePath + "/base.properties");
+    UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/sql-transformer.properties", dfs,
+        dfsBasePath + "/sql-transformer.properties");
     UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/source.avsc", dfs, dfsBasePath + "/source.avsc");
+    UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/target.avsc", dfs, dfsBasePath + "/target.avsc");
+
     TypedProperties props = new TypedProperties();
-    props.setProperty("include", "base.properties");
+    props.setProperty("include", "sql-transformer.properties");
     props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
     props.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
     props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc");
+    props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc");
+    // Hive Configs
+    props.setProperty(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), "jdbc:hive2://127.0.0.1:9999/");
+    props.setProperty(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), "testdb1");
+    props.setProperty(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), "hive_trips");
+    props.setProperty(DataSourceWriteOptions.HIVE_ASSUME_DATE_PARTITION_OPT_KEY(), "false");
+    props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "datestr");
+    props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
+        MultiPartKeysValueExtractor.class.getName());
     UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-source.properties");
+
+    // Properties used for the delta-streamer which incrementally pulls from upstream Hudi source table and writes to
+    // downstream hudi table
+    TypedProperties downstreamProps = new TypedProperties();
+    downstreamProps.setProperty("include", "base.properties");
+    downstreamProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
+    downstreamProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
+
+    // Source schema is the target schema of upstream table
+    downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/target.avsc");
+    downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc");
+    UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs,
+        dfsBasePath + "/test-downstream-source.properties");
   }
 
   @AfterClass
@@ -86,17 +128,48 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
   }
 
   static class TestHelpers {
-
     static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op) {
+      return makeConfig(basePath, op, TripsWithDistanceTransformer.class.getName());
+    }
+
+    static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName) {
+      return makeConfig(basePath, op, transformerClassName, false);
+    }
+
+    static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName,
+        boolean enableHiveSync) {
       HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
       cfg.targetBasePath = basePath;
       cfg.targetTableName = "hoodie_trips";
       cfg.storageType = "COPY_ON_WRITE";
       cfg.sourceClassName = TestDataSource.class.getName();
+      cfg.transformerClassName = transformerClassName;
       cfg.operation = op;
+      cfg.enableHiveSync = enableHiveSync;
       cfg.sourceOrderingField = "timestamp";
       cfg.propsFilePath = dfsBasePath + "/test-source.properties";
       cfg.sourceLimit = 1000;
+      cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName();
+      return cfg;
+    }
+
+    static HoodieDeltaStreamer.Config makeConfigForHudiIncrSrc(String srcBasePath, String basePath, Operation op,
+        boolean addReadLatestOnMissingCkpt) {
+      HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
+      cfg.targetBasePath = basePath;
+      cfg.targetTableName = "hoodie_trips_copy";
+      cfg.storageType = "COPY_ON_WRITE";
+      cfg.sourceClassName = HoodieIncrSource.class.getName();
+      cfg.operation = op;
+      cfg.sourceOrderingField = "timestamp";
+      cfg.propsFilePath = dfsBasePath + "/test-downstream-source.properties";
+      cfg.sourceLimit = 1000;
+      List<String> cfgs = new ArrayList<>();
+      cfgs.add("hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt=" + addReadLatestOnMissingCkpt);
+      cfgs.add("hoodie.deltastreamer.source.hoodieincr.path=" + srcBasePath);
+      // No partition
+      cfgs.add("hoodie.deltastreamer.source.hoodieincr.partition.fields=datestr");
+      cfg.configs = cfgs;
       return cfg;
     }
 
@@ -110,15 +183,30 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
           .sort("_hoodie_commit_time").collectAsList();
     }
 
-    static void assertCommitMetadata(String expected, String datasetPath, FileSystem fs, int totalCommits)
+    static void assertDistanceCount(long expected, String datasetPath, SQLContext sqlContext) {
+      sqlContext.read().format("com.uber.hoodie").load(datasetPath).registerTempTable("tmp_trips");
+      long recordCount =
+          sqlContext.sparkSession().sql("select * from tmp_trips where haversine_distance is not NULL").count();
+      assertEquals(expected, recordCount);
+    }
+
+    static void assertDistanceCountWithExactValue(long expected, String datasetPath, SQLContext sqlContext) {
+      sqlContext.read().format("com.uber.hoodie").load(datasetPath).registerTempTable("tmp_trips");
+      long recordCount =
+          sqlContext.sparkSession().sql("select * from tmp_trips where haversine_distance = 1.0").count();
+      assertEquals(expected, recordCount);
+    }
+
+    static String assertCommitMetadata(String expected, String datasetPath, FileSystem fs, int totalCommits)
         throws IOException {
       HoodieTableMetaClient meta = new HoodieTableMetaClient(fs.getConf(), datasetPath);
       HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
-      HoodieInstant lastCommit = timeline.lastInstant().get();
+      HoodieInstant lastInstant = timeline.lastInstant().get();
       HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
-          timeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class);
+          timeline.getInstantDetails(lastInstant).get(), HoodieCommitMetadata.class);
       assertEquals(totalCommits, timeline.countInstants());
       assertEquals(expected, commitMetadata.getMetadata(HoodieDeltaStreamer.CHECKPOINT_KEY));
+      return lastInstant.getTimestamp();
     }
   }
 
@@ -152,12 +240,14 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
     HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(datasetBasePath, Operation.BULK_INSERT);
     new HoodieDeltaStreamer(cfg, jsc).sync();
     TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
     TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1);
 
     // No new data => no commits.
     cfg.sourceLimit = 0;
     new HoodieDeltaStreamer(cfg, jsc).sync();
     TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
     TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1);
 
     // upsert() #1
@@ -165,11 +255,94 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
     cfg.operation = Operation.UPSERT;
     new HoodieDeltaStreamer(cfg, jsc).sync();
     TestHelpers.assertRecordCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
     TestHelpers.assertCommitMetadata("00001", datasetBasePath, dfs, 2);
     List<Row> counts = TestHelpers.countsPerCommit(datasetBasePath + "/*/*.parquet", sqlContext);
     assertEquals(2000, counts.get(0).getLong(1));
   }
 
+  /**
+   * Test Bulk Insert and upserts with hive syncing. Tests Hudi incremental processing using a 2 step pipeline
+   * The first step involves using a SQL template to transform a source
+   * TEST-DATA-SOURCE  ============================> HUDI TABLE 1   ===============>  HUDI TABLE 2
+   *                   (incr-pull with transform)                     (incr-pull)
+   * Hudi Table 1 is synced with Hive.
+   * @throws Exception
+   */
+  @Test
+  public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline() throws Exception {
+    String datasetBasePath = dfsBasePath + "/test_dataset2";
+    String downstreamDatasetBasePath = dfsBasePath + "/test_downstream_dataset2";
+
+    HiveSyncConfig hiveSyncConfig = getHiveSyncConfig(datasetBasePath, "hive_trips");
+
+    // Initial bulk insert to ingest to first hudi table
+    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(datasetBasePath, Operation.BULK_INSERT,
+        SqlQueryBasedTransformer.class.getName(), true);
+    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
+    TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    String lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1);
+
+    // Now incrementally pull from the above hudi table and ingest to second table
+    HoodieDeltaStreamer.Config downstreamCfg =
+        TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath, Operation.BULK_INSERT, true);
+    new HoodieDeltaStreamer(downstreamCfg, jsc, dfs, hiveServer.getHiveConf()).sync();
+    TestHelpers.assertRecordCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamDatasetBasePath, dfs, 1);
+
+    // No new data => no commits for upstream table
+    cfg.sourceLimit = 0;
+    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
+    TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1);
+
+    // with no change in upstream table, no change in downstream too when pulled.
+    new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
+    TestHelpers.assertRecordCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamDatasetBasePath, dfs, 1);
+
+    // upsert() #1 on upstream hudi table
+    cfg.sourceLimit = 2000;
+    cfg.operation = Operation.UPSERT;
+    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
+    TestHelpers.assertRecordCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(2000, datasetBasePath + "/*/*.parquet", sqlContext);
+    lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00001", datasetBasePath, dfs, 2);
+    List<Row> counts = TestHelpers.countsPerCommit(datasetBasePath + "/*/*.parquet", sqlContext);
+    assertEquals(2000, counts.get(0).getLong(1));
+
+    // Incrementally pull changes in upstream hudi table and apply to downstream table
+    downstreamCfg =
+        TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath, Operation.UPSERT, false);
+    downstreamCfg.sourceLimit = 2000;
+    new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
+    TestHelpers.assertRecordCount(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    String finalInstant =
+        TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamDatasetBasePath, dfs, 2);
+    counts = TestHelpers.countsPerCommit(downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    assertEquals(2000, counts.get(0).getLong(1));
+
+    // Test Hive integration
+    HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), dfs);
+    assertTrue("Table " + hiveSyncConfig.tableName + " should exist",
+        hiveClient.doesTableExist());
+    assertEquals("Table partitions should match the number of partitions we wrote", 1,
+        hiveClient.scanTablePartitions().size());
+    assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
+        lastInstantForUpstreamTable, hiveClient.getLastCommitTimeSynced().get());
+  }
+
   @Test
   public void testFilterDupes() throws Exception {
     String datasetBasePath = dfsBasePath + "/test_dupes_dataset";
@@ -192,4 +365,57 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
     assertEquals(1000, counts.get(0).getLong(1));
     assertEquals(1000, counts.get(1).getLong(1));
   }
+
+  /**
+   * UDF to calculate Haversine distance
+   */
+  public static class DistanceUDF implements UDF4<Double, Double, Double, Double, Double> {
+
+    /**
+     *
+     * Taken from https://stackoverflow.com/questions/3694380/calculating-distance-between-two-points-using-latitude-
+     * longitude-what-am-i-doi
+     * Calculate distance between two points in latitude and longitude taking
+     * into account height difference. If you are not interested in height
+     * difference pass 0.0. Uses Haversine method as its base.
+     *
+     * lat1, lon1 Start point lat2, lon2 End point el1 Start altitude in meters
+     * el2 End altitude in meters
+     * @returns Distance in Meters
+     */
+    @Override
+    public Double call(Double lat1, Double lat2, Double lon1, Double lon2) {
+
+      final int R = 6371; // Radius of the earth
+
+      double latDistance = Math.toRadians(lat2 - lat1);
+      double lonDistance = Math.toRadians(lon2 - lon1);
+      double a = Math.sin(latDistance / 2) * Math.sin(latDistance / 2)
+          + Math.cos(Math.toRadians(lat1)) * Math.cos(Math.toRadians(lat2))
+          * Math.sin(lonDistance / 2) * Math.sin(lonDistance / 2);
+      double c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a));
+      double distance = R * c * 1000; // convert to meters
+
+      double height = 0;
+
+      distance = Math.pow(distance, 2) + Math.pow(height, 2);
+
+      return Math.sqrt(distance);
+    }
+  }
+
+  /**
+   * Adds a new field "haversine_distance" to the row
+   */
+  public static class TripsWithDistanceTransformer implements Transformer {
+
+    @Override
+    public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
+        Dataset<Row> rowDataset, TypedProperties properties) {
+      rowDataset.sqlContext().udf().register("distance_udf", new DistanceUDF(), DataTypes.DoubleType);
+      return rowDataset.withColumn("haversine_distance",
+          functions.callUDF("distance_udf", functions.col("begin_lat"),
+              functions.col("end_lat"), functions.col("begin_lon"), functions.col("end_lat")));
+    }
+  }
 }
diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/UtilitiesTestBase.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/UtilitiesTestBase.java
index eb2d00a65..a5bba7343 100644
--- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/UtilitiesTestBase.java
+++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/UtilitiesTestBase.java
@@ -18,10 +18,16 @@
 
 package com.uber.hoodie.utilities;
 
+import com.google.common.collect.ImmutableList;
 import com.uber.hoodie.common.TestRawTripPayload;
 import com.uber.hoodie.common.minicluster.HdfsTestService;
 import com.uber.hoodie.common.model.HoodieRecord;
+import com.uber.hoodie.common.model.HoodieTableType;
+import com.uber.hoodie.common.table.HoodieTableMetaClient;
 import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.hive.HiveSyncConfig;
+import com.uber.hoodie.hive.HoodieHiveClient;
+import com.uber.hoodie.hive.util.HiveTestService;
 import com.uber.hoodie.utilities.sources.TestDataSource;
 import java.io.BufferedReader;
 import java.io.IOException;
@@ -32,8 +38,11 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hive.service.server.HiveServer2;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
@@ -51,15 +60,26 @@ public class UtilitiesTestBase {
   protected static MiniDFSCluster dfsCluster;
   protected static DistributedFileSystem dfs;
   protected transient JavaSparkContext jsc = null;
+  protected transient SparkSession sparkSession = null;
   protected transient SQLContext sqlContext;
+  protected static HiveServer2 hiveServer;
 
   @BeforeClass
   public static void initClass() throws Exception {
+    initClass(false);
+  }
+
+  static void initClass(boolean startHiveService) throws Exception {
     hdfsTestService = new HdfsTestService();
     dfsCluster = hdfsTestService.start(true);
     dfs = dfsCluster.getFileSystem();
     dfsBasePath = dfs.getWorkingDirectory().toString();
     dfs.mkdirs(new Path(dfsBasePath));
+    if (startHiveService) {
+      HiveTestService hiveService = new HiveTestService(hdfsTestService.getHadoopConf());
+      hiveServer = hiveService.start();
+      clearHiveDb();
+    }
   }
 
   @AfterClass
@@ -67,6 +87,9 @@ public class UtilitiesTestBase {
     if (hdfsTestService != null) {
       hdfsTestService.stop();
     }
+    if (hiveServer != null) {
+      hiveServer.stop();
+    }
   }
 
   @Before
@@ -74,6 +97,7 @@ public class UtilitiesTestBase {
     TestDataSource.initDataGen();
     jsc = UtilHelpers.buildSparkContext(this.getClass().getName() + "-hoodie", "local[2]");
     sqlContext = new SQLContext(jsc);
+    sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate();
   }
 
   @After
@@ -84,6 +108,42 @@ public class UtilitiesTestBase {
     }
   }
 
+  /**
+   * Helper to get hive sync config
+   * @param basePath
+   * @param tableName
+   * @return
+   */
+  protected static HiveSyncConfig getHiveSyncConfig(String basePath, String tableName) {
+    HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
+    hiveSyncConfig.jdbcUrl = "jdbc:hive2://127.0.0.1:9999/";
+    hiveSyncConfig.hiveUser = "";
+    hiveSyncConfig.hivePass = "";
+    hiveSyncConfig.databaseName = "testdb1";
+    hiveSyncConfig.tableName = tableName;
+    hiveSyncConfig.basePath = basePath;
+    hiveSyncConfig.assumeDatePartitioning = false;
+    hiveSyncConfig.partitionFields = new ImmutableList.Builder<String>().add("datestr").build();
+    return hiveSyncConfig;
+  }
+
+  /**
+   * Initialize Hive DB
+   * @throws IOException
+   */
+  private static void clearHiveDb() throws IOException {
+    HiveConf hiveConf = new HiveConf();
+    // Create Dummy hive sync config
+    HiveSyncConfig hiveSyncConfig = getHiveSyncConfig("/dummy", "dummy");
+    hiveConf.addResource(hiveServer.getHiveConf());
+    HoodieTableMetaClient.initTableType(dfs.getConf(), hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
+        hiveSyncConfig.tableName, null);
+    HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveConf, dfs);
+    client.updateHiveSQL("drop database if exists " + hiveSyncConfig.databaseName);
+    client.updateHiveSQL("create database " + hiveSyncConfig.databaseName);
+    client.close();
+  }
+
   public static class Helpers {
 
     // to get hold of resources bundled with jar
diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDFSSource.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDFSSource.java
index d460e6ea8..828b78dd8 100644
--- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDFSSource.java
+++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDFSSource.java
@@ -20,16 +20,20 @@ package com.uber.hoodie.utilities.sources;
 
 import static org.junit.Assert.assertEquals;
 
+import com.uber.hoodie.AvroConversionUtils;
 import com.uber.hoodie.common.HoodieTestDataGenerator;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.UtilitiesTestBase;
+import com.uber.hoodie.utilities.deltastreamer.SourceFormatAdapter;
 import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider;
 import java.io.IOException;
 import java.util.Optional;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
@@ -37,7 +41,7 @@ import org.junit.BeforeClass;
 import org.junit.Test;
 
 /**
- * Basic tests against all subclasses of {@link DFSSource}
+ * Basic tests against all subclasses of {@link JsonDFSSource}
  */
 public class TestDFSSource extends UtilitiesTestBase {
 
@@ -71,34 +75,47 @@ public class TestDFSSource extends UtilitiesTestBase {
 
     TypedProperties props = new TypedProperties();
     props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsBasePath + "/jsonFiles");
-    JsonDFSSource jsonSource = new JsonDFSSource(props, jsc, schemaProvider);
+    JsonDFSSource jsonDFSSource = new JsonDFSSource(props, jsc, sparkSession, schemaProvider);
+    SourceFormatAdapter jsonSource = new SourceFormatAdapter(jsonDFSSource);
 
     // 1. Extract without any checkpoint => get all the data, respecting sourceLimit
-    assertEquals(Optional.empty(), jsonSource.fetchNewData(Optional.empty(), Long.MAX_VALUE).getKey());
+    assertEquals(Optional.empty(), jsonSource.fetchNewDataInAvroFormat(Optional.empty(), Long.MAX_VALUE).getBatch());
     UtilitiesTestBase.Helpers.saveStringsToDFS(
         Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 100)), dfs,
         dfsBasePath + "/jsonFiles/1.json");
-    assertEquals(Optional.empty(), jsonSource.fetchNewData(Optional.empty(), 10).getKey());
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch1 = jsonSource.fetchNewData(Optional.empty(), 1000000);
-    assertEquals(100, fetch1.getKey().get().count());
+    assertEquals(Optional.empty(), jsonSource.fetchNewDataInAvroFormat(Optional.empty(), 10).getBatch());
+    InputBatch<JavaRDD<GenericRecord>> fetch1 =
+        jsonSource.fetchNewDataInAvroFormat(Optional.empty(), 1000000);
+    assertEquals(100, fetch1.getBatch().get().count());
+    // Test json -> Row format
+    InputBatch<Dataset<Row>> fetch1AsRows =
+        jsonSource.fetchNewDataInRowFormat(Optional.empty(), 1000000);
+    assertEquals(100, fetch1AsRows.getBatch().get().count());
+    // Test Avro -> Row format
+    Dataset<Row> fetch1Rows = AvroConversionUtils.createDataFrame(JavaRDD.toRDD(fetch1.getBatch().get()),
+        schemaProvider.getSourceSchema().toString(), jsonDFSSource.getSparkSession());
+    assertEquals(100, fetch1Rows.count());
 
     // 2. Produce new data, extract new data
     UtilitiesTestBase.Helpers.saveStringsToDFS(
         Helpers.jsonifyRecords(dataGenerator.generateInserts("001", 10000)),
         dfs, dfsBasePath + "/jsonFiles/2.json");
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch2 = jsonSource.fetchNewData(
-        Optional.of(fetch1.getValue()), Long.MAX_VALUE);
-    assertEquals(10000, fetch2.getKey().get().count());
+    InputBatch<Dataset<Row>> fetch2 = jsonSource.fetchNewDataInRowFormat(
+        Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(10000, fetch2.getBatch().get().count());
 
     // 3. Extract with previous checkpoint => gives same data back (idempotent)
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch3 = jsonSource.fetchNewData(
-        Optional.of(fetch1.getValue()), Long.MAX_VALUE);
-    assertEquals(10000, fetch3.getKey().get().count());
-    assertEquals(fetch2.getValue(), fetch3.getValue());
+    InputBatch<Dataset<Row>> fetch3 = jsonSource.fetchNewDataInRowFormat(
+        Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(10000, fetch3.getBatch().get().count());
+    assertEquals(fetch2.getCheckpointForNextBatch(), fetch3.getCheckpointForNextBatch());
+    fetch3.getBatch().get().registerTempTable("test_dfs_table");
+    Dataset<Row> rowDataset = new SQLContext(jsc.sc()).sql("select * from test_dfs_table");
+    assertEquals(10000, rowDataset.count());
 
     // 4. Extract with latest checkpoint => no new data returned
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch4 = jsonSource.fetchNewData(
-        Optional.of(fetch2.getValue()), Long.MAX_VALUE);
-    assertEquals(Optional.empty(), fetch4.getKey());
+    InputBatch<JavaRDD<GenericRecord>> fetch4 = jsonSource.fetchNewDataInAvroFormat(
+        Optional.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(Optional.empty(), fetch4.getBatch());
   }
-}
+}
\ No newline at end of file
diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDataSource.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDataSource.java
index 57369de33..ad074149d 100644
--- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDataSource.java
+++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDataSource.java
@@ -21,8 +21,6 @@ package com.uber.hoodie.utilities.sources;
 import com.uber.hoodie.common.HoodieTestDataGenerator;
 import com.uber.hoodie.common.model.HoodieRecord;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.ImmutablePair;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
 import java.io.IOException;
 import java.util.ArrayList;
@@ -35,11 +33,12 @@ import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
 
 /**
  * An implementation of {@link Source}, that emits test upserts.
  */
-public class TestDataSource extends Source {
+public class TestDataSource extends AvroSource {
 
   private static volatile Logger log = LogManager.getLogger(TestDataSource.class);
 
@@ -54,8 +53,9 @@ public class TestDataSource extends Source {
     dataGenerator = null;
   }
 
-  public TestDataSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
+  public TestDataSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider);
   }
 
   private GenericRecord toGenericRecord(HoodieRecord hoodieRecord) {
@@ -68,14 +68,14 @@ public class TestDataSource extends Source {
   }
 
   @Override
-  public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(Optional<String> lastCheckpointStr,
+  protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Optional<String> lastCheckpointStr,
       long sourceLimit) {
 
     int nextCommitNum = lastCheckpointStr.map(s -> Integer.parseInt(s) + 1).orElse(0);
     String commitTime = String.format("%05d", nextCommitNum);
     // No new data.
     if (sourceLimit <= 0) {
-      return new ImmutablePair<>(Optional.empty(), commitTime);
+      return new InputBatch<>(Optional.empty(), commitTime);
     }
 
     // generate `sourceLimit` number of upserts each time.
@@ -94,6 +94,6 @@ public class TestDataSource extends Source {
     }
     
     JavaRDD<GenericRecord> avroRDD = sparkContext.<GenericRecord>parallelize(records, 4);
-    return new ImmutablePair<>(Optional.of(avroRDD), commitTime);
+    return new InputBatch<>(Optional.of(avroRDD), commitTime);
   }
 }
diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestKafkaSource.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestKafkaSource.java
index 785e80569..1adcbc371 100644
--- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestKafkaSource.java
+++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestKafkaSource.java
@@ -18,20 +18,23 @@
 
 package com.uber.hoodie.utilities.sources;
 
-import static com.uber.hoodie.utilities.sources.KafkaSource.CheckpointUtils;
 import static org.junit.Assert.assertEquals;
 
+import com.uber.hoodie.AvroConversionUtils;
 import com.uber.hoodie.common.HoodieTestDataGenerator;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.UtilitiesTestBase;
+import com.uber.hoodie.utilities.deltastreamer.SourceFormatAdapter;
 import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider;
+import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Optional;
 import kafka.common.TopicAndPartition;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
 import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset;
 import org.apache.spark.streaming.kafka.KafkaTestUtils;
 import org.apache.spark.streaming.kafka.OffsetRange;
@@ -42,7 +45,7 @@ import org.junit.BeforeClass;
 import org.junit.Test;
 
 /**
- * Tests against {@link KafkaSource}
+ * Tests against {@link AvroKafkaSource}
  */
 public class TestKafkaSource extends UtilitiesTestBase {
 
@@ -89,30 +92,44 @@ public class TestKafkaSource extends UtilitiesTestBase {
     props.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
     props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
 
-    Source kafkaSource = new JsonKafkaSource(props, jsc, schemaProvider);
+    Source jsonSource = new JsonKafkaSource(props, jsc, sparkSession, schemaProvider);
+    SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
 
     // 1. Extract without any checkpoint => get all the data, respecting sourceLimit
-    assertEquals(Optional.empty(), kafkaSource.fetchNewData(Optional.empty(), Long.MAX_VALUE).getKey());
+    assertEquals(Optional.empty(), kafkaSource.fetchNewDataInAvroFormat(Optional.empty(), Long.MAX_VALUE).getBatch());
     testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch1 = kafkaSource.fetchNewData(Optional.empty(), 900);
-    assertEquals(900, fetch1.getKey().get().count());
+    InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Optional.empty(), 900);
+    assertEquals(900, fetch1.getBatch().get().count());
+    // Test Avro To DataFrame<Row> path
+    Dataset<Row> fetch1AsRows = AvroConversionUtils.createDataFrame(JavaRDD.toRDD(fetch1.getBatch().get()),
+        schemaProvider.getSourceSchema().toString(), jsonSource.getSparkSession());
+    assertEquals(900, fetch1AsRows.count());
 
     // 2. Produce new data, extract new data
     testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("001", 1000)));
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch2 = kafkaSource.fetchNewData(
-        Optional.of(fetch1.getValue()), Long.MAX_VALUE);
-    assertEquals(1100, fetch2.getKey().get().count());
+    InputBatch<Dataset<Row>> fetch2 = kafkaSource.fetchNewDataInRowFormat(
+        Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(1100, fetch2.getBatch().get().count());
 
     // 3. Extract with previous checkpoint => gives same data back (idempotent)
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch3 = kafkaSource.fetchNewData(
-        Optional.of(fetch1.getValue()), Long.MAX_VALUE);
-    assertEquals(fetch2.getKey().get().count(), fetch3.getKey().get().count());
-    assertEquals(fetch2.getValue(), fetch3.getValue());
+    InputBatch<JavaRDD<GenericRecord>> fetch3 = kafkaSource.fetchNewDataInAvroFormat(
+        Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(fetch2.getBatch().get().count(), fetch3.getBatch().get().count());
+    assertEquals(fetch2.getCheckpointForNextBatch(), fetch3.getCheckpointForNextBatch());
+    // Same using Row API
+    InputBatch<Dataset<Row>> fetch3AsRows =
+        kafkaSource.fetchNewDataInRowFormat(Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(fetch2.getBatch().get().count(), fetch3AsRows.getBatch().get().count());
+    assertEquals(fetch2.getCheckpointForNextBatch(), fetch3AsRows.getCheckpointForNextBatch());
 
     // 4. Extract with latest checkpoint => no new data returned
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch4 = kafkaSource.fetchNewData(
-        Optional.of(fetch2.getValue()), Long.MAX_VALUE);
-    assertEquals(Optional.empty(), fetch4.getKey());
+    InputBatch<JavaRDD<GenericRecord>> fetch4 = kafkaSource.fetchNewDataInAvroFormat(
+        Optional.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(Optional.empty(), fetch4.getBatch());
+    // Same using Row API
+    InputBatch<Dataset<Row>> fetch4AsRows =
+        kafkaSource.fetchNewDataInRowFormat(Optional.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(Optional.empty(), fetch4AsRows.getBatch());
   }
 
 
diff --git a/hoodie-utilities/src/test/resources/delta-streamer-config/sql-transformer.properties b/hoodie-utilities/src/test/resources/delta-streamer-config/sql-transformer.properties
new file mode 100644
index 000000000..87038c36b
--- /dev/null
+++ b/hoodie-utilities/src/test/resources/delta-streamer-config/sql-transformer.properties
@@ -0,0 +1,19 @@
+#
+#  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+#
+include=base.properties
+hoodie.deltastreamer.transformer.sql=SELECT a.timestamp, a._row_key, a.rider, a.driver, a.begin_lat, a.begin_lon, a.end_lat, a.end_lon, a.fare, CAST(1.0 AS DOUBLE) AS haversine_distance FROM <SRC> a
\ No newline at end of file
diff --git a/hoodie-utilities/src/test/resources/delta-streamer-config/target.avsc b/hoodie-utilities/src/test/resources/delta-streamer-config/target.avsc
new file mode 100644
index 000000000..d2d410363
--- /dev/null
+++ b/hoodie-utilities/src/test/resources/delta-streamer-config/target.avsc
@@ -0,0 +1,37 @@
+{
+  "type" : "record",
+  "name" : "triprec",
+  "fields" : [
+  {
+    "name" : "timestamp",
+    "type" : "double"
+  }, {
+    "name" : "_row_key",
+    "type" : "string"
+  }, {
+    "name" : "rider",
+    "type" : "string"
+  }, {
+    "name" : "driver",
+    "type" : "string"
+  }, {
+    "name" : "begin_lat",
+    "type" : "double"
+  }, {
+    "name" : "begin_lon",
+    "type" : "double"
+  }, {
+    "name" : "end_lat",
+    "type" : "double"
+  }, {
+    "name" : "end_lon",
+    "type" : "double"
+  }, {
+    "name" : "fare",
+    "type" : "double"
+  }, {
+    "name" : "haversine_distance",
+    "type" : "double"
+  }]
+}
+
diff --git a/packaging/hoodie-hadoop-mr-bundle/pom.xml b/packaging/hoodie-hadoop-mr-bundle/pom.xml
index 79a4a757a..4e2e72d47 100644
--- a/packaging/hoodie-hadoop-mr-bundle/pom.xml
+++ b/packaging/hoodie-hadoop-mr-bundle/pom.xml
@@ -66,6 +66,48 @@
       <artifactId>hadoop-auth</artifactId>
     </dependency>
 
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-jdbc</artifactId>
+      <version>${hive.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-exec</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-service</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-shims</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+     <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-serde</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-metastore</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-common</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-hdfs</artifactId>
@@ -182,116 +224,4 @@
   <properties>
     <checkstyle.skip>true</checkstyle.skip>
   </properties>
-  
-  <profiles>
-    <profile>
-      <id>hive12</id>
-      <activation>
-        <property>
-          <name>!hive11</name>
-        </property>
-      </activation>
-      <properties>
-        <hiveJarSuffix />
-      </properties>
-      <dependencies>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive12.version}</version>
-          <exclusions>
-            <exclusion>
-              <groupId>commons-logging</groupId>
-              <artifactId>commons-logging</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-exec</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-shims</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-         <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-serde</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive11</id>
-      <activation>
-        <property>
-          <name>hive11</name>
-        </property>
-      </activation>
-      <properties>
-        <hiveJarSuffix>.hive11</hiveJarSuffix>
-      </properties>
-      <dependencies>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-shims</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-       <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive11.version}</version>
-          <exclusions>
-            <exclusion>
-              <groupId>commons-logging</groupId>
-              <artifactId>commons-logging</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-serde</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-exec</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-      </dependencies>
-   </profile>
-  </profiles>
-
 </project>
diff --git a/packaging/hoodie-hive-bundle/pom.xml b/packaging/hoodie-hive-bundle/pom.xml
index eafad2592..ee43709fa 100644
--- a/packaging/hoodie-hive-bundle/pom.xml
+++ b/packaging/hoodie-hive-bundle/pom.xml
@@ -44,6 +44,26 @@
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-auth</artifactId>
     </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-service</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-jdbc</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-metastore</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-common</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
@@ -195,73 +215,4 @@
   <properties>
     <checkstyle.skip>true</checkstyle.skip>
   </properties>
-
-  <profiles>
-    <profile>
-      <id>hive12</id>
-      <activation>
-        <property>
-          <name>!hive11</name>
-        </property>
-      </activation>
-      <properties>
-        <hiveJarSuffix />
-      </properties>
-      <dependencies>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive11</id>
-      <activation>
-        <property>
-          <name>hive11</name>
-        </property>
-      </activation>
-      <properties>
-        <hiveJarSuffix>.hive11</hiveJarSuffix>
-      </properties>
-      <dependencies>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
-  </profiles>
 </project>
diff --git a/packaging/hoodie-spark-bundle/pom.xml b/packaging/hoodie-spark-bundle/pom.xml
index 95b4aedd4..5c932e59c 100644
--- a/packaging/hoodie-spark-bundle/pom.xml
+++ b/packaging/hoodie-spark-bundle/pom.xml
@@ -239,6 +239,26 @@
       <groupId>org.apache.avro</groupId>
       <artifactId>avro</artifactId>
     </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-service</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-jdbc</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-metastore</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${hive.groupid}</groupId>
+      <artifactId>hive-common</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-configuration2</artifactId>
@@ -269,74 +289,5 @@
       <version>${project.version}</version>
    </dependency>
   </dependencies>
-
-  <profiles>
-    <profile>
-      <id>hive12</id>
-      <activation>
-        <property>
-          <name>!hive11</name>
-        </property>
-      </activation>
-      <properties>
-        <hiveJarSuffix />
-      </properties>
-      <dependencies>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive12.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive11</id>
-      <activation>
-        <property>
-          <name>hive11</name>
-        </property>
-      </activation>
-      <properties>
-        <hiveJarSuffix>.hive11</hiveJarSuffix>
-      </properties>
-      <dependencies>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>${hive11.groupid}</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
-  </profiles>
 </project>
 
diff --git a/pom.xml b/pom.xml
index 88196084c..a0ec312c5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -129,10 +129,8 @@
     <log4j.version>1.2.17</log4j.version>
     <joda.version>2.9.9</joda.version>
     <hadoop.version>2.7.3</hadoop.version>
-    <hive12.groupid>org.apache.hive</hive12.groupid>
-    <hive12.version>1.2.1</hive12.version>
-    <hive11.groupid>org.apache.hive</hive11.groupid>
-    <hive11.version>1.1.1</hive11.version>
+    <hive.groupid>org.apache.hive</hive.groupid>
+    <hive.version>1.2.1</hive.version>
     <metrics.version>3.1.1</metrics.version>
     <spark.version>2.1.0</spark.version>
     <avro.version>1.7.7</avro.version>
@@ -589,6 +587,11 @@
         <artifactId>commons-dbcp</artifactId>
         <version>1.4</version>
       </dependency>
+      <dependency>
+        <groupId>commons-pool</groupId>
+        <artifactId>commons-pool</artifactId>
+        <version>1.4</version>
+      </dependency>
       <dependency>
         <groupId>org.apache.httpcomponents</groupId>
         <artifactId>httpcore</artifactId>
@@ -656,7 +659,48 @@
         <artifactId>jackson-mapper-asl</artifactId>
         <version>1.9.13</version>
       </dependency>
-
+      <dependency>
+        <groupId>${hive.groupid}</groupId>
+        <artifactId>hive-service</artifactId>
+        <version>${hive.version}</version>
+        <scope>provided</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.groupid}</groupId>
+        <artifactId>hive-shims</artifactId>
+        <version>${hive.version}</version>
+        <scope>provided</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.groupid}</groupId>
+        <artifactId>hive-jdbc</artifactId>
+        <version>${hive.version}</version>
+        <scope>provided</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.groupid}</groupId>
+        <artifactId>hive-serde</artifactId>
+        <version>${hive.version}</version>
+        <scope>provided</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.groupid}</groupId>
+        <artifactId>hive-metastore</artifactId>
+        <version>${hive.version}</version>
+        <scope>provided</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.groupid}</groupId>
+        <artifactId>hive-common</artifactId>
+        <version>${hive.version}</version>
+        <scope>provided</scope>
+      </dependency>
+      <dependency>
+        <groupId>${hive.groupid}</groupId>
+        <artifactId>hive-exec</artifactId>
+        <version>${hive.version}</version>
+        <scope>provided</scope>
+      </dependency>
       <dependency>
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-hdfs</artifactId>
@@ -708,109 +752,6 @@
   </distributionManagement>
 
   <profiles>
-    <profile>
-      <id>hive12</id>
-      <activation>
-        <property>
-          <name>!hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive12.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-shims</artifactId>
-          <version>${hive12.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive12.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-serde</artifactId>
-          <version>${hive12.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive12.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive12.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>${hive12.groupid}</groupId>
-          <artifactId>hive-exec</artifactId>
-          <version>${hive12.version}</version>
-          <scope>provided</scope>
-        </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive11</id>
-      <activation>
-        <property>
-          <name>hive11</name>
-        </property>
-      </activation>
-      <dependencies>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-service</artifactId>
-          <version>${hive11.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-shims</artifactId>
-          <version>${hive11.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-jdbc</artifactId>
-          <version>${hive11.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-serde</artifactId>
-          <version>${hive11.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-metastore</artifactId>
-          <version>${hive11.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-common</artifactId>
-          <version>${hive11.version}</version>
-          <scope>provided</scope>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-exec</artifactId>
-          <version>${hive11.version}</version>
-          <scope>provided</scope>
-        </dependency>
-      </dependencies>
-    </profile>
     <profile>
       <id>release</id>
       <activation>