New Features in DeltaStreamer :

(1) Apply transformation when using delta-streamer to ingest data. (2) Add Hudi Incremental Source for Delta Streamer (3) Allow delta-streamer config-property to be passed as command-line (4) Add Hive Integration to Delta-Streamer and address Review comments (5) Ensure MultiPartKeysValueExtractor handle hive style partition description (6) Reuse same spark session on both source and transformer (7) Support extracting partition fields from _hoodie_partition_path for HoodieIncrSource (8) Reuse Binary Avro coders (9) Add push down filter for Incremental source (10) Add Hoodie DeltaStreamer metrics to track total time taken
2018-10-10 10:31:34 -07:00
parent c70dbc13e9
commit 3a0044216c
65 changed files with 2752 additions and 911 deletions
--- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieDeltaStreamer.java
+++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieDeltaStreamer.java
@@ -19,8 +19,10 @@
 package com.uber.hoodie.utilities;

 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;

+import com.uber.hoodie.DataSourceWriteOptions;
 import com.uber.hoodie.common.model.HoodieCommitMetadata;
 import com.uber.hoodie.common.table.HoodieTableMetaClient;
 import com.uber.hoodie.common.table.HoodieTimeline;
@@ -28,17 +30,31 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
 import com.uber.hoodie.common.util.DFSPropertiesConfiguration;
 import com.uber.hoodie.common.util.TypedProperties;
 import com.uber.hoodie.exception.DatasetNotFoundException;
+import com.uber.hoodie.hive.HiveSyncConfig;
+import com.uber.hoodie.hive.HoodieHiveClient;
+import com.uber.hoodie.hive.MultiPartKeysValueExtractor;
 import com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer;
 import com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer.Operation;
+import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider;
+import com.uber.hoodie.utilities.sources.HoodieIncrSource;
 import com.uber.hoodie.utilities.sources.TestDataSource;
+import com.uber.hoodie.utilities.transform.SqlQueryBasedTransformer;
+import com.uber.hoodie.utilities.transform.Transformer;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.List;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.api.java.UDF4;
+import org.apache.spark.sql.functions;
+import org.apache.spark.sql.types.DataTypes;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
@@ -55,17 +71,43 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {

  @BeforeClass
  public static void initClass() throws Exception {
-    UtilitiesTestBase.initClass();
+    UtilitiesTestBase.initClass(true);

    // prepare the configs.
    UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/base.properties", dfs, dfsBasePath + "/base.properties");
+    UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/sql-transformer.properties", dfs,
+        dfsBasePath + "/sql-transformer.properties");
    UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/source.avsc", dfs, dfsBasePath + "/source.avsc");
+    UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/target.avsc", dfs, dfsBasePath + "/target.avsc");
+
    TypedProperties props = new TypedProperties();
-    props.setProperty("include", "base.properties");
+    props.setProperty("include", "sql-transformer.properties");
    props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
    props.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
    props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc");
+    props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc");
+    // Hive Configs
+    props.setProperty(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), "jdbc:hive2://127.0.0.1:9999/");
+    props.setProperty(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), "testdb1");
+    props.setProperty(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), "hive_trips");
+    props.setProperty(DataSourceWriteOptions.HIVE_ASSUME_DATE_PARTITION_OPT_KEY(), "false");
+    props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "datestr");
+    props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
+        MultiPartKeysValueExtractor.class.getName());
    UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-source.properties");
+
+    // Properties used for the delta-streamer which incrementally pulls from upstream Hudi source table and writes to
+    // downstream hudi table
+    TypedProperties downstreamProps = new TypedProperties();
+    downstreamProps.setProperty("include", "base.properties");
+    downstreamProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
+    downstreamProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
+
+    // Source schema is the target schema of upstream table
+    downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/target.avsc");
+    downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc");
+    UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs,
+        dfsBasePath + "/test-downstream-source.properties");
  }

  @AfterClass
@@ -86,17 +128,48 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
  }

  static class TestHelpers {
-
    static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op) {
+      return makeConfig(basePath, op, TripsWithDistanceTransformer.class.getName());
+    }
+
+    static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName) {
+      return makeConfig(basePath, op, transformerClassName, false);
+    }
+
+    static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName,
+        boolean enableHiveSync) {
      HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
      cfg.targetBasePath = basePath;
      cfg.targetTableName = "hoodie_trips";
      cfg.storageType = "COPY_ON_WRITE";
      cfg.sourceClassName = TestDataSource.class.getName();
+      cfg.transformerClassName = transformerClassName;
      cfg.operation = op;
+      cfg.enableHiveSync = enableHiveSync;
      cfg.sourceOrderingField = "timestamp";
      cfg.propsFilePath = dfsBasePath + "/test-source.properties";
      cfg.sourceLimit = 1000;
+      cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName();
+      return cfg;
+    }
+
+    static HoodieDeltaStreamer.Config makeConfigForHudiIncrSrc(String srcBasePath, String basePath, Operation op,
+        boolean addReadLatestOnMissingCkpt) {
+      HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
+      cfg.targetBasePath = basePath;
+      cfg.targetTableName = "hoodie_trips_copy";
+      cfg.storageType = "COPY_ON_WRITE";
+      cfg.sourceClassName = HoodieIncrSource.class.getName();
+      cfg.operation = op;
+      cfg.sourceOrderingField = "timestamp";
+      cfg.propsFilePath = dfsBasePath + "/test-downstream-source.properties";
+      cfg.sourceLimit = 1000;
+      List<String> cfgs = new ArrayList<>();
+      cfgs.add("hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt=" + addReadLatestOnMissingCkpt);
+      cfgs.add("hoodie.deltastreamer.source.hoodieincr.path=" + srcBasePath);
+      // No partition
+      cfgs.add("hoodie.deltastreamer.source.hoodieincr.partition.fields=datestr");
+      cfg.configs = cfgs;
      return cfg;
    }

@@ -110,15 +183,30 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
          .sort("_hoodie_commit_time").collectAsList();
    }

-    static void assertCommitMetadata(String expected, String datasetPath, FileSystem fs, int totalCommits)
+    static void assertDistanceCount(long expected, String datasetPath, SQLContext sqlContext) {
+      sqlContext.read().format("com.uber.hoodie").load(datasetPath).registerTempTable("tmp_trips");
+      long recordCount =
+          sqlContext.sparkSession().sql("select * from tmp_trips where haversine_distance is not NULL").count();
+      assertEquals(expected, recordCount);
+    }
+
+    static void assertDistanceCountWithExactValue(long expected, String datasetPath, SQLContext sqlContext) {
+      sqlContext.read().format("com.uber.hoodie").load(datasetPath).registerTempTable("tmp_trips");
+      long recordCount =
+          sqlContext.sparkSession().sql("select * from tmp_trips where haversine_distance = 1.0").count();
+      assertEquals(expected, recordCount);
+    }
+
+    static String assertCommitMetadata(String expected, String datasetPath, FileSystem fs, int totalCommits)
        throws IOException {
      HoodieTableMetaClient meta = new HoodieTableMetaClient(fs.getConf(), datasetPath);
      HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
-      HoodieInstant lastCommit = timeline.lastInstant().get();
+      HoodieInstant lastInstant = timeline.lastInstant().get();
      HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
-          timeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class);
+          timeline.getInstantDetails(lastInstant).get(), HoodieCommitMetadata.class);
      assertEquals(totalCommits, timeline.countInstants());
      assertEquals(expected, commitMetadata.getMetadata(HoodieDeltaStreamer.CHECKPOINT_KEY));
+      return lastInstant.getTimestamp();
    }
  }

@@ -152,12 +240,14 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(datasetBasePath, Operation.BULK_INSERT);
    new HoodieDeltaStreamer(cfg, jsc).sync();
    TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1);

    // No new data => no commits.
    cfg.sourceLimit = 0;
    new HoodieDeltaStreamer(cfg, jsc).sync();
    TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1);

    // upsert() #1
@@ -165,11 +255,94 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
    cfg.operation = Operation.UPSERT;
    new HoodieDeltaStreamer(cfg, jsc).sync();
    TestHelpers.assertRecordCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata("00001", datasetBasePath, dfs, 2);
    List<Row> counts = TestHelpers.countsPerCommit(datasetBasePath + "/*/*.parquet", sqlContext);
    assertEquals(2000, counts.get(0).getLong(1));
  }

+  /**
+   * Test Bulk Insert and upserts with hive syncing. Tests Hudi incremental processing using a 2 step pipeline
+   * The first step involves using a SQL template to transform a source
+   * TEST-DATA-SOURCE  ============================> HUDI TABLE 1   ===============>  HUDI TABLE 2
+   *                   (incr-pull with transform)                     (incr-pull)
+   * Hudi Table 1 is synced with Hive.
+   * @throws Exception
+   */
+  @Test
+  public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline() throws Exception {
+    String datasetBasePath = dfsBasePath + "/test_dataset2";
+    String downstreamDatasetBasePath = dfsBasePath + "/test_downstream_dataset2";
+
+    HiveSyncConfig hiveSyncConfig = getHiveSyncConfig(datasetBasePath, "hive_trips");
+
+    // Initial bulk insert to ingest to first hudi table
+    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(datasetBasePath, Operation.BULK_INSERT,
+        SqlQueryBasedTransformer.class.getName(), true);
+    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
+    TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    String lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1);
+
+    // Now incrementally pull from the above hudi table and ingest to second table
+    HoodieDeltaStreamer.Config downstreamCfg =
+        TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath, Operation.BULK_INSERT, true);
+    new HoodieDeltaStreamer(downstreamCfg, jsc, dfs, hiveServer.getHiveConf()).sync();
+    TestHelpers.assertRecordCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamDatasetBasePath, dfs, 1);
+
+    // No new data => no commits for upstream table
+    cfg.sourceLimit = 0;
+    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
+    TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(1000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1);
+
+    // with no change in upstream table, no change in downstream too when pulled.
+    new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
+    TestHelpers.assertRecordCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamDatasetBasePath, dfs, 1);
+
+    // upsert() #1 on upstream hudi table
+    cfg.sourceLimit = 2000;
+    cfg.operation = Operation.UPSERT;
+    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
+    TestHelpers.assertRecordCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(2000, datasetBasePath + "/*/*.parquet", sqlContext);
+    lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00001", datasetBasePath, dfs, 2);
+    List<Row> counts = TestHelpers.countsPerCommit(datasetBasePath + "/*/*.parquet", sqlContext);
+    assertEquals(2000, counts.get(0).getLong(1));
+
+    // Incrementally pull changes in upstream hudi table and apply to downstream table
+    downstreamCfg =
+        TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath, Operation.UPSERT, false);
+    downstreamCfg.sourceLimit = 2000;
+    new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
+    TestHelpers.assertRecordCount(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCount(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    TestHelpers.assertDistanceCountWithExactValue(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    String finalInstant =
+        TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamDatasetBasePath, dfs, 2);
+    counts = TestHelpers.countsPerCommit(downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
+    assertEquals(2000, counts.get(0).getLong(1));
+
+    // Test Hive integration
+    HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), dfs);
+    assertTrue("Table " + hiveSyncConfig.tableName + " should exist",
+        hiveClient.doesTableExist());
+    assertEquals("Table partitions should match the number of partitions we wrote", 1,
+        hiveClient.scanTablePartitions().size());
+    assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
+        lastInstantForUpstreamTable, hiveClient.getLastCommitTimeSynced().get());
+  }
+
  @Test
  public void testFilterDupes() throws Exception {
    String datasetBasePath = dfsBasePath + "/test_dupes_dataset";
@@ -192,4 +365,57 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
    assertEquals(1000, counts.get(0).getLong(1));
    assertEquals(1000, counts.get(1).getLong(1));
  }
+
+  /**
+   * UDF to calculate Haversine distance
+   */
+  public static class DistanceUDF implements UDF4<Double, Double, Double, Double, Double> {
+
+    /**
+     *
+     * Taken from https://stackoverflow.com/questions/3694380/calculating-distance-between-two-points-using-latitude-
+     * longitude-what-am-i-doi
+     * Calculate distance between two points in latitude and longitude taking
+     * into account height difference. If you are not interested in height
+     * difference pass 0.0. Uses Haversine method as its base.
+     *
+     * lat1, lon1 Start point lat2, lon2 End point el1 Start altitude in meters
+     * el2 End altitude in meters
+     * @returns Distance in Meters
+     */
+    @Override
+    public Double call(Double lat1, Double lat2, Double lon1, Double lon2) {
+
+      final int R = 6371; // Radius of the earth
+
+      double latDistance = Math.toRadians(lat2 - lat1);
+      double lonDistance = Math.toRadians(lon2 - lon1);
+      double a = Math.sin(latDistance / 2) * Math.sin(latDistance / 2)
+          + Math.cos(Math.toRadians(lat1)) * Math.cos(Math.toRadians(lat2))
+          * Math.sin(lonDistance / 2) * Math.sin(lonDistance / 2);
+      double c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a));
+      double distance = R * c * 1000; // convert to meters
+
+      double height = 0;
+
+      distance = Math.pow(distance, 2) + Math.pow(height, 2);
+
+      return Math.sqrt(distance);
+    }
+  }
+
+  /**
+   * Adds a new field "haversine_distance" to the row
+   */
+  public static class TripsWithDistanceTransformer implements Transformer {
+
+    @Override
+    public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
+        Dataset<Row> rowDataset, TypedProperties properties) {
+      rowDataset.sqlContext().udf().register("distance_udf", new DistanceUDF(), DataTypes.DoubleType);
+      return rowDataset.withColumn("haversine_distance",
+          functions.callUDF("distance_udf", functions.col("begin_lat"),
+              functions.col("end_lat"), functions.col("begin_lon"), functions.col("end_lat")));
+    }
+  }
 }
--- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/UtilitiesTestBase.java
+++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/UtilitiesTestBase.java
@@ -18,10 +18,16 @@

 package com.uber.hoodie.utilities;

+import com.google.common.collect.ImmutableList;
 import com.uber.hoodie.common.TestRawTripPayload;
 import com.uber.hoodie.common.minicluster.HdfsTestService;
 import com.uber.hoodie.common.model.HoodieRecord;
+import com.uber.hoodie.common.model.HoodieTableType;
+import com.uber.hoodie.common.table.HoodieTableMetaClient;
 import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.hive.HiveSyncConfig;
+import com.uber.hoodie.hive.HoodieHiveClient;
+import com.uber.hoodie.hive.util.HiveTestService;
 import com.uber.hoodie.utilities.sources.TestDataSource;
 import java.io.BufferedReader;
 import java.io.IOException;
@@ -32,8 +38,11 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hive.service.server.HiveServer2;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
@@ -51,15 +60,26 @@ public class UtilitiesTestBase {
  protected static MiniDFSCluster dfsCluster;
  protected static DistributedFileSystem dfs;
  protected transient JavaSparkContext jsc = null;
+  protected transient SparkSession sparkSession = null;
  protected transient SQLContext sqlContext;
+  protected static HiveServer2 hiveServer;

  @BeforeClass
  public static void initClass() throws Exception {
+    initClass(false);
+  }
+
+  static void initClass(boolean startHiveService) throws Exception {
    hdfsTestService = new HdfsTestService();
    dfsCluster = hdfsTestService.start(true);
    dfs = dfsCluster.getFileSystem();
    dfsBasePath = dfs.getWorkingDirectory().toString();
    dfs.mkdirs(new Path(dfsBasePath));
+    if (startHiveService) {
+      HiveTestService hiveService = new HiveTestService(hdfsTestService.getHadoopConf());
+      hiveServer = hiveService.start();
+      clearHiveDb();
+    }
  }

  @AfterClass
@@ -67,6 +87,9 @@ public class UtilitiesTestBase {
    if (hdfsTestService != null) {
      hdfsTestService.stop();
    }
+    if (hiveServer != null) {
+      hiveServer.stop();
+    }
  }

  @Before
@@ -74,6 +97,7 @@ public class UtilitiesTestBase {
    TestDataSource.initDataGen();
    jsc = UtilHelpers.buildSparkContext(this.getClass().getName() + "-hoodie", "local[2]");
    sqlContext = new SQLContext(jsc);
+    sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate();
  }

  @After
@@ -84,6 +108,42 @@ public class UtilitiesTestBase {
    }
  }

+  /**
+   * Helper to get hive sync config
+   * @param basePath
+   * @param tableName
+   * @return
+   */
+  protected static HiveSyncConfig getHiveSyncConfig(String basePath, String tableName) {
+    HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
+    hiveSyncConfig.jdbcUrl = "jdbc:hive2://127.0.0.1:9999/";
+    hiveSyncConfig.hiveUser = "";
+    hiveSyncConfig.hivePass = "";
+    hiveSyncConfig.databaseName = "testdb1";
+    hiveSyncConfig.tableName = tableName;
+    hiveSyncConfig.basePath = basePath;
+    hiveSyncConfig.assumeDatePartitioning = false;
+    hiveSyncConfig.partitionFields = new ImmutableList.Builder<String>().add("datestr").build();
+    return hiveSyncConfig;
+  }
+
+  /**
+   * Initialize Hive DB
+   * @throws IOException
+   */
+  private static void clearHiveDb() throws IOException {
+    HiveConf hiveConf = new HiveConf();
+    // Create Dummy hive sync config
+    HiveSyncConfig hiveSyncConfig = getHiveSyncConfig("/dummy", "dummy");
+    hiveConf.addResource(hiveServer.getHiveConf());
+    HoodieTableMetaClient.initTableType(dfs.getConf(), hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
+        hiveSyncConfig.tableName, null);
+    HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveConf, dfs);
+    client.updateHiveSQL("drop database if exists " + hiveSyncConfig.databaseName);
+    client.updateHiveSQL("create database " + hiveSyncConfig.databaseName);
+    client.close();
+  }
+
  public static class Helpers {

    // to get hold of resources bundled with jar
--- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDFSSource.java
+++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDFSSource.java
@@ -20,16 +20,20 @@ package com.uber.hoodie.utilities.sources;

 import static org.junit.Assert.assertEquals;

+import com.uber.hoodie.AvroConversionUtils;
 import com.uber.hoodie.common.HoodieTestDataGenerator;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.UtilitiesTestBase;
+import com.uber.hoodie.utilities.deltastreamer.SourceFormatAdapter;
 import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider;
 import java.io.IOException;
 import java.util.Optional;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
@@ -37,7 +41,7 @@ import org.junit.BeforeClass;
 import org.junit.Test;

 /**
- * Basic tests against all subclasses of {@link DFSSource}
+ * Basic tests against all subclasses of {@link JsonDFSSource}
 */
 public class TestDFSSource extends UtilitiesTestBase {

@@ -71,34 +75,47 @@ public class TestDFSSource extends UtilitiesTestBase {

    TypedProperties props = new TypedProperties();
    props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsBasePath + "/jsonFiles");
-    JsonDFSSource jsonSource = new JsonDFSSource(props, jsc, schemaProvider);
+    JsonDFSSource jsonDFSSource = new JsonDFSSource(props, jsc, sparkSession, schemaProvider);
+    SourceFormatAdapter jsonSource = new SourceFormatAdapter(jsonDFSSource);

    // 1. Extract without any checkpoint => get all the data, respecting sourceLimit
-    assertEquals(Optional.empty(), jsonSource.fetchNewData(Optional.empty(), Long.MAX_VALUE).getKey());
+    assertEquals(Optional.empty(), jsonSource.fetchNewDataInAvroFormat(Optional.empty(), Long.MAX_VALUE).getBatch());
    UtilitiesTestBase.Helpers.saveStringsToDFS(
        Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 100)), dfs,
        dfsBasePath + "/jsonFiles/1.json");
-    assertEquals(Optional.empty(), jsonSource.fetchNewData(Optional.empty(), 10).getKey());
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch1 = jsonSource.fetchNewData(Optional.empty(), 1000000);
-    assertEquals(100, fetch1.getKey().get().count());
+    assertEquals(Optional.empty(), jsonSource.fetchNewDataInAvroFormat(Optional.empty(), 10).getBatch());
+    InputBatch<JavaRDD<GenericRecord>> fetch1 =
+        jsonSource.fetchNewDataInAvroFormat(Optional.empty(), 1000000);
+    assertEquals(100, fetch1.getBatch().get().count());
+    // Test json -> Row format
+    InputBatch<Dataset<Row>> fetch1AsRows =
+        jsonSource.fetchNewDataInRowFormat(Optional.empty(), 1000000);
+    assertEquals(100, fetch1AsRows.getBatch().get().count());
+    // Test Avro -> Row format
+    Dataset<Row> fetch1Rows = AvroConversionUtils.createDataFrame(JavaRDD.toRDD(fetch1.getBatch().get()),
+        schemaProvider.getSourceSchema().toString(), jsonDFSSource.getSparkSession());
+    assertEquals(100, fetch1Rows.count());

    // 2. Produce new data, extract new data
    UtilitiesTestBase.Helpers.saveStringsToDFS(
        Helpers.jsonifyRecords(dataGenerator.generateInserts("001", 10000)),
        dfs, dfsBasePath + "/jsonFiles/2.json");
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch2 = jsonSource.fetchNewData(
-        Optional.of(fetch1.getValue()), Long.MAX_VALUE);
-    assertEquals(10000, fetch2.getKey().get().count());
+    InputBatch<Dataset<Row>> fetch2 = jsonSource.fetchNewDataInRowFormat(
+        Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(10000, fetch2.getBatch().get().count());

    // 3. Extract with previous checkpoint => gives same data back (idempotent)
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch3 = jsonSource.fetchNewData(
-        Optional.of(fetch1.getValue()), Long.MAX_VALUE);
-    assertEquals(10000, fetch3.getKey().get().count());
-    assertEquals(fetch2.getValue(), fetch3.getValue());
+    InputBatch<Dataset<Row>> fetch3 = jsonSource.fetchNewDataInRowFormat(
+        Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(10000, fetch3.getBatch().get().count());
+    assertEquals(fetch2.getCheckpointForNextBatch(), fetch3.getCheckpointForNextBatch());
+    fetch3.getBatch().get().registerTempTable("test_dfs_table");
+    Dataset<Row> rowDataset = new SQLContext(jsc.sc()).sql("select * from test_dfs_table");
+    assertEquals(10000, rowDataset.count());

    // 4. Extract with latest checkpoint => no new data returned
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch4 = jsonSource.fetchNewData(
-        Optional.of(fetch2.getValue()), Long.MAX_VALUE);
-    assertEquals(Optional.empty(), fetch4.getKey());
+    InputBatch<JavaRDD<GenericRecord>> fetch4 = jsonSource.fetchNewDataInAvroFormat(
+        Optional.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(Optional.empty(), fetch4.getBatch());
  }
-}
+}
--- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDataSource.java
+++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDataSource.java
@@ -21,8 +21,6 @@ package com.uber.hoodie.utilities.sources;
 import com.uber.hoodie.common.HoodieTestDataGenerator;
 import com.uber.hoodie.common.model.HoodieRecord;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.ImmutablePair;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
 import java.io.IOException;
 import java.util.ArrayList;
@@ -35,11 +33,12 @@ import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;

 /**
 * An implementation of {@link Source}, that emits test upserts.
 */
-public class TestDataSource extends Source {
+public class TestDataSource extends AvroSource {

  private static volatile Logger log = LogManager.getLogger(TestDataSource.class);

@@ -54,8 +53,9 @@ public class TestDataSource extends Source {
    dataGenerator = null;
  }

-  public TestDataSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
+  public TestDataSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider);
  }

  private GenericRecord toGenericRecord(HoodieRecord hoodieRecord) {
@@ -68,14 +68,14 @@ public class TestDataSource extends Source {
  }

  @Override
-  public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(Optional<String> lastCheckpointStr,
+  protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Optional<String> lastCheckpointStr,
      long sourceLimit) {

    int nextCommitNum = lastCheckpointStr.map(s -> Integer.parseInt(s) + 1).orElse(0);
    String commitTime = String.format("%05d", nextCommitNum);
    // No new data.
    if (sourceLimit <= 0) {
-      return new ImmutablePair<>(Optional.empty(), commitTime);
+      return new InputBatch<>(Optional.empty(), commitTime);
    }

    // generate `sourceLimit` number of upserts each time.
@@ -94,6 +94,6 @@ public class TestDataSource extends Source {
    }
    
    JavaRDD<GenericRecord> avroRDD = sparkContext.<GenericRecord>parallelize(records, 4);
-    return new ImmutablePair<>(Optional.of(avroRDD), commitTime);
+    return new InputBatch<>(Optional.of(avroRDD), commitTime);
  }
 }
--- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestKafkaSource.java
+++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestKafkaSource.java
@@ -18,20 +18,23 @@

 package com.uber.hoodie.utilities.sources;

-import static com.uber.hoodie.utilities.sources.KafkaSource.CheckpointUtils;
 import static org.junit.Assert.assertEquals;

+import com.uber.hoodie.AvroConversionUtils;
 import com.uber.hoodie.common.HoodieTestDataGenerator;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.UtilitiesTestBase;
+import com.uber.hoodie.utilities.deltastreamer.SourceFormatAdapter;
 import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider;
+import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Optional;
 import kafka.common.TopicAndPartition;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
 import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset;
 import org.apache.spark.streaming.kafka.KafkaTestUtils;
 import org.apache.spark.streaming.kafka.OffsetRange;
@@ -42,7 +45,7 @@ import org.junit.BeforeClass;
 import org.junit.Test;

 /**
- * Tests against {@link KafkaSource}
+ * Tests against {@link AvroKafkaSource}
 */
 public class TestKafkaSource extends UtilitiesTestBase {

@@ -89,30 +92,44 @@ public class TestKafkaSource extends UtilitiesTestBase {
    props.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
    props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");

-    Source kafkaSource = new JsonKafkaSource(props, jsc, schemaProvider);
+    Source jsonSource = new JsonKafkaSource(props, jsc, sparkSession, schemaProvider);
+    SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);

    // 1. Extract without any checkpoint => get all the data, respecting sourceLimit
-    assertEquals(Optional.empty(), kafkaSource.fetchNewData(Optional.empty(), Long.MAX_VALUE).getKey());
+    assertEquals(Optional.empty(), kafkaSource.fetchNewDataInAvroFormat(Optional.empty(), Long.MAX_VALUE).getBatch());
    testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch1 = kafkaSource.fetchNewData(Optional.empty(), 900);
-    assertEquals(900, fetch1.getKey().get().count());
+    InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Optional.empty(), 900);
+    assertEquals(900, fetch1.getBatch().get().count());
+    // Test Avro To DataFrame<Row> path
+    Dataset<Row> fetch1AsRows = AvroConversionUtils.createDataFrame(JavaRDD.toRDD(fetch1.getBatch().get()),
+        schemaProvider.getSourceSchema().toString(), jsonSource.getSparkSession());
+    assertEquals(900, fetch1AsRows.count());

    // 2. Produce new data, extract new data
    testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("001", 1000)));
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch2 = kafkaSource.fetchNewData(
-        Optional.of(fetch1.getValue()), Long.MAX_VALUE);
-    assertEquals(1100, fetch2.getKey().get().count());
+    InputBatch<Dataset<Row>> fetch2 = kafkaSource.fetchNewDataInRowFormat(
+        Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(1100, fetch2.getBatch().get().count());

    // 3. Extract with previous checkpoint => gives same data back (idempotent)
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch3 = kafkaSource.fetchNewData(
-        Optional.of(fetch1.getValue()), Long.MAX_VALUE);
-    assertEquals(fetch2.getKey().get().count(), fetch3.getKey().get().count());
-    assertEquals(fetch2.getValue(), fetch3.getValue());
+    InputBatch<JavaRDD<GenericRecord>> fetch3 = kafkaSource.fetchNewDataInAvroFormat(
+        Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(fetch2.getBatch().get().count(), fetch3.getBatch().get().count());
+    assertEquals(fetch2.getCheckpointForNextBatch(), fetch3.getCheckpointForNextBatch());
+    // Same using Row API
+    InputBatch<Dataset<Row>> fetch3AsRows =
+        kafkaSource.fetchNewDataInRowFormat(Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(fetch2.getBatch().get().count(), fetch3AsRows.getBatch().get().count());
+    assertEquals(fetch2.getCheckpointForNextBatch(), fetch3AsRows.getCheckpointForNextBatch());

    // 4. Extract with latest checkpoint => no new data returned
-    Pair<Optional<JavaRDD<GenericRecord>>, String> fetch4 = kafkaSource.fetchNewData(
-        Optional.of(fetch2.getValue()), Long.MAX_VALUE);
-    assertEquals(Optional.empty(), fetch4.getKey());
+    InputBatch<JavaRDD<GenericRecord>> fetch4 = kafkaSource.fetchNewDataInAvroFormat(
+        Optional.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(Optional.empty(), fetch4.getBatch());
+    // Same using Row API
+    InputBatch<Dataset<Row>> fetch4AsRows =
+        kafkaSource.fetchNewDataInRowFormat(Optional.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE);
+    assertEquals(Optional.empty(), fetch4AsRows.getBatch());
  }


--- a/hoodie-utilities/src/test/resources/delta-streamer-config/sql-transformer.properties
+++ b/hoodie-utilities/src/test/resources/delta-streamer-config/sql-transformer.properties
@@ -0,0 +1,19 @@
+#
+#  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+#
+include=base.properties
+hoodie.deltastreamer.transformer.sql=SELECT a.timestamp, a._row_key, a.rider, a.driver, a.begin_lat, a.begin_lon, a.end_lat, a.end_lon, a.fare, CAST(1.0 AS DOUBLE) AS haversine_distance FROM <SRC> a
--- a/hoodie-utilities/src/test/resources/delta-streamer-config/target.avsc
+++ b/hoodie-utilities/src/test/resources/delta-streamer-config/target.avsc
@@ -0,0 +1,37 @@
+{
+  "type" : "record",
+  "name" : "triprec",
+  "fields" : [
+  {
+    "name" : "timestamp",
+    "type" : "double"
+  }, {
+    "name" : "_row_key",
+    "type" : "string"
+  }, {
+    "name" : "rider",
+    "type" : "string"
+  }, {
+    "name" : "driver",
+    "type" : "string"
+  }, {
+    "name" : "begin_lat",
+    "type" : "double"
+  }, {
+    "name" : "begin_lon",
+    "type" : "double"
+  }, {
+    "name" : "end_lat",
+    "type" : "double"
+  }, {
+    "name" : "end_lon",
+    "type" : "double"
+  }, {
+    "name" : "fare",
+    "type" : "double"
+  }, {
+    "name" : "haversine_distance",
+    "type" : "double"
+  }]
+}
+