New Features in DeltaStreamer :

(1) Apply transformation when using delta-streamer to ingest data. (2) Add Hudi Incremental Source for Delta Streamer (3) Allow delta-streamer config-property to be passed as command-line (4) Add Hive Integration to Delta-Streamer and address Review comments (5) Ensure MultiPartKeysValueExtractor handle hive style partition description (6) Reuse same spark session on both source and transformer (7) Support extracting partition fields from _hoodie_partition_path for HoodieIncrSource (8) Reuse Binary Avro coders (9) Add push down filter for Incremental source (10) Add Hoodie DeltaStreamer metrics to track total time taken
2018-10-10 10:31:34 -07:00
parent c70dbc13e9
commit 3a0044216c
65 changed files with 2752 additions and 911 deletions
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieCleaner.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieCleaner.java
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *          http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.uber.hoodie.utilities;
+
+import com.beust.jcommander.JCommander;
+import com.beust.jcommander.Parameter;
+import com.uber.hoodie.HoodieWriteClient;
+import com.uber.hoodie.common.util.FSUtils;
+import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.config.HoodieWriteConfig;
+import com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaSparkContext;
+
+public class HoodieCleaner {
+
+  private static volatile Logger log = LogManager.getLogger(HoodieDeltaStreamer.class);
+
+  /**
+   * Config for Cleaner
+   */
+  private final Config cfg;
+
+  /**
+   * Filesystem used
+   */
+  private transient FileSystem fs;
+
+  /**
+   * Spark context
+   */
+  private transient JavaSparkContext jssc;
+
+  /**
+   * Bag of properties with source, hoodie client, key generator etc.
+   */
+  TypedProperties props;
+
+  public HoodieCleaner(Config cfg, JavaSparkContext jssc) throws IOException {
+    this.cfg = cfg;
+    this.jssc = jssc;
+    this.fs = FSUtils.getFs(cfg.basePath, jssc.hadoopConfiguration());
+
+    this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
+    log.info("Creating Cleaner with configs : " + props.toString());
+  }
+
+  public void run() throws Exception {
+    HoodieWriteConfig hoodieCfg = getHoodieClientConfig();
+    HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg, false);
+    client.clean();
+  }
+
+  private HoodieWriteConfig getHoodieClientConfig() throws Exception {
+    return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.basePath)
+        .withAutoCommit(false)
+        .withProps(props).build();
+  }
+
+  public static class Config implements Serializable {
+
+    @Parameter(names = {"--target-base-path"}, description = "base path for the hoodie dataset to be cleaner.",
+        required = true)
+    public String basePath;
+
+    @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
+        + "hoodie client for cleaning")
+    public String propsFilePath =
+        "file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties";
+
+    @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
+        + "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter")
+    public List<String> configs = new ArrayList<>();
+
+    @Parameter(names = {"--spark-master"}, description = "spark master to use.")
+    public String sparkMaster = "local[2]";
+
+    @Parameter(names = {"--help", "-h"}, help = true)
+    public Boolean help = false;
+  }
+
+  public static void main(String[] args) throws Exception {
+    final Config cfg = new Config();
+    JCommander cmd = new JCommander(cfg, args);
+    if (cfg.help || args.length == 0) {
+      cmd.usage();
+      System.exit(1);
+    }
+
+    String dirName = new Path(cfg.basePath).getName();
+    JavaSparkContext jssc = UtilHelpers.buildSparkContext("hoodie-cleaner-" + dirName, cfg.sparkMaster);
+    new HoodieCleaner(cfg, jssc).run();
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java
@@ -30,9 +30,13 @@ import com.uber.hoodie.exception.HoodieException;
 import com.uber.hoodie.index.HoodieIndex;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
 import com.uber.hoodie.utilities.sources.Source;
+import com.uber.hoodie.utilities.transform.Transformer;
+import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringReader;
 import java.nio.ByteBuffer;
+import java.util.List;
 import java.util.Optional;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
@@ -43,6 +47,7 @@ import org.apache.spark.Accumulator;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;

 /**
 * Bunch of helper methods
@@ -51,12 +56,12 @@ public class UtilHelpers {
  private static Logger logger = LogManager.getLogger(UtilHelpers.class);

  public static Source createSource(String sourceClass, TypedProperties cfg,
-      JavaSparkContext jssc, SchemaProvider schemaProvider)
+      JavaSparkContext jssc, SparkSession sparkSession, SchemaProvider schemaProvider)
      throws IOException {
    try {
      return (Source) ReflectionUtils.loadClass(sourceClass,
-          new Class<?>[]{TypedProperties.class, JavaSparkContext.class, SchemaProvider.class},
-          cfg, jssc, schemaProvider);
+          new Class<?>[]{TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class},
+          cfg, jssc, sparkSession, schemaProvider);
    } catch (Throwable e) {
      throw new IOException("Could not load source class " + sourceClass, e);
    }
@@ -65,17 +70,31 @@ public class UtilHelpers {
  public static SchemaProvider createSchemaProvider(String schemaProviderClass,
      TypedProperties cfg, JavaSparkContext jssc) throws IOException {
    try {
-      return (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
+      return schemaProviderClass == null ? null :
+          (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
    } catch (Throwable e) {
      throw new IOException("Could not load schema provider class " + schemaProviderClass, e);
    }
  }

+  public static Transformer createTransformer(String transformerClass) throws IOException {
+    try {
+      return transformerClass == null ? null : (Transformer) ReflectionUtils.loadClass(transformerClass);
+    } catch (Throwable e) {
+      throw new IOException("Could not load transformer class " + transformerClass, e);
+    }
+  }
+
  /**
   */
-  public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath) {
+  public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath, List<String> overriddenProps) {
    try {
-      return new DFSPropertiesConfiguration(fs, cfgPath);
+      DFSPropertiesConfiguration conf = new DFSPropertiesConfiguration(fs, cfgPath);
+      if (!overriddenProps.isEmpty()) {
+        logger.info("Adding overridden properties to file properties.");
+        conf.addProperties(new BufferedReader(new StringReader(String.join("\n", overriddenProps))));
+      }
+      return conf;
    } catch (Exception e) {
      throw new HoodieException("Unable to read props file at :" + cfgPath, e);
    }
@@ -109,7 +128,7 @@ public class UtilHelpers {
      sparkConf.set("spark.eventLog.overwrite", "true");
      sparkConf.set("spark.eventLog.enabled", "true");
    }
-    sparkConf.set("spark.driver.maxResultSize", "2g");
+    sparkConf.setIfMissing("spark.driver.maxResultSize", "2g");
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    sparkConf.set("spark.hadoop.mapred.output.compress", "true");
    sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java
@@ -18,10 +18,15 @@

 package com.uber.hoodie.utilities.deltastreamer;

+import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE;
+import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME;
+
 import com.beust.jcommander.IStringConverter;
 import com.beust.jcommander.JCommander;
 import com.beust.jcommander.Parameter;
 import com.beust.jcommander.ParameterException;
+import com.codahale.metrics.Timer;
+import com.uber.hoodie.AvroConversionUtils;
 import com.uber.hoodie.DataSourceUtils;
 import com.uber.hoodie.HoodieWriteClient;
 import com.uber.hoodie.KeyGenerator;
@@ -36,32 +41,40 @@ import com.uber.hoodie.common.table.HoodieTimeline;
 import com.uber.hoodie.common.table.timeline.HoodieInstant;
 import com.uber.hoodie.common.util.FSUtils;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.config.HoodieCompactionConfig;
 import com.uber.hoodie.config.HoodieIndexConfig;
 import com.uber.hoodie.config.HoodieWriteConfig;
+import com.uber.hoodie.hive.HiveSyncConfig;
+import com.uber.hoodie.hive.HiveSyncTool;
 import com.uber.hoodie.index.HoodieIndex;
 import com.uber.hoodie.utilities.HiveIncrementalPuller;
 import com.uber.hoodie.utilities.UtilHelpers;
 import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException;
-import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider;
+import com.uber.hoodie.utilities.schema.RowBasedSchemaProvider;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
+import com.uber.hoodie.utilities.sources.InputBatch;
 import com.uber.hoodie.utilities.sources.JsonDFSSource;
-import com.uber.hoodie.utilities.sources.Source;
+import com.uber.hoodie.utilities.transform.Transformer;
 import java.io.IOException;
 import java.io.Serializable;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Optional;
 import org.apache.avro.Schema;
 import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
 import scala.collection.JavaConversions;

 /**
@@ -81,7 +94,7 @@ public class HoodieDeltaStreamer implements Serializable {
  /**
   * Source to pull deltas from
   */
-  private transient Source source;
+  private transient SourceFormatAdapter formatAdapter;

  /**
   * Schema provider that supplies the command for reading the input and writing out the target
@@ -89,6 +102,11 @@ public class HoodieDeltaStreamer implements Serializable {
   */
  private transient SchemaProvider schemaProvider;

+  /**
+   * Allows transforming source to target dataset before writing
+   */
+  private transient Transformer transformer;
+
  /**
   * Extract the key for the target dataset
   */
@@ -109,16 +127,30 @@ public class HoodieDeltaStreamer implements Serializable {
   */
  private transient JavaSparkContext jssc;

+  /**
+   * Spark Session
+   */
+  private transient SparkSession sparkSession;
+
+  /**
+   * Hive Config
+   */
+  private transient HiveConf hiveConf;

  /**
   * Bag of properties with source, hoodie client, key generator etc.
   */
  TypedProperties props;

-
  public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc) throws IOException {
+    this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()),
+        getDefaultHiveConf(jssc.hadoopConfiguration()));
+  }
+
+  public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf) throws IOException {
    this.cfg = cfg;
    this.jssc = jssc;
+    this.sparkSession = SparkSession.builder().config(jssc.getConf()).getOrCreate();
    this.fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration());

    if (fs.exists(new Path(cfg.targetBasePath))) {
@@ -129,19 +161,28 @@ public class HoodieDeltaStreamer implements Serializable {
      this.commitTimelineOpt = Optional.empty();
    }

-    this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath)).getConfig();
+    this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
    log.info("Creating delta streamer with configs : " + props.toString());
    this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jssc);
+    this.transformer = UtilHelpers.createTransformer(cfg.transformerClassName);
    this.keyGenerator = DataSourceUtils.createKeyGenerator(cfg.keyGeneratorClass, props);
-    this.source = UtilHelpers.createSource(cfg.sourceClassName, props, jssc, schemaProvider);

-    // register the schemas, so that shuffle does not serialize the full schemas
-    List<Schema> schemas = Arrays.asList(schemaProvider.getSourceSchema(),
-        schemaProvider.getTargetSchema());
-    jssc.sc().getConf().registerAvroSchemas(JavaConversions.asScalaBuffer(schemas).toList());
+    this.formatAdapter =
+        new SourceFormatAdapter(UtilHelpers.createSource(cfg.sourceClassName, props, jssc, sparkSession,
+            schemaProvider));
+
+    this.hiveConf = hiveConf;
+  }
+
+  private static HiveConf getDefaultHiveConf(Configuration cfg) {
+    HiveConf hiveConf = new HiveConf();
+    hiveConf.addResource(cfg);
+    return hiveConf;
  }

  public void sync() throws Exception {
+    HoodieDeltaStreamerMetrics metrics = new HoodieDeltaStreamerMetrics(getHoodieClientConfig(null));
+    Timer.Context overallTimerContext = metrics.getOverallTimerContext();
    // Retrieve the previous round checkpoints, if any
    Optional<String> resumeCheckpointStr = Optional.empty();
    if (commitTimelineOpt.isPresent()) {
@@ -163,16 +204,42 @@ public class HoodieDeltaStreamer implements Serializable {
    }
    log.info("Checkpoint to resume from : " + resumeCheckpointStr);

-    // Pull the data from the source & prepare the write
-    Pair<Optional<JavaRDD<GenericRecord>>, String> dataAndCheckpoint = source.fetchNewData(
-        resumeCheckpointStr, cfg.sourceLimit);
+    final Optional<JavaRDD<GenericRecord>> avroRDDOptional;
+    final String checkpointStr;
+    final SchemaProvider schemaProvider;
+    if (transformer != null) {
+      // Transformation is needed. Fetch New rows in Row Format, apply transformation and then convert them
+      // to generic records for writing
+      InputBatch<Dataset<Row>> dataAndCheckpoint = formatAdapter.fetchNewDataInRowFormat(
+          resumeCheckpointStr, cfg.sourceLimit);

-    if (!dataAndCheckpoint.getKey().isPresent()) {
+      Optional<Dataset<Row>> transformed =
+          dataAndCheckpoint.getBatch().map(data -> transformer.apply(jssc, sparkSession, data, props));
+      checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
+      avroRDDOptional = transformed.map(t ->
+         AvroConversionUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()
+      );
+      // Use Transformed Row's schema if not overridden
+      schemaProvider =
+          this.schemaProvider == null ? transformed.map(r -> (SchemaProvider)new RowBasedSchemaProvider(r.schema()))
+              .orElse(dataAndCheckpoint.getSchemaProvider()) : this.schemaProvider;
+    } else {
+      // Pull the data from the source & prepare the write
+      InputBatch<JavaRDD<GenericRecord>> dataAndCheckpoint =
+          formatAdapter.fetchNewDataInAvroFormat(resumeCheckpointStr, cfg.sourceLimit);
+      avroRDDOptional = dataAndCheckpoint.getBatch();
+      checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
+      schemaProvider = dataAndCheckpoint.getSchemaProvider();
+    }
+
+    if ((!avroRDDOptional.isPresent()) || (avroRDDOptional.get().isEmpty())) {
      log.info("No new data, nothing to commit.. ");
      return;
    }

-    JavaRDD<GenericRecord> avroRDD = dataAndCheckpoint.getKey().get();
+    registerAvroSchemas(schemaProvider);
+
+    JavaRDD<GenericRecord> avroRDD = avroRDDOptional.get();
    JavaRDD<HoodieRecord> records = avroRDD.map(gr -> {
      HoodieRecordPayload payload = DataSourceUtils.createPayload(cfg.payloadClassName, gr,
          (Comparable) gr.get(cfg.sourceOrderingField));
@@ -180,20 +247,20 @@ public class HoodieDeltaStreamer implements Serializable {
    });

    // filter dupes if needed
-    HoodieWriteConfig hoodieCfg = getHoodieClientConfig();
+    HoodieWriteConfig hoodieCfg = getHoodieClientConfig(schemaProvider);
    if (cfg.filterDupes) {
      // turn upserts to insert
      cfg.operation = cfg.operation == Operation.UPSERT ? Operation.INSERT : cfg.operation;
      records = DataSourceUtils.dropDuplicates(jssc, records, hoodieCfg);
-    }

-    if (records.isEmpty()) {
-      log.info("No new data, nothing to commit.. ");
-      return;
+      if (records.isEmpty()) {
+        log.info("No new data, nothing to commit.. ");
+        return;
+      }
    }

    // Perform the write
-    HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg);
+    HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg, true);
    String commitTime = client.startCommit();
    log.info("Starting commit  : " + commitTime);

@@ -210,7 +277,7 @@ public class HoodieDeltaStreamer implements Serializable {

    // Simply commit for now. TODO(vc): Support better error handlers later on
    HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
-    checkpointCommitMetadata.put(CHECKPOINT_KEY, dataAndCheckpoint.getValue());
+    checkpointCommitMetadata.put(CHECKPOINT_KEY, checkpointStr);

    boolean success = client.commit(commitTime, writeStatusRDD,
        Optional.of(checkpointCommitMetadata));
@@ -220,17 +287,54 @@ public class HoodieDeltaStreamer implements Serializable {
    } else {
      log.info("Commit " + commitTime + " failed!");
    }
+
+    // Sync to hive if enabled
+    Timer.Context hiveSyncContext = metrics.getHiveSyncTimerContext();
+    syncHive();
+    long hiveSyncTimeMs = hiveSyncContext != null ? hiveSyncContext.stop() : 0;
+
    client.close();
+    long overallTimeMs = overallTimerContext != null ? overallTimerContext.stop() : 0;
+
+    // Send DeltaStreamer Metrics
+    metrics.updateDeltaStreamerMetrics(overallTimeMs, hiveSyncTimeMs);
  }

-  private HoodieWriteConfig getHoodieClientConfig() throws Exception {
-    return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
-        .withAutoCommit(false)
-        .withSchema(schemaProvider.getTargetSchema().toString())
-        .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build())
-        .forTable(cfg.targetTableName)
-        .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
-        .withProps(props).build();
+  public void syncHive() {
+    if (cfg.enableHiveSync) {
+      HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath);
+      log.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName
+          + "). Hive metastore URL :" + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
+
+      new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable();
+    }
+  }
+
+  /**
+   * Register Avro Schemas
+   * @param schemaProvider Schema Provider
+   */
+  private void registerAvroSchemas(SchemaProvider schemaProvider) {
+    // register the schemas, so that shuffle does not serialize the full schemas
+    if (null != schemaProvider) {
+      List<Schema> schemas = Arrays.asList(schemaProvider.getSourceSchema(), schemaProvider.getTargetSchema());
+      log.info("Registering Schema :" + schemas);
+      jssc.sc().getConf().registerAvroSchemas(JavaConversions.asScalaBuffer(schemas).toList());
+    }
+  }
+
+  private HoodieWriteConfig getHoodieClientConfig(SchemaProvider schemaProvider) throws Exception {
+    HoodieWriteConfig.Builder builder =
+        HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
+            .withAutoCommit(false)
+            .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build())
+            .forTable(cfg.targetTableName)
+            .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
+            .withProps(props);
+    if (null != schemaProvider) {
+      builder = builder.withSchema(schemaProvider.getTargetSchema().toString());
+    }
+    return builder.build();
  }

  public enum Operation {
@@ -266,6 +370,10 @@ public class HoodieDeltaStreamer implements Serializable {
    public String propsFilePath =
        "file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties";

+    @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
+        + "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter")
+    public List<String> configs = new ArrayList<>();
+
    @Parameter(names = {"--source-class"}, description = "Subclass of com.uber.hoodie.utilities.sources to read data. "
        + "Built-in options: com.uber.hoodie.utilities.sources.{JsonDFSSource (default), AvroDFSSource, "
        + "JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource}")
@@ -285,11 +393,22 @@ public class HoodieDeltaStreamer implements Serializable {
    public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName();

    @Parameter(names = {"--schemaprovider-class"}, description = "subclass of com.uber.hoodie.utilities.schema"
-        + ".SchemaProvider to attach schemas to input & target table data, built in options: FilebasedSchemaProvider")
-    public String schemaProviderClassName = FilebasedSchemaProvider.class.getName();
+        + ".SchemaProvider to attach schemas to input & target table data, built in options: "
+        + "com.uber.hoodie.utilities.schema.FilebasedSchemaProvider."
+        + "Source (See com.uber.hoodie.utilities.sources.Source) implementation can implement their own SchemaProvider."
+        + " For Sources that return Dataset<Row>, the schema is obtained implicitly. "
+        + "However, this CLI option allows overriding the schemaprovider returned by Source.")
+    public String schemaProviderClassName = null;
+
+    @Parameter(names = {"--transformer-class"},
+        description = "subclass of com.uber.hoodie.utilities.transform.Transformer"
+        + ". Allows transforming raw source dataset to a target dataset (conforming to target schema) before writing."
+            + " Default : Not set. E:g - com.uber.hoodie.utilities.transform.SqlQueryBasedTransformer (which allows"
+            + "a SQL query templated to be passed as a transformation function)")
+    public String transformerClassName = null;

    @Parameter(names = {"--source-limit"}, description = "Maximum amount of data to read from source. "
-        + "Default: No limit For e.g: DFSSource => max bytes to read, KafkaSource => max events to read")
+        + "Default: No limit For e.g: DFS-Source => max bytes to read, Kafka-Source => max events to read")
    public long sourceLimit = Long.MAX_VALUE;

    @Parameter(names = {"--op"}, description = "Takes one of these values : UPSERT (default), INSERT (use when input "
@@ -301,6 +420,9 @@ public class HoodieDeltaStreamer implements Serializable {
        + "before insert/bulk-insert")
    public Boolean filterDupes = false;

+    @Parameter(names = {"--enable-hive-sync"}, description = "Enable syncing to hive")
+    public Boolean enableHiveSync = false;
+
    @Parameter(names = {"--spark-master"}, description = "spark master to use.")
    public String sparkMaster = "local[2]";

@@ -319,4 +441,44 @@ public class HoodieDeltaStreamer implements Serializable {
    JavaSparkContext jssc = UtilHelpers.buildSparkContext("delta-streamer-" + cfg.targetTableName, cfg.sparkMaster);
    new HoodieDeltaStreamer(cfg, jssc).sync();
  }
+
+  public SourceFormatAdapter getFormatAdapter() {
+    return formatAdapter;
+  }
+
+  public SchemaProvider getSchemaProvider() {
+    return schemaProvider;
+  }
+
+  public Transformer getTransformer() {
+    return transformer;
+  }
+
+  public KeyGenerator getKeyGenerator() {
+    return keyGenerator;
+  }
+
+  public FileSystem getFs() {
+    return fs;
+  }
+
+  public Optional<HoodieTimeline> getCommitTimelineOpt() {
+    return commitTimelineOpt;
+  }
+
+  public JavaSparkContext getJssc() {
+    return jssc;
+  }
+
+  public SparkSession getSparkSession() {
+    return sparkSession;
+  }
+
+  public HiveConf getHiveConf() {
+    return hiveConf;
+  }
+
+  public TypedProperties getProps() {
+    return props;
+  }
 }
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java
@@ -0,0 +1,61 @@
+package com.uber.hoodie.utilities.deltastreamer;
+
+import static com.uber.hoodie.metrics.Metrics.registerGauge;
+
+import com.codahale.metrics.Timer;
+import com.uber.hoodie.config.HoodieWriteConfig;
+import com.uber.hoodie.metrics.Metrics;
+
+public class HoodieDeltaStreamerMetrics {
+
+  private HoodieWriteConfig config = null;
+  private String tableName = null;
+
+  public String overallTimerName = null;
+  public String hiveSyncTimerName = null;
+  private Timer overallTimer = null;
+  public Timer hiveSyncTimer = null;
+
+  public HoodieDeltaStreamerMetrics(HoodieWriteConfig config) {
+    this.config = config;
+    this.tableName = config.getTableName();
+    if (config.isMetricsOn()) {
+      Metrics.init(config);
+      this.overallTimerName = getMetricsName("timer", "deltastreamer");
+      this.hiveSyncTimerName = getMetricsName("timer", "deltastreamerHiveSync");
+    }
+  }
+
+  public Timer.Context getOverallTimerContext() {
+    if (config.isMetricsOn() && overallTimer == null) {
+      overallTimer = createTimer(overallTimerName);
+    }
+    return overallTimer == null ? null : overallTimer.time();
+  }
+
+  public Timer.Context getHiveSyncTimerContext() {
+    if (config.isMetricsOn() && hiveSyncTimer == null) {
+      hiveSyncTimer = createTimer(hiveSyncTimerName);
+    }
+    return hiveSyncTimer == null ? null : hiveSyncTimer.time();
+  }
+
+  private Timer createTimer(String name) {
+    return config.isMetricsOn() ? Metrics.getInstance().getRegistry().timer(name) : null;
+  }
+
+  String getMetricsName(String action, String metric) {
+    return config == null ? null : String.format("%s.%s.%s", tableName, action, metric);
+  }
+
+  public void updateDeltaStreamerMetrics(long durationInNs, long hiveSyncNs) {
+    if (config.isMetricsOn()) {
+      registerGauge(getMetricsName("deltastreamer", "duration"), getDurationInMs(durationInNs));
+      registerGauge(getMetricsName("deltastreamer", "hiveSyncDuration"), getDurationInMs(hiveSyncNs));
+    }
+  }
+
+  public long getDurationInMs(long ctxDuration) {
+    return ctxDuration / 1000000;
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/SourceFormatAdapter.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/SourceFormatAdapter.java
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.deltastreamer;
+
+import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE;
+import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME;
+
+import com.uber.hoodie.AvroConversionUtils;
+import com.uber.hoodie.utilities.sources.AvroSource;
+import com.uber.hoodie.utilities.sources.InputBatch;
+import com.uber.hoodie.utilities.sources.JsonSource;
+import com.uber.hoodie.utilities.sources.RowSource;
+import com.uber.hoodie.utilities.sources.Source;
+import com.uber.hoodie.utilities.sources.helpers.AvroConvertor;
+import java.util.Optional;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * Adapts data-format provided by the source to the data-format required by the client (DeltaStreamer)
+ */
+public final class SourceFormatAdapter {
+
+  private final Source source;
+
+
+  public SourceFormatAdapter(Source source) {
+    this.source = source;
+  }
+
+  /**
+   * Fetch new data in avro format. If the source provides data in different format, they are translated
+   * to Avro format
+   * @param lastCkptStr
+   * @param sourceLimit
+   * @return
+   */
+  public InputBatch<JavaRDD<GenericRecord>> fetchNewDataInAvroFormat(Optional<String> lastCkptStr,
+      long sourceLimit) {
+    switch (source.getSourceType()) {
+      case AVRO:
+        return ((AvroSource)source).fetchNext(lastCkptStr, sourceLimit);
+      case JSON: {
+        InputBatch<JavaRDD<String>> r = ((JsonSource)source).fetchNext(lastCkptStr, sourceLimit);
+        AvroConvertor convertor = new AvroConvertor(r.getSchemaProvider().getSourceSchema());
+        return new InputBatch<>(Optional.ofNullable(
+            r.getBatch().map(rdd -> rdd.map(convertor::fromJson))
+                .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
+      }
+      case ROW: {
+        InputBatch<Dataset<Row>> r = ((RowSource)source).fetchNext(lastCkptStr, sourceLimit);
+        return new InputBatch<>(Optional.ofNullable(r.getBatch().map(
+            rdd -> (AvroConversionUtils.createRdd(rdd, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()))
+            .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
+      }
+      default:
+        throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")");
+    }
+  }
+
+  /**
+   * Fetch new data in row format. If the source provides data in different format, they are translated
+   * to Row format
+   * @param lastCkptStr
+   * @param sourceLimit
+   * @return
+   */
+  public InputBatch<Dataset<Row>> fetchNewDataInRowFormat(Optional<String> lastCkptStr, long sourceLimit) {
+    switch (source.getSourceType()) {
+      case ROW:
+        return ((RowSource)source).fetchNext(lastCkptStr, sourceLimit);
+      case AVRO: {
+        InputBatch<JavaRDD<GenericRecord>> r = ((AvroSource)source).fetchNext(lastCkptStr, sourceLimit);
+        Schema sourceSchema = r.getSchemaProvider().getSourceSchema();
+        return new InputBatch<>(Optional.ofNullable(
+            r.getBatch().map(rdd -> AvroConversionUtils.createDataFrame(JavaRDD.toRDD(rdd),
+                sourceSchema.toString(), source.getSparkSession()))
+                .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
+      }
+      case JSON: {
+        InputBatch<JavaRDD<String>> r = ((JsonSource)source).fetchNext(lastCkptStr, sourceLimit);
+        Schema sourceSchema = r.getSchemaProvider().getSourceSchema();
+        StructType dataType = AvroConversionUtils.convertAvroSchemaToStructType(sourceSchema);
+        return new InputBatch<>(Optional.ofNullable(
+            r.getBatch().map(rdd -> source.getSparkSession().read().schema(dataType).json(rdd))
+                .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
+      }
+      default:
+        throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")");
+    }
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/RowBasedSchemaProvider.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/RowBasedSchemaProvider.java
@@ -0,0 +1,25 @@
+package com.uber.hoodie.utilities.schema;
+
+import com.uber.hoodie.AvroConversionUtils;
+import org.apache.avro.Schema;
+import org.apache.spark.sql.types.StructType;
+
+public class RowBasedSchemaProvider extends SchemaProvider {
+
+  // Used in GenericRecord conversions
+  public static final String HOODIE_RECORD_NAMESPACE = "hoodie.source";
+  public static final String HOODIE_RECORD_STRUCT_NAME = "hoodie_source";
+
+  private StructType rowStruct;
+
+  public RowBasedSchemaProvider(StructType rowStruct) {
+    super(null, null);
+    this.rowStruct = rowStruct;
+  }
+
+  @Override
+  public Schema getSourceSchema() {
+    return AvroConversionUtils.convertStructTypeToAvroSchema(rowStruct, HOODIE_RECORD_STRUCT_NAME,
+        HOODIE_RECORD_NAMESPACE);
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaRegistryProvider.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaRegistryProvider.java
@@ -42,12 +42,15 @@ public class SchemaRegistryProvider extends SchemaProvider {
   */
  public static class Config {

-    private static final String SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
+    private static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
+    private static final String TARGET_SCHEMA_REGISTRY_URL_PROP =
+        "hoodie.deltastreamer.schemaprovider.registry.targetUrl";
  }

  private final Schema schema;
+  private final Schema targetSchema;

-  private String fetchSchemaFromRegistry(String registryUrl) throws IOException {
+  private static String fetchSchemaFromRegistry(String registryUrl) throws IOException {
    URL registry = new URL(registryUrl);
    ObjectMapper mapper = new ObjectMapper();
    JsonNode node = mapper.readTree(registry.openStream());
@@ -56,17 +59,32 @@ public class SchemaRegistryProvider extends SchemaProvider {

  public SchemaRegistryProvider(TypedProperties props, JavaSparkContext jssc) {
    super(props, jssc);
-    DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.SCHEMA_REGISTRY_URL_PROP));
-    String registryUrl = props.getString(Config.SCHEMA_REGISTRY_URL_PROP);
+    DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.SRC_SCHEMA_REGISTRY_URL_PROP));
+    String registryUrl = props.getString(Config.SRC_SCHEMA_REGISTRY_URL_PROP);
+    String targetRegistryUrl = props.getString(Config.TARGET_SCHEMA_REGISTRY_URL_PROP, registryUrl);
    try {
-      this.schema = new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl));
+      this.schema = getSchema(registryUrl);
+      if (!targetRegistryUrl.equals(registryUrl)) {
+        this.targetSchema = getSchema(targetRegistryUrl);
+      } else {
+        this.targetSchema = schema;
+      }
    } catch (IOException ioe) {
      throw new HoodieIOException("Error reading schema from registry :" + registryUrl, ioe);
    }
  }

+  private static Schema getSchema(String registryUrl) throws IOException {
+    return new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl));
+  }
+
  @Override
  public Schema getSourceSchema() {
    return schema;
  }
+
+  @Override
+  public Schema getTargetSchema() {
+    return targetSchema;
+  }
 }
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroDFSSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroDFSSource.java
@@ -19,7 +19,10 @@
 package com.uber.hoodie.utilities.sources;

 import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
+import com.uber.hoodie.utilities.sources.helpers.DFSPathSelector;
+import java.util.Optional;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.mapred.AvroKey;
 import org.apache.avro.mapreduce.AvroKeyInputFormat;
@@ -27,18 +30,33 @@ import org.apache.hadoop.io.NullWritable;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;

 /**
 * DFS Source that reads avro data
 */
-public class AvroDFSSource extends DFSSource {
+public class AvroDFSSource extends AvroSource {

-  public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
+  private final DFSPathSelector pathSelector;
+
+  public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider);
+    this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
  }

  @Override
-  protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
+  protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Optional<String> lastCkptStr,
+      long sourceLimit) {
+    Pair<Optional<String>, String> selectPathsWithMaxModificationTime =
+        pathSelector.getNextFilePathsAndMaxModificationTime(lastCkptStr, sourceLimit);
+    return selectPathsWithMaxModificationTime.getLeft().map(pathStr -> new InputBatch<>(
+        Optional.of(fromFiles(pathStr)),
+        selectPathsWithMaxModificationTime.getRight()))
+        .orElseGet(() -> new InputBatch<>(Optional.empty(), selectPathsWithMaxModificationTime.getRight()));
+  }
+
+  private JavaRDD<GenericRecord> fromFiles(String pathStr) {
    JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
        AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
        sparkContext.hadoopConfiguration());
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroKafkaSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroKafkaSource.java
@@ -20,27 +20,55 @@ package com.uber.hoodie.utilities.sources;

 import com.uber.hoodie.common.util.TypedProperties;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
+import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen;
+import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
 import io.confluent.kafka.serializers.KafkaAvroDecoder;
+import java.util.Optional;
 import kafka.serializer.StringDecoder;
 import org.apache.avro.generic.GenericRecord;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
 import org.apache.spark.streaming.kafka.KafkaUtils;
 import org.apache.spark.streaming.kafka.OffsetRange;

 /**
 * Reads avro serialized Kafka data, based on the confluent schema-registry
 */
-public class AvroKafkaSource extends KafkaSource {
+public class AvroKafkaSource extends AvroSource {

-  public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
+  private static Logger log = LogManager.getLogger(AvroKafkaSource.class);
+
+  private final KafkaOffsetGen offsetGen;
+
+  public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider);
+    offsetGen = new KafkaOffsetGen(props);
  }

  @Override
-  protected JavaRDD<GenericRecord> toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor) {
-    return KafkaUtils
-        .createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class, kafkaParams,
-            offsetRanges).values().map(obj -> (GenericRecord) obj);
+  protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Optional<String> lastCheckpointStr,
+      long sourceLimit) {
+    OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit);
+    long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
+    if (totalNewMsgs <= 0) {
+      return new InputBatch<>(Optional.empty(),
+          lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
+    } else {
+      log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
+    }
+    JavaRDD<GenericRecord> newDataRDD = toRDD(offsetRanges);
+    return new InputBatch<>(Optional.of(newDataRDD),
+        KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges));
+  }
+
+  private JavaRDD<GenericRecord> toRDD(OffsetRange[] offsetRanges) {
+    JavaRDD<GenericRecord> recordRDD = KafkaUtils
+        .createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class,
+            offsetGen.getKafkaParams(), offsetRanges).values().map(obj -> (GenericRecord) obj);
+    return recordRDD;
  }
 }
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroSource.java
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.sources;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.utilities.schema.SchemaProvider;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+
+public abstract class AvroSource extends Source<JavaRDD<GenericRecord>> {
+
+  public AvroSource(TypedProperties props,
+      JavaSparkContext sparkContext,
+      SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider, SourceType.AVRO);
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java
@@ -21,8 +21,6 @@ package com.uber.hoodie.utilities.sources;
 import com.uber.hoodie.DataSourceUtils;
 import com.uber.hoodie.common.util.FSUtils;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.ImmutablePair;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.exception.HoodieIOException;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
 import java.io.IOException;
@@ -44,19 +42,20 @@ import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;

 /**
- * Source to read deltas produced by {@link com.uber.hoodie.utilities.HiveIncrementalPuller}, commit
- * by commit and apply to the target table
+ * Source to read deltas produced by {@link com.uber.hoodie.utilities.HiveIncrementalPuller}, commit by commit and apply
+ * to the target table
 * <p>
 * The general idea here is to have commits sync across the data pipeline.
 * <p>
- * [Source Tables(s)]  ====> HiveIncrementalScanner  ==> incrPullRootPath ==> targetTable
- * {c1,c2,c3,...}                                       {c1,c2,c3,...}       {c1,c2,c3,...}
+ * [Source Tables(s)]  ====> HiveIncrementalScanner  ==> incrPullRootPath ==> targetTable {c1,c2,c3,...}
+ * {c1,c2,c3,...}       {c1,c2,c3,...}
 * <p>
 * This produces beautiful causality, that makes data issues in ETLs very easy to debug
 */
-public class HiveIncrPullSource extends Source {
+public class HiveIncrPullSource extends AvroSource {

  private static volatile Logger log = LogManager.getLogger(HiveIncrPullSource.class);

@@ -73,9 +72,9 @@ public class HiveIncrPullSource extends Source {
    private static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.incrpull.root";
  }

-  public HiveIncrPullSource(TypedProperties props, JavaSparkContext sparkContext,
+  public HiveIncrPullSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
      SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
+    super(props, sparkContext, sparkSession, schemaProvider);
    DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.ROOT_INPUT_PATH_PROP));
    this.incrPullRootPath = props.getString(Config.ROOT_INPUT_PATH_PROP);
    this.fs = FSUtils.getFs(incrPullRootPath, sparkContext.hadoopConfiguration());
@@ -113,15 +112,15 @@ public class HiveIncrPullSource extends Source {
  }

  @Override
-  public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
+  protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(
      Optional<String> lastCheckpointStr, long sourceLimit) {
    try {
      // find the source commit to pull
      Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);

      if (!commitToPull.isPresent()) {
-        return new ImmutablePair<>(Optional.empty(),
-                lastCheckpointStr.orElse(""));
+        return new InputBatch<>(Optional.empty(),
+            lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
      }

      // read the files out.
@@ -132,7 +131,7 @@ public class HiveIncrPullSource extends Source {
      JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
          AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
          sparkContext.hadoopConfiguration());
-      return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
+      return new InputBatch<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
          String.valueOf(commitToPull.get()));
    } catch (IOException ioe) {
      throw new HoodieIOException(
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HoodieIncrSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HoodieIncrSource.java
@@ -0,0 +1,144 @@
+package com.uber.hoodie.utilities.sources;
+
+import com.uber.hoodie.DataSourceReadOptions;
+import com.uber.hoodie.DataSourceUtils;
+import com.uber.hoodie.common.model.HoodieRecord;
+import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.common.util.collection.Pair;
+import com.uber.hoodie.hive.SlashEncodedDayPartitionValueExtractor;
+import com.uber.hoodie.utilities.schema.SchemaProvider;
+import com.uber.hoodie.utilities.sources.helpers.IncrSourceHelper;
+import java.util.Arrays;
+import java.util.Optional;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrameReader;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+public class HoodieIncrSource extends RowSource {
+
+  /**
+   * Configs supported
+   */
+  protected static class Config {
+
+    /**
+     * {@value #HOODIE_SRC_BASE_PATH} is the base-path for the source Hoodie table
+     */
+    private static final String HOODIE_SRC_BASE_PATH = "hoodie.deltastreamer.source.hoodieincr.path";
+
+    /**
+     * {@value #NUM_INSTANTS_PER_FETCH} allows the max number of instants whose changes can be incrementally fetched
+     */
+    private static final String NUM_INSTANTS_PER_FETCH = "hoodie.deltastreamer.source.hoodieincr.num_instants";
+    private static final Integer DEFAULT_NUM_INSTANTS_PER_FETCH = 1;
+
+    /**
+     * {@value #HOODIE_SRC_PARTITION_FIELDS} specifies partition fields that needs to be added to source table after
+     * parsing _hoodie_partition_path
+     */
+    private static final String HOODIE_SRC_PARTITION_FIELDS = "hoodie.deltastreamer.source.hoodieincr.partition.fields";
+
+    /**
+     * {@value #HOODIE_SRC_PARTITION_EXTRACTORCLASS} PartitionValueExtractor class to extract partition fields from
+     * _hoodie_partition_path
+     */
+    private static final String HOODIE_SRC_PARTITION_EXTRACTORCLASS =
+        "hoodie.deltastreamer.source.hoodieincr.partition.extractor.class";
+    private static final String DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS =
+        SlashEncodedDayPartitionValueExtractor.class.getCanonicalName();
+
+    /**
+     * {@value #READ_LATEST_INSTANT_ON_MISSING_CKPT} allows delta-streamer to incrementally fetch from latest committed
+     * instant when checkpoint is not provided.
+     */
+    private static final String READ_LATEST_INSTANT_ON_MISSING_CKPT =
+        "hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt";
+    private static final Boolean DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT = false;
+  }
+
+  public HoodieIncrSource(TypedProperties props,
+      JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider);
+  }
+
+  @Override
+  public Pair<Optional<Dataset<Row>>, String> fetchNextBatch(Optional<String> lastCkptStr, long sourceLimit) {
+
+    DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH));
+
+    /**
+     DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH,
+     Config.HOODIE_SRC_PARTITION_FIELDS));
+    List<String> partitionFields = props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",",
+        new ArrayList<>());
+    PartitionValueExtractor extractor = DataSourceUtils.createPartitionExtractor(props.getString(
+        Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS, Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS));
+    **/
+    String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH);
+    int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH);
+    boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT,
+        Config.DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT);
+
+    // Use begin Instant if set and non-empty
+    Optional<String> beginInstant =
+        lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Optional.empty() : lastCkptStr : Optional.empty();
+
+    Pair<String, String> instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath,
+        numInstantsPerFetch, beginInstant, readLatestOnMissingCkpt);
+
+    if (instantEndpts.getKey().equals(instantEndpts.getValue())) {
+      log.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey());
+      return Pair.of(Optional.empty(), instantEndpts.getKey());
+    }
+
+    // Do Incr pull. Set end instant if available
+    DataFrameReader reader = sparkSession.read().format("com.uber.hoodie")
+        .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
+        .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), instantEndpts.getLeft())
+        .option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY(), instantEndpts.getRight());
+
+    Dataset<Row> source = reader.load(srcPath);
+
+    /**
+    log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema());
+
+    StructType newSchema = new StructType(source.schema().fields());
+    for (String field : partitionFields) {
+      newSchema = newSchema.add(field, DataTypes.StringType, true);
+    }
+
+    /**
+     * Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if
+     * configured
+     *
+    Dataset<Row> validated = source.map((MapFunction<Row, Row>) (Row row) -> {
+      // _hoodie_instant_time
+      String instantTime = row.getString(0);
+      IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(), instantEndpts.getValue());
+      if (!partitionFields.isEmpty()) {
+        // _hoodie_partition_path
+        String hoodiePartitionPath = row.getString(3);
+        List<Object> partitionVals = extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream()
+            .map(o -> (Object) o).collect(Collectors.toList());
+        Preconditions.checkArgument(partitionVals.size() == partitionFields.size(),
+            "#partition-fields != #partition-values-extracted");
+        List<Object> rowObjs = new ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq()));
+        rowObjs.addAll(partitionVals);
+        return RowFactory.create(rowObjs.toArray());
+      }
+      return row;
+    }, RowEncoder.apply(newSchema));
+
+    log.info("Validated Source Schema :" + validated.schema());
+    **/
+
+    // Remove Hoodie meta columns except partition path from input source
+    final Dataset<Row> src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream()
+        .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new));
+    //log.info("Final Schema from Source is :" + src.schema());
+    return Pair.of(Optional.of(src), instantEndpts.getRight());
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/InputBatch.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/InputBatch.java
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.sources;
+
+import com.uber.hoodie.utilities.schema.SchemaProvider;
+import java.util.Optional;
+
+public class InputBatch<T> {
+
+  private final Optional<T> batch;
+  private final String checkpointForNextBatch;
+  private final SchemaProvider schemaProvider;
+
+  public InputBatch(Optional<T> batch, String checkpointForNextBatch,
+      SchemaProvider schemaProvider) {
+    this.batch = batch;
+    this.checkpointForNextBatch = checkpointForNextBatch;
+    this.schemaProvider = schemaProvider;
+  }
+
+  public InputBatch(Optional<T> batch, String checkpointForNextBatch) {
+    this.batch = batch;
+    this.checkpointForNextBatch = checkpointForNextBatch;
+    this.schemaProvider = null;
+  }
+
+  public Optional<T> getBatch() {
+    return batch;
+  }
+
+  public String getCheckpointForNextBatch() {
+    return checkpointForNextBatch;
+  }
+
+  public SchemaProvider getSchemaProvider() {
+    return schemaProvider;
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonDFSSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonDFSSource.java
@@ -19,22 +19,38 @@
 package com.uber.hoodie.utilities.sources;

 import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
-import org.apache.avro.generic.GenericRecord;
+import com.uber.hoodie.utilities.sources.helpers.DFSPathSelector;
+import java.util.Optional;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;

 /**
 * DFS Source that reads json data
 */
-public class JsonDFSSource extends DFSSource {
+public class JsonDFSSource extends JsonSource {

-  public JsonDFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
+  private final DFSPathSelector pathSelector;
+
+  public JsonDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider);
+    this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
  }

  @Override
-  protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
-    return sparkContext.textFile(pathStr).map(convertor::fromJson);
+  protected InputBatch<JavaRDD<String>> fetchNewData(Optional<String> lastCkptStr,
+      long sourceLimit) {
+    Pair<Optional<String>, String> selPathsWithMaxModificationTime =
+        pathSelector.getNextFilePathsAndMaxModificationTime(lastCkptStr, sourceLimit);
+    return selPathsWithMaxModificationTime.getLeft().map(pathStr -> new InputBatch<>(
+        Optional.of(fromFiles(pathStr)), selPathsWithMaxModificationTime.getRight()))
+        .orElse(new InputBatch<>(Optional.empty(), selPathsWithMaxModificationTime.getRight()));
+  }
+
+  private JavaRDD<String> fromFiles(String pathStr) {
+    return sparkContext.textFile(pathStr);
  }
 }
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonKafkaSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonKafkaSource.java
@@ -20,26 +20,49 @@ package com.uber.hoodie.utilities.sources;

 import com.uber.hoodie.common.util.TypedProperties;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
+import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen;
+import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
+import java.util.Optional;
 import kafka.serializer.StringDecoder;
-import org.apache.avro.generic.GenericRecord;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
 import org.apache.spark.streaming.kafka.KafkaUtils;
 import org.apache.spark.streaming.kafka.OffsetRange;

 /**
 * Read json kafka data
 */
-public class JsonKafkaSource extends KafkaSource {
+public class JsonKafkaSource extends JsonSource {

-  public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(properties, sparkContext, schemaProvider);
+  private static Logger log = LogManager.getLogger(JsonKafkaSource.class);
+
+  private final KafkaOffsetGen offsetGen;
+
+  public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(properties, sparkContext, sparkSession, schemaProvider);
+    offsetGen = new KafkaOffsetGen(properties);
  }

  @Override
-  protected JavaRDD<GenericRecord> toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor) {
+  protected InputBatch<JavaRDD<String>> fetchNewData(Optional<String> lastCheckpointStr,
+      long sourceLimit) {
+    OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit);
+    long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
+    if (totalNewMsgs <= 0) {
+      return new InputBatch<>(Optional.empty(),
+          lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
+    }
+    log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
+    JavaRDD<String> newDataRDD = toRDD(offsetRanges);
+    return new InputBatch<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
+  }
+
+  private JavaRDD<String> toRDD(OffsetRange[] offsetRanges) {
    return KafkaUtils.createRDD(sparkContext, String.class, String.class, StringDecoder.class, StringDecoder.class,
-        kafkaParams, offsetRanges)
-        .values().map(avroConvertor::fromJson);
+        offsetGen.getKafkaParams(), offsetRanges).values();
  }
 }
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonSource.java
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.sources;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.utilities.schema.SchemaProvider;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+
+public abstract class JsonSource extends Source<JavaRDD<String>> {
+
+  public JsonSource(TypedProperties props,
+      JavaSparkContext sparkContext,
+      SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider, SourceType.JSON);
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/RowSource.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/RowSource.java
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.sources;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import com.uber.hoodie.common.util.collection.Pair;
+import com.uber.hoodie.utilities.schema.RowBasedSchemaProvider;
+import com.uber.hoodie.utilities.schema.SchemaProvider;
+import java.util.Optional;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+public abstract class RowSource extends Source<Dataset<Row>> {
+
+  public RowSource(TypedProperties props,
+      JavaSparkContext sparkContext,
+      SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    super(props, sparkContext, sparkSession, schemaProvider, SourceType.ROW);
+  }
+
+  protected abstract Pair<Optional<Dataset<Row>>, String> fetchNextBatch(Optional<String> lastCkptStr,
+      long sourceLimit);
+
+  @Override
+  protected final InputBatch<Dataset<Row>> fetchNewData(Optional<String> lastCkptStr, long sourceLimit) {
+    Pair<Optional<Dataset<Row>>, String> res = fetchNextBatch(lastCkptStr, sourceLimit);
+    return res.getKey().map(dsr -> {
+      SchemaProvider rowSchemaProvider = new RowBasedSchemaProvider(dsr.schema());
+      return new InputBatch<>(res.getKey(), res.getValue(), rowSchemaProvider);
+    }).orElseGet(() -> new InputBatch<>(res.getKey(), res.getValue()));
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java
@@ -19,36 +19,67 @@
 package com.uber.hoodie.utilities.sources;

 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.utilities.schema.SchemaProvider;
 import java.io.Serializable;
 import java.util.Optional;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.spark.api.java.JavaRDD;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;

 /**
 * Represents a source from which we can tail data. Assumes a constructor that takes properties.
 */
-public abstract class Source implements Serializable {
+public abstract class Source<T> implements Serializable {
+  protected static volatile Logger log = LogManager.getLogger(Source.class);

-  protected transient TypedProperties props;
-
-  protected transient JavaSparkContext sparkContext;
-
-  protected transient SchemaProvider schemaProvider;
-
-
-  protected Source(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    this.props = props;
-    this.sparkContext = sparkContext;
-    this.schemaProvider = schemaProvider;
+  public enum SourceType {
+    JSON,
+    AVRO,
+    ROW
  }

+  protected transient TypedProperties props;
+  protected transient JavaSparkContext sparkContext;
+  protected transient SparkSession sparkSession;
+  private transient SchemaProvider overriddenSchemaProvider;
+
+  private final SourceType sourceType;
+
+  protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider) {
+    this(props, sparkContext, sparkSession, schemaProvider, SourceType.AVRO);
+  }
+
+  protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
+      SchemaProvider schemaProvider, SourceType sourceType) {
+    this.props = props;
+    this.sparkContext = sparkContext;
+    this.sparkSession = sparkSession;
+    this.overriddenSchemaProvider = schemaProvider;
+    this.sourceType = sourceType;
+  }
+
+  protected abstract InputBatch<T> fetchNewData(Optional<String> lastCkptStr, long sourceLimit);
+
  /**
-   * Fetches new data upto sourceLimit, from the provided checkpoint and returns an RDD of the
-   * data, as well as the checkpoint to be written as a result of that.
+   * Main API called by Hoodie Delta Streamer to fetch records
+   * @param lastCkptStr Last Checkpoint
+   * @param sourceLimit Source Limit
+   * @return
   */
-  public abstract Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
-      Optional<String> lastCheckpointStr, long sourceLimit);
+  public final InputBatch<T> fetchNext(Optional<String> lastCkptStr, long sourceLimit) {
+    InputBatch<T> batch = fetchNewData(lastCkptStr, sourceLimit);
+    // If overriddenSchemaProvider is passed in CLI, use it
+    return overriddenSchemaProvider == null ? batch : new InputBatch<>(batch.getBatch(),
+        batch.getCheckpointForNextBatch(), overriddenSchemaProvider);
+  }
+
+  public SourceType getSourceType() {
+    return sourceType;
+  }
+
+  public SparkSession getSparkSession() {
+    return sparkSession;
+  }
 }
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/AvroConvertor.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/AvroConvertor.java
@@ -16,7 +16,7 @@
 *
 */

-package com.uber.hoodie.utilities.sources;
+package com.uber.hoodie.utilities.sources.helpers;

 import com.twitter.bijection.Injection;
 import com.twitter.bijection.avro.GenericAvroCodecs;
@@ -55,6 +55,10 @@ public class AvroConvertor implements Serializable {
    this.schemaStr = schemaStr;
  }

+  public AvroConvertor(Schema schema) {
+    this.schemaStr = schema.toString();
+    this.schema = schema;
+  }

  private void initSchema() {
    if (schema == null) {
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/DFSPathSelector.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/DFSPathSelector.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 *
 */

-package com.uber.hoodie.utilities.sources;
+package com.uber.hoodie.utilities.sources.helpers;

 import com.uber.hoodie.DataSourceUtils;
 import com.uber.hoodie.common.util.FSUtils;
@@ -24,45 +24,38 @@ import com.uber.hoodie.common.util.TypedProperties;
 import com.uber.hoodie.common.util.collection.ImmutablePair;
 import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.exception.HoodieIOException;
-import com.uber.hoodie.utilities.schema.SchemaProvider;
 import java.io.IOException;
 import java.util.*;
 import java.util.stream.Collectors;
-import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;

-/**
- * Source to read data from a given DFS directory structure, incrementally
- */
-public abstract class DFSSource extends Source {
+public class DFSPathSelector {

  /**
   * Configs supported
   */
  static class Config {
+
    private static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.dfs.root";
  }

  private static final List<String> IGNORE_FILEPREFIX_LIST = Arrays.asList(".", "_");

  private final transient FileSystem fs;
+  private final TypedProperties props;

-  public DFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
-    DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.ROOT_INPUT_PATH_PROP));
-    this.fs = FSUtils.getFs(props.getString(Config.ROOT_INPUT_PATH_PROP), sparkContext.hadoopConfiguration());
+  public DFSPathSelector(TypedProperties props, Configuration hadoopConf) {
+    DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.ROOT_INPUT_PATH_PROP));
+    this.props = props;
+    this.fs = FSUtils.getFs(props.getString(Config.ROOT_INPUT_PATH_PROP), hadoopConf);
  }

-  protected abstract JavaRDD<GenericRecord> fromFiles(final AvroConvertor convertor, String pathStr);
-
-  @Override
-  public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
+  public Pair<Optional<String>, String> getNextFilePathsAndMaxModificationTime(
      Optional<String> lastCheckpointStr, long sourceLimit) {

    try {
@@ -111,11 +104,9 @@ public abstract class DFSSource extends Source {
      // read the files out.
      String pathStr = filteredFiles.stream().map(f -> f.getPath().toString())
          .collect(Collectors.joining(","));
-      String schemaStr = schemaProvider.getSourceSchema().toString();
-      final AvroConvertor avroConvertor = new AvroConvertor(schemaStr);

      return new ImmutablePair<>(
-          Optional.of(fromFiles(avroConvertor, pathStr)),
+          Optional.ofNullable(pathStr),
          String.valueOf(maxModificationTime));
    } catch (IOException ioe) {
      throw new HoodieIOException(
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/IncrSourceHelper.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/IncrSourceHelper.java
@@ -0,0 +1,88 @@
+package com.uber.hoodie.utilities.sources.helpers;
+
+import com.google.common.base.Preconditions;
+import com.uber.hoodie.common.table.HoodieTableMetaClient;
+import com.uber.hoodie.common.table.HoodieTimeline;
+import com.uber.hoodie.common.table.timeline.HoodieInstant;
+import com.uber.hoodie.common.util.collection.Pair;
+import java.util.Optional;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Row;
+
+
+/**
+ * Helper for Hudi Incremental Source. Has APIs to
+ *   (a) calculate begin and end instant time for incrementally pulling from Hudi source
+ *   (b) Find max seen instant to be set as checkpoint for next fetch.
+ */
+public class IncrSourceHelper {
+
+  /**
+   * Get a timestamp which is the next value in a descending sequence
+   *
+   * @param timestamp Timestamp
+   */
+  private static String getStrictlyLowerTimestamp(String timestamp) {
+    long ts = Long.parseLong(timestamp);
+    Preconditions.checkArgument(ts > 0, "Timestamp must be positive");
+    Long lower = ts - 1;
+    return "" + lower;
+  }
+
+  /**
+   * Find begin and end instants to be set for the next fetch
+   *
+   * @param jssc Java Spark Context
+   * @param srcBasePath Base path of Hudi source table
+   * @param numInstantsPerFetch Max Instants per fetch
+   * @param beginInstant Last Checkpoint String
+   * @param readLatestOnMissingBeginInstant when begin instant is missing, allow reading from latest committed instant
+   * @return begin and end instants
+   */
+  public static Pair<String, String> calculateBeginAndEndInstants(
+      JavaSparkContext jssc, String srcBasePath, int numInstantsPerFetch, Optional<String> beginInstant,
+      boolean readLatestOnMissingBeginInstant) {
+    Preconditions.checkArgument(numInstantsPerFetch > 0, "Make sure the config"
+        + " hoodie.deltastreamer.source.hoodieincr.num_instants is set to a positive value");
+    HoodieTableMetaClient srcMetaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(),
+        srcBasePath, true);
+
+    final HoodieTimeline activeCommitTimeline =
+        srcMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();
+
+    String beginInstantTime = beginInstant.orElseGet(() -> {
+      if (readLatestOnMissingBeginInstant) {
+        Optional<HoodieInstant> lastInstant = activeCommitTimeline.lastInstant();
+        return lastInstant.map(hoodieInstant -> getStrictlyLowerTimestamp(hoodieInstant.getTimestamp())).orElse("000");
+      } else {
+        throw new IllegalArgumentException("Missing begin instant for incremental pull. For reading from latest "
+            + "committed instant set hoodie.deltastreamer.source.hoodie.read_latest_on_midding_ckpt to true");
+      }
+    });
+
+    Optional<HoodieInstant> nthInstant =
+        activeCommitTimeline.findInstantsAfter(beginInstantTime, numInstantsPerFetch).getInstants().reduce((x, y) -> y);
+    return Pair.of(beginInstantTime, nthInstant.map(instant -> instant.getTimestamp()).orElse(beginInstantTime));
+  }
+
+  /**
+   * Validate instant time seen in the incoming row
+   *
+   * @param row Input Row
+   * @param instantTime Hoodie Instant time of the row
+   * @param sinceInstant begin instant of the batch
+   * @param endInstant end instant of the batch
+   */
+  public static void validateInstantTime(Row row, String instantTime, String sinceInstant, String endInstant) {
+    Preconditions.checkNotNull(instantTime);
+    Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime,
+        sinceInstant, HoodieTimeline.GREATER),
+        "Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime
+            + "but expected to be between " + sinceInstant + "(excl) - "
+            + endInstant + "(incl)");
+    Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime,
+        endInstant, HoodieTimeline.LESSER_OR_EQUAL),
+        "Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime
+            + "but expected to be between " + sinceInstant + "(excl) - " + endInstant + "(incl)");
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/KafkaOffsetGen.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/KafkaOffsetGen.java
@@ -16,24 +16,22 @@
 *
 */

-package com.uber.hoodie.utilities.sources;
+package com.uber.hoodie.utilities.sources.helpers;

 import com.uber.hoodie.DataSourceUtils;
 import com.uber.hoodie.common.util.TypedProperties;
-import com.uber.hoodie.common.util.collection.ImmutablePair;
-import com.uber.hoodie.common.util.collection.Pair;
 import com.uber.hoodie.exception.HoodieNotSupportedException;
 import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException;
-import com.uber.hoodie.utilities.schema.SchemaProvider;
-
-import java.util.*;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Optional;
 import java.util.stream.Collectors;
 import kafka.common.TopicAndPartition;
-import org.apache.avro.generic.GenericRecord;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.streaming.kafka.KafkaCluster;
 import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset;
 import org.apache.spark.streaming.kafka.OffsetRange;
@@ -49,14 +47,13 @@ import scala.util.Either;
 /**
 * Source to read data from Kafka, incrementally
 */
-public abstract class KafkaSource extends Source {
+public class KafkaOffsetGen {

-  private static volatile Logger log = LogManager.getLogger(KafkaSource.class);
+  private static volatile Logger log = LogManager.getLogger(KafkaOffsetGen.class);

  private static long DEFAULT_MAX_EVENTS_TO_READ = 1000000; // 1M events max

-
-  static class CheckpointUtils {
+  public static class CheckpointUtils {

    /**
     * Reconstruct checkpoint from string.
@@ -90,7 +87,6 @@ public abstract class KafkaSource extends Source {
      return sb.toString();
    }

-
    /**
     * Compute the offset ranges to read from Kafka, while handling newly added partitions, skews, event limits.
     *
@@ -174,19 +170,18 @@ public abstract class KafkaSource extends Source {
   * Configs to be passed for this source. All standard Kafka consumer configs are also respected
   */
  static class Config {
+
    private static final String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic";
    private static final KafkaResetOffsetStrategies DEFAULT_AUTO_RESET_OFFSET = KafkaResetOffsetStrategies.LARGEST;
  }

-
-  protected HashMap<String, String> kafkaParams;
-
+  private final HashMap<String, String> kafkaParams;
+  private final TypedProperties props;
  protected final String topicName;

-  public KafkaSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
-    super(props, sparkContext, schemaProvider);
-
-    kafkaParams = new HashMap<>();
+  public KafkaOffsetGen(TypedProperties props) {
+    this.props = props;
+    kafkaParams = new HashMap<String, String>();
    for (Object prop : props.keySet()) {
      kafkaParams.put(prop.toString(), props.getString(prop.toString()));
    }
@@ -194,11 +189,7 @@ public abstract class KafkaSource extends Source {
    topicName = props.getString(Config.KAFKA_TOPIC_NAME);
  }

-  protected abstract JavaRDD<GenericRecord> toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor);
-
-  @Override
-  public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
-      Optional<String> lastCheckpointStr, long sourceLimit) {
+  public OffsetRange[] getNextOffsetRanges(Optional<String> lastCheckpointStr, long sourceLimit) {

    // Obtain current metadata for the topic
    KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams));
@@ -240,16 +231,15 @@ public abstract class KafkaSource extends Source {
    // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
    long numEvents = Math.min(DEFAULT_MAX_EVENTS_TO_READ, sourceLimit);
    OffsetRange[] offsetRanges = CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents);
-    long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
-    if (totalNewMsgs <= 0) {
-      return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.orElse(""));
-    } else {
-      log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + topicName);
-    }

-    // Produce a RDD[GenericRecord]
-    final AvroConvertor avroConvertor = new AvroConvertor(schemaProvider.getSourceSchema().toString());
-    JavaRDD<GenericRecord> newDataRDD = toAvroRDD(offsetRanges, avroConvertor);
-    return new ImmutablePair<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
+    return offsetRanges;
+  }
+
+  public String getTopicName() {
+    return topicName;
+  }
+
+  public HashMap<String, String> getKafkaParams() {
+    return kafkaParams;
  }
 }
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/IdentityTransformer.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/IdentityTransformer.java
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.transform;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+/**
+ * Identity transformer
+ */
+public class IdentityTransformer implements Transformer {
+
+  @Override
+  public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
+      Dataset<Row> rowDataset, TypedProperties properties) {
+    return rowDataset;
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/SqlQueryBasedTransformer.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/SqlQueryBasedTransformer.java
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.transform;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import java.util.UUID;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+/**
+ * A transformer that allows a sql-query template be used to transform the source before writing to Hudi data-set.
+ *
+ * The query should reference the source as a table named "\<SRC\>"
+ */
+public class SqlQueryBasedTransformer implements Transformer {
+
+  private static volatile Logger log = LogManager.getLogger(SqlQueryBasedTransformer.class);
+
+  private static final String SRC_PATTERN = "<SRC>";
+  private static final String TMP_TABLE = "HOODIE_SRC_TMP_TABLE_";
+
+  /**
+   * Configs supported
+   */
+  static class Config {
+
+    private static final String TRANSFORMER_SQL = "hoodie.deltastreamer.transformer.sql";
+  }
+
+  @Override
+  public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
+      Dataset<Row> rowDataset, TypedProperties properties) {
+    String transformerSQL = properties.getString(Config.TRANSFORMER_SQL);
+    if (null == transformerSQL) {
+      throw new IllegalArgumentException("Missing configuration : (" + Config.TRANSFORMER_SQL + ")");
+    }
+
+    // tmp table name doesn't like dashes
+    String tmpTable = TMP_TABLE.concat(UUID.randomUUID().toString().replace("-", "_"));
+    log.info("Registering tmp table : " + tmpTable);
+    rowDataset.registerTempTable(tmpTable);
+    String sqlStr = transformerSQL.replaceAll(SRC_PATTERN, tmpTable);
+    log.info("SQL Query for transformation : (" + sqlStr + ")");
+    return sparkSession.sql(sqlStr);
+  }
+}
--- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/Transformer.java
+++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/Transformer.java
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.utilities.transform;
+
+import com.uber.hoodie.common.util.TypedProperties;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+/**
+ * Transform source to target dataset before writing
+ */
+public interface Transformer {
+
+  /**
+   * Transform source RDD to target RDD
+   *
+   * @param jsc JavaSparkContext
+   * @param rowDataset Source DataSet
+   * @param sparkSession Spark Session
+   * @param properties Config properties
+   * @return Transformed Dataset
+   */
+  Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
+      Dataset<Row> rowDataset, TypedProperties properties);
+}