[HUDI-394] Provide a basic implementation of test suite

2019-11-01 12:40:09 -07:00
parent d5b593b7d9
commit 2fc2b01d86
102 changed files with 8633 additions and 64 deletions
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieDeltaStreamerWrapper.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieDeltaStreamerWrapper.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
+import org.apache.hudi.utilities.schema.SchemaProvider;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+/**
+ * Extends the {@link HoodieDeltaStreamer} to expose certain operations helpful in running the Test Suite.
+ * This is done to achieve 2 things 1) Leverage some components of {@link HoodieDeltaStreamer} 2)
+ * Piggyback on the suite to test {@link HoodieDeltaStreamer}
+ */
+public class HoodieDeltaStreamerWrapper extends HoodieDeltaStreamer {
+
+  public HoodieDeltaStreamerWrapper(Config cfg, JavaSparkContext jssc) throws Exception {
+    super(cfg, jssc);
+  }
+
+  public HoodieDeltaStreamerWrapper(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf conf) throws Exception {
+    super(cfg, jssc, fs, conf);
+  }
+
+  public JavaRDD<WriteStatus> upsert(Operation operation) throws
+      Exception {
+    cfg.operation = operation;
+    return deltaSyncService.getDeltaSync().syncOnce().getRight();
+  }
+
+  public JavaRDD<WriteStatus> insert() throws Exception {
+    return upsert(Operation.INSERT);
+  }
+
+  public JavaRDD<WriteStatus> bulkInsert() throws
+      Exception {
+    return upsert(Operation.BULK_INSERT);
+  }
+
+  public void scheduleCompact() throws Exception {
+    // Since we don't support scheduleCompact() operation in delta-streamer, assume upsert without any data that will
+    // trigger scheduling compaction
+    upsert(Operation.UPSERT);
+  }
+
+  public JavaRDD<WriteStatus> compact() throws Exception {
+    // Since we don't support compact() operation in delta-streamer, assume upsert without any data that will trigger
+    // inline compaction
+    return upsert(Operation.UPSERT);
+  }
+
+  public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchSource() throws Exception {
+    return deltaSyncService.getDeltaSync().readFromSource(deltaSyncService.getDeltaSync().getCommitTimelineOpt());
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite;
+
+import com.beust.jcommander.JCommander;
+import com.beust.jcommander.Parameter;
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hudi.DataSourceUtils;
+import org.apache.hudi.common.config.SerializableConfiguration;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.util.ReflectionUtils;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
+import org.apache.hudi.integ.testsuite.dag.DagUtils;
+import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
+import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator;
+import org.apache.hudi.integ.testsuite.dag.scheduler.DagScheduler;
+import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
+import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
+import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
+import org.apache.hudi.keygen.KeyGenerator;
+import org.apache.hudi.utilities.UtilHelpers;
+import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
+import org.apache.hudi.utilities.schema.SchemaProvider;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is the entry point for running a Hudi Test Suite. Although this class has similarities with
+ * {@link HoodieDeltaStreamer} this class does not extend it since do not want to create a dependency on the changes in
+ * DeltaStreamer.
+ */
+public class HoodieTestSuiteJob {
+
+  private static volatile Logger log = LoggerFactory.getLogger(HoodieTestSuiteJob.class);
+
+  private final HoodieTestSuiteConfig cfg;
+  /**
+   * Bag of properties with source, hoodie client, key generator etc.
+   */
+  TypedProperties props;
+  /**
+   * Schema provider that supplies the command for writing out the generated payloads.
+   */
+  private transient SchemaProvider schemaProvider;
+  /**
+   * Filesystem used.
+   */
+  private transient FileSystem fs;
+  /**
+   * Spark context.
+   */
+  private transient JavaSparkContext jsc;
+  /**
+   * Spark Session.
+   */
+  private transient SparkSession sparkSession;
+  /**
+   * Hive Config.
+   */
+  private transient HiveConf hiveConf;
+
+  private KeyGenerator keyGenerator;
+
+  public HoodieTestSuiteJob(HoodieTestSuiteConfig cfg, JavaSparkContext jsc) throws IOException {
+    this.cfg = cfg;
+    this.jsc = jsc;
+    this.sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate();
+    this.fs = FSUtils.getFs(cfg.inputBasePath, jsc.hadoopConfiguration());
+    this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
+    log.info("Creating workload generator with configs : {}", props.toString());
+    this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jsc);
+    this.hiveConf = getDefaultHiveConf(jsc.hadoopConfiguration());
+    this.keyGenerator = DataSourceUtils.createKeyGenerator(props);
+    if (!fs.exists(new Path(cfg.targetBasePath))) {
+      HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), cfg.targetBasePath,
+          HoodieTableType.valueOf(cfg.tableType), cfg.targetTableName, "archived");
+    }
+  }
+
+  private static HiveConf getDefaultHiveConf(Configuration cfg) {
+    HiveConf hiveConf = new HiveConf();
+    hiveConf.addResource(cfg);
+    return hiveConf;
+  }
+
+  public static void main(String[] args) throws Exception {
+    final HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig();
+    JCommander cmd = new JCommander(cfg, args);
+    if (cfg.help || args.length == 0) {
+      cmd.usage();
+      System.exit(1);
+    }
+
+    JavaSparkContext jssc = UtilHelpers.buildSparkContext("workload-generator-" + cfg.outputTypeName
+        + "-" + cfg.inputFormatName, cfg.sparkMaster);
+    new HoodieTestSuiteJob(cfg, jssc).runTestSuite();
+  }
+
+  public void runTestSuite() {
+    try {
+      WorkflowDag workflowDag = this.cfg.workloadYamlPath == null ? ((WorkflowDagGenerator) ReflectionUtils
+          .loadClass((this.cfg).workloadDagGenerator)).build()
+          : DagUtils.convertYamlPathToDag(this.fs, this.cfg.workloadYamlPath);
+      log.info("Workflow Dag => " + DagUtils.convertDagToYaml(workflowDag));
+      long startTime = System.currentTimeMillis();
+      String schemaStr = schemaProvider.getSourceSchema().toString();
+      final HoodieTestSuiteWriter writer = new HoodieTestSuiteWriter(jsc, props, cfg, schemaStr);
+      final DeltaGenerator deltaGenerator = new DeltaGenerator(
+          new DFSDeltaConfig(DeltaOutputMode.valueOf(cfg.outputTypeName), DeltaInputType.valueOf(cfg.inputFormatName),
+              new SerializableConfiguration(jsc.hadoopConfiguration()), cfg.inputBasePath, cfg.targetBasePath,
+              schemaStr, cfg.limitFileSize), jsc, sparkSession, schemaStr, keyGenerator);
+      DagScheduler dagScheduler = new DagScheduler(workflowDag, writer, deltaGenerator);
+      dagScheduler.schedule();
+      log.info("Finished scheduling all tasks, Time taken {}", System.currentTimeMillis() - startTime);
+    } catch (Exception e) {
+      log.error("Failed to run Test Suite ", e);
+      throw new HoodieException("Failed to run Test Suite ", e);
+    } finally {
+      jsc.stop();
+    }
+  }
+
+  /**
+   * The Hudi test suite uses {@link HoodieDeltaStreamer} to run some operations hence extend delta streamer config.
+   */
+  public static class HoodieTestSuiteConfig extends HoodieDeltaStreamer.Config {
+
+    @Parameter(names = {"--input-base-path"}, description = "base path for input data"
+        + "(Will be created if did not exist first time around. If exists, more data will be added to that path)",
+        required = true)
+    public String inputBasePath;
+
+    @Parameter(names = {
+        "--workload-generator-classname"}, description = "WorkflowDag of operations to generate the workload",
+        required = true)
+    public String workloadDagGenerator = WorkflowDagGenerator.class.getName();
+
+    @Parameter(names = {
+        "--workload-yaml-path"}, description = "Workflow Dag yaml path to generate the workload")
+    public String workloadYamlPath;
+
+    @Parameter(names = {"--delta-output-type"}, description = "Subclass of "
+        + "org.apache.hudi.testsuite.workload.DeltaOutputMode to readAvro data.")
+    public String outputTypeName = DeltaOutputMode.DFS.name();
+
+    @Parameter(names = {"--delta-input-format"}, description = "Subclass of "
+        + "org.apache.hudi.testsuite.workload.DeltaOutputMode to read avro data.")
+    public String inputFormatName = DeltaInputType.AVRO.name();
+
+    @Parameter(names = {"--input-file-size"}, description = "The min/max size of the input files to generate",
+        required = true)
+    public Long limitFileSize = 1024 * 1024 * 120L;
+
+    @Parameter(names = {"--use-deltastreamer"}, description = "Choose whether to use HoodieDeltaStreamer to "
+        + "perform"
+        + " ingestion. If set to false, HoodieWriteClient will be used")
+    public Boolean useDeltaStreamer = false;
+
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hudi.avro.model.HoodieCompactionPlan;
+import org.apache.hudi.client.HoodieReadClient;
+import org.apache.hudi.client.HoodieWriteClient;
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.config.HoodieCompactionConfig;
+import org.apache.hudi.config.HoodieIndexConfig;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.index.HoodieIndex;
+import org.apache.hudi.integ.testsuite.dag.nodes.CleanNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode;
+import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig;
+import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Operation;
+import org.apache.hudi.utilities.schema.SchemaProvider;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+/**
+ * A writer abstraction for the Hudi test suite. This class wraps different implementations of writers used to perform
+ * write operations into the target hudi dataset. Current supported writers are {@link HoodieDeltaStreamerWrapper}
+ * and {@link HoodieWriteClient}.
+ */
+public class HoodieTestSuiteWriter {
+
+  private HoodieDeltaStreamerWrapper deltaStreamerWrapper;
+  private HoodieWriteClient writeClient;
+  protected HoodieTestSuiteConfig cfg;
+  private Option<String> lastCheckpoint;
+  private HoodieReadClient hoodieReadClient;
+  private Properties props;
+  private String schema;
+  private transient Configuration configuration;
+  private transient JavaSparkContext sparkContext;
+  private static Set<String> VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE = new HashSet<>(
+      Arrays.asList(RollbackNode.class.getName(), CleanNode.class.getName(), ScheduleCompactNode.class.getName()));
+
+  public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema) throws
+      Exception {
+    this(jsc, props, cfg, schema, true);
+  }
+
+  public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema,
+      boolean rollbackInflight) throws Exception {
+    // We ensure that only 1 instance of HoodieWriteClient is instantiated for a HoodieTestSuiteWriter
+    // This does not instantiate a HoodieWriteClient until a
+    // {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} is invoked.
+    this.deltaStreamerWrapper = new HoodieDeltaStreamerWrapper(cfg, jsc);
+    this.hoodieReadClient = new HoodieReadClient(jsc, cfg.targetBasePath);
+    if (!cfg.useDeltaStreamer) {
+      this.writeClient = new HoodieWriteClient(jsc, getHoodieClientConfig(cfg, props, schema), rollbackInflight);
+    }
+    this.cfg = cfg;
+    this.configuration = jsc.hadoopConfiguration();
+    this.sparkContext = jsc;
+    this.props = props;
+    this.schema = schema;
+  }
+
+  private HoodieWriteConfig getHoodieClientConfig(HoodieTestSuiteConfig cfg, Properties props, String schema) {
+    HoodieWriteConfig.Builder builder =
+        HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
+            .withAutoCommit(false)
+            .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build())
+            .forTable(cfg.targetTableName)
+            .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
+            .withProps(props);
+    builder = builder.withSchema(schema);
+    return builder.build();
+  }
+
+  private boolean allowWriteClientAccess(DagNode dagNode) {
+    if (VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE.contains(dagNode.getClass().getName())) {
+      return true;
+    }
+    return false;
+  }
+
+  public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchSource() throws Exception {
+    return this.deltaStreamerWrapper.fetchSource();
+  }
+
+  public Option<String> startCommit() {
+    if (cfg.useDeltaStreamer) {
+      return Option.of(HoodieActiveTimeline.createNewInstantTime());
+    } else {
+      return Option.of(writeClient.startCommit());
+    }
+  }
+
+  public JavaRDD<WriteStatus> upsert(Option<String> instantTime) throws Exception {
+    if (cfg.useDeltaStreamer) {
+      return deltaStreamerWrapper.upsert(Operation.UPSERT);
+    } else {
+      Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
+      lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
+      return writeClient.upsert(nextBatch.getRight().getRight(), instantTime.get());
+    }
+  }
+
+  public JavaRDD<WriteStatus> insert(Option<String> instantTime) throws Exception {
+    if (cfg.useDeltaStreamer) {
+      return deltaStreamerWrapper.insert();
+    } else {
+      Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
+      lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
+      return writeClient.insert(nextBatch.getRight().getRight(), instantTime.get());
+    }
+  }
+
+  public JavaRDD<WriteStatus> bulkInsert(Option<String> instantTime) throws Exception {
+    if (cfg.useDeltaStreamer) {
+      return deltaStreamerWrapper.bulkInsert();
+    } else {
+      Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
+      lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
+      return writeClient.bulkInsert(nextBatch.getRight().getRight(), instantTime.get());
+    }
+  }
+
+  public JavaRDD<WriteStatus> compact(Option<String> instantTime) throws Exception {
+    if (cfg.useDeltaStreamer) {
+      return deltaStreamerWrapper.compact();
+    } else {
+      if (!instantTime.isPresent()) {
+        Option<Pair<String, HoodieCompactionPlan>> compactionPlanPair = Option
+            .fromJavaOptional(hoodieReadClient.getPendingCompactions()
+                .stream().findFirst());
+        if (compactionPlanPair.isPresent()) {
+          instantTime = Option.of(compactionPlanPair.get().getLeft());
+        }
+      }
+      if (instantTime.isPresent()) {
+        return writeClient.compact(instantTime.get());
+      } else {
+        return null;
+      }
+    }
+  }
+
+  public Option<String> scheduleCompaction(Option<Map<String, String>> previousCommitExtraMetadata) throws
+      Exception {
+    if (!cfg.useDeltaStreamer) {
+      deltaStreamerWrapper.scheduleCompact();
+      return Option.empty();
+    } else {
+      return writeClient.scheduleCompaction(previousCommitExtraMetadata);
+    }
+  }
+
+  public void commit(JavaRDD<WriteStatus> records, Option<String> instantTime) {
+    if (!cfg.useDeltaStreamer) {
+      Map<String, String> extraMetadata = new HashMap<>();
+      /** Store the checkpoint in the commit metadata just like
+       * {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} **/
+      extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get());
+      writeClient.commit(instantTime.get(), records, Option.of(extraMetadata));
+    }
+  }
+
+  public HoodieWriteClient getWriteClient(DagNode dagNode) throws IllegalAccessException {
+    if (cfg.useDeltaStreamer & !allowWriteClientAccess(dagNode)) {
+      throw new IllegalAccessException("cannot access write client when testing in deltastreamer mode");
+    }
+    synchronized (this) {
+      if (writeClient == null) {
+        this.writeClient = new HoodieWriteClient(this.sparkContext, getHoodieClientConfig(cfg, props, schema), false);
+      }
+    }
+    return writeClient;
+  }
+
+  public HoodieDeltaStreamerWrapper getDeltaStreamerWrapper() {
+    return deltaStreamerWrapper;
+  }
+
+  public HoodieTestSuiteConfig getCfg() {
+    return cfg;
+  }
+
+  public Configuration getConfiguration() {
+    return configuration;
+  }
+
+  public JavaSparkContext getSparkContext() {
+    return sparkContext;
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DFSDeltaConfig.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DFSDeltaConfig.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.configuration;
+
+import org.apache.hudi.common.config.SerializableConfiguration;
+import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
+import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
+
+/**
+ * Configuration to hold details about a DFS based output type, implements {@link DeltaConfig}.
+ */
+public class DFSDeltaConfig extends DeltaConfig {
+
+  // The base path where the generated data should be written to. This data will in turn be used to write into a hudi
+  // dataset
+  private final String deltaBasePath;
+  private final String datasetOutputPath;
+  private final String schemaStr;
+  // Maximum file size for the files generated
+  private final Long maxFileSize;
+  // The current batch id
+  private Integer batchId;
+
+  public DFSDeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType,
+                        SerializableConfiguration configuration,
+                        String deltaBasePath, String targetBasePath, String schemaStr, Long maxFileSize) {
+    super(deltaOutputMode, deltaInputType, configuration);
+    this.deltaBasePath = deltaBasePath;
+    this.schemaStr = schemaStr;
+    this.maxFileSize = maxFileSize;
+    this.datasetOutputPath = targetBasePath;
+  }
+
+  public String getDeltaBasePath() {
+    return deltaBasePath;
+  }
+
+  public String getDatasetOutputPath() {
+    return datasetOutputPath;
+  }
+
+  public String getSchemaStr() {
+    return schemaStr;
+  }
+
+  public Long getMaxFileSize() {
+    return maxFileSize;
+  }
+
+  public Integer getBatchId() {
+    return batchId;
+  }
+
+  public void setBatchId(Integer batchId) {
+    this.batchId = batchId;
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.configuration;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hudi.common.config.SerializableConfiguration;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
+import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
+
+/**
+ * Configuration to hold the delta output type and delta input format.
+ */
+public class DeltaConfig implements Serializable {
+
+  private final DeltaOutputMode deltaOutputMode;
+  private final DeltaInputType deltaInputType;
+  private final SerializableConfiguration configuration;
+
+  public DeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType,
+      SerializableConfiguration configuration) {
+    this.deltaOutputMode = deltaOutputMode;
+    this.deltaInputType = deltaInputType;
+    this.configuration = configuration;
+  }
+
+  public DeltaOutputMode getDeltaOutputMode() {
+    return deltaOutputMode;
+  }
+
+  public DeltaInputType getDeltaInputType() {
+    return deltaInputType;
+  }
+
+  public Configuration getConfiguration() {
+    return configuration.get();
+  }
+
+  /**
+   * Represents any kind of workload operation for new data. Each workload also contains a set of Option sequence of
+   * actions that can be executed in parallel.
+   */
+  public static class Config {
+
+    public static final String CONFIG_NAME = "config";
+    public static final String TYPE = "type";
+    public static final String NODE_NAME = "name";
+    public static final String DEPENDENCIES = "deps";
+    public static final String CHILDREN = "children";
+    public static final String HIVE_QUERIES = "hive_queries";
+    public static final String HIVE_PROPERTIES = "hive_props";
+    private static String NUM_RECORDS_INSERT = "num_records_insert";
+    private static String NUM_RECORDS_UPSERT = "num_records_upsert";
+    private static String REPEAT_COUNT = "repeat_count";
+    private static String RECORD_SIZE = "record_size";
+    private static String NUM_PARTITIONS_INSERT = "num_partitions_insert";
+    private static String NUM_PARTITIONS_UPSERT = "num_partitions_upsert";
+    private static String NUM_FILES_UPSERT = "num_files_upsert";
+    private static String FRACTION_UPSERT_PER_FILE = "fraction_upsert_per_file";
+    private static String DISABLE_GENERATE = "disable_generate";
+    private static String DISABLE_INGEST = "disable_ingest";
+    private static String HIVE_LOCAL = "hive_local";
+
+    private Map<String, Object> configsMap;
+
+    public Config(Map<String, Object> configsMap) {
+      this.configsMap = configsMap;
+    }
+
+    public static Builder newBuilder() {
+      return new Builder();
+    }
+
+    public long getNumRecordsInsert() {
+      return Long.valueOf(configsMap.getOrDefault(NUM_RECORDS_INSERT, 0).toString());
+    }
+
+    public long getNumRecordsUpsert() {
+      return Long.valueOf(configsMap.getOrDefault(NUM_RECORDS_UPSERT, 0).toString());
+    }
+
+    public int getRecordSize() {
+      return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString());
+    }
+
+    public int getNumInsertPartitions() {
+      return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString());
+    }
+
+    public int getRepeatCount() {
+      return Integer.valueOf(configsMap.getOrDefault(REPEAT_COUNT, 1).toString());
+    }
+
+    public int getNumUpsertPartitions() {
+      return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_UPSERT, 0).toString());
+    }
+
+    public int getNumUpsertFiles() {
+      return Integer.valueOf(configsMap.getOrDefault(NUM_FILES_UPSERT, 1).toString());
+    }
+
+    public double getFractionUpsertPerFile() {
+      return Double.valueOf(configsMap.getOrDefault(FRACTION_UPSERT_PER_FILE, 0.0).toString());
+    }
+
+    public boolean isDisableGenerate() {
+      return Boolean.valueOf(configsMap.getOrDefault(DISABLE_GENERATE, false).toString());
+    }
+
+    public boolean isDisableIngest() {
+      return Boolean.valueOf(configsMap.getOrDefault(DISABLE_INGEST, false).toString());
+    }
+
+    public Map<String, Object> getOtherConfigs() {
+      if (configsMap == null) {
+        return new HashMap<>();
+      }
+      return configsMap;
+    }
+
+    public List<Pair<String, Integer>> getHiveQueries() {
+      try {
+        return (List<Pair<String, Integer>>) this.configsMap.getOrDefault("hive_queries", new ArrayList<>());
+      } catch (Exception e) {
+        throw new RuntimeException("unable to get hive queries from configs");
+      }
+    }
+
+    public boolean isHiveLocal() {
+      return Boolean.valueOf(configsMap.getOrDefault(HIVE_LOCAL, false).toString());
+    }
+
+    public List<String> getHiveProperties() {
+      return (List<String>) this.configsMap.getOrDefault(HIVE_PROPERTIES, new ArrayList<>());
+    }
+
+    @Override
+    public String toString() {
+      try {
+        return new ObjectMapper().writeValueAsString(this.configsMap);
+      } catch (Exception e) {
+        throw new RuntimeException("unable to generate string representation of config");
+      }
+    }
+
+    public static class Builder {
+
+      private Map<String, Object> configsMap = new HashMap<>();
+
+      public Builder() {
+      }
+
+      public Builder withNumRecordsToInsert(long numRecordsInsert) {
+        this.configsMap.put(NUM_RECORDS_INSERT, numRecordsInsert);
+        return this;
+      }
+
+      public Builder withNumRecordsToUpdate(long numRecordsUpsert) {
+        this.configsMap.put(NUM_RECORDS_UPSERT, numRecordsUpsert);
+        return this;
+      }
+
+      public Builder withNumInsertPartitions(int numInsertPartitions) {
+        this.configsMap.put(NUM_PARTITIONS_INSERT, numInsertPartitions);
+        return this;
+      }
+
+      public Builder withNumUpsertPartitions(int numUpsertPartitions) {
+        this.configsMap.put(NUM_PARTITIONS_UPSERT, numUpsertPartitions);
+        return this;
+      }
+
+      public Builder withNumUpsertFiles(int numUpsertFiles) {
+        this.configsMap.put(NUM_FILES_UPSERT, numUpsertFiles);
+        return this;
+      }
+
+      public Builder withFractionUpsertPerFile(double fractionUpsertPerFile) {
+        this.configsMap.put(FRACTION_UPSERT_PER_FILE, fractionUpsertPerFile);
+        return this;
+      }
+
+      public Builder withNumTimesToRepeat(int repeatCount) {
+        this.configsMap.put(REPEAT_COUNT, repeatCount);
+        return this;
+      }
+
+      public Builder withRecordSize(int recordSize) {
+        this.configsMap.put(RECORD_SIZE, recordSize);
+        return this;
+      }
+
+      public Builder disableGenerate(boolean generate) {
+        this.configsMap.put(DISABLE_GENERATE, generate);
+        return this;
+      }
+
+      public Builder disableIngest(boolean ingest) {
+        this.configsMap.put(DISABLE_INGEST, ingest);
+        return this;
+      }
+
+      public Builder withConfig(String name, Object value) {
+        this.configsMap.put(name, value);
+        return this;
+      }
+
+      public Builder withHiveQueryAndResults(List<Pair<String, Integer>> hiveQueries) {
+        this.configsMap.put(HIVE_QUERIES, hiveQueries);
+        return this;
+      }
+
+      public Builder withHiveLocal(boolean startHiveLocal) {
+        this.configsMap.put(HIVE_LOCAL, startHiveLocal);
+        return this;
+      }
+
+      public Builder withHiveProperties(List<String> hiveProperties) {
+        this.configsMap.put(HIVE_PROPERTIES, hiveProperties);
+        return this;
+      }
+
+      public Builder withConfigsMap(Map<String, Object> configsMap) {
+        this.configsMap = configsMap;
+        return this;
+      }
+
+      public Config build() {
+        return new Config(configsMap);
+      }
+
+    }
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/Converter.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/Converter.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.converter;
+
+import java.io.Serializable;
+import org.apache.spark.api.java.JavaRDD;
+
+/**
+ * Implementations of {@link Converter} will convert data from one format to another.
+ *
+ * @param <I> Input Data Type
+ * @param <O> Output Data Type
+ */
+public interface Converter<I, O> extends Serializable {
+
+  JavaRDD<O> convert(JavaRDD<I> inputRDD);
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/UpdateConverter.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/UpdateConverter.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.converter;
+
+import java.util.List;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.integ.testsuite.generator.LazyRecordGeneratorIterator;
+import org.apache.hudi.integ.testsuite.generator.UpdateGeneratorIterator;
+import org.apache.spark.api.java.JavaRDD;
+
+/**
+ * This converter creates an update {@link GenericRecord} from an existing {@link GenericRecord}.
+ */
+public class UpdateConverter implements Converter<GenericRecord, GenericRecord> {
+
+  private final String schemaStr;
+  // The fields that should not be mutated when converting the insert record to an update record, generally the
+  // record_key
+  private final List<String> partitionPathFields;
+  private final List<String> recordKeyFields;
+  private final int minPayloadSize;
+
+  public UpdateConverter(String schemaStr, int minPayloadSize, List<String> partitionPathFields,
+      List<String> recordKeyFields) {
+    this.schemaStr = schemaStr;
+    this.partitionPathFields = partitionPathFields;
+    this.recordKeyFields = recordKeyFields;
+    this.minPayloadSize = minPayloadSize;
+  }
+
+  @Override
+  public JavaRDD<GenericRecord> convert(JavaRDD<GenericRecord> inputRDD) {
+    return inputRDD.mapPartitions(recordItr -> new LazyRecordGeneratorIterator(new UpdateGeneratorIterator(recordItr,
+        schemaStr, partitionPathFields, recordKeyFields, minPayloadSize)));
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag;
+
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
+import com.fasterxml.jackson.dataformat.yaml.YAMLGenerator.Feature;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.stream.Collectors;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.util.ReflectionUtils;
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+
+/**
+ * Utility class to SerDe workflow dag.
+ */
+public class DagUtils {
+
+  static final ObjectMapper MAPPER = new ObjectMapper();
+
+  /**
+   * Converts a YAML path to {@link WorkflowDag}.
+   */
+  public static WorkflowDag convertYamlPathToDag(FileSystem fs, String path) throws IOException {
+    InputStream is = fs.open(new Path(path));
+    return convertYamlToDag(toString(is));
+  }
+
+  /**
+   * Converts a YAML representation to {@link WorkflowDag}.
+   */
+  public static WorkflowDag convertYamlToDag(String yaml) throws IOException {
+    Map<String, DagNode> allNodes = new HashMap<>();
+    final ObjectMapper yamlReader = new ObjectMapper(new YAMLFactory());
+    final JsonNode jsonNode = yamlReader.readTree(yaml);
+    Iterator<Entry<String, JsonNode>> itr = jsonNode.fields();
+    while (itr.hasNext()) {
+      Entry<String, JsonNode> dagNode = itr.next();
+      allNodes.put(dagNode.getKey(), convertJsonToDagNode(allNodes, dagNode.getValue()));
+    }
+    return new WorkflowDag(findRootNodes(allNodes));
+  }
+
+  /**
+   * Converts {@link WorkflowDag} to a YAML representation.
+   */
+  public static String convertDagToYaml(WorkflowDag dag) throws IOException {
+    final ObjectMapper yamlWriter = new ObjectMapper(new YAMLFactory().disable(Feature.WRITE_DOC_START_MARKER)
+        .enable(Feature.MINIMIZE_QUOTES).enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES));
+    JsonNode yamlNode = MAPPER.createObjectNode();
+    convertDagToYaml(yamlNode, dag.getNodeList());
+    return yamlWriter.writerWithDefaultPrettyPrinter().writeValueAsString(yamlNode);
+  }
+
+  private static void convertDagToYaml(JsonNode yamlNode, List<DagNode> dagNodes) throws IOException {
+    for (DagNode dagNode : dagNodes) {
+      String name = dagNode.getConfig().getOtherConfigs().getOrDefault(DeltaConfig.Config.NODE_NAME, dagNode.getName()).toString();
+      ((ObjectNode) yamlNode).put(name, convertDagNodeToJsonNode(dagNode));
+      if (dagNode.getChildNodes().size() > 0) {
+        convertDagToYaml(yamlNode, dagNode.getChildNodes());
+      }
+    }
+  }
+
+  private static DagNode convertJsonToDagNode(Map<String, DagNode> allNodes, JsonNode node) throws IOException {
+    String type = node.get(DeltaConfig.Config.TYPE).asText();
+    final DagNode retNode = convertJsonToDagNode(node, type);
+    Arrays.asList(node.get(DeltaConfig.Config.DEPENDENCIES).textValue().split(",")).stream().forEach(dep -> {
+      DagNode parentNode = allNodes.get(dep);
+      if (parentNode != null) {
+        parentNode.addChildNode(retNode);
+      }
+    });
+    return retNode;
+  }
+
+  private static List<DagNode> findRootNodes(Map<String, DagNode> allNodes) {
+    final List<DagNode> rootNodes = new ArrayList<>();
+    allNodes.entrySet().stream().forEach(entry -> {
+      if (entry.getValue().getParentNodes().size() < 1) {
+        rootNodes.add(entry.getValue());
+      }
+    });
+    return rootNodes;
+  }
+
+  private static DagNode convertJsonToDagNode(JsonNode node, String type) {
+    try {
+      DeltaConfig.Config config = DeltaConfig.Config.newBuilder().withConfigsMap(convertJsonNodeToMap(node)).build();
+      return (DagNode) ReflectionUtils.loadClass(generateFQN(type), config);
+    } catch (ClassNotFoundException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static String generateFQN(String name) throws ClassNotFoundException {
+    return Class.forName(StringUtils.joinUsingDelim(".",
+        DagNode.class.getName().substring(0, DagNode.class.getName().lastIndexOf(".")), name)).getName();
+  }
+
+  private static JsonNode convertDagNodeToJsonNode(DagNode node) throws IOException {
+    return createJsonNode(node, node.getClass().getSimpleName());
+  }
+
+  private static Map<String, Object> convertJsonNodeToMap(JsonNode node) {
+    Map<String, Object> configsMap = new HashMap<>();
+    Iterator<Entry<String, JsonNode>> itr = node.get(DeltaConfig.Config.CONFIG_NAME).fields();
+    while (itr.hasNext()) {
+      Entry<String, JsonNode> entry = itr.next();
+      switch (entry.getKey()) {
+        case DeltaConfig.Config.HIVE_QUERIES:
+          configsMap.put(DeltaConfig.Config.HIVE_QUERIES, getHiveQueries(entry));
+          break;
+        case DeltaConfig.Config.HIVE_PROPERTIES:
+          configsMap.put(DeltaConfig.Config.HIVE_PROPERTIES, getProperties(entry));
+          break;
+        default:
+          configsMap.put(entry.getKey(), getValue(entry.getValue()));
+          break;
+      }
+    }
+    return configsMap;
+  }
+
+  private static List<Pair<String, Integer>> getHiveQueries(Entry<String, JsonNode> entry) {
+    List<Pair<String, Integer>> queries = new ArrayList<>();
+    Iterator<Entry<String, JsonNode>> queriesItr = entry.getValue().fields();
+    while (queriesItr.hasNext()) {
+      queries.add(Pair.of(queriesItr.next().getValue().textValue(), queriesItr.next().getValue().asInt()));
+    }
+    return queries;
+  }
+
+  private static List<String> getProperties(Entry<String, JsonNode> entry) {
+    List<String> properties = new ArrayList<>();
+    Iterator<Entry<String, JsonNode>> queriesItr = entry.getValue().fields();
+    while (queriesItr.hasNext()) {
+      properties.add(queriesItr.next().getValue().textValue());
+    }
+    return properties;
+  }
+
+  private static Object getValue(JsonNode node) {
+    if (node.isInt()) {
+      return node.asInt();
+    } else if (node.isLong()) {
+      return node.asLong();
+    } else if (node.isShort()) {
+      return node.asInt();
+    } else if (node.isBoolean()) {
+      return node.asBoolean();
+    } else if (node.isDouble()) {
+      return node.asDouble();
+    } else if (node.isFloat()) {
+      return node.asDouble();
+    }
+    return node.textValue();
+  }
+
+  private static JsonNode createJsonNode(DagNode node, String type) throws IOException {
+    JsonNode configNode = MAPPER.readTree(node.getConfig().toString());
+    JsonNode jsonNode = MAPPER.createObjectNode();
+    ((ObjectNode) jsonNode).put(DeltaConfig.Config.CONFIG_NAME, configNode);
+    ((ObjectNode) jsonNode).put(DeltaConfig.Config.TYPE, type);
+    ((ObjectNode) jsonNode).put(DeltaConfig.Config.DEPENDENCIES, getDependencyNames(node));
+    return jsonNode;
+  }
+
+  private static String getDependencyNames(DagNode node) {
+    return node.getParentNodes().stream()
+        .map(e -> ((DagNode) e).getConfig().getOtherConfigs().getOrDefault(DeltaConfig.Config.NODE_NAME, node.getName()).toString())
+        .collect(Collectors.joining(",")).toString();
+  }
+
+  public static String toString(InputStream inputStream) throws IOException {
+    ByteArrayOutputStream result = new ByteArrayOutputStream();
+    byte[] buffer = new byte[1024];
+    int length;
+    while ((length = inputStream.read(buffer)) != -1) {
+      result.write(buffer, 0, length);
+    }
+    return result.toString("utf-8");
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/ExecutionContext.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/ExecutionContext.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag;
+
+import java.io.Serializable;
+import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
+import org.apache.spark.api.java.JavaSparkContext;
+
+/**
+ * This wraps the context needed for an execution of
+ * a {@link DagNode#execute(ExecutionContext)}.
+ */
+public class ExecutionContext implements Serializable {
+
+  private HoodieTestSuiteWriter hoodieTestSuiteWriter;
+  private DeltaGenerator deltaGenerator;
+  private transient JavaSparkContext jsc;
+
+  public ExecutionContext(JavaSparkContext jsc, HoodieTestSuiteWriter hoodieTestSuiteWriter, DeltaGenerator deltaGenerator) {
+    this.hoodieTestSuiteWriter = hoodieTestSuiteWriter;
+    this.deltaGenerator = deltaGenerator;
+    this.jsc = jsc;
+  }
+
+  public HoodieTestSuiteWriter getHoodieTestSuiteWriter() {
+    return hoodieTestSuiteWriter;
+  }
+
+  public DeltaGenerator getDeltaGenerator() {
+    return deltaGenerator;
+  }
+
+  public JavaSparkContext getJsc() {
+    return jsc;
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/SimpleWorkflowDagGenerator.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/SimpleWorkflowDagGenerator.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
+
+/**
+ * An example of how to generate a workflow dag programmatically. This is also used as the default workflow dag if
+ * none is provided.
+ */
+public class SimpleWorkflowDagGenerator implements WorkflowDagGenerator {
+
+  @Override
+  public WorkflowDag build() {
+
+    DagNode root = new InsertNode(DeltaConfig.Config.newBuilder()
+        .withNumRecordsToInsert(100)
+        .withNumInsertPartitions(1)
+        .withNumTimesToRepeat(2)
+        .withRecordSize(1000).build());
+
+    DagNode child1 = new InsertNode(DeltaConfig.Config.newBuilder()
+        .withNumRecordsToInsert(100)
+        .withNumInsertPartitions(1)
+        .withNumTimesToRepeat(2)
+        .withRecordSize(1000).build());
+
+    root.addChildNode(child1);
+
+    DagNode child1OfChild1 = new UpsertNode(DeltaConfig.Config.newBuilder()
+        .withNumRecordsToUpdate(100)
+        .withNumUpsertPartitions(2)
+        .withNumTimesToRepeat(1)
+        .withRecordSize(1000).build());
+
+    // Tests running 2 nodes in parallel
+    child1.addChildNode(child1OfChild1);
+
+    List<Pair<String, Integer>> queryAndResult = new ArrayList<>();
+    queryAndResult.add(Pair.of("select " + "count(*) from testdb1.table1 group "
+        + "by rider having count(*) < 1", 0));
+    DagNode child2OfChild1 = new HiveQueryNode(DeltaConfig.Config.newBuilder()
+        .withHiveQueryAndResults(queryAndResult).withHiveLocal(true).build());
+    child1.addChildNode(child2OfChild1);
+
+    List<DagNode> rootNodes = new ArrayList<>();
+    rootNodes.add(root);
+
+    return new WorkflowDag(rootNodes);
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDag.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDag.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag;
+
+import java.util.List;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+
+/**
+ * Workflow dag that encapsulates all execute nodes.
+ */
+public class WorkflowDag<O> {
+
+  private List<DagNode<O>> nodeList;
+
+  public WorkflowDag(List<DagNode<O>> nodeList) {
+    this.nodeList = nodeList;
+  }
+
+  public List<DagNode<O>> getNodeList() {
+    return nodeList;
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDagGenerator.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDagGenerator.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag;
+
+/**
+ * A interface which represents a workflow dag generator.
+ */
+public interface WorkflowDagGenerator {
+
+  /**
+   * Builds a {@link WorkflowDag}.
+   */
+  WorkflowDag build();
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BulkInsertNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BulkInsertNode.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.spark.api.java.JavaRDD;
+
+public class BulkInsertNode extends InsertNode {
+
+  public BulkInsertNode(Config config) {
+    super(config);
+  }
+
+  @Override
+  protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, Option<String> commitTime)
+      throws Exception {
+    log.info("Execute bulk ingest node {}", this.getName());
+    return hoodieTestSuiteWriter.bulkInsert(commitTime);
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CleanNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CleanNode.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+
+public class CleanNode extends DagNode<Boolean> {
+
+  public CleanNode() {
+  }
+
+  @Override
+  public void execute(ExecutionContext executionContext) throws Exception {
+    log.info("Executing clean node {}", this.getName());
+    executionContext.getHoodieTestSuiteWriter().getWriteClient(this).clean();
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+import org.apache.spark.api.java.JavaRDD;
+
+public class CompactNode extends DagNode<JavaRDD<WriteStatus>> {
+
+  public CompactNode(Config config) {
+    this.config = config;
+  }
+
+  @Override
+  public void execute(ExecutionContext executionContext) throws Exception {
+    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
+        executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
+    Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline()
+        .getCommitsAndCompactionTimeline().filterPendingCompactionTimeline().lastInstant();
+    if (lastInstant.isPresent()) {
+      log.info("Compacting instant {}", lastInstant.get());
+      this.result = executionContext.getHoodieTestSuiteWriter().compact(Option.of(lastInstant.get().getTimestamp()));
+    }
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DagNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DagNode.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Objects;
+import java.util.UUID;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Represents a Node in the DAG of operations for a workflow.
+ */
+public abstract class DagNode<O> implements Comparable<DagNode<O>> {
+
+  protected static Logger log = LoggerFactory.getLogger(DagNode.class);
+
+  protected List<DagNode<O>> childNodes;
+  protected List<DagNode<O>> parentNodes;
+  protected O result;
+  protected Config config;
+  private boolean isCompleted;
+
+  public DagNode<O> addChildNode(DagNode childNode) {
+    childNode.getParentNodes().add(this);
+    getChildNodes().add(childNode);
+    return this;
+  }
+
+  public DagNode<O> addParentNode(DagNode parentNode) {
+    if (!this.getParentNodes().contains(parentNode)) {
+      this.getParentNodes().add(parentNode);
+    }
+    return this;
+  }
+
+  public O getResult() {
+    return result;
+  }
+
+  public List<DagNode<O>> getChildNodes() {
+    if (childNodes == null) {
+      childNodes = new LinkedList<>();
+    }
+    return childNodes;
+  }
+
+  public List<DagNode<O>> getParentNodes() {
+    if (parentNodes == null) {
+      this.parentNodes = new ArrayList<>();
+    }
+    return this.parentNodes;
+  }
+
+  public void setParentNodes(List<DagNode<O>> parentNodes) {
+    this.parentNodes = parentNodes;
+  }
+
+  public abstract void execute(ExecutionContext context) throws Exception;
+
+  public boolean isCompleted() {
+    return isCompleted;
+  }
+
+  public void setCompleted(boolean completed) {
+    isCompleted = completed;
+  }
+
+  public Config getConfig() {
+    return config;
+  }
+
+  public String getName() {
+    Object name = this.config.getOtherConfigs().get(Config.NODE_NAME);
+    if (name == null) {
+      String randomName = UUID.randomUUID().toString();
+      this.config.getOtherConfigs().put(Config.NODE_NAME, randomName);
+      return randomName;
+    }
+    return name.toString();
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+    DagNode<?> dagNode = (DagNode<?>) o;
+    return getName() == dagNode.getName();
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getName());
+  }
+
+  @Override
+  public int compareTo(DagNode<O> thatNode) {
+    return this.hashCode() - thatNode.hashCode();
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import org.apache.hudi.DataSourceUtils;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.hive.HiveSyncConfig;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
+
+public class HiveQueryNode extends DagNode<Boolean> {
+
+  private HiveServiceProvider hiveServiceProvider;
+
+  public HiveQueryNode(DeltaConfig.Config config) {
+    this.config = config;
+    this.hiveServiceProvider = new HiveServiceProvider(config);
+  }
+
+  @Override
+  public void execute(ExecutionContext executionContext) throws Exception {
+    log.info("Executing hive query node {}", this.getName());
+    this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
+    HiveSyncConfig hiveSyncConfig = DataSourceUtils
+        .buildHiveSyncConfig(executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
+                .getDeltaSyncService().getDeltaSync().getProps(),
+            executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
+                .getDeltaSyncService().getDeltaSync().getCfg().targetBasePath,
+            executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
+                .getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat);
+    this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
+    Connection con = DriverManager.getConnection(hiveSyncConfig.jdbcUrl, hiveSyncConfig.hiveUser,
+        hiveSyncConfig.hivePass);
+    Statement stmt = con.createStatement();
+    stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
+    for (String hiveProperty : this.config.getHiveProperties()) {
+      executeStatement(hiveProperty, stmt);
+    }
+    for (Pair<String, Integer> queryAndResult : this.config.getHiveQueries()) {
+      log.info("Running {}", queryAndResult.getLeft());
+      ResultSet res = stmt.executeQuery(queryAndResult.getLeft());
+      if (!res.next()) {
+        log.info("res.next() was False - typically this means the query returned no rows.");
+        assert 0 == queryAndResult.getRight();
+      } else {
+        Integer result = res.getInt(1);
+        if (!queryAndResult.getRight().equals(result)) {
+          throw new AssertionError(
+              "QUERY: " + queryAndResult.getLeft()
+                  + " | EXPECTED RESULT = " + queryAndResult.getRight()
+                  + " | ACTUAL RESULT = " + result
+          );
+        }
+      }
+      log.info("Successfully validated query!");
+    }
+    this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
+  }
+
+  private void executeStatement(String query, Statement stmt) throws SQLException {
+    log.info("Executing statement {}", stmt.toString());
+    stmt.execute(query);
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveSyncNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveSyncNode.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+
+public class HiveSyncNode extends DagNode<Boolean> {
+
+  private HiveServiceProvider hiveServiceProvider;
+
+  public HiveSyncNode(Config config) {
+    this.config = config;
+    this.hiveServiceProvider = new HiveServiceProvider(config);
+  }
+
+  @Override
+  public void execute(ExecutionContext executionContext) throws Exception {
+    log.info("Executing hive sync node");
+    this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
+    this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
+    executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive();
+    this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
+  }
+
+  public HiveServiceProvider getHiveServiceProvider() {
+    return hiveServiceProvider;
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
+import org.apache.spark.api.java.JavaRDD;
+
+public class InsertNode extends DagNode<JavaRDD<WriteStatus>> {
+
+  public InsertNode(Config config) {
+    this.config = config;
+  }
+
+  @Override
+  public void execute(ExecutionContext executionContext) throws Exception {
+    generate(executionContext.getDeltaGenerator());
+    log.info("Configs : {}", this.config);
+    if (!config.isDisableIngest()) {
+      log.info("Inserting input data {}", this.getName());
+      Option<String> commitTime = executionContext.getHoodieTestSuiteWriter().startCommit();
+      JavaRDD<WriteStatus> writeStatus = ingest(executionContext.getHoodieTestSuiteWriter(), commitTime);
+      executionContext.getHoodieTestSuiteWriter().commit(writeStatus, commitTime);
+      this.result = writeStatus;
+    }
+  }
+
+  protected void generate(DeltaGenerator deltaGenerator) throws Exception {
+    if (!config.isDisableGenerate()) {
+      log.info("Generating input data for node {}", this.getName());
+      deltaGenerator.writeRecords(deltaGenerator.generateInserts(config)).count();
+    }
+  }
+
+  protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter,
+      Option<String> commitTime) throws Exception {
+    return hoodieTestSuiteWriter.insert(commitTime);
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/RollbackNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/RollbackNode.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+
+public class RollbackNode extends DagNode<Option<HoodieInstant>> {
+
+  public RollbackNode(Config config) {
+    this.config = config;
+  }
+
+  @Override
+  public void execute(ExecutionContext executionContext) throws Exception {
+    log.info("Executing rollback node {}", this.getName());
+    // Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer
+    // testing for now
+    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
+        executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
+    Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant();
+    if (lastInstant.isPresent()) {
+      log.info("Rolling back last instant {}", lastInstant.get());
+      executionContext.getHoodieTestSuiteWriter().getWriteClient(this).rollback(lastInstant.get().getTimestamp());
+      this.result = lastInstant;
+    }
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+
+public class ScheduleCompactNode extends DagNode<Option<String>> {
+
+  public ScheduleCompactNode(Config config) {
+    this.config = config;
+  }
+
+  @Override
+  public void execute(ExecutionContext executionContext) throws Exception {
+    log.info("Executing schedule compact node {}", this.getName());
+    // Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer
+    // testing for now
+    // Find the last commit and extra the extra metadata to be passed to the schedule compaction. This is
+    // done to ensure the CHECKPOINT is correctly passed from commit to commit
+    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
+        executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
+    Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant();
+    if (lastInstant.isPresent()) {
+      HoodieCommitMetadata metadata = org.apache.hudi.common.model.HoodieCommitMetadata.fromBytes(metaClient
+          .getActiveTimeline().getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class);
+      Option<String> scheduledInstant = executionContext.getHoodieTestSuiteWriter().scheduleCompaction(Option.of(metadata
+              .getExtraMetadata()));
+      if (scheduledInstant.isPresent()) {
+        log.info("Scheduling compaction instant {}", scheduledInstant.get());
+      }
+      this.result = scheduledInstant;
+    }
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/SparkSQLQueryNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/SparkSQLQueryNode.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+public class SparkSQLQueryNode extends DagNode<Boolean> {
+
+  HiveServiceProvider hiveServiceProvider;
+
+  public SparkSQLQueryNode(Config config) {
+    this.config = config;
+    this.hiveServiceProvider = new HiveServiceProvider(config);
+  }
+
+  @Override
+  public void execute(ExecutionContext executionContext) throws Exception {
+    log.info("Executing spark sql query node");
+    this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
+    this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
+    SparkSession session = SparkSession.builder().sparkContext(executionContext.getJsc().sc()).getOrCreate();
+    for (String hiveProperty : this.config.getHiveProperties()) {
+      session.sql(hiveProperty).count();
+    }
+    for (Pair<String, Integer> queryAndResult : this.config.getHiveQueries()) {
+      log.info("Running {}", queryAndResult.getLeft());
+      Dataset<Row> res = session.sql(queryAndResult.getLeft());
+      if (res.count() == 0) {
+        assert 0 == queryAndResult.getRight();
+      } else {
+        assert ((Row[]) res.collect())[0].getInt(0) == queryAndResult.getRight();
+      }
+      log.info("Successfully validated query!");
+    }
+    this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
+    this.result = true;
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/UpsertNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/UpsertNode.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
+import org.apache.spark.api.java.JavaRDD;
+
+public class UpsertNode extends InsertNode {
+
+  public UpsertNode(Config config) {
+    super(config);
+  }
+
+  @Override
+  protected void generate(DeltaGenerator deltaGenerator) throws Exception {
+    if (!config.isDisableGenerate()) {
+      log.info("Generating input data {}", this.getName());
+      deltaGenerator.writeRecords(deltaGenerator.generateUpdates(config)).count();
+    }
+  }
+
+  @Override
+  protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, Option<String> commitTime)
+      throws Exception {
+    if (!config.isDisableIngest()) {
+      log.info("Upserting input data {}", this.getName());
+      this.result = hoodieTestSuiteWriter.upsert(commitTime);
+    }
+    return this.result;
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateNode.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.nodes;
+
+import java.util.List;
+import java.util.function.Function;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+
+public class ValidateNode<R> extends DagNode {
+
+  protected Function<List<DagNode>, R> function;
+
+  public ValidateNode(Config config, Function<List<DagNode>, R> function) {
+    this.function = function;
+    this.config = config;
+  }
+
+  @Override
+  public void execute(ExecutionContext executionContext) {
+    if (this.getParentNodes().size() > 0 && (Boolean) this.config.getOtherConfigs().getOrDefault("WAIT_FOR_PARENTS",
+        true)) {
+      for (DagNode node : (List<DagNode>) this.getParentNodes()) {
+        if (!node.isCompleted()) {
+          throw new RuntimeException("cannot validate before parent nodes are complete");
+        }
+      }
+    }
+    this.result = this.function.apply((List<DagNode>) this.getParentNodes());
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/DagScheduler.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/DagScheduler.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag.scheduler;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
+import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
+import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
+import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DagScheduler {
+
+  private static Logger log = LoggerFactory.getLogger(DagScheduler.class);
+  private WorkflowDag workflowDag;
+  private ExecutionContext executionContext;
+
+  public DagScheduler(WorkflowDag workflowDag, HoodieTestSuiteWriter hoodieTestSuiteWriter, DeltaGenerator deltaGenerator) {
+    this.workflowDag = workflowDag;
+    this.executionContext = new ExecutionContext(null, hoodieTestSuiteWriter, deltaGenerator);
+  }
+
+  public void schedule() throws Exception {
+    ExecutorService service = Executors.newFixedThreadPool(2);
+    try {
+      execute(service, workflowDag.getNodeList());
+      service.shutdown();
+    } finally {
+      if (!service.isShutdown()) {
+        log.info("Forcing shutdown of executor service, this might kill running tasks");
+        service.shutdownNow();
+      }
+    }
+  }
+
+  private void execute(ExecutorService service, List<DagNode> nodes) throws Exception {
+    // Nodes at the same level are executed in parallel
+    Queue<DagNode> queue = new PriorityQueue<>(nodes);
+    log.info("Running workloads");
+    do {
+      List<Future> futures = new ArrayList<>();
+      Set<DagNode> childNodes = new HashSet<>();
+      while (queue.size() > 0) {
+        DagNode nodeToExecute = queue.poll();
+        futures.add(service.submit(() -> executeNode(nodeToExecute)));
+        if (nodeToExecute.getChildNodes().size() > 0) {
+          childNodes.addAll(nodeToExecute.getChildNodes());
+        }
+      }
+      queue.addAll(childNodes);
+      childNodes.clear();
+      for (Future future : futures) {
+        future.get(1, TimeUnit.HOURS);
+      }
+    } while (queue.size() > 0);
+    log.info("Finished workloads");
+  }
+
+  private void executeNode(DagNode node) {
+    if (node.isCompleted()) {
+      throw new RuntimeException("DagNode already completed! Cannot re-execute");
+    }
+    try {
+      node.execute(executionContext);
+      node.setCompleted(true);
+      log.info("Finished executing {}", node.getName());
+    } catch (Exception e) {
+      log.error("Exception executing node");
+      throw new HoodieException(e);
+    }
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.generator;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.io.UncheckedIOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.StreamSupport;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.integ.testsuite.converter.Converter;
+import org.apache.hudi.integ.testsuite.converter.UpdateConverter;
+import org.apache.hudi.integ.testsuite.reader.DFSAvroDeltaInputReader;
+import org.apache.hudi.integ.testsuite.reader.DFSHoodieDatasetInputReader;
+import org.apache.hudi.integ.testsuite.reader.DeltaInputReader;
+import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
+import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
+import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter;
+import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory;
+import org.apache.hudi.keygen.ComplexKeyGenerator;
+import org.apache.hudi.keygen.KeyGenerator;
+import org.apache.hudi.keygen.SimpleKeyGenerator;
+import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.storage.StorageLevel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+/**
+ * The delta generator generates all types of workloads (insert, update) for the given configs.
+ */
+public class DeltaGenerator implements Serializable {
+
+  private static Logger log = LoggerFactory.getLogger(DFSHoodieDatasetInputReader.class);
+
+  private DeltaConfig deltaOutputConfig;
+  private transient JavaSparkContext jsc;
+  private transient SparkSession sparkSession;
+  private String schemaStr;
+  private List<String> recordRowKeyFieldNames;
+  private List<String> partitionPathFieldNames;
+  private int batchId;
+
+  public DeltaGenerator(DeltaConfig deltaOutputConfig, JavaSparkContext jsc, SparkSession sparkSession,
+      String schemaStr,
+      KeyGenerator keyGenerator) {
+    this.deltaOutputConfig = deltaOutputConfig;
+    this.jsc = jsc;
+    this.sparkSession = sparkSession;
+    this.schemaStr = schemaStr;
+    this.recordRowKeyFieldNames = keyGenerator instanceof ComplexKeyGenerator ? ((ComplexKeyGenerator) keyGenerator)
+        .getRecordKeyFields() : Arrays.asList(((SimpleKeyGenerator) keyGenerator).getRecordKeyField());
+    this.partitionPathFieldNames = keyGenerator instanceof ComplexKeyGenerator ? ((ComplexKeyGenerator) keyGenerator)
+        .getPartitionPathFields() : Arrays.asList(((SimpleKeyGenerator) keyGenerator).getPartitionPathField());
+  }
+
+  public JavaRDD<DeltaWriteStats> writeRecords(JavaRDD<GenericRecord> records) {
+    // The following creates a new anonymous function for iterator and hence results in serialization issues
+    JavaRDD<DeltaWriteStats> ws = records.mapPartitions(itr -> {
+      try {
+        DeltaWriterAdapter<GenericRecord> deltaWriterAdapter = DeltaWriterFactory
+            .getDeltaWriterAdapter(deltaOutputConfig, batchId);
+        return Collections.singletonList(deltaWriterAdapter.write(itr)).iterator();
+      } catch (IOException io) {
+        throw new UncheckedIOException(io);
+      }
+    }).flatMap(List::iterator);
+    batchId++;
+    return ws;
+  }
+
+  public JavaRDD<GenericRecord> generateInserts(Config operation) {
+    long recordsPerPartition = operation.getNumRecordsInsert();
+    int minPayloadSize = operation.getRecordSize();
+    JavaRDD<GenericRecord> inputBatch = jsc.parallelize(Collections.EMPTY_LIST)
+        .repartition(operation.getNumInsertPartitions()).mapPartitions(p -> {
+          return new LazyRecordGeneratorIterator(new FlexibleSchemaRecordGenerationIterator(recordsPerPartition,
+              minPayloadSize, schemaStr, partitionPathFieldNames));
+        });
+    return inputBatch;
+  }
+
+  public JavaRDD<GenericRecord> generateUpdates(Config config) throws IOException {
+    if (deltaOutputConfig.getDeltaOutputMode() == DeltaOutputMode.DFS) {
+      JavaRDD<GenericRecord> inserts = null;
+      if (config.getNumRecordsInsert() > 0) {
+        inserts = generateInserts(config);
+      }
+      DeltaInputReader deltaInputReader = null;
+      JavaRDD<GenericRecord> adjustedRDD = null;
+      if (config.getNumUpsertPartitions() < 1) {
+        // randomly generate updates for a given number of records without regard to partitions and files
+        deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr,
+            ((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty());
+        adjustedRDD = deltaInputReader.read(config.getNumRecordsUpsert());
+        adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsUpsert());
+      } else {
+        deltaInputReader =
+            new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(),
+                schemaStr);
+        if (config.getFractionUpsertPerFile() > 0) {
+          adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(),
+              config.getFractionUpsertPerFile());
+        } else {
+          adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), config
+              .getNumRecordsUpsert());
+        }
+      }
+      log.info("Repartitioning records");
+      // persist this since we will make multiple passes over this
+      adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism());
+      log.info("Repartitioning records done");
+      Converter converter = new UpdateConverter(schemaStr, config.getRecordSize(),
+          partitionPathFieldNames, recordRowKeyFieldNames);
+      JavaRDD<GenericRecord> updates = converter.convert(adjustedRDD);
+      log.info("Records converted");
+      updates.persist(StorageLevel.DISK_ONLY());
+      return inserts != null ? inserts.union(updates) : updates;
+      // TODO : Generate updates for only N partitions.
+    } else {
+      throw new IllegalArgumentException("Other formats are not supported at the moment");
+    }
+  }
+
+  public Map<Integer, Long> getPartitionToCountMap(JavaRDD<GenericRecord> records) {
+    // Requires us to keep the partitioner the same
+    return records.mapPartitionsWithIndex((index, itr) -> {
+      Iterable<GenericRecord> newIterable = () -> itr;
+      // parallelize counting for speed
+      long count = StreamSupport.stream(newIterable.spliterator(), true).count();
+      return Arrays.asList(new Tuple2<>(index, count)).iterator();
+    }, true).mapToPair(i -> i).collectAsMap();
+  }
+
+  public Map<Integer, Long> getAdjustedPartitionsCount(Map<Integer, Long> partitionCountMap, long
+      recordsToRemove) {
+    long remainingRecordsToRemove = recordsToRemove;
+    Iterator<Map.Entry<Integer, Long>> iterator = partitionCountMap.entrySet().iterator();
+    Map<Integer, Long> adjustedPartitionCountMap = new HashMap<>();
+    while (iterator.hasNext()) {
+      Map.Entry<Integer, Long> entry = iterator.next();
+      if (entry.getValue() < remainingRecordsToRemove) {
+        remainingRecordsToRemove -= entry.getValue();
+        adjustedPartitionCountMap.put(entry.getKey(), 0L);
+      } else {
+        long newValue = entry.getValue() - remainingRecordsToRemove;
+        remainingRecordsToRemove = 0;
+        adjustedPartitionCountMap.put(entry.getKey(), newValue);
+      }
+      if (remainingRecordsToRemove == 0) {
+        break;
+      }
+    }
+    return adjustedPartitionCountMap;
+  }
+
+  public JavaRDD<GenericRecord> adjustRDDToGenerateExactNumUpdates(JavaRDD<GenericRecord> updates, JavaSparkContext
+      jsc, long totalRecordsRequired) {
+    Map<Integer, Long> actualPartitionCountMap = getPartitionToCountMap(updates);
+    long totalRecordsGenerated = actualPartitionCountMap.values().stream().mapToLong(Long::longValue).sum();
+    if (isSafeToTake(totalRecordsRequired, totalRecordsGenerated)) {
+      // Generate totalRecordsRequired - totalRecordsGenerated new records and union the RDD's
+      // NOTE : This performs poorly when totalRecordsRequired >> totalRecordsGenerated. Hence, always
+      // ensure that enough inserts are created before hand (this needs to be noted during the WorkflowDag creation)
+      long sizeOfUpdateRDD = totalRecordsGenerated;
+      while (totalRecordsRequired != sizeOfUpdateRDD) {
+        long recordsToTake = (totalRecordsRequired - sizeOfUpdateRDD) > sizeOfUpdateRDD
+            ? sizeOfUpdateRDD : (totalRecordsRequired - sizeOfUpdateRDD);
+        if ((totalRecordsRequired - sizeOfUpdateRDD) > recordsToTake && recordsToTake <= sizeOfUpdateRDD) {
+          updates = updates.union(updates);
+          sizeOfUpdateRDD *= 2;
+        } else {
+          List<GenericRecord> remainingUpdates = updates.take((int) (recordsToTake));
+          updates = updates.union(jsc.parallelize(remainingUpdates));
+          sizeOfUpdateRDD = sizeOfUpdateRDD + recordsToTake;
+        }
+      }
+      return updates;
+    } else if (totalRecordsRequired < totalRecordsGenerated) {
+      final Map<Integer, Long> adjustedPartitionCountMap = getAdjustedPartitionsCount(actualPartitionCountMap,
+          totalRecordsGenerated - totalRecordsRequired);
+      // limit counts across partitions to meet the exact number of updates required
+      JavaRDD<GenericRecord> trimmedRecords = updates.mapPartitionsWithIndex((index, itr) -> {
+        int counter = 1;
+        List<GenericRecord> entriesToKeep = new ArrayList<>();
+        if (!adjustedPartitionCountMap.containsKey(index)) {
+          return itr;
+        } else {
+          long recordsToKeepForThisPartition = adjustedPartitionCountMap.get(index);
+          while (counter <= recordsToKeepForThisPartition && itr.hasNext()) {
+            entriesToKeep.add(itr.next());
+            counter++;
+          }
+          return entriesToKeep.iterator();
+        }
+      }, true);
+      return trimmedRecords;
+    }
+    return updates;
+  }
+
+  private boolean isSafeToTake(long totalRecords, long totalRecordsGenerated) {
+    // TODO : Ensure that the difference between totalRecords and totalRecordsGenerated is not too big, if yes,
+    // then there are fewer number of records on disk, hence we need to find another way to generate updates when
+    // requiredUpdates >> insertedRecords
+    return totalRecords > totalRecordsGenerated;
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/FlexibleSchemaRecordGenerationIterator.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/FlexibleSchemaRecordGenerationIterator.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.generator;
+
+import java.util.Iterator;
+import java.util.List;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+
+/**
+ * A GenericRecordGeneratorIterator for the custom schema of the workload. Implements {@link Iterator} to allow for
+ * iteration semantics.
+ */
+public class FlexibleSchemaRecordGenerationIterator implements Iterator<GenericRecord> {
+
+  // Stores how many records to generate as part of this iterator. Ideally, one iterator is started per spark
+  // partition.
+  private long counter;
+  // Use the full payload generator as default
+  private GenericRecordFullPayloadGenerator generator;
+  // Store last record for the partition path of the first payload to be used for all subsequent generated payloads
+  private GenericRecord lastRecord;
+  // Partition path field name
+  private List<String> partitionPathFieldNames;
+
+  public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, String schema) {
+    this(maxEntriesToProduce, GenericRecordFullPayloadGenerator.DEFAULT_PAYLOAD_SIZE, schema, null);
+  }
+
+  public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, int minPayloadSize, String schemaStr,
+      List<String> partitionPathFieldNames) {
+    this.counter = maxEntriesToProduce;
+    this.partitionPathFieldNames = partitionPathFieldNames;
+    Schema schema = new Schema.Parser().parse(schemaStr);
+    this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize);
+  }
+
+  @Override
+  public boolean hasNext() {
+    return this.counter > 0;
+  }
+
+  @Override
+  public GenericRecord next() {
+    this.counter--;
+    if (lastRecord == null) {
+      GenericRecord record = this.generator.getNewPayload();
+      lastRecord = record;
+      return record;
+    } else {
+      return this.generator.randomize(lastRecord, this.partitionPathFieldNames);
+    }
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadGenerator.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadGenerator.java
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.generator;
+
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.UUID;
+import org.apache.avro.LogicalType;
+import org.apache.avro.LogicalTypes;
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Type;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericData.Fixed;
+import org.apache.avro.generic.GenericFixed;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.common.util.collection.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a GenericRecord payload generator that generates full generic records {@link GenericRecord}.
+ * Every field of a generic record created using this generator contains a random value.
+ */
+public class GenericRecordFullPayloadGenerator implements Serializable {
+
+  public static final int DEFAULT_PAYLOAD_SIZE = 1024 * 10; // 10 KB
+  private static Logger log = LoggerFactory.getLogger(GenericRecordFullPayloadGenerator.class);
+  protected final Random random = new Random();
+  // The source schema used to generate a payload
+  private final transient Schema baseSchema;
+  // Used to validate a generic record
+  private final transient GenericData genericData = new GenericData();
+  // Number of more bytes to add based on the estimated full record payload size and min payload size
+  private int numberOfBytesToAdd;
+  // If more elements should be packed to meet the minPayloadSize
+  private boolean shouldAddMore;
+  // How many complex fields have we visited that can help us pack more entries and increase the size of the record
+  private int numberOfComplexFields;
+  // The size of a full record where every field of a generic record created contains 1 random value
+  private int estimatedFullPayloadSize;
+  // LogicalTypes in Avro 1.8.2
+  private static final String DECIMAL = "decimal";
+  private static final String UUID_NAME = "uuid";
+  private static final String DATE = "date";
+  private static final String TIME_MILLIS = "time-millis";
+  private static final String TIME_MICROS = "time-micros";
+  private static final String TIMESTAMP_MILLIS = "timestamp-millis";
+  private static final String TIMESTAMP_MICROS = "timestamp-micros";
+
+  public GenericRecordFullPayloadGenerator(Schema schema) {
+    this(schema, DEFAULT_PAYLOAD_SIZE);
+  }
+
+  public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize) {
+    Pair<Integer, Integer> sizeInfo = new GenericRecordFullPayloadSizeEstimator(schema)
+        .typeEstimateAndNumComplexFields();
+    this.estimatedFullPayloadSize = sizeInfo.getLeft();
+    this.numberOfComplexFields = sizeInfo.getRight();
+    this.baseSchema = schema;
+    this.shouldAddMore = estimatedFullPayloadSize < minPayloadSize;
+    if (this.shouldAddMore) {
+      this.numberOfBytesToAdd = minPayloadSize - estimatedFullPayloadSize;
+      if (numberOfComplexFields < 1) {
+        log.warn("The schema does not have any collections/complex fields. Cannot achieve minPayloadSize : {}",
+            minPayloadSize);
+      }
+    }
+  }
+
+  protected static boolean isPrimitive(Schema localSchema) {
+    if (localSchema.getType() != Type.ARRAY
+        && localSchema.getType() != Type.MAP
+        && localSchema.getType() != Type.RECORD
+        && localSchema.getType() != Type.UNION) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  public GenericRecord getNewPayload() {
+    return convert(baseSchema);
+  }
+
+  public GenericRecord getUpdatePayload(GenericRecord record, List<String> blacklistFields) {
+    return randomize(record, blacklistFields);
+  }
+
+  protected GenericRecord convert(Schema schema) {
+    GenericRecord result = new GenericData.Record(schema);
+    for (Schema.Field f : schema.getFields()) {
+      result.put(f.name(), typeConvert(f.schema()));
+    }
+    return result;
+  }
+
+  protected GenericRecord convertPartial(Schema schema) {
+    GenericRecord result = new GenericData.Record(schema);
+    for (Schema.Field f : schema.getFields()) {
+      boolean setNull = random.nextBoolean();
+      if (!setNull) {
+        result.put(f.name(), typeConvert(f.schema()));
+      } else {
+        result.put(f.name(), null);
+      }
+    }
+    // TODO : pack remaining bytes into a complex field
+    return result;
+  }
+
+  protected GenericRecord randomize(GenericRecord record, List<String> blacklistFields) {
+    for (Schema.Field f : record.getSchema().getFields()) {
+      if (blacklistFields == null || !blacklistFields.contains(f.name())) {
+        record.put(f.name(), typeConvert(f.schema()));
+      }
+    }
+    return record;
+  }
+
+  private Object typeConvert(Schema schema) {
+    Schema localSchema = schema;
+    if (isOption(schema)) {
+      localSchema = getNonNull(schema);
+    }
+    switch (localSchema.getType()) {
+      case BOOLEAN:
+        return random.nextBoolean();
+      case DOUBLE:
+        return random.nextDouble();
+      case FLOAT:
+        return random.nextFloat();
+      case INT:
+        return random.nextInt();
+      case LONG:
+        return random.nextLong();
+      case STRING:
+        return UUID.randomUUID().toString();
+      case ENUM:
+        List<String> enumSymbols = localSchema.getEnumSymbols();
+        return new GenericData.EnumSymbol(localSchema, enumSymbols.get(random.nextInt(enumSymbols.size() - 1)));
+      case RECORD:
+        return convert(localSchema);
+      case ARRAY:
+        Schema elementSchema = localSchema.getElementType();
+        List listRes = new ArrayList();
+        if (isPrimitive(elementSchema) && this.shouldAddMore) {
+          int numEntriesToAdd = numEntriesToAdd(elementSchema);
+          while (numEntriesToAdd > 0) {
+            listRes.add(typeConvert(elementSchema));
+            numEntriesToAdd--;
+          }
+        } else {
+          listRes.add(typeConvert(elementSchema));
+        }
+        return listRes;
+      case MAP:
+        Schema valueSchema = localSchema.getValueType();
+        Map<String, Object> mapRes = new HashMap<String, Object>();
+        if (isPrimitive(valueSchema) && this.shouldAddMore) {
+          int numEntriesToAdd = numEntriesToAdd(valueSchema);
+          while (numEntriesToAdd > 0) {
+            mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema));
+            numEntriesToAdd--;
+          }
+        } else {
+          mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema));
+        }
+        return mapRes;
+      case BYTES:
+        return ByteBuffer.wrap(UUID.randomUUID().toString().getBytes(Charset.defaultCharset()));
+      case FIXED:
+        return generateFixedType(localSchema);
+      default:
+        throw new IllegalArgumentException(
+            "Cannot handle type: " + localSchema.getType());
+    }
+  }
+
+  private Object generateFixedType(Schema localSchema) {
+    // TODO: Need to implement valid data generation for fixed type
+    GenericFixed genericFixed = new GenericData.Fixed(localSchema);
+    switch (localSchema.getLogicalType().getName()) {
+      case UUID_NAME:
+        ((Fixed) genericFixed).bytes(UUID.randomUUID().toString().getBytes());
+        return genericFixed;
+      case DECIMAL:
+        return genericFixed;
+      case DATE:
+        return genericFixed;
+      case TIME_MILLIS:
+        return genericFixed;
+      default:
+        throw new IllegalArgumentException(
+            "Cannot handle type: " + localSchema.getLogicalType());
+    }
+  }
+
+  public boolean validate(GenericRecord record) {
+    return genericData.validate(baseSchema, record);
+  }
+
+  protected boolean isOption(Schema schema) {
+    return schema.getType().equals(Schema.Type.UNION)
+        && schema.getTypes().size() == 2
+        && (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
+        || schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
+  }
+
+  protected Schema getNonNull(Schema schema) {
+    List<Schema> types = schema.getTypes();
+    return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
+  }
+
+  public int getEstimatedFullPayloadSize() {
+    return estimatedFullPayloadSize;
+  }
+
+  private int getSize(Schema elementSchema) {
+    switch (elementSchema.getType()) {
+      case BOOLEAN:
+        return 1;
+      case DOUBLE:
+        return Double.BYTES;
+      case FLOAT:
+        return Float.BYTES;
+      case INT:
+        return Integer.BYTES;
+      case LONG:
+        return Long.BYTES;
+      case STRING:
+        return UUID.randomUUID().toString().length();
+      case ENUM:
+        return 1;
+      case BYTES:
+        return UUID.randomUUID().toString().length();
+      case FIXED:
+        return elementSchema.getFixedSize();
+      default:
+        throw new RuntimeException("Unknown type " + elementSchema.getType());
+    }
+  }
+
+  private int numEntriesToAdd(Schema elementSchema) {
+    // Find the size of the primitive data type in bytes
+    int primitiveDataTypeSize = getSize(elementSchema);
+    int numEntriesToAdd = numberOfBytesToAdd / primitiveDataTypeSize;
+    // If more than 10 entries are being added for this same complex field and there are still more complex fields to
+    // be visited in the schema, reduce the number of entries to add by a factor of 10 to allow for other complex
+    // fields to pack some entries
+    if (numEntriesToAdd % 10 > 0 && this.numberOfComplexFields > 1) {
+      numEntriesToAdd = numEntriesToAdd / 10;
+      numberOfBytesToAdd -= numEntriesToAdd * primitiveDataTypeSize;
+      this.shouldAddMore = true;
+    } else {
+      this.numberOfBytesToAdd = 0;
+      this.shouldAddMore = false;
+    }
+    this.numberOfComplexFields -= 1;
+    return numEntriesToAdd;
+  }
+}
+
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadSizeEstimator.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadSizeEstimator.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.generator;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.common.util.collection.Pair;
+
+/**
+ * This is a GenericRecord payload estimator estimates the size of a full generic record {@link GenericRecord}.
+ * A full record is defined as "Every field of a generic record created contains 1 random value"
+ */
+public class GenericRecordFullPayloadSizeEstimator implements Serializable {
+
+  private final transient Schema baseSchema;
+
+  // This variable is used to track the number of complex/collection fields with primitive data types at their leaf.
+  // This is used to figure out how many entries can be packed in such a collection field to meet the min payload
+  // size requested
+  private final transient AtomicInteger counter = new AtomicInteger(0);
+
+  public GenericRecordFullPayloadSizeEstimator(Schema schema) {
+    this.baseSchema = schema;
+  }
+
+  public Pair<Integer, Integer> typeEstimateAndNumComplexFields() {
+    int size = estimate(baseSchema);
+    return Pair.of(size, counter.get());
+  }
+
+  /**
+   * This method estimates the size of the payload if all entries of this payload were populated with one value.
+   * For eg. A primitive data type such as String will be populated with {@link UUID} so the length if 36 bytes
+   * whereas a complex data type such as an Array of type Int, will be populated with exactly 1 Integer value.
+   */
+  protected int estimate(Schema schema) {
+    long size = 0;
+    for (Schema.Field f : schema.getFields()) {
+      size += typeEstimate(f.schema());
+    }
+    return (int) size;
+  }
+
+  private long typeEstimate(Schema schema) {
+    Schema localSchema = schema;
+    if (isOption(schema)) {
+      localSchema = getNonNull(schema);
+    }
+    switch (localSchema.getType()) {
+      case BOOLEAN:
+        return 1;
+      case DOUBLE:
+        return 8;
+      case FLOAT:
+        return 4;
+      case INT:
+        return 4;
+      case LONG:
+        return 8;
+      case STRING:
+        return UUID.randomUUID().toString().length();
+      case ENUM:
+        return 1;
+      case RECORD:
+        return estimate(localSchema);
+      case ARRAY:
+        if (GenericRecordFullPayloadGenerator.isPrimitive(localSchema.getElementType())) {
+          counter.addAndGet(1);
+        }
+        Schema elementSchema = localSchema.getElementType();
+        return typeEstimate(elementSchema);
+      case MAP:
+        if (GenericRecordFullPayloadGenerator.isPrimitive(localSchema.getValueType())) {
+          counter.addAndGet(1);
+        }
+        Schema valueSchema = localSchema.getValueType();
+        return UUID.randomUUID().toString().length() + typeEstimate(valueSchema);
+      case BYTES:
+        return UUID.randomUUID().toString().length();
+      case FIXED:
+        return localSchema.getFixedSize();
+      default:
+        throw new IllegalArgumentException(
+            "Cannot handle type: " + localSchema.getType());
+    }
+  }
+
+  protected boolean isOption(Schema schema) {
+    return schema.getType().equals(Schema.Type.UNION)
+        && schema.getTypes().size() == 2
+        && (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
+        || schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
+  }
+
+  protected Schema getNonNull(Schema schema) {
+    List<Schema> types = schema.getTypes();
+    return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
+  }
+
+}
+
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordPartialPayloadGenerator.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordPartialPayloadGenerator.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.generator;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+
+/**
+ * This is a GenericRecord payload generator that generates partial generic records {@link GenericRecord}. A partial
+ * records is one that has some fields of the schema NULL or NOT PRESENT. This payload enables us to simulate
+ * creation of partial records which are possible in many cases, especially for database change logs.
+ */
+public class GenericRecordPartialPayloadGenerator extends GenericRecordFullPayloadGenerator {
+
+  public GenericRecordPartialPayloadGenerator(Schema schema) {
+    super(schema);
+  }
+
+  public GenericRecordPartialPayloadGenerator(Schema schema, int minPayloadSize) {
+    super(schema, minPayloadSize);
+  }
+
+  @Override
+  protected GenericRecord convert(Schema schema) {
+    GenericRecord record = super.convertPartial(schema);
+    return record;
+  }
+
+  private void setNull(GenericRecord record) {
+    for (Schema.Field field : record.getSchema().getFields()) {
+      // A random boolean decides whether this field of the generic record should be present or absent.
+      // Using this we can set only a handful of fields in the record and generate partial records
+      boolean setNull = random.nextBoolean();
+      if (setNull) { // TODO : DO NOT SET THE RECORD KEY FIELDS TO NULL
+        record.put(field.name(), null);
+      } else {
+        if (record.get(field.name()) instanceof GenericData.Record) {
+          setNull((GenericData.Record) record.get(field.name()));
+        }
+      }
+    }
+  }
+
+  @Override
+  public boolean validate(GenericRecord record) {
+    return validate((Object) record);
+  }
+
+  // Atleast 1 entry should be null
+  private boolean validate(Object object) {
+    if (object == null) {
+      return true;
+    } else if (object instanceof GenericRecord) {
+      for (Schema.Field field : ((GenericRecord) object).getSchema().getFields()) {
+        boolean ret = validate(((GenericRecord) object).get(field.name()));
+        if (ret) {
+          return ret;
+        }
+      }
+    }
+    return false;
+  }
+
+}
+
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/LazyRecordGeneratorIterator.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/LazyRecordGeneratorIterator.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.generator;
+
+import java.util.Iterator;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.client.utils.LazyIterableIterator;
+
+/**
+ * A lazy record generator to generate {@link GenericRecord}s lazily and not hold a list of records in memory.
+ */
+public class LazyRecordGeneratorIterator extends
+    LazyIterableIterator<GenericRecord, GenericRecord> {
+
+  public LazyRecordGeneratorIterator(Iterator<GenericRecord> inputItr) {
+    super(inputItr);
+  }
+
+  @Override
+  protected void start() {
+  }
+
+  @Override
+  protected GenericRecord computeNext() {
+    return inputItr.next();
+  }
+
+  @Override
+  protected void end() {
+
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/UpdateGeneratorIterator.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/UpdateGeneratorIterator.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.generator;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+
+public class UpdateGeneratorIterator implements Iterator<GenericRecord> {
+
+  // Use the full payload generator as default
+  private GenericRecordFullPayloadGenerator generator;
+  private List<String> blackListedFields;
+  // iterator
+  private Iterator<GenericRecord> itr;
+
+  public UpdateGeneratorIterator(Iterator<GenericRecord> itr, String schemaStr, List<String> partitionPathFieldNames,
+      List<String> recordKeyFieldNames, int minPayloadSize) {
+    this.itr = itr;
+    this.blackListedFields = new ArrayList<>();
+    this.blackListedFields.addAll(partitionPathFieldNames);
+    this.blackListedFields.addAll(recordKeyFieldNames);
+    Schema schema = new Schema.Parser().parse(schemaStr);
+    this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize);
+  }
+
+  @Override
+  public boolean hasNext() {
+    return itr.hasNext();
+  }
+
+  @Override
+  public GenericRecord next() {
+    GenericRecord newRecord = itr.next();
+    return this.generator.randomize(newRecord, this.blackListedFields);
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/DFSTestSuitePathSelector.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/DFSTestSuitePathSelector.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.helpers;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.collection.ImmutablePair;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.hudi.utilities.sources.helpers.DFSPathSelector;
+
+/**
+ * A custom dfs path selector used only for the hudi test suite. To be used only if workload is not run inline.
+ */
+public class DFSTestSuitePathSelector extends DFSPathSelector {
+
+  public DFSTestSuitePathSelector(TypedProperties props, Configuration hadoopConf) {
+    super(props, hadoopConf);
+  }
+
+  @Override
+  public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(
+      Option<String> lastCheckpointStr, long sourceLimit) {
+
+    Integer lastBatchId;
+    Integer nextBatchId;
+    try {
+      if (lastCheckpointStr.isPresent()) {
+        lastBatchId = Integer.parseInt(lastCheckpointStr.get());
+        nextBatchId = lastBatchId + 1;
+      } else {
+        lastBatchId = -1;
+        nextBatchId = 0;
+      }
+      // obtain all eligible files for the batch
+      List<FileStatus> eligibleFiles = new ArrayList<>();
+      FileStatus[] fileStatuses = fs.globStatus(
+          new Path(props.getString(Config.ROOT_INPUT_PATH_PROP), "*"));
+      for (FileStatus fileStatus : fileStatuses) {
+        if (!fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream()
+            .anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) {
+          continue;
+        } else if (fileStatus.getPath().getName().compareTo(lastBatchId.toString()) > 0 && fileStatus.getPath()
+            .getName().compareTo(nextBatchId.toString()) <= 0) {
+          RemoteIterator<LocatedFileStatus> files = fs.listFiles(fileStatus.getPath(), true);
+          while (files.hasNext()) {
+            eligibleFiles.add(files.next());
+          }
+        }
+      }
+      // no data to readAvro
+      if (eligibleFiles.size() == 0) {
+        return new ImmutablePair<>(Option.empty(),
+            lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE)));
+      }
+      // readAvro the files out.
+      String pathStr = eligibleFiles.stream().map(f -> f.getPath().toString())
+          .collect(Collectors.joining(","));
+
+      return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(nextBatchId));
+    } catch (IOException ioe) {
+      throw new HoodieIOException(
+          "Unable to readAvro from source from checkpoint: " + lastCheckpointStr, ioe);
+    }
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.helpers;
+
+import java.io.IOException;
+import java.net.BindException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hive.service.server.HiveServer2;
+import org.apache.hudi.hive.testutils.HiveTestService;
+import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.thrift.transport.TTransportException;
+
+/**
+ * Hive Service provider.
+ */
+public class HiveServiceProvider {
+
+  private HiveTestService hiveService;
+  private HiveServer2 hiveServer;
+  private Config config;
+
+  private static final Logger LOG = LogManager.getLogger(HiveServiceProvider.class);
+
+  public HiveServiceProvider(Config config) {
+    this.config = config;
+  }
+
+  public void startLocalHiveServiceIfNeeded(Configuration configuration) throws IOException {
+    if (config.isHiveLocal()) {
+      hiveService = new HiveTestService(configuration);
+      hiveServer = hiveService.start();
+    }
+  }
+
+  public void syncToLocalHiveIfNeeded(HoodieTestSuiteWriter writer) {
+    if (this.config.isHiveLocal()) {
+      writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync()
+          .syncHive(getLocalHiveServer().getHiveConf());
+    } else {
+      writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive();
+    }
+  }
+
+  public void stopLocalHiveServiceIfNeeded() throws IOException {
+    if (config.isHiveLocal()) {
+      if (hiveService != null) {
+        hiveService.stop();
+      }
+    }
+  }
+
+  public HiveServer2 getLocalHiveServer() {
+    return hiveServer;
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSAvroDeltaInputReader.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSAvroDeltaInputReader.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.reader;
+
+import java.io.IOException;
+import java.util.Arrays;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
+import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.SparkSession;
+
+/**
+ * A reader of {@link DeltaOutputMode#DFS} and {@link DeltaInputType#AVRO}.
+ */
+public class DFSAvroDeltaInputReader extends DFSDeltaInputReader {
+
+  private final SparkSession sparkSession;
+  private final String schemaStr;
+  private final String basePath;
+  private final Option<String> structName;
+  private final Option<String> nameSpace;
+  protected PathFilter filter = (path) -> {
+    if (path.toUri().toString().contains(AvroFileDeltaInputWriter.AVRO_EXTENSION)) {
+      return true;
+    } else {
+      return false;
+    }
+  };
+
+  public DFSAvroDeltaInputReader(SparkSession sparkSession, String schemaStr, String basePath,
+      Option<String> structName,
+      Option<String> nameSpace) {
+    this.sparkSession = sparkSession;
+    this.schemaStr = schemaStr;
+    this.basePath = basePath;
+    this.structName = structName;
+    this.nameSpace = nameSpace;
+  }
+
+  @Override
+  public JavaRDD<GenericRecord> read(long totalRecordsToRead) throws IOException {
+    return SparkBasedReader.readAvro(sparkSession, schemaStr, getFilePathsToRead(basePath, filter, totalRecordsToRead),
+        structName, nameSpace);
+  }
+
+  @Override
+  public JavaRDD<GenericRecord> read(int numPartitions, long approxNumRecords) throws IOException {
+    throw new UnsupportedOperationException("cannot generate updates");
+  }
+
+  @Override
+  public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, long approxNumRecords) throws IOException {
+    throw new UnsupportedOperationException("cannot generate updates");
+  }
+
+  @Override
+  public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, double percentageRecordsPerFile)
+      throws IOException {
+    throw new UnsupportedOperationException("cannot generate updates");
+  }
+
+  @Override
+  protected long analyzeSingleFile(String filePath) {
+    JavaRDD<GenericRecord> recordsFromOneFile = SparkBasedReader
+        .readAvro(sparkSession, schemaStr, Arrays.asList(filePath),
+            structName, nameSpace);
+    return recordsFromOneFile.count();
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSDeltaInputReader.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSDeltaInputReader.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.reader;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.collection.Pair;
+
+/**
+ * This class helps to estimate the number of files to read a given number of total records.
+ * Use this class for all DFS based implementations of {@link DeltaInputReader}
+ */
+public abstract class DFSDeltaInputReader implements DeltaInputReader<GenericRecord> {
+
+  protected List<String> getFilePathsToRead(String basePath, PathFilter filter, long totalRecordsToRead) throws
+      IOException {
+    FileSystem fs = FSUtils.getFs(basePath, new Configuration());
+    // TODO : Sort list by file size and take the median file status to ensure fair calculation and change to remote
+    // iterator
+    List<FileStatus> fileStatuses = Arrays.asList(fs.globStatus(new Path(basePath, "*/*"), filter));
+    if (fileStatuses.size() > 0) {
+      FileStatus status = fileStatuses.get(0);
+      long avgNumRecordsPerFile = analyzeSingleFile(status.getPath().toString());
+      long numFilesToMatchExpectedRecords = (long) Math.ceil((double) totalRecordsToRead / (double)
+          avgNumRecordsPerFile);
+      long avgSizeOfEachFile = status.getLen();
+      long totalSizeToRead = avgSizeOfEachFile * numFilesToMatchExpectedRecords;
+      // choose N files with that length
+      Pair<Integer, Integer> fileStatusIndexRange = getFileStatusIndexRange(fileStatuses, avgSizeOfEachFile,
+          totalSizeToRead);
+      int startIndex = fileStatusIndexRange.getLeft();
+      List<String> filePaths = new ArrayList<>();
+      while (startIndex == 0 || startIndex < fileStatusIndexRange.getRight()) {
+        filePaths.add(fileStatuses.get(startIndex).getPath().toString());
+        startIndex++;
+      }
+      return filePaths;
+    }
+    return Collections.emptyList();
+  }
+
+  protected Pair<Integer, Integer> getFileStatusIndexRange(List<FileStatus> fileStatuses, long averageFileSize, long
+      totalSizeToRead) {
+    long totalSizeOfFilesPresent = 0;
+    int startOffset = 0;
+    int endOffset = 0;
+    for (FileStatus fileStatus : fileStatuses) {
+      // If current file length is greater than averageFileSize, increment by averageFileSize since our
+      // totalSizeToRead calculation is based on the averageRecordSize * numRecordsToRead.
+      if (fileStatus.getLen() > averageFileSize) {
+        totalSizeOfFilesPresent += averageFileSize;
+      } else {
+        totalSizeOfFilesPresent += fileStatus.getLen();
+      }
+      if (totalSizeOfFilesPresent <= totalSizeToRead) {
+        endOffset++;
+        continue;
+      } else {
+        return Pair.of(startOffset, endOffset);
+      }
+    }
+    return Pair.of(startOffset, endOffset);
+  }
+
+  /**
+   * Implementation of {@link DeltaInputReader}s to provide a way to read a single file on DFS and provide an
+   * average number of records across N files.
+   */
+  protected long analyzeSingleFile(String filePath) {
+    throw new UnsupportedOperationException("No implementation found");
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java
@@ -0,0 +1,338 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.reader;
+
+import static java.util.Map.Entry.comparingByValue;
+import static java.util.stream.Collectors.toMap;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.FileSlice;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
+import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
+import org.apache.hudi.common.table.view.TableFileSystemView;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ParquetReaderIterator;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.config.HoodieMemoryConfig;
+import org.apache.parquet.avro.AvroParquetReader;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+/**
+ * This class helps to generate updates from an already existing hoodie dataset. It supports generating updates in
+ * across partitions, files and records.
+ */
+public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader {
+
+  private static Logger log = LoggerFactory.getLogger(DFSHoodieDatasetInputReader.class);
+
+  private transient JavaSparkContext jsc;
+  private String schemaStr;
+  private HoodieTableMetaClient metaClient;
+
+  public DFSHoodieDatasetInputReader(JavaSparkContext jsc, String basePath, String schemaStr) {
+    this.jsc = jsc;
+    this.schemaStr = schemaStr;
+    this.metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
+  }
+
+  protected List<String> getPartitions(Option<Integer> partitionsLimit) throws IOException {
+    List<String> partitionPaths = FSUtils
+        .getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), false);
+    // Sort partition so we can pick last N partitions by default
+    Collections.sort(partitionPaths);
+    if (!partitionPaths.isEmpty()) {
+      ValidationUtils.checkArgument(partitionPaths.size() >= partitionsLimit.get(),
+          "Cannot generate updates for more partitions " + "than present in the dataset, partitions "
+              + "requested " + partitionsLimit.get() + ", partitions present " + partitionPaths.size());
+      return partitionPaths.subList(0, partitionsLimit.get());
+    }
+    return partitionPaths;
+
+  }
+
+  private JavaPairRDD<String, Iterator<FileSlice>> getPartitionToFileSlice(HoodieTableMetaClient metaClient,
+      List<String> partitionPaths) {
+    TableFileSystemView.SliceView fileSystemView = new HoodieTableFileSystemView(metaClient,
+        metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants());
+    // pass num partitions to another method
+    JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSliceList = jsc.parallelize(partitionPaths).mapToPair(p -> {
+      return new Tuple2<>(p, fileSystemView.getLatestFileSlices(p).iterator());
+    });
+    return partitionToFileSliceList;
+  }
+
+  @Override
+  protected long analyzeSingleFile(String filePath) {
+    return SparkBasedReader.readParquet(new SparkSession(jsc.sc()), Arrays.asList(filePath),
+        Option.empty(), Option.empty()).count();
+  }
+
+  private JavaRDD<GenericRecord> fetchAnyRecordsFromDataset(Option<Long> numRecordsToUpdate) throws IOException {
+    return fetchRecordsFromDataset(Option.empty(), Option.empty(), numRecordsToUpdate, Option.empty());
+  }
+
+  private JavaRDD<GenericRecord> fetchAnyRecordsFromDataset(Option<Long> numRecordsToUpdate, Option<Integer>
+      numPartitions) throws IOException {
+    return fetchRecordsFromDataset(numPartitions, Option.empty(), numRecordsToUpdate, Option.empty());
+  }
+
+  private JavaRDD<GenericRecord> fetchPercentageRecordsFromDataset(Option<Integer> numPartitions, Option<Integer>
+      numFiles, Option<Double> percentageRecordsPerFile) throws IOException {
+    return fetchRecordsFromDataset(numPartitions, numFiles, Option.empty(), percentageRecordsPerFile);
+  }
+
+  private JavaRDD<GenericRecord> fetchRecordsFromDataset(Option<Integer> numPartitions, Option<Integer>
+      numFiles, Option<Long> numRecordsToUpdate) throws IOException {
+    return fetchRecordsFromDataset(numPartitions, numFiles, numRecordsToUpdate, Option.empty());
+  }
+
+  private JavaRDD<GenericRecord> fetchRecordsFromDataset(Option<Integer> numPartitions, Option<Integer> numFiles,
+      Option<Long> numRecordsToUpdate, Option<Double> percentageRecordsPerFile) throws IOException {
+    log.info("NumPartitions : {}, NumFiles : {}, numRecordsToUpdate : {}, percentageRecordsPerFile : {}",
+        numPartitions, numFiles, numRecordsToUpdate, percentageRecordsPerFile);
+    List<String> partitionPaths = getPartitions(numPartitions);
+    // Read all file slices in the partition
+    JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice = getPartitionToFileSlice(metaClient,
+        partitionPaths);
+    // TODO : read record count from metadata
+    // Read the records in a single file
+    long recordsInSingleFile = iteratorSize(readParquetOrLogFiles(getSingleSliceFromRDD(partitionToFileSlice)));
+    int numFilesToUpdate;
+    long numRecordsToUpdatePerFile;
+    if (!numFiles.isPresent() || numFiles.get() == 0) {
+      // If num files are not passed, find the number of files to update based on total records to update and records
+      // per file
+      numFilesToUpdate = (int) (numRecordsToUpdate.get() / recordsInSingleFile);
+      log.info("Files to update {}", numFilesToUpdate);
+      numRecordsToUpdatePerFile = recordsInSingleFile;
+    } else {
+      // If num files is passed, find the number of records per file based on either percentage or total records to
+      // update and num files passed
+      numFilesToUpdate = numFiles.get();
+      numRecordsToUpdatePerFile = percentageRecordsPerFile.isPresent() ? (long) (recordsInSingleFile
+          * percentageRecordsPerFile.get()) : numRecordsToUpdate.get() / numFilesToUpdate;
+    }
+    // Adjust the number of files to read per partition based on the requested partition & file counts
+    Map<String, Integer> adjustedPartitionToFileIdCountMap = getFilesToReadPerPartition(partitionToFileSlice,
+        getPartitions(numPartitions).size(), numFilesToUpdate);
+    JavaRDD<GenericRecord> updates = projectSchema(generateUpdates(adjustedPartitionToFileIdCountMap,
+        partitionToFileSlice, numFilesToUpdate, (int) numRecordsToUpdatePerFile));
+    if (numRecordsToUpdate.isPresent() && numFiles.isPresent() && numFiles.get() != 0 && numRecordsToUpdate.get()
+        != numRecordsToUpdatePerFile * numFiles.get()) {
+      long remainingRecordsToAdd = (numRecordsToUpdate.get() - (numRecordsToUpdatePerFile * numFiles.get()));
+      updates = updates.union(projectSchema(jsc.parallelize(generateUpdates(adjustedPartitionToFileIdCountMap,
+          partitionToFileSlice, numFilesToUpdate, (int) remainingRecordsToAdd).take((int) remainingRecordsToAdd))));
+    }
+    log.info("Finished generating updates");
+    return updates;
+  }
+
+  private JavaRDD<GenericRecord> projectSchema(JavaRDD<GenericRecord> updates) {
+    // The records read from the hoodie dataset have the hoodie record fields, rewrite the record to eliminate them
+    return updates
+        .map(r -> HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(r, new Schema.Parser().parse(schemaStr)));
+  }
+
+  private JavaRDD<GenericRecord> generateUpdates(Map<String, Integer> adjustedPartitionToFileIdCountMap,
+      JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice, int numFiles, int numRecordsToReadPerFile) {
+    return partitionToFileSlice.map(p -> {
+      int maxFilesToRead = adjustedPartitionToFileIdCountMap.get(p._1);
+      return iteratorLimit(p._2, maxFilesToRead);
+    }).flatMap(p -> p).repartition(numFiles).map(fileSlice -> {
+      if (numRecordsToReadPerFile > 0) {
+        return iteratorLimit(readParquetOrLogFiles(fileSlice), numRecordsToReadPerFile);
+      } else {
+        return readParquetOrLogFiles(fileSlice);
+      }
+    }).flatMap(p -> p).map(i -> (GenericRecord) i);
+  }
+
+  private Map<String, Integer> getFilesToReadPerPartition(JavaPairRDD<String, Iterator<FileSlice>>
+      partitionToFileSlice, Integer numPartitions, Integer numFiles) {
+    int numFilesPerPartition = (int) Math.ceil(numFiles / numPartitions);
+    Map<String, Integer> partitionToFileIdCountMap = partitionToFileSlice
+        .mapToPair(p -> new Tuple2<>(p._1, iteratorSize(p._2))).collectAsMap();
+    long totalExistingFilesCount = partitionToFileIdCountMap.values().stream().reduce((a, b) -> a + b).get();
+    ValidationUtils.checkArgument(totalExistingFilesCount >= numFiles, "Cannot generate updates "
+        + "for more files than present in the dataset, file requested " + numFiles + ", files present "
+        + totalExistingFilesCount);
+    Map<String, Integer> partitionToFileIdCountSortedMap = partitionToFileIdCountMap
+        .entrySet()
+        .stream()
+        .sorted(comparingByValue())
+        .collect(toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2,
+            LinkedHashMap::new));
+    // Limit files to be read per partition
+    Map<String, Integer> adjustedPartitionToFileIdCountMap = new HashMap<>();
+    partitionToFileIdCountSortedMap.entrySet().stream().forEach(e -> {
+      if (e.getValue() <= numFilesPerPartition) {
+        adjustedPartitionToFileIdCountMap.put(e.getKey(), e.getValue());
+      } else {
+        adjustedPartitionToFileIdCountMap.put(e.getKey(), numFilesPerPartition);
+      }
+    });
+    return adjustedPartitionToFileIdCountMap;
+  }
+
+  private FileSlice getSingleSliceFromRDD(JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice) {
+    return partitionToFileSlice.map(f -> {
+      FileSlice slice = f._2.next();
+      FileSlice newSlice = new FileSlice(slice.getFileGroupId(), slice.getBaseInstantTime());
+      if (slice.getBaseFile().isPresent()) {
+        newSlice.setBaseFile(slice.getBaseFile().get());
+      } else {
+        slice.getLogFiles().forEach(l -> {
+          newSlice.addLogFile(l);
+        });
+      }
+      return newSlice;
+    }).take(1).get(0);
+  }
+
+  private Iterator<IndexedRecord> readParquetOrLogFiles(FileSlice fileSlice) throws IOException {
+    if (fileSlice.getBaseFile().isPresent()) {
+      Iterator<IndexedRecord> itr =
+          new ParquetReaderIterator<IndexedRecord>(AvroParquetReader.<IndexedRecord>builder(new
+              Path(fileSlice.getBaseFile().get().getPath())).withConf(metaClient.getHadoopConf()).build());
+      return itr;
+    } else {
+      // If there is no data file, fall back to reading log files
+      HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(metaClient.getFs(),
+          metaClient.getBasePath(),
+          fileSlice.getLogFiles().map(l -> l.getPath().getName()).collect(Collectors.toList()),
+          new Schema.Parser().parse(schemaStr), metaClient.getActiveTimeline().getCommitsTimeline()
+          .filterCompletedInstants().lastInstant().get().getTimestamp(),
+          HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES, true, false,
+          HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE,
+          HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH);
+      // readAvro log files
+      Iterable<HoodieRecord<? extends HoodieRecordPayload>> iterable = () -> scanner.iterator();
+      Schema schema = new Schema.Parser().parse(schemaStr);
+      return StreamSupport.stream(iterable.spliterator(), false)
+          .map(e -> {
+            try {
+              return (IndexedRecord) e.getData().getInsertValue(schema).get();
+            } catch (IOException io) {
+              throw new UncheckedIOException(io);
+            }
+          }).iterator();
+    }
+  }
+
+  /**
+   * Returns the number of elements remaining in {@code iterator}. The iterator
+   * will be left exhausted: its {@code hasNext()} method will return
+   * {@code false}.
+   */
+  private static int iteratorSize(Iterator<?> iterator) {
+    int count = 0;
+    while (iterator.hasNext()) {
+      iterator.next();
+      count++;
+    }
+    return count;
+  }
+
+  /**
+   * Creates an iterator returning the first {@code limitSize} elements of the
+   * given iterator. If the original iterator does not contain that many
+   * elements, the returned iterator will have the same behavior as the original
+   * iterator. The returned iterator supports {@code remove()} if the original
+   * iterator does.
+   *
+   * @param iterator the iterator to limit
+   * @param limitSize the maximum number of elements in the returned iterator
+   * @throws IllegalArgumentException if {@code limitSize} is negative
+   */
+  private static <T> Iterator<T> iteratorLimit(
+      final Iterator<T> iterator, final int limitSize) {
+    ValidationUtils.checkArgument(iterator != null, "iterator is null");
+    ValidationUtils.checkArgument(limitSize >= 0, "limit is negative");
+    return new Iterator<T>() {
+      private int count;
+
+      @Override
+      public boolean hasNext() {
+        return count < limitSize && iterator.hasNext();
+      }
+
+      @Override
+      public T next() {
+        if (!hasNext()) {
+          throw new NoSuchElementException();
+        }
+        count++;
+        return iterator.next();
+      }
+
+      @Override
+      public void remove() {
+        iterator.remove();
+      }
+    };
+  }
+
+  @Override
+  public JavaRDD<GenericRecord> read(long numRecords) throws IOException {
+    return fetchAnyRecordsFromDataset(Option.of(numRecords));
+  }
+
+  @Override
+  public JavaRDD<GenericRecord> read(int numPartitions, long approxNumRecords) throws IOException {
+    return fetchAnyRecordsFromDataset(Option.of(approxNumRecords), Option.of(numPartitions));
+  }
+
+  @Override
+  public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, long numRecords) throws IOException {
+    return fetchRecordsFromDataset(Option.of(numPartitions), Option.of(numFiles), Option.of(numRecords));
+  }
+
+  @Override
+  public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, double percentageRecordsPerFile)
+      throws IOException {
+    return fetchPercentageRecordsFromDataset(Option.of(numPartitions), Option.of(numFiles),
+        Option.of(percentageRecordsPerFile));
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DeltaInputReader.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DeltaInputReader.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.reader;
+
+import java.io.IOException;
+import java.io.Serializable;
+import org.apache.spark.api.java.JavaRDD;
+
+/**
+ * Implementations of {@link DeltaInputReader} will read the configured input type and provide an RDD of records to the
+ * client.
+ *
+ * @param <O> Read result data type
+ */
+public interface DeltaInputReader<O> extends Serializable {
+
+  /**
+   * Attempts to reads an approximate number of records close to approxNumRecords.
+   * This highly depends on the number of records already present in the input.
+   */
+  JavaRDD<O> read(long approxNumRecords) throws IOException;
+
+  /**
+   * @throws IOException Attempts to read approx number of records (exact if equal or more records available)
+   * across requested number of
+   * partitions.
+   */
+  JavaRDD<O> read(int numPartitions, long approxNumRecords) throws IOException;
+
+  /**
+   * @throws IOException Attempts to read approx number of records (exact if equal or more records available)
+   * across requested number of
+   * partitions and number of files.
+   * 1. Find numFiles across numPartitions
+   * 2. numRecordsToReadPerFile = approxNumRecords / numFiles
+   */
+  JavaRDD<O> read(int numPartitions, int numFiles, long approxNumRecords) throws IOException;
+
+  /**
+   * @throws IOException Attempts to a % of records per file across requested number of partitions and number of files.
+   * 1. Find numFiles across numPartitions
+   * 2. numRecordsToReadPerFile = approxNumRecordsPerFile * percentageRecordsPerFile
+   */
+  JavaRDD<O> read(int numPartitions, int numFiles, double percentageRecordsPerFile) throws IOException;
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DeltaInputType.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DeltaInputType.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.reader;
+
+/**
+ * Supported delta input data types.
+ */
+public enum DeltaInputType {
+  AVRO, PARQUET
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/SparkBasedReader.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/SparkBasedReader.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.reader;
+
+import java.util.List;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.AvroConversionUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.utilities.schema.RowBasedSchemaProvider;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import scala.collection.JavaConverters;
+
+
+/**
+ * Helper class to read avro and/or parquet files and generate a RDD of {@link GenericRecord}.
+ */
+public class SparkBasedReader {
+
+  public static final String SPARK_AVRO_FORMAT = "avro";
+  public static final String SPARK_PARQUET_FORMAT = "com.databricks.spark.parquet";
+  private static final String AVRO_SCHEMA_OPTION_KEY = "avroSchema";
+
+  // Spark anyways globs the path and gets all the paths in memory so take the List<filePaths> as an argument.
+  // https://github.com/apache/spark/.../org/apache/spark/sql/execution/datasources/DataSource.scala#L251
+  public static JavaRDD<GenericRecord> readAvro(SparkSession sparkSession, String schemaStr, List<String> listOfPaths,
+      Option<String> structName, Option<String> nameSpace) {
+
+    Dataset<Row> dataSet = sparkSession.read()
+        .format(SPARK_AVRO_FORMAT)
+        .option(AVRO_SCHEMA_OPTION_KEY, schemaStr)
+        .load(JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq());
+
+    return AvroConversionUtils
+        .createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME),
+            nameSpace.orElse(RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE))
+        .toJavaRDD();
+  }
+
+  public static JavaRDD<GenericRecord> readParquet(SparkSession sparkSession, List<String>
+      listOfPaths, Option<String> structName, Option<String> nameSpace) {
+
+    Dataset<Row> dataSet = sparkSession.read()
+        .parquet((JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq()));
+
+    return AvroConversionUtils
+        .createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME),
+            RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE)
+        .toJavaRDD();
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/AvroFileDeltaInputWriter.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/AvroFileDeltaInputWriter.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.writer;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.UUID;
+import org.apache.avro.Schema;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.avro.io.DatumWriter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Implementation of {@link DeltaInputWriter} that writes avro records to the result file.
+ */
+public class AvroFileDeltaInputWriter implements DeltaInputWriter<GenericRecord> {
+
+  public static final String AVRO_EXTENSION = ".avro";
+  private static Logger log = LoggerFactory.getLogger(AvroFileDeltaInputWriter.class);
+  // The maximum file size for an avro file before being rolled over to a new one
+  private final Long maxFileSize;
+  private final Configuration configuration;
+  private HoodieWrapperFileSystem fs;
+  // Path of the actual avro file
+  private Path file;
+  // Base input path to write avro files under
+  // TODO : Make this bucketed so don't have a large number of files in a single directory
+  private String basePath;
+  private DatumWriter<IndexedRecord> writer;
+  private DataFileWriter<IndexedRecord> dataFileWriter;
+  private OutputStream output;
+  private Schema schema;
+  private DeltaWriteStats deltaWriteStats;
+  private long recordsWritten = 0;
+
+  // TODO : Handle failure case which may leave behind tons of small corrupt files
+  public AvroFileDeltaInputWriter(Configuration configuration, String basePath, String schemaStr, Long maxFileSize)
+      throws IOException {
+    this.schema = Schema.parse(schemaStr);
+    this.maxFileSize = maxFileSize;
+    this.configuration = configuration;
+    this.basePath = basePath;
+    Path path = new Path(basePath, new Path(UUID.randomUUID().toString() + AVRO_EXTENSION));
+    this.file = HoodieWrapperFileSystem.convertToHoodiePath(path, configuration);
+    this.fs = (HoodieWrapperFileSystem) this.file
+        .getFileSystem(FSUtils.registerFileSystem(path, configuration));
+    this.output = this.fs.create(this.file);
+    this.writer = new GenericDatumWriter(schema);
+    this.dataFileWriter = new DataFileWriter<>(writer).create(schema, output);
+    this.deltaWriteStats = new DeltaWriteStats();
+  }
+
+  @Override
+  public void writeData(GenericRecord iData) throws IOException {
+    this.dataFileWriter.append(iData);
+    recordsWritten++;
+  }
+
+  @Override
+  public boolean canWrite() {
+    return fs.getBytesWritten(file) < maxFileSize;
+  }
+
+  @Override
+  public void close() throws IOException {
+    this.deltaWriteStats.setBytesWritten(this.fs.getBytesWritten(this.file));
+    this.deltaWriteStats.setRecordsWritten(this.recordsWritten);
+    this.deltaWriteStats.setFilePath(this.file.toUri().getPath());
+    this.dataFileWriter.close();
+    log.info("New Avro File : {}", getPath());
+  }
+
+  @Override
+  public DeltaInputWriter getNewWriter() throws IOException {
+    AvroFileDeltaInputWriter avroFileDeltaInputWriter = new AvroFileDeltaInputWriter(this.configuration, this.basePath, this
+        .schema.toString(), this.maxFileSize);
+    return avroFileDeltaInputWriter;
+  }
+
+  public FileSystem getFs() {
+    return fs;
+  }
+
+  public Path getPath() {
+    return this.file;
+  }
+
+  @Override
+  public DeltaWriteStats getDeltaWriteStats() {
+    return this.deltaWriteStats;
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSDeltaWriterAdapter.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSDeltaWriterAdapter.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.writer;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.avro.generic.GenericRecord;
+
+/**
+ * {@link org.apache.hadoop.hdfs.DistributedFileSystem} (or {@link org.apache.hadoop.fs.LocalFileSystem}) based delta
+ * generator.
+ */
+public class DFSDeltaWriterAdapter implements DeltaWriterAdapter<GenericRecord> {
+
+  private DeltaInputWriter deltaInputGenerator;
+  private List<DeltaWriteStats> metrics = new ArrayList<>();
+
+  public DFSDeltaWriterAdapter(DeltaInputWriter<GenericRecord> deltaInputGenerator) {
+    this.deltaInputGenerator = deltaInputGenerator;
+  }
+
+  @Override
+  public List<DeltaWriteStats> write(Iterator<GenericRecord> input) throws IOException {
+    while (input.hasNext()) {
+      if (this.deltaInputGenerator.canWrite()) {
+        this.deltaInputGenerator.writeData(input.next());
+      } else if (input.hasNext()) {
+        rollOver();
+      }
+    }
+    close();
+    return this.metrics;
+  }
+
+  public void rollOver() throws IOException {
+    close();
+    this.deltaInputGenerator = this.deltaInputGenerator.getNewWriter();
+  }
+
+  private void close() throws IOException {
+    this.deltaInputGenerator.close();
+    this.metrics.add(this.deltaInputGenerator.getDeltaWriteStats());
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSSparkAvroDeltaWriter.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSSparkAvroDeltaWriter.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.writer;
+
+import java.io.IOException;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter.SparkBasedDeltaWriter;
+import org.apache.spark.api.java.JavaRDD;
+
+/**
+ * NEED TO IMPLEMENT A CUSTOM SPARK PARTITIONER TO ENSURE WE WRITE LARGE ENOUGH AVRO FILES.
+ */
+public class DFSSparkAvroDeltaWriter implements SparkBasedDeltaWriter<JavaRDD<GenericRecord>> {
+
+  private DeltaInputWriter<JavaRDD<GenericRecord>> deltaInputWriter;
+
+  public DFSSparkAvroDeltaWriter(DeltaInputWriter<JavaRDD<GenericRecord>> deltaInputWriter) {
+    this.deltaInputWriter = deltaInputWriter;
+  }
+
+  @Override
+  public JavaRDD<DeltaWriteStats> write(JavaRDD<GenericRecord> input) throws IOException {
+    this.deltaInputWriter.writeData(input);
+    return null;
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaInputWriter.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaInputWriter.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.writer;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/**
+ * Implementations of {@link DeltaInputWriter} will be able to generate data.
+ *
+ * @param <I> Data type to be generated.
+ */
+public interface DeltaInputWriter<I> extends Closeable {
+
+  /**
+   * Generate any type of data.
+   */
+  void writeData(I iData) throws IOException;
+
+  /**
+   * Check whether more data can/should be written.
+   */
+  boolean canWrite();
+
+  /**
+   * Return the statistics of data written.
+   */
+  DeltaWriteStats getDeltaWriteStats();
+
+  /**
+   * Return a new instance of this writer.
+   * @return
+   * @throws IOException
+   */
+  DeltaInputWriter getNewWriter() throws IOException;
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaOutputMode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaOutputMode.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.writer;
+
+/**
+ * Supported output destination types for the generated delta workload.
+ */
+public enum DeltaOutputMode {
+  KAFKA, DFS
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriteStats.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriteStats.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.writer;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hudi.common.util.collection.Pair;
+
+/**
+ * This class holds the write statistics for {@link DeltaInputWriter}.
+ */
+public class DeltaWriteStats implements Serializable {
+
+  // The file path (if any) for the data written
+  private String filePath;
+  // Number of bytes written before being closed
+  private long bytesWritten;
+  // Number of records written before being closed
+  private long recordsWritten;
+
+  private List<Pair<String, String>> partitionPathRecordKey = new ArrayList<>();
+
+  public String getFilePath() {
+    return filePath;
+  }
+
+  public void setFilePath(String filePath) {
+    this.filePath = filePath;
+  }
+
+  public long getBytesWritten() {
+    return bytesWritten;
+  }
+
+  public void setBytesWritten(long bytesWritten) {
+    this.bytesWritten = bytesWritten;
+  }
+
+  public List<Pair<String, String>> getPartitionPathRecordKey() {
+    return partitionPathRecordKey;
+  }
+
+  public void setPartitionPathRecordKey(List<Pair<String, String>> partitionPathRecordKey) {
+    this.partitionPathRecordKey = partitionPathRecordKey;
+  }
+
+  public long getRecordsWritten() {
+    return recordsWritten;
+  }
+
+  public void setRecordsWritten(long recordsWritten) {
+    this.recordsWritten = recordsWritten;
+  }
+
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterAdapter.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterAdapter.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.writer;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.spark.api.java.JavaRDD;
+
+public interface DeltaWriterAdapter<I> {
+
+  List<DeltaWriteStats> write(Iterator<I> input) throws IOException;
+
+  interface SparkBasedDeltaWriter<J> {
+
+    JavaRDD<DeltaWriteStats> write(J input) throws IOException;
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterFactory.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterFactory.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.writer;
+
+import java.io.IOException;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
+import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
+
+/**
+ * A factory to help instantiate different {@link DeltaWriterAdapter}s depending on the {@link DeltaOutputMode} and
+ * {@link DeltaInputType}.
+ */
+public class DeltaWriterFactory {
+
+  private DeltaWriterFactory() {
+  }
+
+  public static DeltaWriterAdapter getDeltaWriterAdapter(DeltaConfig config, Integer batchId) throws IOException {
+    switch (config.getDeltaOutputMode()) {
+      case DFS:
+        switch (config.getDeltaInputType()) {
+          case AVRO:
+            DFSDeltaConfig dfsDeltaConfig = (DFSDeltaConfig) config;
+            dfsDeltaConfig.setBatchId(batchId);
+            DeltaInputWriter<GenericRecord> fileDeltaInputGenerator = new AvroFileDeltaInputWriter(
+                dfsDeltaConfig.getConfiguration(),
+                StringUtils
+                    .join(new String[]{dfsDeltaConfig.getDeltaBasePath(), dfsDeltaConfig.getBatchId().toString()},
+                        "/"), dfsDeltaConfig.getSchemaStr(), dfsDeltaConfig.getMaxFileSize());
+            return new DFSDeltaWriterAdapter(fileDeltaInputGenerator);
+          default:
+            throw new IllegalArgumentException("Invalid delta input format " + config.getDeltaInputType());
+        }
+      default:
+        throw new IllegalArgumentException("Invalid delta input type " + config.getDeltaOutputMode());
+    }
+  }
+}
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/SparkAvroDeltaInputWriter.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/SparkAvroDeltaInputWriter.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.writer;
+
+import java.io.IOException;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.AvroConversionUtils;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.SparkSession;
+
+/**
+ * Spark based avro delta input writer. We don't use this yet since we cannot control result file size.
+ */
+public class SparkAvroDeltaInputWriter implements DeltaInputWriter<JavaRDD<GenericRecord>> {
+
+  private static final String AVRO_FORMAT_PACKAGE = "avro";
+  public SparkSession sparkSession;
+  private String schemaStr;
+  // TODO : the base path has to be a new path every time for spark avro
+  private String basePath;
+
+  public SparkAvroDeltaInputWriter(SparkSession sparkSession, String schemaStr, String basePath) {
+    this.sparkSession = sparkSession;
+    this.schemaStr = schemaStr;
+    this.basePath = basePath;
+  }
+
+  @Override
+  public void writeData(JavaRDD<GenericRecord> iData) throws IOException {
+    AvroConversionUtils.createDataFrame(iData.rdd(), schemaStr, sparkSession).write()
+        .format(AVRO_FORMAT_PACKAGE).save(basePath);
+  }
+
+  @Override
+  public boolean canWrite() {
+    throw new UnsupportedOperationException("not applicable for spark based writer");
+  }
+
+  @Override
+  public void close() throws IOException {
+  }
+
+  @Override
+  public DeltaWriteStats getDeltaWriteStats() {
+    throw new UnsupportedOperationException("not applicable for spark based writer");
+  }
+
+  @Override
+  public DeltaInputWriter getNewWriter() throws IOException {
+    throw new UnsupportedOperationException("not applicable for spark based writer");
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.when;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.SerializableConfiguration;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
+import org.apache.hudi.integ.testsuite.generator.FlexibleSchemaRecordGenerationIterator;
+import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
+import org.apache.hudi.integ.testsuite.utils.TestUtils;
+import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
+import org.apache.hudi.integ.testsuite.writer.DFSDeltaWriterAdapter;
+import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
+import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
+import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
+import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter;
+import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory;
+import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
+import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
+import org.apache.spark.api.java.JavaRDD;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
+
+public class TestDFSHoodieTestSuiteWriterAdapter extends UtilitiesTestBase {
+
+  private FilebasedSchemaProvider schemaProvider;
+  private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/";
+
+  @BeforeAll
+  public static void initClass() throws Exception {
+    UtilitiesTestBase.initClass();
+  }
+
+  @AfterAll
+  public static void cleanupClass() {
+    UtilitiesTestBase.cleanupClass();
+  }
+
+  @BeforeEach
+  public void setup() throws Exception {
+    super.setup();
+    schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFSWithAbsoluteScope(
+        System.getProperty("user.dir") + "/.." + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH,
+        "complex-source.avsc"), jsc);
+  }
+
+  @AfterEach
+  public void teardown() throws Exception {
+    super.teardown();
+  }
+
+  @Test
+  public void testDFSOneFileWrite() throws IOException {
+
+    DeltaInputWriter<GenericRecord> mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class);
+    DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class);
+    when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter);
+    when(mockFileSinkWriter.canWrite()).thenReturn(true);
+    when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats);
+
+    DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter);
+
+    JavaRDD<GenericRecord> records = TestUtils.makeRDD(jsc, 10);
+
+    dfsDeltaWriterAdapter.write(records.collect().iterator());
+    Mockito.verify(mockFileSinkWriter, times(10)).canWrite();
+    Mockito.verify(mockFileSinkWriter, times(1)).close();
+  }
+
+  @Test
+  public void testDFSTwoFilesWriteWithRollover() throws IOException {
+
+    DeltaInputWriter<GenericRecord> mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class);
+    DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class);
+    when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter);
+    when(mockFileSinkWriter.canWrite()).thenReturn(false, true);
+    when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats);
+
+    DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter);
+
+    Iterator<GenericRecord> mockIterator = Mockito.mock(Iterator.class);
+    when(mockIterator.hasNext()).thenReturn(true, true, true, false);
+
+    dfsDeltaWriterAdapter.write(mockIterator);
+    Mockito.verify(mockFileSinkWriter, times(2)).canWrite();
+    Mockito.verify(mockFileSinkWriter, times(1)).getNewWriter();
+    Mockito.verify(mockFileSinkWriter, times(2)).close();
+  }
+
+  @Test
+  public void testDFSWorkloadSinkWithMultipleFilesFunctional() throws IOException {
+    DeltaConfig dfsSinkConfig = new DFSDeltaConfig(DeltaOutputMode.DFS, DeltaInputType.AVRO,
+        new SerializableConfiguration(jsc.hadoopConfiguration()), dfsBasePath, dfsBasePath,
+        schemaProvider.getSourceSchema().toString(), 10240L);
+    DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = DeltaWriterFactory
+        .getDeltaWriterAdapter(dfsSinkConfig, 1);
+    FlexibleSchemaRecordGenerationIterator itr = new FlexibleSchemaRecordGenerationIterator(1000,
+        schemaProvider.getSourceSchema().toString());
+    dfsDeltaWriterAdapter.write(itr);
+    FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
+    FileStatus[] fileStatuses = fs.listStatus(new Path(dfsBasePath));
+    // Since maxFileSize was 10240L and we produced 1K records each close to 1K size, we should produce more than
+    // 1 file
+    assertTrue(fileStatuses.length > 0);
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestFileDeltaInputWriter.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestFileDeltaInputWriter.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.integ.testsuite.reader.SparkBasedReader;
+import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
+import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
+import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
+import org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator;
+import org.apache.hudi.integ.testsuite.reader.SparkBasedReader;
+import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
+import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
+import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
+import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
+import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
+import org.apache.spark.api.java.JavaRDD;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+public class TestFileDeltaInputWriter extends UtilitiesTestBase {
+
+  private FilebasedSchemaProvider schemaProvider;
+  private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/";
+
+  @BeforeAll
+  public static void initClass() throws Exception {
+    UtilitiesTestBase.initClass();
+  }
+
+  @AfterAll
+  public static void cleanupClass() {
+    UtilitiesTestBase.cleanupClass();
+  }
+
+  @BeforeEach
+  public void setup() throws Exception {
+    super.setup();
+    schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFSWithAbsoluteScope(System.getProperty("user.dir") + "/.."
+        + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, "complex-source.avsc"), jsc);
+  }
+
+  @AfterEach
+  public void teardown() throws Exception {
+    super.teardown();
+  }
+
+  @Test
+  public void testAvroFileSinkWriter() throws IOException {
+    // 1. Create a Avro File Sink Writer
+    DeltaInputWriter<GenericRecord> fileSinkWriter =
+        new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath + "/input", schemaProvider.getSourceSchema()
+            .toString(), 1024 * 1024L);
+    GenericRecordFullPayloadGenerator payloadGenerator =
+        new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
+    // 2. Generate 100 avro payloads and write them to an avro file
+    IntStream.range(0, 100).forEach(a -> {
+      try {
+        fileSinkWriter.writeData(payloadGenerator.getNewPayload());
+      } catch (IOException io) {
+        throw new UncheckedIOException(io);
+      }
+    });
+    fileSinkWriter.close();
+    DeltaWriteStats deltaWriteStats = fileSinkWriter.getDeltaWriteStats();
+    FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
+    FileStatus[] fileStatuses = fs.listStatus(new Path(deltaWriteStats.getFilePath()));
+    // Atleast 1 file was written
+    assertEquals(1, fileStatuses.length);
+    // File length should be greater than 0
+    assertTrue(fileStatuses[0].getLen() > 0);
+    // File length should be the same as the number of bytes written
+    assertTrue(deltaWriteStats.getBytesWritten() > 0);
+    List<String> paths = Arrays.asList(fs.globStatus(new Path(dfsBasePath + "/*/*.avro")))
+        .stream().map(f -> f.getPath().toString()).collect(Collectors.toList());
+    JavaRDD<GenericRecord> writtenRecords =
+        SparkBasedReader.readAvro(sparkSession, schemaProvider.getSourceSchema().toString(), paths, Option.empty(),
+            Option.empty());
+    // Number of records written should be 100
+    assertEquals(writtenRecords.count(), 100);
+    // Number of records in file should match with the stats
+    assertEquals(writtenRecords.count(), deltaWriteStats.getRecordsWritten());
+  }
+
+  @Test
+  public void testAvroFileSinkCreateNewWriter() throws IOException {
+    // 1. Create a Avro File Sink Writer
+    DeltaInputWriter<GenericRecord> fileSinkWriter =
+        new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath,
+            schemaProvider.getSourceSchema().toString(),
+            1024 * 1024L);
+    GenericRecordFullPayloadGenerator payloadGenerator =
+        new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
+    // 2. Generate 100 avro payloads and write them to an avro file
+    IntStream.range(0, 100).forEach(a -> {
+      try {
+        fileSinkWriter.writeData(payloadGenerator.getNewPayload());
+      } catch (IOException io) {
+        throw new UncheckedIOException(io);
+      }
+    });
+    fileSinkWriter.close();
+    String oldFilePath = fileSinkWriter.getDeltaWriteStats().getFilePath();
+    assertFalse(oldFilePath == null);
+    DeltaInputWriter<GenericRecord> newFileSinkWriter = fileSinkWriter.getNewWriter();
+    newFileSinkWriter.close();
+    DeltaWriteStats newStats = newFileSinkWriter.getDeltaWriteStats();
+    assertEquals(newStats.getBytesWritten(), 3674);
+    assertEquals(newStats.getRecordsWritten(), 0);
+    assertTrue(newStats.getFilePath() != null);
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/configuration/TestWorkflowBuilder.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/configuration/TestWorkflowBuilder.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.configuration;
+
+import static junit.framework.Assert.assertTrue;
+import static junit.framework.TestCase.assertEquals;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
+import org.junit.jupiter.api.Test;
+
+public class TestWorkflowBuilder {
+
+  @Test
+  public void testWorkloadOperationSequenceBuilder() {
+
+    DagNode root = new InsertNode(DeltaConfig.Config.newBuilder()
+        .withNumRecordsToInsert(10000)
+        .withNumInsertPartitions(1)
+        .withNumTimesToRepeat(2)
+        .withRecordSize(1000).build());
+
+    DagNode child1 = new UpsertNode(DeltaConfig.Config.newBuilder()
+        .withNumRecordsToUpdate(10000)
+        .withNumInsertPartitions(1)
+        .withNumTimesToRepeat(2)
+        .withRecordSize(1000).build());
+
+    root.addChildNode(child1);
+    child1.addParentNode(root);
+    List<DagNode> rootNodes = new ArrayList<>();
+    rootNodes.add(root);
+    WorkflowDag workflowDag = new WorkflowDag(rootNodes);
+
+    assertEquals(workflowDag.getNodeList().size(), 1);
+    assertEquals(((DagNode) workflowDag.getNodeList().get(0)).getChildNodes().size(), 1);
+    DagNode dagNode = (DagNode) workflowDag.getNodeList().get(0);
+    assertTrue(dagNode instanceof InsertNode);
+    DeltaConfig.Config config = dagNode.getConfig();
+    assertEquals(config.getNumInsertPartitions(), 1);
+    assertEquals(config.getRecordSize(), 1000);
+    assertEquals(config.getRepeatCount(), 2);
+    assertEquals(config.getNumRecordsInsert(), 10000);
+    assertEquals(config.getNumRecordsUpsert(), 0);
+    dagNode = (DagNode) ((DagNode) workflowDag.getNodeList().get(0)).getChildNodes().get(0);
+    assertTrue(dagNode instanceof UpsertNode);
+    config = dagNode.getConfig();
+    assertEquals(config.getNumInsertPartitions(), 1);
+    assertEquals(config.getRecordSize(), 1000);
+    assertEquals(config.getRepeatCount(), 2);
+    assertEquals(config.getNumRecordsInsert(), 0);
+    assertEquals(config.getNumRecordsUpsert(), 10000);
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestUpdateConverter.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestUpdateConverter.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.converter;
+
+import static junit.framework.TestCase.assertTrue;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import org.apache.avro.Schema.Field;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.integ.testsuite.utils.TestUtils;
+import org.apache.hudi.integ.testsuite.utils.TestUtils;
+import org.apache.hudi.utilities.UtilHelpers;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import scala.Tuple2;
+
+public class TestUpdateConverter {
+
+  private JavaSparkContext jsc;
+
+  @BeforeEach
+  public void setup() throws Exception {
+    jsc = UtilHelpers.buildSparkContext(this.getClass().getName() + "-hoodie", "local[1]");
+
+  }
+
+  @AfterEach
+  public void teardown() {
+    jsc.stop();
+  }
+
+  @Test
+  public void testGenerateUpdateRecordsFromInputRecords() throws Exception {
+    JavaRDD<GenericRecord> inputRDD = TestUtils.makeRDD(jsc, 10);
+    String schemaStr = inputRDD.take(1).get(0).getSchema().toString();
+    int minPayloadSize = 1000;
+    // 2. DFS converter reads existing records and generates random updates for the same row keys
+    UpdateConverter updateConverter = new UpdateConverter(schemaStr, minPayloadSize,
+        Arrays.asList("timestamp"), Arrays.asList("_row_key"));
+    List<String> insertRowKeys = inputRDD.map(r -> r.get("_row_key").toString()).collect();
+    assertTrue(inputRDD.count() == 10);
+    JavaRDD<GenericRecord> outputRDD = updateConverter.convert(inputRDD);
+    List<String> updateRowKeys = outputRDD.map(row -> row.get("_row_key").toString()).collect();
+    // The insert row keys should be the same as update row keys
+    assertTrue(insertRowKeys.containsAll(updateRowKeys));
+    Map<String, GenericRecord> inputRecords = inputRDD.mapToPair(r -> new Tuple2<>(r.get("_row_key").toString(), r))
+        .collectAsMap();
+    List<GenericRecord> updateRecords = outputRDD.collect();
+    updateRecords.stream().forEach(updateRecord -> {
+      GenericRecord inputRecord = inputRecords.get(updateRecord.get("_row_key").toString());
+      assertTrue(areRecordsDifferent(inputRecord, updateRecord));
+    });
+
+  }
+
+  /**
+   * Checks if even a single field in the 2 records is different (except the row key which is the same for an update).
+   */
+  private boolean areRecordsDifferent(GenericRecord in, GenericRecord up) {
+    for (Field field : in.getSchema().getFields()) {
+      if (field.name() == "_row_key") {
+        continue;
+      } else {
+        // Just convert all types to string for now since all are primitive
+        if (in.get(field.name()).toString() != up.get(field.name()).toString()) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/ComplexDagGenerator.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/ComplexDagGenerator.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Function;
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.ValidateNode;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.ValidateNode;
+import org.apache.spark.api.java.JavaRDD;
+
+public class ComplexDagGenerator implements WorkflowDagGenerator {
+
+  @Override
+  public WorkflowDag build() {
+    DagNode root = new InsertNode(Config.newBuilder()
+        .withNumRecordsToInsert(1000)
+        .withNumInsertPartitions(3)
+        .withRecordSize(1000).build());
+
+    DagNode child1 = new UpsertNode(Config.newBuilder()
+        .withNumRecordsToUpdate(999)
+        .withNumRecordsToInsert(1000)
+        .withNumUpsertFiles(1)
+        .withNumUpsertPartitions(1)
+        .withNumInsertPartitions(1)
+        .withRecordSize(10000).build());
+
+    Function<List<DagNode<JavaRDD<WriteStatus>>>, Boolean> function = (dagNodes) -> {
+      DagNode<JavaRDD<WriteStatus>> parent1 = dagNodes.get(0);
+      List<WriteStatus> statuses = parent1.getResult().collect();
+      long totalRecordsTouched = statuses.stream().map(st -> st.getStat().getNumUpdateWrites() + st.getStat()
+          .getNumInserts()).reduce((a, b) -> a + b).get();
+      boolean b1 = totalRecordsTouched == parent1.getConfig().getNumRecordsInsert()
+          + parent1.getConfig().getNumRecordsUpsert();
+      boolean b2 = statuses.size() > parent1.getConfig().getNumUpsertFiles();
+
+      DagNode<JavaRDD<WriteStatus>> parent2 = parent1.getParentNodes().get(0);
+      statuses = parent2.getResult().collect();
+      totalRecordsTouched = statuses.stream().map(st -> st.getStat().getNumUpdateWrites() + st.getStat()
+          .getNumInserts()).reduce((a, b) -> a + b).get();
+      boolean b3 = totalRecordsTouched == parent2.getConfig().getNumRecordsInsert()
+          * parent2.getConfig().getNumInsertPartitions() + parent2.getConfig().getNumRecordsUpsert();
+      return b1 & b2 & b3;
+    };
+    DagNode child2 = new ValidateNode(Config.newBuilder().build(), function);
+
+    root.addChildNode(child1);
+    // child1.addParentNode(root);
+    child1.addChildNode(child2);
+    // child2.addParentNode(child1);
+    List<DagNode> rootNodes = new ArrayList<>();
+    rootNodes.add(root);
+    return new WorkflowDag(rootNodes);
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGenerator.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGenerator.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
+
+public class HiveSyncDagGenerator implements WorkflowDagGenerator {
+
+  @Override
+  public WorkflowDag build() {
+    DagNode root = new InsertNode(Config.newBuilder()
+        .withNumRecordsToInsert(100)
+        .withNumInsertPartitions(1)
+        .withNumTimesToRepeat(1)
+        .withRecordSize(1000).build());
+
+    DagNode child1 = new HiveSyncNode(Config.newBuilder().withHiveLocal(true).build());
+
+    root.addChildNode(child1);
+
+    DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays
+        .asList(Pair.of("select " + "count(*) from testdb1.table1 group " + "by rider having count(*) < 1", 0)))
+        .build());
+    child1.addChildNode(child2);
+
+    List<DagNode> rootNodes = new ArrayList<>();
+    rootNodes.add(root);
+    return new WorkflowDag(rootNodes);
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGeneratorMOR.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGeneratorMOR.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
+
+public class HiveSyncDagGeneratorMOR implements WorkflowDagGenerator {
+
+  @Override
+  public WorkflowDag build() {
+    DagNode root = new InsertNode(Config.newBuilder()
+        .withNumRecordsToInsert(100)
+        .withNumInsertPartitions(1)
+        .withNumTimesToRepeat(1)
+        .withRecordSize(1000).build());
+
+    DagNode child1 = new HiveSyncNode(Config.newBuilder().withHiveLocal(true).build());
+
+    root.addChildNode(child1);
+
+    DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays
+        .asList(Pair.of("select " + "count(*) from testdb1.table1_rt group " + "by rider having count(*) < 1", 0)))
+        .build());
+    child1.addChildNode(child2);
+
+    List<DagNode> rootNodes = new ArrayList<>();
+    rootNodes.add(root);
+    return new WorkflowDag(rootNodes);
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.dag;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
+import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
+import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
+import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+public class TestDagUtils {
+
+  private static final String COW_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-cow.yaml";
+
+  @Test
+  public void testConvertDagToYaml() throws Exception {
+    ComplexDagGenerator dag = new ComplexDagGenerator();
+    String yaml = DagUtils.convertDagToYaml(dag.build());
+    System.out.println(yaml);
+  }
+
+  @Test
+  public void testConvertYamlToDag() throws Exception {
+    WorkflowDag dag = DagUtils.convertYamlToDag(UtilitiesTestBase.Helpers
+        .readFileFromAbsolutePath((System.getProperty("user.dir") + "/.." + COW_DAG_DOCKER_DEMO_RELATIVE_PATH)));
+    assertEquals(dag.getNodeList().size(), 1);
+    Assertions.assertEquals(((DagNode) dag.getNodeList().get(0)).getParentNodes().size(), 0);
+    assertEquals(((DagNode) dag.getNodeList().get(0)).getChildNodes().size(), 1);
+    DagNode firstChild = (DagNode) ((DagNode) dag.getNodeList().get(0)).getChildNodes().get(0);
+    assertEquals(firstChild.getParentNodes().size(), 1);
+    assertEquals(firstChild.getChildNodes().size(), 1);
+    assertEquals(((DagNode) firstChild.getChildNodes().get(0)).getChildNodes().size(), 1);
+  }
+
+  public static class ComplexDagGenerator implements WorkflowDagGenerator {
+
+    @Override
+    public WorkflowDag build() {
+      DagNode root = new InsertNode(Config.newBuilder()
+          .withNumRecordsToInsert(1000000)
+          .withNumInsertPartitions(1)
+          .withNumTimesToRepeat(2)
+          .withRecordSize(1000).build());
+
+      DagNode child1 = new InsertNode(Config.newBuilder()
+          .withNumRecordsToInsert(1000000)
+          .withNumInsertPartitions(1)
+          .withNumTimesToRepeat(2)
+          .withRecordSize(1000).build());
+
+      DagNode child2 = new InsertNode(Config.newBuilder()
+          .withNumRecordsToInsert(1000000)
+          .withNumInsertPartitions(1)
+          .withNumTimesToRepeat(2)
+          .withRecordSize(1000).build());
+
+      root.addChildNode(child1);
+      root.addChildNode(child2);
+
+      DagNode child3 = new InsertNode(Config.newBuilder()
+          .withNumRecordsToInsert(1000000)
+          .withNumInsertPartitions(1)
+          .withNumTimesToRepeat(2)
+          .withRecordSize(1000).build());
+
+      child2.addChildNode(child3);
+      List<DagNode> rootNodes = new ArrayList<>();
+      rootNodes.add(root);
+
+      return new WorkflowDag(rootNodes);
+    }
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.generator;
+
+import static junit.framework.TestCase.assertEquals;
+
+import org.apache.avro.Schema;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
+import org.junit.jupiter.api.Test;
+
+public class TestGenericRecordPayloadEstimator {
+
+  private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
+  private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH =
+      "/docker/demo/config/test-suite/complex-source.avsc";
+
+  @Test
+  public void testSimpleSchemaSize() throws Exception {
+    Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
+        .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
+    GenericRecordFullPayloadSizeEstimator estimator =
+        new GenericRecordFullPayloadSizeEstimator(schema);
+    Pair<Integer, Integer> estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields();
+    assertEquals(estimateAndNumComplexFields.getRight().intValue(), 0);
+    assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 156);
+  }
+
+  @Test
+  public void testComplexSchemaSize() throws Exception {
+    Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers.readFileFromAbsolutePath(
+        System.getProperty("user.dir") + "/.." + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
+    GenericRecordFullPayloadSizeEstimator estimator =
+        new GenericRecordFullPayloadSizeEstimator(schema);
+    Pair<Integer, Integer> estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields();
+    assertEquals(estimateAndNumComplexFields.getRight().intValue(), 1);
+    assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 1278);
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadGenerator.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadGenerator.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.generator;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.IntStream;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
+import org.junit.jupiter.api.Test;
+
+public class TestGenericRecordPayloadGenerator {
+
+  private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
+  private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH =
+      "/docker/demo/config/test-suite/complex-source.avsc";
+
+  @Test
+  public void testSimplePayload() throws Exception {
+    Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
+        .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
+    GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
+    GenericRecord record = payloadGenerator.getNewPayload();
+    // The generated payload should validate with the provided schema
+    payloadGenerator.validate(record);
+  }
+
+  @Test
+  public void testComplexPayload() throws IOException {
+    Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
+        .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
+            COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
+    GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
+    GenericRecord record = payloadGenerator.getNewPayload();
+    // The generated payload should validate with the provided schema
+    assertTrue(payloadGenerator.validate(record));
+  }
+
+  @Test
+  public void testComplexPartialPayload() throws IOException {
+    Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
+        .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
+            COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
+    GenericRecordPartialPayloadGenerator payloadGenerator = new GenericRecordPartialPayloadGenerator(schema);
+    IntStream.range(0, 10).forEach(a -> {
+      GenericRecord record = payloadGenerator.getNewPayload();
+      // The generated payload should validate with the provided schema
+      assertTrue(payloadGenerator.validate(record));
+    });
+  }
+
+  @Test
+  public void testUpdatePayloadGenerator() throws IOException {
+    Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
+        .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
+    GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
+    List<String> insertRowKeys = new ArrayList<>();
+    List<String> updateRowKeys = new ArrayList<>();
+    List<Long> insertTimeStamps = new ArrayList<>();
+    List<Long> updateTimeStamps = new ArrayList<>();
+    List<GenericRecord> records = new ArrayList<>();
+    // Generate 10 new records
+    IntStream.range(0, 10).forEach(a -> {
+      GenericRecord record = payloadGenerator.getNewPayload();
+      records.add(record);
+      insertRowKeys.add(record.get("_row_key").toString());
+      insertTimeStamps.add((Long) record.get("timestamp"));
+    });
+    List<String> blacklistFields = Arrays.asList("_row_key");
+    records.stream().forEach(a -> {
+      // Generate 10 updated records
+      GenericRecord record = payloadGenerator.getUpdatePayload(a, blacklistFields);
+      updateRowKeys.add(record.get("_row_key").toString());
+      updateTimeStamps.add((Long) record.get("timestamp"));
+    });
+    // The row keys from insert payloads should match all the row keys from the update payloads
+    assertTrue(insertRowKeys.containsAll(updateRowKeys));
+    // The timestamp field for the insert payloads should not all match with the update payloads
+    assertFalse(insertTimeStamps.containsAll(updateTimeStamps));
+  }
+
+  @Test
+  public void testSimplePayloadWithLargeMinSize() throws Exception {
+    Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
+        .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
+    int minPayloadSize = 1000;
+    GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema,
+        minPayloadSize);
+    GenericRecord record = payloadGenerator.getNewPayload();
+    // The payload generated is less than minPayloadSize due to no collections present
+    assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize);
+  }
+
+  @Test
+  public void testComplexPayloadWithLargeMinSize() throws Exception {
+    Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
+        .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
+            COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
+    int minPayloadSize = 10000;
+    GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(
+        schema, minPayloadSize);
+    GenericRecord record = payloadGenerator.getNewPayload();
+    // The payload generated should be within 10% extra of the minPayloadSize
+    assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize + 0.1 * minPayloadSize);
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.job;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.util.UUID;
+import java.util.stream.Stream;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.DataSourceWriteOptions;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob;
+import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig;
+import org.apache.hudi.integ.testsuite.dag.ComplexDagGenerator;
+import org.apache.hudi.integ.testsuite.dag.HiveSyncDagGenerator;
+import org.apache.hudi.integ.testsuite.dag.HiveSyncDagGeneratorMOR;
+import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator;
+import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
+import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
+import org.apache.hudi.keygen.TimestampBasedKeyGenerator;
+import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
+import org.apache.hudi.utilities.sources.AvroDFSSource;
+import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+public class TestHoodieTestSuiteJob extends UtilitiesTestBase {
+
+  private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with useDeltaStreamer={0}, tableType={1}";
+  private static final String BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/base"
+      + ".properties";
+  private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
+  private static final String TARGET_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/target.avsc";
+  private static final String COW_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-cow.yaml";
+  private static final String MOR_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-mor.yaml";
+
+  public static Stream<Arguments> configParams() {
+    Object[][] data =
+        new Object[][] {{false, "COPY_ON_WRITE"}};
+    return Stream.of(data).map(Arguments::of);
+  }
+
+  @BeforeAll
+  public static void initClass() throws Exception {
+    UtilitiesTestBase.initClass();
+    // prepare the configs.
+    UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+        + BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/base.properties");
+    UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+        + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/source.avsc");
+    UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+        + TARGET_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/target.avsc");
+
+    UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+        + COW_DAG_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/complex-dag-cow.yaml");
+    UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+        + MOR_DAG_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/complex-dag-mor.yaml");
+
+    TypedProperties props = new TypedProperties();
+    props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
+    props.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp");
+    props.setProperty("hoodie.deltastreamer.keygen.timebased.timestamp.type", "UNIX_TIMESTAMP");
+    props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyy/MM/dd");
+    props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc");
+    props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc");
+    props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsBasePath + "/input");
+    props.setProperty("hoodie.datasource.hive_sync.assume_date_partitioning", "true");
+    props.setProperty("hoodie.datasource.write.keytranslator.class", "org.apache.hudi"
+        + ".DayBasedPartitionPathKeyTranslator");
+    props.setProperty("hoodie.compact.inline.max.delta.commits", "3");
+    props.setProperty("hoodie.parquet.max.file.size", "1024000");
+    props.setProperty("hoodie.compact.inline.max.delta.commits", "0");
+    // Hive Configs
+    props.setProperty(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), "jdbc:hive2://127.0.0.1:9999/");
+    props.setProperty(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), "testdb1");
+    props.setProperty(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), "table1");
+    props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "datestr");
+    props.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), TimestampBasedKeyGenerator.class.getName());
+    UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-source"
+        + ".properties");
+
+    // Properties used for the delta-streamer which incrementally pulls from upstream DFS Avro source and
+    // writes to downstream hudi table
+    TypedProperties downstreamProps = new TypedProperties();
+    downstreamProps.setProperty("include", "base.properties");
+    downstreamProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
+    downstreamProps.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp");
+
+    // Source schema is the target schema of upstream table
+    downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc");
+    downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc");
+    UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs,
+        dfsBasePath + "/test-downstream-source.properties");
+    // these tests cause a lot of log verbosity from spark, turning it down
+    Logger.getLogger("org.apache.spark").setLevel(Level.WARN);
+  }
+
+  @AfterAll
+  public static void cleanupClass() {
+    UtilitiesTestBase.cleanupClass();
+  }
+
+  @BeforeEach
+  public void setup() throws Exception {
+    super.setup();
+  }
+
+  @AfterEach
+  public void teardown() throws Exception {
+    super.teardown();
+  }
+
+  // Tests in this class add to the test build time significantly. Since this is a Integration Test (end to end), we
+  // would like to run this as a nightly build which is a TODO.
+  // TODO : Clean up input / result paths after each test
+  @MethodSource("configParams")
+  public void testDagWithInsertUpsertAndValidate(boolean useDeltaStreamer, String tableType) throws Exception {
+    dfs.delete(new Path(dfsBasePath + "/input"), true);
+    dfs.delete(new Path(dfsBasePath + "/result"), true);
+    String inputBasePath = dfsBasePath + "/input/" + UUID.randomUUID().toString();
+    String outputBasePath = dfsBasePath + "/result/" + UUID.randomUUID().toString();
+    HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, tableType);
+    cfg.workloadDagGenerator = ComplexDagGenerator.class.getName();
+    HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
+    hoodieTestSuiteJob.runTestSuite();
+    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
+    assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 2);
+  }
+
+  @MethodSource("configParams")
+  public void testHiveSync(boolean useDeltaStreamer, String tableType) throws Exception {
+    dfs.delete(new Path(dfsBasePath + "/input"), true);
+    dfs.delete(new Path(dfsBasePath + "/result"), true);
+    String inputBasePath = dfsBasePath + "/input";
+    String outputBasePath = dfsBasePath + "/result";
+    HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, tableType);
+    if (tableType == HoodieTableType.COPY_ON_WRITE.name()) {
+      cfg.workloadDagGenerator = HiveSyncDagGenerator.class.getName();
+    } else {
+      cfg.workloadDagGenerator = HiveSyncDagGeneratorMOR.class.getName();
+    }
+    HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
+    hoodieTestSuiteJob.runTestSuite();
+    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
+    assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 1);
+  }
+
+  @MethodSource("configParams")
+  public void testCOWFullDagFromYaml(boolean useDeltaStreamer, String tableType) throws Exception {
+    dfs.delete(new Path(dfsBasePath + "/input"), true);
+    dfs.delete(new Path(dfsBasePath + "/result"), true);
+    String inputBasePath = dfsBasePath + "/input";
+    String outputBasePath = dfsBasePath + "/result";
+    HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType
+        .COPY_ON_WRITE.name());
+    cfg.workloadYamlPath = dfsBasePath + "/complex-dag-cow.yaml";
+    HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
+    hoodieTestSuiteJob.runTestSuite();
+    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
+    assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 5);
+  }
+
+  @MethodSource("configParams")
+  public void testMORFullDagFromYaml(boolean useDeltaStreamer, String tableType) throws Exception {
+    dfs.delete(new Path(dfsBasePath + "/input"), true);
+    dfs.delete(new Path(dfsBasePath + "/result"), true);
+    String inputBasePath = dfsBasePath + "/input";
+    String outputBasePath = dfsBasePath + "/result";
+    HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType
+        .MERGE_ON_READ.name());
+    cfg.workloadYamlPath = dfsBasePath + "/complex-dag-mor.yaml";
+    HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
+    hoodieTestSuiteJob.runTestSuite();
+    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
+    assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 7);
+  }
+
+  protected HoodieTestSuiteConfig makeConfig(String inputBasePath, String outputBasePath, boolean useDeltaStream,
+      String tableType) {
+    HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig();
+    cfg.targetBasePath = outputBasePath;
+    cfg.inputBasePath = inputBasePath;
+    cfg.targetTableName = "table1";
+    cfg.tableType = tableType;
+    cfg.sourceClassName = AvroDFSSource.class.getName();
+    cfg.sourceOrderingField = "timestamp";
+    cfg.propsFilePath = dfsBasePath + "/test-source.properties";
+    cfg.outputTypeName = DeltaOutputMode.DFS.name();
+    cfg.inputFormatName = DeltaInputType.AVRO.name();
+    cfg.limitFileSize = 1024 * 1024L;
+    cfg.sourceLimit = 20000000;
+    cfg.workloadDagGenerator = WorkflowDagGenerator.class.getName();
+    cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName();
+    cfg.useDeltaStreamer = useDeltaStream;
+    return cfg;
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSAvroDeltaInputReader.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSAvroDeltaInputReader.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.reader;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.integ.testsuite.utils.TestUtils;
+import org.apache.hudi.integ.testsuite.utils.TestUtils;
+import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+public class TestDFSAvroDeltaInputReader extends UtilitiesTestBase {
+
+  @BeforeAll
+  public static void initClass() throws Exception {
+    UtilitiesTestBase.initClass();
+  }
+
+  @AfterAll
+  public static void cleanupClass() {
+    UtilitiesTestBase.cleanupClass();
+  }
+
+  @BeforeEach
+  public void setup() throws Exception {
+    super.setup();
+  }
+
+  @Test
+  @Disabled
+  public void testDFSSinkReader() throws IOException {
+    FileSystem fs = FSUtils.getFs(dfsBasePath, new Configuration());
+    // Create 10 avro files with 10 records each
+    TestUtils.createAvroFiles(jsc, sparkSession, dfsBasePath, 10, 10);
+    FileStatus[] statuses = fs.globStatus(new Path(dfsBasePath + "/*/*.avro"));
+    DFSAvroDeltaInputReader reader =
+        new DFSAvroDeltaInputReader(sparkSession, TestUtils.getSchema().toString(), dfsBasePath, Option.empty(),
+            Option.empty());
+    assertEquals(reader.analyzeSingleFile(statuses[0].getPath().toString()), 5);
+    assertEquals(reader.read(100).count(), 100);
+    assertEquals(reader.read(1000).count(), 100);
+    assertEquals(reader.read(10).count(), 10);
+    assertTrue(reader.read(11).count() > 11);
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.reader;
+
+import static junit.framework.TestCase.assertEquals;
+import static junit.framework.TestCase.assertTrue;
+
+import java.util.HashSet;
+import java.util.List;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.client.HoodieWriteClient;
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
+import org.apache.hudi.common.testutils.HoodieTestUtils;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
+import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
+import org.apache.spark.api.java.JavaRDD;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+public class TestDFSHoodieDatasetInputReader extends UtilitiesTestBase {
+
+  @BeforeAll
+  public static void initClass() throws Exception {
+    UtilitiesTestBase.initClass();
+  }
+
+  @AfterAll
+  public static void cleanupClass() {
+    UtilitiesTestBase.cleanupClass();
+  }
+
+  @BeforeEach
+  public void setup() throws Exception {
+    super.setup();
+    HoodieTestUtils.init(jsc.hadoopConfiguration(), dfsBasePath);
+  }
+
+  @AfterEach
+  public void teardown() throws Exception {
+    super.teardown();
+  }
+
+  @Test
+  public void testSimpleHoodieDatasetReader() throws Exception {
+
+    HoodieWriteConfig config = makeHoodieClientConfig();
+    HoodieWriteClient client = new HoodieWriteClient(jsc, config);
+    String commitTime = client.startCommit();
+    HoodieTestDataGenerator generator = new HoodieTestDataGenerator();
+    // Insert 100 records across 3 partitions
+    List<HoodieRecord> inserts = generator.generateInserts(commitTime, 100);
+    JavaRDD<WriteStatus> writeStatuses = client.upsert(jsc.parallelize(inserts), commitTime);
+    writeStatuses.count();
+
+    DFSHoodieDatasetInputReader reader = new DFSHoodieDatasetInputReader(jsc, config.getBasePath(),
+        HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())).toString());
+    // Try to read 100 records for the same partition path and same file ID
+    JavaRDD<GenericRecord> records = reader.read(1, 1, 100L);
+    assertTrue(records.count() <= 100);
+    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
+        1);
+    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
+        1);
+
+    // Try to read 100 records for 3 partition paths and 3 different file ids
+    records = reader.read(3, 3, 100L);
+    assertTrue(records.count() <= 100);
+    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
+        3);
+    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
+        3);
+
+    // Try to read 100 records for 3 partition paths and 50% records from each file
+    records = reader.read(3, 3, 0.5);
+    assertTrue(records.count() <= 100);
+    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
+        3);
+    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
+        3);
+  }
+
+  private HoodieWriteConfig makeHoodieClientConfig() throws Exception {
+    return makeHoodieClientConfigBuilder().build();
+  }
+
+  private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
+    // Prepare the AvroParquetIO
+    return HoodieWriteConfig.newBuilder().withPath(dfsBasePath)
+        .withParallelism(2, 2)
+        .withSchema(HoodieTestDataGenerator
+            .TRIP_EXAMPLE_SCHEMA);
+  }
+
+}
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/utils/TestUtils.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/utils/TestUtils.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ.testsuite.utils;
+
+import java.util.List;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.AvroConversionUtils;
+import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
+import org.apache.hudi.utilities.schema.RowBasedSchemaProvider;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+
+public class TestUtils {
+
+  /**
+   * Create a RDD of generic records for testing purposes.
+   */
+  public static JavaRDD<GenericRecord> makeRDD(JavaSparkContext jsc, int numRecords) {
+    return jsc.parallelize(generateGenericRecords(numRecords));
+  }
+
+  /**
+   * Generate generic records.
+   */
+  public static List<GenericRecord> generateGenericRecords(int numRecords) {
+    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
+    return dataGenerator.generateGenericRecords(numRecords);
+  }
+
+  public static void createAvroFiles(JavaSparkContext jsc, SparkSession sparkSession, String basePath, int numFiles,
+      int numRecordsPerFile) {
+    Schema schema = HoodieTestDataGenerator.AVRO_SCHEMA;
+    for (int i = 0; i < numFiles; i++) {
+      JavaRDD<GenericRecord> rdd = makeRDD(jsc, numRecordsPerFile);
+      AvroConversionUtils.createDataFrame(rdd.rdd(), schema.toString(), sparkSession).write()
+          .format("avro").option("recordName", RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME)
+          .option("recordNamespace", RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE).save(basePath + "/" + i);
+    }
+  }
+
+  public static Schema getSchema() {
+    return HoodieTestDataGenerator.AVRO_SCHEMA;
+  }
+
+}
--- a/hudi-integ-test/src/test/resources/log4j-surefire-quiet.properties
+++ b/hudi-integ-test/src/test/resources/log4j-surefire-quiet.properties
@@ -15,8 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###
-log4j.rootLogger=WARN, CONSOLE
-log4j.logger.org.apache.hudi=DEBUG
+log4j.rootLogger=ERROR, CONSOLE
+log4j.logger.org.apache.hudi=ERROR
+log4j.category.org.apache.spark=ERROR

 # CONSOLE is set to be a ConsoleAppender.
 log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
@@ -25,5 +26,5 @@ log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
 log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
 log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
 log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
-log4j.appender.CONSOLE.filter.a.LevelMin=WARN
+log4j.appender.CONSOLE.filter.a.LevelMin=ERROR
 log4j.appender.CONSOLE.filter.a.LevelMax=FATAL