[HUDI-3312] Fixing spark yaml and adding hive validation to integ test suite (#4731)

2022-02-08 00:40:36 -05:00
parent 8ab6f17149
commit 0ab1a8ec80
14 changed files with 295 additions and 38 deletions
--- a/hudi-integ-test/README.md
+++ b/hudi-integ-test/README.md
@@ -82,8 +82,8 @@ spark-submit

 2.YAML file

-Choose to write up the entire DAG of operations in YAML, take a look at `complex-dag-cow.yaml` or 
-`complex-dag-mor.yaml`.
+Choose to write up the entire DAG of operations in YAML, take a look at `simple-deltastreamer.yaml` or 
+`simple-deltastreamer.yaml`.
 Once you're ready with the DAG you want to execute, simply pass the yaml file path as follows:

 ```
@@ -177,7 +177,7 @@ cd /opt
 Copy the integration tests jar into the docker container

 ```
-docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt
+docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt
 ```

 ```
@@ -217,7 +217,7 @@ spark-submit \
 --conf spark.driver.extraClassPath=/var/demo/jars/* \
 --conf spark.executor.extraClassPath=/var/demo/jars/* \
 --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
-/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
+/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
 --source-ordering-field test_suite_source_ordering_field \
 --use-deltastreamer \
 --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
@@ -227,7 +227,7 @@ spark-submit \
 --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
 --source-class org.apache.hudi.utilities.sources.AvroDFSSource \
 --input-file-size 125829120 \
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \
+--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \
 --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
 --table-type COPY_ON_WRITE \
 --compact-scheduling-minshare 1 \
@@ -264,7 +264,7 @@ spark-submit \
 --conf spark.driver.extraClassPath=/var/demo/jars/* \
 --conf spark.executor.extraClassPath=/var/demo/jars/* \
 --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
-/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
+/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
 --source-ordering-field test_suite_source_ordering_field \
 --use-deltastreamer \
 --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
@@ -274,7 +274,7 @@ spark-submit \
 --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
 --source-class org.apache.hudi.utilities.sources.AvroDFSSource \
 --input-file-size 125829120 \
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \
+--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \
 --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
 --table-type MERGE_ON_READ \
 --compact-scheduling-minshare 1 \
@@ -308,16 +308,16 @@ contents both via spark datasource and hive table via spark sql engine. Hive val
 If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not 
 go well w/ hive2* jars. So, after running docker setup, follow the below steps. 
 ```
-docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/
+docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt/
 docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/
 ```
 Also copy your dag of interest to adhoc-2:/opt/
 ```
-docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/
+docker cp docker/demo/config/test-suite/simple-deltastreamer.yaml adhoc-2:/opt/
 ```

 For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins". 
-This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: complex-dag-cow.yaml
+This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: simple-deltastreamer.yaml
 already has all these configs set. So no changes required just to try it out. 

 Also, ValidateDatasetNode can be configured in two ways. Either with "delete_input_data" set to true or without 
@@ -457,7 +457,7 @@ spark-submit \
 --conf spark.driver.extraClassPath=/var/demo/jars/* \
 --conf spark.executor.extraClassPath=/var/demo/jars/* \
 --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
-/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
+/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
 --source-ordering-field test_suite_source_ordering_field \
 --use-deltastreamer \
 --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
@@ -467,7 +467,7 @@ spark-submit \
 --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
 --source-class org.apache.hudi.utilities.sources.AvroDFSSource \
 --input-file-size 125829120 \
--workload-yaml-path file:/opt/complex-dag-cow.yaml \
+--workload-yaml-path file:/opt/simple-deltastreamer.yaml \
 --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
 --table-type COPY_ON_WRITE \
 --compact-scheduling-minshare 1 \
@@ -486,8 +486,8 @@ If you wish to enable metrics add below properties as well
 Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running 
 dags.
 ```
-complex-dag-cow.yaml: simple 1 round dag for COW table.
-complex-dag-mor.yaml: simple 1 round dag for MOR table.
+simple-deltastreamer.yaml: simple 1 round dag for COW table.
+simple-deltastreamer.yaml: simple 1 round dag for MOR table.
 cow-clustering-example.yaml : dag with 3 rounds, in which inline clustering will trigger during 2nd iteration. 
 cow-long-running-example.yaml : long running dag with 50 iterations. only 1 partition is used. 
 cow-long-running-multi-partitions.yaml: long running dag wit 50 iterations with multiple partitions.
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java
@@ -95,6 +95,7 @@ public class DeltaConfig implements Serializable {
    private static String SCHEMA_VERSION = "schema_version";
    private static String NUM_ROLLBACKS = "num_rollbacks";
    private static String ENABLE_ROW_WRITING = "enable_row_writing";
+    private static String ENABLE_METADATA_VALIDATE = "enable_metadata_validate";

    // Spark SQL Create Table
    private static String TABLE_TYPE = "table_type";
@@ -149,6 +150,10 @@ public class DeltaConfig implements Serializable {
      return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString());
    }

+    public boolean isEnableMetadataValidate() {
+      return Boolean.valueOf(configsMap.getOrDefault(ENABLE_METADATA_VALIDATE, false).toString());
+    }
+
    public int getNumInsertPartitions() {
      return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString());
    }
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java
@@ -111,14 +111,17 @@ public abstract class BaseValidateDatasetNode extends DagNode<Boolean> {
      String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key());
      String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key());
      log.warn("Validating hive table with db : " + database + " and table : " + tableName);
-      Dataset<Row> cowDf = session.sql("SELECT * FROM " + database + "." + tableName);
-      Dataset<Row> trimmedCowDf = cowDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
-          .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD);
-      intersectionDf = inputSnapshotDf.intersect(trimmedCowDf);
+      session.sql("REFRESH TABLE " + database + "." + tableName);
+      Dataset<Row> cowDf = session.sql("SELECT _row_key, rider, driver, begin_lat, begin_lon, end_lat, end_lon, fare, _hoodie_is_deleted, " +
+          "test_suite_source_ordering_field FROM " + database + "." + tableName);
+      Dataset<Row> reorderedInputDf = inputSnapshotDf.select("_row_key","rider","driver","begin_lat","begin_lon","end_lat","end_lon","fare",
+          "_hoodie_is_deleted","test_suite_source_ordering_field");
+
+      Dataset<Row> intersectedHiveDf = reorderedInputDf.intersect(cowDf);
      outputCount = trimmedHudiDf.count();
      log.warn("Input count: " + inputCount + "; output count: " + outputCount);
      // the intersected df should be same as inputDf. if not, there is some mismatch.
-      if (outputCount == 0 || inputSnapshotDf.except(intersectionDf).count() != 0) {
+      if (outputCount == 0 || reorderedInputDf.except(intersectedHiveDf).count() != 0) {
        log.error("Data set validation failed for COW hive table. Total count in hudi " + outputCount + ", input df count " + inputCount);
        throw new AssertionError("Hudi hive table contents does not match contents input data. ");
      }
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java
@@ -18,6 +18,7 @@

 package org.apache.hudi.integ.testsuite.dag.nodes;

+import org.apache.hudi.common.config.HoodieMetadataConfig;
 import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
 import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
@@ -49,7 +50,8 @@ public class ValidateDatasetNode extends BaseValidateDatasetNode {
                                           StructType inputSchema) {
    String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/*/*/*";
    log.info("Validate data in target hudi path " + hudiPath);
-    Dataset<Row> hudiDf = session.read().format("hudi").load(hudiPath);
+    Dataset<Row> hudiDf = session.read().option(HoodieMetadataConfig.ENABLE.key(), String.valueOf(config.isEnableMetadataValidate()))
+        .format("hudi").load(hudiPath);
    return hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
            .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD);
  }
--- a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala
+++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala
@@ -60,7 +60,7 @@ class SparkInsertNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] {
      .option(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX.key, "deltastreamer.checkpoint.key")
      .option("deltastreamer.checkpoint.key", context.getWriterContext.getHoodieTestSuiteWriter.getLastCheckpoint.orElse(""))
      .option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName)
-      .mode(SaveMode.Overwrite)
+      .mode(SaveMode.Append)
      .save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath)
  }
 }