[HUDI-1331] Adding support for validating entire dataset and long running tests in test suite framework (#2168)

* trigger rebuild * [HUDI-1156] Remove unused dependencies from HoodieDeltaStreamerWrapper Class (#1927) * Adding support for validating records and long running tests in test sutie framework * Adding partial validate node * Fixing spark session initiation in Validate nodes * Fixing validation * Adding hive table validation to ValidateDatasetNode * Rebasing with latest commits from master * Addressing feedback * Addressing comments Co-authored-by: lamber-ken <lamberken@163.com> Co-authored-by: linshan-ma <mabin194046@163.com>
2020-12-26 12:29:24 -05:00
parent 3ec9270e8e
commit 8cf6a7223f
33 changed files with 1101 additions and 518 deletions
--- a/hudi-integ-test/src/test/resources/unit-test-cow-dag.yaml
+++ b/hudi-integ-test/src/test/resources/unit-test-cow-dag.yaml
@@ -13,58 +13,62 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-first_insert:
-  config:
-    record_size: 70000
-    num_partitions_insert: 1
-    repeat_count: 2
-    num_records_insert: 100
-  type: InsertNode
-  deps: none
-second_insert:
-  config:
-    record_size: 70000
-    num_partitions_insert: 1
-    repeat_count: 1
-    num_records_insert: 100
-  type: InsertNode
-  deps: first_insert
-first_rollback:
-  config:
-  deps: second_insert
-  type: RollbackNode
-third_insert:
-  config:
-    record_size: 70000
-    num_partitions_insert: 1
-    repeat_count: 1
-    num_records_insert: 100
-  type: InsertNode
-  deps: first_rollback
-first_upsert:
-  config:
-    record_size: 70000
-    num_partitions_upsert: 1
-    repeat_count: 1
-    num_records_upsert: 100
-  type: UpsertNode
-  deps: third_insert
-first_hive_sync:
-  config:
-    queue_name: "adhoc"
-    engine: "mr"
-  type: HiveSyncNode
-  deps: first_upsert
-first_hive_query:
-  config:
-    hive_props:
-      prop2: "set spark.yarn.queue="
-      prop3: "set hive.strict.checks.large.query=false"
-      prop4: "set hive.stats.autogather=false"
-    hive_queries:
-      query1: "select count(*) from testdb1.table1"
-      result1: 300
-      query2: "select count(*) from testdb1.table1 group   by `_row_key` having count(*) > 1"
-      result2: 0
-  type: HiveQueryNode
-  deps: first_hive_sync
+dag_name: unit-test-cow-dag
+dag_rounds: 1
+dag_intermittent_delay_mins: 10
+dag_content:
+  first_insert:
+    config:
+      record_size: 70000
+      num_partitions_insert: 1
+      repeat_count: 2
+      num_records_insert: 100
+    type: InsertNode
+    deps: none
+  second_insert:
+    config:
+      record_size: 70000
+      num_partitions_insert: 1
+      repeat_count: 1
+      num_records_insert: 100
+    type: InsertNode
+    deps: first_insert
+  first_rollback:
+    config:
+    deps: second_insert
+    type: RollbackNode
+  third_insert:
+    config:
+      record_size: 70000
+      num_partitions_insert: 1
+      repeat_count: 1
+      num_records_insert: 100
+    type: InsertNode
+    deps: first_rollback
+  first_upsert:
+    config:
+      record_size: 70000
+      num_partitions_upsert: 1
+      repeat_count: 1
+      num_records_upsert: 100
+    type: UpsertNode
+    deps: third_insert
+  first_hive_sync:
+    config:
+      queue_name: "adhoc"
+      engine: "mr"
+    type: HiveSyncNode
+    deps: first_upsert
+  first_hive_query:
+    config:
+      hive_props:
+        prop2: "set spark.yarn.queue="
+        prop3: "set hive.strict.checks.large.query=false"
+        prop4: "set hive.stats.autogather=false"
+      hive_queries:
+        query1: "select count(*) from testdb1.table1"
+        result1: 300
+        query2: "select count(*) from testdb1.table1 group   by `_row_key` having count(*) > 1"
+        result2: 0
+    type: HiveQueryNode
+    deps: first_hive_sync
--- a/hudi-integ-test/src/test/resources/unit-test-mor-dag.yaml
+++ b/hudi-integ-test/src/test/resources/unit-test-mor-dag.yaml
@@ -13,58 +13,62 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-first_insert:
-  config:
-    record_size: 70000
-    num_partitions_insert: 1
-    repeat_count: 2
-    num_records_insert: 100
-  type: InsertNode
-  deps: none
-second_insert:
-  config:
-    record_size: 70000
-    num_partitions_insert: 1
-    repeat_count: 1
-    num_records_insert: 100
-  type: InsertNode
-  deps: first_insert
-first_rollback:
-  config:
-  deps: second_insert
-  type: RollbackNode
-third_insert:
-  config:
-    record_size: 70000
-    num_partitions_insert: 1
-    repeat_count: 1
-    num_records_insert: 100
-  type: InsertNode
-  deps: first_rollback
-first_upsert:
-  config:
-    record_size: 70000
-    num_partitions_upsert: 1
-    repeat_count: 1
-    num_records_upsert: 100
-  type: UpsertNode
-  deps: third_insert
-first_hive_sync:
-  config:
-    queue_name: "adhoc"
-    engine: "mr"
-  type: HiveSyncNode
-  deps: first_upsert
-first_hive_query:
-  config:
-    hive_props:
-      prop2: "set spark.yarn.queue="
-      prop3: "set hive.strict.checks.large.query=false"
-      prop4: "set hive.stats.autogather=false"
-    hive_queries:
-      query1: "select count(*) from testdb1.table1"
-      result1: 300
-      query2: "select count(*) from testdb1.table1 group   by `_row_key` having count(*) > 1"
-      result2: 0
-  type: HiveQueryNode
-  deps: first_hive_sync
+dag_name: unit-test-mor-dag
+dag_rounds: 1
+dag_intermittent_delay_mins: 10
+dag_content:
+  first_insert:
+    config:
+      record_size: 70000
+      num_partitions_insert: 1
+      repeat_count: 2
+      num_records_insert: 100
+    type: InsertNode
+    deps: none
+  second_insert:
+    config:
+      record_size: 70000
+      num_partitions_insert: 1
+      repeat_count: 1
+      num_records_insert: 100
+    type: InsertNode
+    deps: first_insert
+  first_rollback:
+    config:
+    deps: second_insert
+    type: RollbackNode
+  third_insert:
+    config:
+      record_size: 70000
+      num_partitions_insert: 1
+      repeat_count: 1
+      num_records_insert: 100
+    type: InsertNode
+    deps: first_rollback
+  first_upsert:
+    config:
+      record_size: 70000
+      num_partitions_upsert: 1
+      repeat_count: 1
+      num_records_upsert: 100
+    type: UpsertNode
+    deps: third_insert
+  first_hive_sync:
+    config:
+      queue_name: "adhoc"
+      engine: "mr"
+    type: HiveSyncNode
+    deps: first_upsert
+  first_hive_query:
+    config:
+      hive_props:
+        prop2: "set spark.yarn.queue="
+        prop3: "set hive.strict.checks.large.query=false"
+        prop4: "set hive.stats.autogather=false"
+      hive_queries:
+        query1: "select count(*) from testdb1.table1"
+        result1: 300
+        query2: "select count(*) from testdb1.table1 group   by `_row_key` having count(*) > 1"
+        result2: 0
+    type: HiveQueryNode
+    deps: first_hive_sync