1
0

[HUDI-3312] Fixing spark yaml and adding hive validation to integ test suite (#4731)

This commit is contained in:
Sivabalan Narayanan
2022-02-08 00:40:36 -05:00
committed by GitHub
parent 8ab6f17149
commit 0ab1a8ec80
14 changed files with 295 additions and 38 deletions

View File

@@ -13,13 +13,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: cow-spark-long-running-multi-partitions.yaml
dag_rounds: 50
dag_intermittent_delay_mins: 1
dag_name: cow-spark-deltastreamer-long-running-multi-partitions.yaml
dag_rounds: 30
dag_intermittent_delay_mins: 0
dag_content:
first_insert:
config:
record_size: 1000
record_size: 200
num_partitions_insert: 50
repeat_count: 1
num_records_insert: 10000
@@ -33,12 +33,12 @@ dag_content:
deps: first_insert
first_validate:
config:
validate_hive: true
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_upsert:
config:
record_size: 1000
record_size: 200
num_partitions_insert: 50
num_records_insert: 300
repeat_count: 1
@@ -60,13 +60,13 @@ dag_content:
deps: first_delete
second_validate:
config:
validate_hive: true
validate_hive: false
delete_input_data: true
type: ValidateDatasetNode
deps: second_hive_sync
last_validate:
config:
execute_itr_count: 50
execute_itr_count: 30
validate_clean: true
validate_archival: true
type: ValidateAsyncOperations

View File

@@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: cow-spark-simple.yaml
dag_rounds: 2
dag_rounds: 1
dag_intermittent_delay_mins: 1
dag_content:
first_insert:
@@ -33,7 +33,7 @@ dag_content:
deps: first_insert
first_validate:
config:
validate_hive: true
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_upsert:
@@ -60,7 +60,7 @@ dag_content:
deps: first_delete
second_validate:
config:
validate_hive: true
validate_hive: false
delete_input_data: false
type: ValidateDatasetNode
deps: second_hive_sync

View File

@@ -0,0 +1,89 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: deltastreamer-long-running-multi-partitions.yaml
dag_rounds: 50
dag_intermittent_delay_mins: 1
dag_content:
first_insert:
config:
record_size: 1000
num_partitions_insert: 5
repeat_count: 1
num_records_insert: 1000
type: InsertNode
deps: none
second_insert:
config:
record_size: 1000
num_partitions_insert: 50
repeat_count: 1
num_records_insert: 10000
deps: first_insert
type: InsertNode
third_insert:
config:
record_size: 1000
num_partitions_insert: 2
repeat_count: 1
num_records_insert: 300
deps: second_insert
type: InsertNode
first_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: third_insert
first_validate:
config:
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_upsert:
config:
record_size: 1000
num_partitions_insert: 2
num_records_insert: 300
repeat_count: 1
num_records_upsert: 100
num_partitions_upsert: 1
type: UpsertNode
deps: first_validate
first_delete:
config:
num_partitions_delete: 50
num_records_delete: 8000
type: DeleteNode
deps: first_upsert
second_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: first_delete
second_validate:
config:
validate_hive: true
delete_input_data: true
type: ValidateDatasetNode
deps: second_hive_sync
last_validate:
config:
execute_itr_count: 50
validate_clean: true
validate_archival: true
type: ValidateAsyncOperations
deps: second_validate

View File

@@ -13,7 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: cow-long-running-multi-partitions.yaml
dag_name: deltastreamer-long-running-multi-partitions.yaml
dag_rounds: 50
dag_intermittent_delay_mins: 1
dag_content:

View File

@@ -13,7 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: cow-long-running-example.yaml
dag_name: detlastreamer-long-running-example.yaml
dag_rounds: 50
dag_intermittent_delay_mins: 1
dag_content:

View File

@@ -0,0 +1,76 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: simple-clustering-hive.yaml
dag_rounds: 30
dag_intermittent_delay_mins: 0
dag_content:
first_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 1000
type: InsertNode
deps: none
second_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 10000
deps: first_insert
type: InsertNode
third_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 300
deps: second_insert
type: InsertNode
first_delete:
config:
num_partitions_delete: 1
num_records_delete: 9000
type: DeleteNode
deps: third_insert
first_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: first_delete
first_validate:
config:
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_cluster:
config:
execute_itr_count: 20
type: ClusteringNode
deps: first_validate
second_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: first_cluster
second_validate:
config:
validate_hive: true
type: ValidateDatasetNode
deps: second_hive_sync

View File

@@ -13,8 +13,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: cow-clustering-example.yaml
dag_rounds: 3
dag_name: simple-clustering.yaml
dag_rounds: 30
dag_intermittent_delay_mins: 0
dag_content:
first_insert:
@@ -60,7 +60,7 @@ dag_content:
deps: first_hive_sync
first_cluster:
config:
execute_itr_count: 2
execute_itr_count: 25
type: ClusteringNode
deps: first_validate
second_hive_sync:

View File

@@ -0,0 +1,82 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: simple-deltastreamer.yaml
dag_rounds: 1
dag_intermittent_delay_mins: 1
dag_content:
first_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 1000
type: InsertNode
deps: none
second_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 10000
deps: first_insert
type: InsertNode
third_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 300
deps: second_insert
type: InsertNode
first_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: third_insert
first_validate:
config:
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_upsert:
config:
record_size: 1000
num_partitions_insert: 1
num_records_insert: 300
repeat_count: 1
num_records_upsert: 100
num_partitions_upsert: 1
type: UpsertNode
deps: first_validate
first_delete:
config:
num_partitions_delete: 1
num_records_delete: 2000
type: DeleteNode
deps: first_upsert
second_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: first_delete
second_validate:
config:
validate_hive: true
delete_input_data: true
type: ValidateDatasetNode
deps: second_hive_sync

View File

@@ -13,7 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: complex-dag-cow.yaml
dag_name: simple-deltastreamer.yaml
dag_rounds: 1
dag_intermittent_delay_mins: 1
dag_content:

View File

@@ -82,8 +82,8 @@ spark-submit
2.YAML file
Choose to write up the entire DAG of operations in YAML, take a look at `complex-dag-cow.yaml` or
`complex-dag-mor.yaml`.
Choose to write up the entire DAG of operations in YAML, take a look at `simple-deltastreamer.yaml` or
`simple-deltastreamer.yaml`.
Once you're ready with the DAG you want to execute, simply pass the yaml file path as follows:
```
@@ -177,7 +177,7 @@ cd /opt
Copy the integration tests jar into the docker container
```
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt
```
```
@@ -217,7 +217,7 @@ spark-submit \
--conf spark.driver.extraClassPath=/var/demo/jars/* \
--conf spark.executor.extraClassPath=/var/demo/jars/* \
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
--source-ordering-field test_suite_source_ordering_field \
--use-deltastreamer \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
@@ -227,7 +227,7 @@ spark-submit \
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
--input-file-size 125829120 \
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
--table-type COPY_ON_WRITE \
--compact-scheduling-minshare 1 \
@@ -264,7 +264,7 @@ spark-submit \
--conf spark.driver.extraClassPath=/var/demo/jars/* \
--conf spark.executor.extraClassPath=/var/demo/jars/* \
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
--source-ordering-field test_suite_source_ordering_field \
--use-deltastreamer \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
@@ -274,7 +274,7 @@ spark-submit \
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
--input-file-size 125829120 \
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
--table-type MERGE_ON_READ \
--compact-scheduling-minshare 1 \
@@ -308,16 +308,16 @@ contents both via spark datasource and hive table via spark sql engine. Hive val
If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not
go well w/ hive2* jars. So, after running docker setup, follow the below steps.
```
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt/
docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/
```
Also copy your dag of interest to adhoc-2:/opt/
```
docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/
docker cp docker/demo/config/test-suite/simple-deltastreamer.yaml adhoc-2:/opt/
```
For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins".
This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: complex-dag-cow.yaml
This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: simple-deltastreamer.yaml
already has all these configs set. So no changes required just to try it out.
Also, ValidateDatasetNode can be configured in two ways. Either with "delete_input_data" set to true or without
@@ -457,7 +457,7 @@ spark-submit \
--conf spark.driver.extraClassPath=/var/demo/jars/* \
--conf spark.executor.extraClassPath=/var/demo/jars/* \
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
--source-ordering-field test_suite_source_ordering_field \
--use-deltastreamer \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
@@ -467,7 +467,7 @@ spark-submit \
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
--input-file-size 125829120 \
--workload-yaml-path file:/opt/complex-dag-cow.yaml \
--workload-yaml-path file:/opt/simple-deltastreamer.yaml \
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
--table-type COPY_ON_WRITE \
--compact-scheduling-minshare 1 \
@@ -486,8 +486,8 @@ If you wish to enable metrics add below properties as well
Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running
dags.
```
complex-dag-cow.yaml: simple 1 round dag for COW table.
complex-dag-mor.yaml: simple 1 round dag for MOR table.
simple-deltastreamer.yaml: simple 1 round dag for COW table.
simple-deltastreamer.yaml: simple 1 round dag for MOR table.
cow-clustering-example.yaml : dag with 3 rounds, in which inline clustering will trigger during 2nd iteration.
cow-long-running-example.yaml : long running dag with 50 iterations. only 1 partition is used.
cow-long-running-multi-partitions.yaml: long running dag wit 50 iterations with multiple partitions.

View File

@@ -95,6 +95,7 @@ public class DeltaConfig implements Serializable {
private static String SCHEMA_VERSION = "schema_version";
private static String NUM_ROLLBACKS = "num_rollbacks";
private static String ENABLE_ROW_WRITING = "enable_row_writing";
private static String ENABLE_METADATA_VALIDATE = "enable_metadata_validate";
// Spark SQL Create Table
private static String TABLE_TYPE = "table_type";
@@ -149,6 +150,10 @@ public class DeltaConfig implements Serializable {
return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString());
}
public boolean isEnableMetadataValidate() {
return Boolean.valueOf(configsMap.getOrDefault(ENABLE_METADATA_VALIDATE, false).toString());
}
public int getNumInsertPartitions() {
return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString());
}

View File

@@ -111,14 +111,17 @@ public abstract class BaseValidateDatasetNode extends DagNode<Boolean> {
String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key());
String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key());
log.warn("Validating hive table with db : " + database + " and table : " + tableName);
Dataset<Row> cowDf = session.sql("SELECT * FROM " + database + "." + tableName);
Dataset<Row> trimmedCowDf = cowDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD);
intersectionDf = inputSnapshotDf.intersect(trimmedCowDf);
session.sql("REFRESH TABLE " + database + "." + tableName);
Dataset<Row> cowDf = session.sql("SELECT _row_key, rider, driver, begin_lat, begin_lon, end_lat, end_lon, fare, _hoodie_is_deleted, " +
"test_suite_source_ordering_field FROM " + database + "." + tableName);
Dataset<Row> reorderedInputDf = inputSnapshotDf.select("_row_key","rider","driver","begin_lat","begin_lon","end_lat","end_lon","fare",
"_hoodie_is_deleted","test_suite_source_ordering_field");
Dataset<Row> intersectedHiveDf = reorderedInputDf.intersect(cowDf);
outputCount = trimmedHudiDf.count();
log.warn("Input count: " + inputCount + "; output count: " + outputCount);
// the intersected df should be same as inputDf. if not, there is some mismatch.
if (outputCount == 0 || inputSnapshotDf.except(intersectionDf).count() != 0) {
if (outputCount == 0 || reorderedInputDf.except(intersectedHiveDf).count() != 0) {
log.error("Data set validation failed for COW hive table. Total count in hudi " + outputCount + ", input df count " + inputCount);
throw new AssertionError("Hudi hive table contents does not match contents input data. ");
}

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
@@ -49,7 +50,8 @@ public class ValidateDatasetNode extends BaseValidateDatasetNode {
StructType inputSchema) {
String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/*/*/*";
log.info("Validate data in target hudi path " + hudiPath);
Dataset<Row> hudiDf = session.read().format("hudi").load(hudiPath);
Dataset<Row> hudiDf = session.read().option(HoodieMetadataConfig.ENABLE.key(), String.valueOf(config.isEnableMetadataValidate()))
.format("hudi").load(hudiPath);
return hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD);
}

View File

@@ -60,7 +60,7 @@ class SparkInsertNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] {
.option(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX.key, "deltastreamer.checkpoint.key")
.option("deltastreamer.checkpoint.key", context.getWriterContext.getHoodieTestSuiteWriter.getLastCheckpoint.orElse(""))
.option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName)
.mode(SaveMode.Overwrite)
.mode(SaveMode.Append)
.save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath)
}
}