[HUDI-3312] Fixing spark yaml and adding hive validation to integ test suite (#4731)
This commit is contained in:
committed by
GitHub
parent
8ab6f17149
commit
0ab1a8ec80
@@ -82,8 +82,8 @@ spark-submit
|
||||
|
||||
2.YAML file
|
||||
|
||||
Choose to write up the entire DAG of operations in YAML, take a look at `complex-dag-cow.yaml` or
|
||||
`complex-dag-mor.yaml`.
|
||||
Choose to write up the entire DAG of operations in YAML, take a look at `simple-deltastreamer.yaml` or
|
||||
`simple-deltastreamer.yaml`.
|
||||
Once you're ready with the DAG you want to execute, simply pass the yaml file path as follows:
|
||||
|
||||
```
|
||||
@@ -177,7 +177,7 @@ cd /opt
|
||||
Copy the integration tests jar into the docker container
|
||||
|
||||
```
|
||||
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt
|
||||
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt
|
||||
```
|
||||
|
||||
```
|
||||
@@ -217,7 +217,7 @@ spark-submit \
|
||||
--conf spark.driver.extraClassPath=/var/demo/jars/* \
|
||||
--conf spark.executor.extraClassPath=/var/demo/jars/* \
|
||||
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
||||
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
|
||||
/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
|
||||
--source-ordering-field test_suite_source_ordering_field \
|
||||
--use-deltastreamer \
|
||||
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
||||
@@ -227,7 +227,7 @@ spark-submit \
|
||||
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
|
||||
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
|
||||
--input-file-size 125829120 \
|
||||
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \
|
||||
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \
|
||||
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
|
||||
--table-type COPY_ON_WRITE \
|
||||
--compact-scheduling-minshare 1 \
|
||||
@@ -264,7 +264,7 @@ spark-submit \
|
||||
--conf spark.driver.extraClassPath=/var/demo/jars/* \
|
||||
--conf spark.executor.extraClassPath=/var/demo/jars/* \
|
||||
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
||||
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
|
||||
/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
|
||||
--source-ordering-field test_suite_source_ordering_field \
|
||||
--use-deltastreamer \
|
||||
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
||||
@@ -274,7 +274,7 @@ spark-submit \
|
||||
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
|
||||
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
|
||||
--input-file-size 125829120 \
|
||||
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \
|
||||
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \
|
||||
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
|
||||
--table-type MERGE_ON_READ \
|
||||
--compact-scheduling-minshare 1 \
|
||||
@@ -308,16 +308,16 @@ contents both via spark datasource and hive table via spark sql engine. Hive val
|
||||
If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not
|
||||
go well w/ hive2* jars. So, after running docker setup, follow the below steps.
|
||||
```
|
||||
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/
|
||||
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt/
|
||||
docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/
|
||||
```
|
||||
Also copy your dag of interest to adhoc-2:/opt/
|
||||
```
|
||||
docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/
|
||||
docker cp docker/demo/config/test-suite/simple-deltastreamer.yaml adhoc-2:/opt/
|
||||
```
|
||||
|
||||
For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins".
|
||||
This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: complex-dag-cow.yaml
|
||||
This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: simple-deltastreamer.yaml
|
||||
already has all these configs set. So no changes required just to try it out.
|
||||
|
||||
Also, ValidateDatasetNode can be configured in two ways. Either with "delete_input_data" set to true or without
|
||||
@@ -457,7 +457,7 @@ spark-submit \
|
||||
--conf spark.driver.extraClassPath=/var/demo/jars/* \
|
||||
--conf spark.executor.extraClassPath=/var/demo/jars/* \
|
||||
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
||||
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
|
||||
/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
|
||||
--source-ordering-field test_suite_source_ordering_field \
|
||||
--use-deltastreamer \
|
||||
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
||||
@@ -467,7 +467,7 @@ spark-submit \
|
||||
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
|
||||
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
|
||||
--input-file-size 125829120 \
|
||||
--workload-yaml-path file:/opt/complex-dag-cow.yaml \
|
||||
--workload-yaml-path file:/opt/simple-deltastreamer.yaml \
|
||||
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
|
||||
--table-type COPY_ON_WRITE \
|
||||
--compact-scheduling-minshare 1 \
|
||||
@@ -486,8 +486,8 @@ If you wish to enable metrics add below properties as well
|
||||
Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running
|
||||
dags.
|
||||
```
|
||||
complex-dag-cow.yaml: simple 1 round dag for COW table.
|
||||
complex-dag-mor.yaml: simple 1 round dag for MOR table.
|
||||
simple-deltastreamer.yaml: simple 1 round dag for COW table.
|
||||
simple-deltastreamer.yaml: simple 1 round dag for MOR table.
|
||||
cow-clustering-example.yaml : dag with 3 rounds, in which inline clustering will trigger during 2nd iteration.
|
||||
cow-long-running-example.yaml : long running dag with 50 iterations. only 1 partition is used.
|
||||
cow-long-running-multi-partitions.yaml: long running dag wit 50 iterations with multiple partitions.
|
||||
|
||||
@@ -95,6 +95,7 @@ public class DeltaConfig implements Serializable {
|
||||
private static String SCHEMA_VERSION = "schema_version";
|
||||
private static String NUM_ROLLBACKS = "num_rollbacks";
|
||||
private static String ENABLE_ROW_WRITING = "enable_row_writing";
|
||||
private static String ENABLE_METADATA_VALIDATE = "enable_metadata_validate";
|
||||
|
||||
// Spark SQL Create Table
|
||||
private static String TABLE_TYPE = "table_type";
|
||||
@@ -149,6 +150,10 @@ public class DeltaConfig implements Serializable {
|
||||
return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString());
|
||||
}
|
||||
|
||||
public boolean isEnableMetadataValidate() {
|
||||
return Boolean.valueOf(configsMap.getOrDefault(ENABLE_METADATA_VALIDATE, false).toString());
|
||||
}
|
||||
|
||||
public int getNumInsertPartitions() {
|
||||
return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString());
|
||||
}
|
||||
|
||||
@@ -111,14 +111,17 @@ public abstract class BaseValidateDatasetNode extends DagNode<Boolean> {
|
||||
String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key());
|
||||
String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key());
|
||||
log.warn("Validating hive table with db : " + database + " and table : " + tableName);
|
||||
Dataset<Row> cowDf = session.sql("SELECT * FROM " + database + "." + tableName);
|
||||
Dataset<Row> trimmedCowDf = cowDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
|
||||
.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD);
|
||||
intersectionDf = inputSnapshotDf.intersect(trimmedCowDf);
|
||||
session.sql("REFRESH TABLE " + database + "." + tableName);
|
||||
Dataset<Row> cowDf = session.sql("SELECT _row_key, rider, driver, begin_lat, begin_lon, end_lat, end_lon, fare, _hoodie_is_deleted, " +
|
||||
"test_suite_source_ordering_field FROM " + database + "." + tableName);
|
||||
Dataset<Row> reorderedInputDf = inputSnapshotDf.select("_row_key","rider","driver","begin_lat","begin_lon","end_lat","end_lon","fare",
|
||||
"_hoodie_is_deleted","test_suite_source_ordering_field");
|
||||
|
||||
Dataset<Row> intersectedHiveDf = reorderedInputDf.intersect(cowDf);
|
||||
outputCount = trimmedHudiDf.count();
|
||||
log.warn("Input count: " + inputCount + "; output count: " + outputCount);
|
||||
// the intersected df should be same as inputDf. if not, there is some mismatch.
|
||||
if (outputCount == 0 || inputSnapshotDf.except(intersectionDf).count() != 0) {
|
||||
if (outputCount == 0 || reorderedInputDf.except(intersectedHiveDf).count() != 0) {
|
||||
log.error("Data set validation failed for COW hive table. Total count in hudi " + outputCount + ", input df count " + inputCount);
|
||||
throw new AssertionError("Hudi hive table contents does not match contents input data. ");
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
@@ -49,7 +50,8 @@ public class ValidateDatasetNode extends BaseValidateDatasetNode {
|
||||
StructType inputSchema) {
|
||||
String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/*/*/*";
|
||||
log.info("Validate data in target hudi path " + hudiPath);
|
||||
Dataset<Row> hudiDf = session.read().format("hudi").load(hudiPath);
|
||||
Dataset<Row> hudiDf = session.read().option(HoodieMetadataConfig.ENABLE.key(), String.valueOf(config.isEnableMetadataValidate()))
|
||||
.format("hudi").load(hudiPath);
|
||||
return hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
|
||||
.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD);
|
||||
}
|
||||
|
||||
@@ -60,7 +60,7 @@ class SparkInsertNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] {
|
||||
.option(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX.key, "deltastreamer.checkpoint.key")
|
||||
.option("deltastreamer.checkpoint.key", context.getWriterContext.getHoodieTestSuiteWriter.getLastCheckpoint.orElse(""))
|
||||
.option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.mode(SaveMode.Append)
|
||||
.save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user