1
0

[HUDI-3312] Fixing spark yaml and adding hive validation to integ test suite (#4731)

This commit is contained in:
Sivabalan Narayanan
2022-02-08 00:40:36 -05:00
committed by GitHub
parent 8ab6f17149
commit 0ab1a8ec80
14 changed files with 295 additions and 38 deletions

View File

@@ -13,13 +13,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
dag_name: cow-spark-long-running-multi-partitions.yaml dag_name: cow-spark-deltastreamer-long-running-multi-partitions.yaml
dag_rounds: 50 dag_rounds: 30
dag_intermittent_delay_mins: 1 dag_intermittent_delay_mins: 0
dag_content: dag_content:
first_insert: first_insert:
config: config:
record_size: 1000 record_size: 200
num_partitions_insert: 50 num_partitions_insert: 50
repeat_count: 1 repeat_count: 1
num_records_insert: 10000 num_records_insert: 10000
@@ -33,12 +33,12 @@ dag_content:
deps: first_insert deps: first_insert
first_validate: first_validate:
config: config:
validate_hive: true validate_hive: false
type: ValidateDatasetNode type: ValidateDatasetNode
deps: first_hive_sync deps: first_hive_sync
first_upsert: first_upsert:
config: config:
record_size: 1000 record_size: 200
num_partitions_insert: 50 num_partitions_insert: 50
num_records_insert: 300 num_records_insert: 300
repeat_count: 1 repeat_count: 1
@@ -60,13 +60,13 @@ dag_content:
deps: first_delete deps: first_delete
second_validate: second_validate:
config: config:
validate_hive: true validate_hive: false
delete_input_data: true delete_input_data: true
type: ValidateDatasetNode type: ValidateDatasetNode
deps: second_hive_sync deps: second_hive_sync
last_validate: last_validate:
config: config:
execute_itr_count: 50 execute_itr_count: 30
validate_clean: true validate_clean: true
validate_archival: true validate_archival: true
type: ValidateAsyncOperations type: ValidateAsyncOperations

View File

@@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
dag_name: cow-spark-simple.yaml dag_name: cow-spark-simple.yaml
dag_rounds: 2 dag_rounds: 1
dag_intermittent_delay_mins: 1 dag_intermittent_delay_mins: 1
dag_content: dag_content:
first_insert: first_insert:
@@ -33,7 +33,7 @@ dag_content:
deps: first_insert deps: first_insert
first_validate: first_validate:
config: config:
validate_hive: true validate_hive: false
type: ValidateDatasetNode type: ValidateDatasetNode
deps: first_hive_sync deps: first_hive_sync
first_upsert: first_upsert:
@@ -60,7 +60,7 @@ dag_content:
deps: first_delete deps: first_delete
second_validate: second_validate:
config: config:
validate_hive: true validate_hive: false
delete_input_data: false delete_input_data: false
type: ValidateDatasetNode type: ValidateDatasetNode
deps: second_hive_sync deps: second_hive_sync

View File

@@ -0,0 +1,89 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: deltastreamer-long-running-multi-partitions.yaml
dag_rounds: 50
dag_intermittent_delay_mins: 1
dag_content:
first_insert:
config:
record_size: 1000
num_partitions_insert: 5
repeat_count: 1
num_records_insert: 1000
type: InsertNode
deps: none
second_insert:
config:
record_size: 1000
num_partitions_insert: 50
repeat_count: 1
num_records_insert: 10000
deps: first_insert
type: InsertNode
third_insert:
config:
record_size: 1000
num_partitions_insert: 2
repeat_count: 1
num_records_insert: 300
deps: second_insert
type: InsertNode
first_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: third_insert
first_validate:
config:
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_upsert:
config:
record_size: 1000
num_partitions_insert: 2
num_records_insert: 300
repeat_count: 1
num_records_upsert: 100
num_partitions_upsert: 1
type: UpsertNode
deps: first_validate
first_delete:
config:
num_partitions_delete: 50
num_records_delete: 8000
type: DeleteNode
deps: first_upsert
second_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: first_delete
second_validate:
config:
validate_hive: true
delete_input_data: true
type: ValidateDatasetNode
deps: second_hive_sync
last_validate:
config:
execute_itr_count: 50
validate_clean: true
validate_archival: true
type: ValidateAsyncOperations
deps: second_validate

View File

@@ -13,7 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
dag_name: cow-long-running-multi-partitions.yaml dag_name: deltastreamer-long-running-multi-partitions.yaml
dag_rounds: 50 dag_rounds: 50
dag_intermittent_delay_mins: 1 dag_intermittent_delay_mins: 1
dag_content: dag_content:

View File

@@ -13,7 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
dag_name: cow-long-running-example.yaml dag_name: detlastreamer-long-running-example.yaml
dag_rounds: 50 dag_rounds: 50
dag_intermittent_delay_mins: 1 dag_intermittent_delay_mins: 1
dag_content: dag_content:

View File

@@ -0,0 +1,76 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: simple-clustering-hive.yaml
dag_rounds: 30
dag_intermittent_delay_mins: 0
dag_content:
first_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 1000
type: InsertNode
deps: none
second_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 10000
deps: first_insert
type: InsertNode
third_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 300
deps: second_insert
type: InsertNode
first_delete:
config:
num_partitions_delete: 1
num_records_delete: 9000
type: DeleteNode
deps: third_insert
first_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: first_delete
first_validate:
config:
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_cluster:
config:
execute_itr_count: 20
type: ClusteringNode
deps: first_validate
second_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: first_cluster
second_validate:
config:
validate_hive: true
type: ValidateDatasetNode
deps: second_hive_sync

View File

@@ -13,8 +13,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
dag_name: cow-clustering-example.yaml dag_name: simple-clustering.yaml
dag_rounds: 3 dag_rounds: 30
dag_intermittent_delay_mins: 0 dag_intermittent_delay_mins: 0
dag_content: dag_content:
first_insert: first_insert:
@@ -60,7 +60,7 @@ dag_content:
deps: first_hive_sync deps: first_hive_sync
first_cluster: first_cluster:
config: config:
execute_itr_count: 2 execute_itr_count: 25
type: ClusteringNode type: ClusteringNode
deps: first_validate deps: first_validate
second_hive_sync: second_hive_sync:

View File

@@ -0,0 +1,82 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dag_name: simple-deltastreamer.yaml
dag_rounds: 1
dag_intermittent_delay_mins: 1
dag_content:
first_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 1000
type: InsertNode
deps: none
second_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 10000
deps: first_insert
type: InsertNode
third_insert:
config:
record_size: 1000
num_partitions_insert: 1
repeat_count: 1
num_records_insert: 300
deps: second_insert
type: InsertNode
first_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: third_insert
first_validate:
config:
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_upsert:
config:
record_size: 1000
num_partitions_insert: 1
num_records_insert: 300
repeat_count: 1
num_records_upsert: 100
num_partitions_upsert: 1
type: UpsertNode
deps: first_validate
first_delete:
config:
num_partitions_delete: 1
num_records_delete: 2000
type: DeleteNode
deps: first_upsert
second_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: first_delete
second_validate:
config:
validate_hive: true
delete_input_data: true
type: ValidateDatasetNode
deps: second_hive_sync

View File

@@ -13,7 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
dag_name: complex-dag-cow.yaml dag_name: simple-deltastreamer.yaml
dag_rounds: 1 dag_rounds: 1
dag_intermittent_delay_mins: 1 dag_intermittent_delay_mins: 1
dag_content: dag_content:

View File

@@ -82,8 +82,8 @@ spark-submit
2.YAML file 2.YAML file
Choose to write up the entire DAG of operations in YAML, take a look at `complex-dag-cow.yaml` or Choose to write up the entire DAG of operations in YAML, take a look at `simple-deltastreamer.yaml` or
`complex-dag-mor.yaml`. `simple-deltastreamer.yaml`.
Once you're ready with the DAG you want to execute, simply pass the yaml file path as follows: Once you're ready with the DAG you want to execute, simply pass the yaml file path as follows:
``` ```
@@ -177,7 +177,7 @@ cd /opt
Copy the integration tests jar into the docker container Copy the integration tests jar into the docker container
``` ```
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt
``` ```
``` ```
@@ -217,7 +217,7 @@ spark-submit \
--conf spark.driver.extraClassPath=/var/demo/jars/* \ --conf spark.driver.extraClassPath=/var/demo/jars/* \
--conf spark.executor.extraClassPath=/var/demo/jars/* \ --conf spark.executor.extraClassPath=/var/demo/jars/* \
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ /opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
--source-ordering-field test_suite_source_ordering_field \ --source-ordering-field test_suite_source_ordering_field \
--use-deltastreamer \ --use-deltastreamer \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
@@ -227,7 +227,7 @@ spark-submit \
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \
--input-file-size 125829120 \ --input-file-size 125829120 \
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \ --workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
--table-type COPY_ON_WRITE \ --table-type COPY_ON_WRITE \
--compact-scheduling-minshare 1 \ --compact-scheduling-minshare 1 \
@@ -264,7 +264,7 @@ spark-submit \
--conf spark.driver.extraClassPath=/var/demo/jars/* \ --conf spark.driver.extraClassPath=/var/demo/jars/* \
--conf spark.executor.extraClassPath=/var/demo/jars/* \ --conf spark.executor.extraClassPath=/var/demo/jars/* \
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ /opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
--source-ordering-field test_suite_source_ordering_field \ --source-ordering-field test_suite_source_ordering_field \
--use-deltastreamer \ --use-deltastreamer \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
@@ -274,7 +274,7 @@ spark-submit \
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \
--input-file-size 125829120 \ --input-file-size 125829120 \
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \ --workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
--table-type MERGE_ON_READ \ --table-type MERGE_ON_READ \
--compact-scheduling-minshare 1 \ --compact-scheduling-minshare 1 \
@@ -308,16 +308,16 @@ contents both via spark datasource and hive table via spark sql engine. Hive val
If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not
go well w/ hive2* jars. So, after running docker setup, follow the below steps. go well w/ hive2* jars. So, after running docker setup, follow the below steps.
``` ```
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/ docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt/
docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/ docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/
``` ```
Also copy your dag of interest to adhoc-2:/opt/ Also copy your dag of interest to adhoc-2:/opt/
``` ```
docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/ docker cp docker/demo/config/test-suite/simple-deltastreamer.yaml adhoc-2:/opt/
``` ```
For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins". For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins".
This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: complex-dag-cow.yaml This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: simple-deltastreamer.yaml
already has all these configs set. So no changes required just to try it out. already has all these configs set. So no changes required just to try it out.
Also, ValidateDatasetNode can be configured in two ways. Either with "delete_input_data" set to true or without Also, ValidateDatasetNode can be configured in two ways. Either with "delete_input_data" set to true or without
@@ -457,7 +457,7 @@ spark-submit \
--conf spark.driver.extraClassPath=/var/demo/jars/* \ --conf spark.driver.extraClassPath=/var/demo/jars/* \
--conf spark.executor.extraClassPath=/var/demo/jars/* \ --conf spark.executor.extraClassPath=/var/demo/jars/* \
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ /opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \
--source-ordering-field test_suite_source_ordering_field \ --source-ordering-field test_suite_source_ordering_field \
--use-deltastreamer \ --use-deltastreamer \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
@@ -467,7 +467,7 @@ spark-submit \
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \
--input-file-size 125829120 \ --input-file-size 125829120 \
--workload-yaml-path file:/opt/complex-dag-cow.yaml \ --workload-yaml-path file:/opt/simple-deltastreamer.yaml \
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
--table-type COPY_ON_WRITE \ --table-type COPY_ON_WRITE \
--compact-scheduling-minshare 1 \ --compact-scheduling-minshare 1 \
@@ -486,8 +486,8 @@ If you wish to enable metrics add below properties as well
Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running
dags. dags.
``` ```
complex-dag-cow.yaml: simple 1 round dag for COW table. simple-deltastreamer.yaml: simple 1 round dag for COW table.
complex-dag-mor.yaml: simple 1 round dag for MOR table. simple-deltastreamer.yaml: simple 1 round dag for MOR table.
cow-clustering-example.yaml : dag with 3 rounds, in which inline clustering will trigger during 2nd iteration. cow-clustering-example.yaml : dag with 3 rounds, in which inline clustering will trigger during 2nd iteration.
cow-long-running-example.yaml : long running dag with 50 iterations. only 1 partition is used. cow-long-running-example.yaml : long running dag with 50 iterations. only 1 partition is used.
cow-long-running-multi-partitions.yaml: long running dag wit 50 iterations with multiple partitions. cow-long-running-multi-partitions.yaml: long running dag wit 50 iterations with multiple partitions.

View File

@@ -95,6 +95,7 @@ public class DeltaConfig implements Serializable {
private static String SCHEMA_VERSION = "schema_version"; private static String SCHEMA_VERSION = "schema_version";
private static String NUM_ROLLBACKS = "num_rollbacks"; private static String NUM_ROLLBACKS = "num_rollbacks";
private static String ENABLE_ROW_WRITING = "enable_row_writing"; private static String ENABLE_ROW_WRITING = "enable_row_writing";
private static String ENABLE_METADATA_VALIDATE = "enable_metadata_validate";
// Spark SQL Create Table // Spark SQL Create Table
private static String TABLE_TYPE = "table_type"; private static String TABLE_TYPE = "table_type";
@@ -149,6 +150,10 @@ public class DeltaConfig implements Serializable {
return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString()); return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString());
} }
public boolean isEnableMetadataValidate() {
return Boolean.valueOf(configsMap.getOrDefault(ENABLE_METADATA_VALIDATE, false).toString());
}
public int getNumInsertPartitions() { public int getNumInsertPartitions() {
return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString()); return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString());
} }

View File

@@ -111,14 +111,17 @@ public abstract class BaseValidateDatasetNode extends DagNode<Boolean> {
String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key()); String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key());
String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key()); String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key());
log.warn("Validating hive table with db : " + database + " and table : " + tableName); log.warn("Validating hive table with db : " + database + " and table : " + tableName);
Dataset<Row> cowDf = session.sql("SELECT * FROM " + database + "." + tableName); session.sql("REFRESH TABLE " + database + "." + tableName);
Dataset<Row> trimmedCowDf = cowDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) Dataset<Row> cowDf = session.sql("SELECT _row_key, rider, driver, begin_lat, begin_lon, end_lat, end_lon, fare, _hoodie_is_deleted, " +
.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD); "test_suite_source_ordering_field FROM " + database + "." + tableName);
intersectionDf = inputSnapshotDf.intersect(trimmedCowDf); Dataset<Row> reorderedInputDf = inputSnapshotDf.select("_row_key","rider","driver","begin_lat","begin_lon","end_lat","end_lon","fare",
"_hoodie_is_deleted","test_suite_source_ordering_field");
Dataset<Row> intersectedHiveDf = reorderedInputDf.intersect(cowDf);
outputCount = trimmedHudiDf.count(); outputCount = trimmedHudiDf.count();
log.warn("Input count: " + inputCount + "; output count: " + outputCount); log.warn("Input count: " + inputCount + "; output count: " + outputCount);
// the intersected df should be same as inputDf. if not, there is some mismatch. // the intersected df should be same as inputDf. if not, there is some mismatch.
if (outputCount == 0 || inputSnapshotDf.except(intersectionDf).count() != 0) { if (outputCount == 0 || reorderedInputDf.except(intersectedHiveDf).count() != 0) {
log.error("Data set validation failed for COW hive table. Total count in hudi " + outputCount + ", input df count " + inputCount); log.error("Data set validation failed for COW hive table. Total count in hudi " + outputCount + ", input df count " + inputCount);
throw new AssertionError("Hudi hive table contents does not match contents input data. "); throw new AssertionError("Hudi hive table contents does not match contents input data. ");
} }

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.integ.testsuite.dag.nodes; package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext; import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
@@ -49,7 +50,8 @@ public class ValidateDatasetNode extends BaseValidateDatasetNode {
StructType inputSchema) { StructType inputSchema) {
String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/*/*/*"; String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/*/*/*";
log.info("Validate data in target hudi path " + hudiPath); log.info("Validate data in target hudi path " + hudiPath);
Dataset<Row> hudiDf = session.read().format("hudi").load(hudiPath); Dataset<Row> hudiDf = session.read().option(HoodieMetadataConfig.ENABLE.key(), String.valueOf(config.isEnableMetadataValidate()))
.format("hudi").load(hudiPath);
return hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) return hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD); .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD);
} }

View File

@@ -60,7 +60,7 @@ class SparkInsertNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] {
.option(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX.key, "deltastreamer.checkpoint.key") .option(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX.key, "deltastreamer.checkpoint.key")
.option("deltastreamer.checkpoint.key", context.getWriterContext.getHoodieTestSuiteWriter.getLastCheckpoint.orElse("")) .option("deltastreamer.checkpoint.key", context.getWriterContext.getHoodieTestSuiteWriter.getLastCheckpoint.orElse(""))
.option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) .option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName)
.mode(SaveMode.Overwrite) .mode(SaveMode.Append)
.save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath) .save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath)
} }
} }