diff --git a/docker/compose/docker-compose_hadoop284_hive233_spark244.yml b/docker/compose/docker-compose_hadoop284_hive233_spark244.yml index 3e42d532b..05790963b 100644 --- a/docker/compose/docker-compose_hadoop284_hive233_spark244.yml +++ b/docker/compose/docker-compose_hadoop284_hive233_spark244.yml @@ -33,7 +33,7 @@ services: interval: 30s timeout: 10s retries: 3 - + datanode1: image: apachehudi/hudi-hadoop_2.8.4-datanode:latest container_name: datanode1 @@ -84,7 +84,7 @@ services: - hive-metastore-postgresql:/var/lib/postgresql hostname: hive-metastore-postgresql container_name: hive-metastore-postgresql - + hivemetastore: image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest hostname: hivemetastore @@ -221,6 +221,15 @@ services: - ${HUDI_WS}:/var/hoodie/ws command: worker + graphite: + container_name: graphite + hostname: graphite + image: graphiteapp/graphite-statsd + ports: + - 80:80 + - 2003-2004:2003-2004 + - 8126:8126 + adhoc-1: image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest hostname: adhoc-1 diff --git a/docker/demo/config/test-suite/complex-dag-cow.yaml b/docker/demo/config/test-suite/complex-dag-cow.yaml index acbe287ac..3a84b0a0a 100644 --- a/docker/demo/config/test-suite/complex-dag-cow.yaml +++ b/docker/demo/config/test-suite/complex-dag-cow.yaml @@ -49,7 +49,7 @@ dag_content: deps: third_insert first_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: first_hive_sync first_upsert: @@ -76,7 +76,7 @@ dag_content: deps: first_delete second_validate: config: - validate_hive: true + validate_hive: false delete_input_data: true type: ValidateDatasetNode deps: second_hive_sync diff --git a/docker/demo/config/test-suite/cow-clustering-example.yaml b/docker/demo/config/test-suite/cow-clustering-example.yaml index 939e16f55..95932317c 100644 --- a/docker/demo/config/test-suite/cow-clustering-example.yaml +++ b/docker/demo/config/test-suite/cow-clustering-example.yaml @@ -55,7 +55,7 @@ dag_content: deps: first_delete first_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: first_hive_sync first_cluster: @@ -71,6 +71,6 @@ dag_content: deps: first_cluster second_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: second_hive_sync diff --git a/docker/demo/config/test-suite/cow-long-running-example.yaml b/docker/demo/config/test-suite/cow-long-running-example.yaml index 71a34f813..29b6858bf 100644 --- a/docker/demo/config/test-suite/cow-long-running-example.yaml +++ b/docker/demo/config/test-suite/cow-long-running-example.yaml @@ -49,7 +49,7 @@ dag_content: deps: third_insert first_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: first_hive_sync first_upsert: @@ -76,7 +76,7 @@ dag_content: deps: first_delete second_validate: config: - validate_hive: true + validate_hive: false delete_input_data: true type: ValidateDatasetNode deps: second_hive_sync diff --git a/docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml b/docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml index b071c4667..0ce529805 100644 --- a/docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml +++ b/docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml @@ -49,7 +49,7 @@ dag_content: deps: third_insert first_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: first_hive_sync first_upsert: @@ -76,7 +76,7 @@ dag_content: deps: first_delete second_validate: config: - validate_hive: true + validate_hive: false delete_input_data: true type: ValidateDatasetNode deps: second_hive_sync diff --git a/docker/generate_test_suite.sh b/docker/generate_test_suite.sh index d7c140563..48c876fa6 100755 --- a/docker/generate_test_suite.sh +++ b/docker/generate_test_suite.sh @@ -16,6 +16,37 @@ # See the License for the specific language governing permissions and # limitations under the License. +usage=" +USAGE: +$(basename "$0") [--help] [--all boolen] -- Script to generate the test suite according to arguments provided and run these test suites. + +where: + --help show this help text + --all set the seed value + --execute_test_suite flag if test need to execute (DEFAULT- true) + --medium_num_iterations number of medium iterations (DEFAULT- 20) + --long_num_iterations number of long iterations (DEFAULT- 30) + --intermittent_delay_mins delay after every test run (DEFAULT- 1) + --table_type hoodie table type to test (DEFAULT COPY_ON_WRITE) + --include_long_test_suite_yaml include long infra test suite (DEFAULT false) + --include_medium_test_suite_yaml include medium infra test suite (DEFAULT false) + --cluster_num_itr number of cluster iterations (DEFAULT 30) + --include_cluster_yaml include cluster infra test suite (DEFAULT false) + --input_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/input/) + --output_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/output/) + +Example: +Note - Execute the command from within docker folder + + 1. To generate and run all test suites + ./generate_test_suite.sh --all true + 2. To only generate test suites + ./generate_test_suite.sh --all --execute_test_suite false + 3. To run only specific test suite yaml + ./generate_test_suite.sh --execute_test_suite true --include_medium_test_suite_yaml true + " + + MEDIUM_NUM_ITR=20 LONG_NUM_ITR=50 DELAY_MINS=1 @@ -39,6 +70,17 @@ do key="$1" case $key in + --help) + echo "$usage" + exit + ;; + --all) + INCLUDE_LONG_TEST_SUITE="$2" + INCLUDE_MEDIUM_TEST_SUITE="$2" + INCLUDE_CLUSTER_YAML="$2" + shift # past argument + shift # past value + ;; --execute_test_suite) EXECUTE_TEST_SUITE="$2" shift # past argument @@ -115,12 +157,15 @@ case $key in ;; *) # unknown option POSITIONAL+=("$1") # save it in an array for later + echo "Unknown argument provided - '$1'" + echo "$usage" + exit 0 shift # past argument ;; esac done set -- "${POSITIONAL[@]}" # restore positional parameters - +echo "$POSITIONAL" echo "Include Medium test suite $INCLUDE_MEDIUM_TEST_SUITE" if $INCLUDE_MEDIUM_TEST_SUITE ; then echo "Medium test suite iterations = ${MEDIUM_NUM_ITR}" @@ -232,7 +277,7 @@ fi if $EXECUTE_TEST_SUITE ; then - docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/$JAR_NAME adhoc-2:/opt/ + docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/"$JAR_NAME" adhoc-2:/opt/ docker exec -it adhoc-2 /bin/bash rm -rf /opt/staging* docker cp demo/config/test-suite/staging/ adhoc-2:/opt/ docker exec -it adhoc-2 /bin/bash echo "\n============================== Executing sanity test suite ============================== " diff --git a/hudi-integ-test/README.md b/hudi-integ-test/README.md index 4a9e9bc67..ffdedf849 100644 --- a/hudi-integ-test/README.md +++ b/hudi-integ-test/README.md @@ -177,7 +177,7 @@ cd /opt Copy the integration tests jar into the docker container ``` -docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar adhoc-2:/opt +docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt ``` ``` @@ -214,21 +214,29 @@ spark-submit \ --conf spark.network.timeout=600s \ --conf spark.yarn.max.executor.failures=10 \ --conf spark.sql.catalogImplementation=hive \ +--conf spark.driver.extraClassPath=/var/demo/jars/* \ +--conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \ +/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ --source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ --input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \ --target-table table1 \ --props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \ ---schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --input-file-size 125829120 \ --workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --table-type COPY_ON_WRITE \ ---compact-scheduling-minshare 1 +--compact-scheduling-minshare 1 \ +--hoodie-conf hoodie.metrics.on=true \ +--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \ +--hoodie-conf hoodie.metrics.graphite.host=graphite \ +--hoodie-conf hoodie.metrics.graphite.port=2003 \ +--clean-input \ +--clean-output ``` Or a Merge-on-Read job: @@ -253,23 +261,44 @@ spark-submit \ --conf spark.network.timeout=600s \ --conf spark.yarn.max.executor.failures=10 \ --conf spark.sql.catalogImplementation=hive \ +--conf spark.driver.extraClassPath=/var/demo/jars/* \ +--conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \ +/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ --source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ --input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \ --target-table table1 \ --props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \ ---schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --input-file-size 125829120 \ --workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --table-type MERGE_ON_READ \ ---compact-scheduling-minshare 1 +--compact-scheduling-minshare 1 \ +--hoodie-conf hoodie.metrics.on=true \ +--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \ +--hoodie-conf hoodie.metrics.graphite.host=graphite \ +--hoodie-conf hoodie.metrics.graphite.port=2003 \ +--clean-input \ +--clean-output ``` +## Visualize and inspect the hoodie metrics and performance (local) +Graphite server is already setup (and up) in ```docker/setup_demo.sh```. + +Open browser and access metrics at +``` +http://localhost:80 +``` +Dashboard +``` +http://localhost/dashboard + +``` + ## Running long running test suite in Local Docker environment For long running test suite, validation has to be done differently. Idea is to run same dag in a repeated manner for @@ -279,12 +308,12 @@ contents both via spark datasource and hive table via spark sql engine. Hive val If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not go well w/ hive2* jars. So, after running docker setup, follow the below steps. ``` -docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar adhoc-2:/opt/ -docker cp demo/config/test-suite/test.properties adhoc-2:/opt/ +docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/ +docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/ ``` Also copy your dag of interest to adhoc-2:/opt/ ``` -docker cp demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/ +docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/ ``` For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins". @@ -428,7 +457,7 @@ spark-submit \ --conf spark.driver.extraClassPath=/var/demo/jars/* \ --conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \ +/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ --source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ @@ -446,6 +475,14 @@ spark-submit \ --clean-output ``` +If you wish to enable metrics add below properties as well +``` +--hoodie-conf hoodie.metrics.on=true \ +--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \ +--hoodie-conf hoodie.metrics.graphite.host=graphite \ +--hoodie-conf hoodie.metrics.graphite.port=2003 \ +``` + Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running dags. ```