[HUDI-2267] Update docs and infra test configs, add support for graphite (#3482)
Co-authored-by: Sivabalan Narayanan <n.siva.b@gmail.com>
This commit is contained in:
@@ -221,6 +221,15 @@ services:
|
|||||||
- ${HUDI_WS}:/var/hoodie/ws
|
- ${HUDI_WS}:/var/hoodie/ws
|
||||||
command: worker
|
command: worker
|
||||||
|
|
||||||
|
graphite:
|
||||||
|
container_name: graphite
|
||||||
|
hostname: graphite
|
||||||
|
image: graphiteapp/graphite-statsd
|
||||||
|
ports:
|
||||||
|
- 80:80
|
||||||
|
- 2003-2004:2003-2004
|
||||||
|
- 8126:8126
|
||||||
|
|
||||||
adhoc-1:
|
adhoc-1:
|
||||||
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest
|
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest
|
||||||
hostname: adhoc-1
|
hostname: adhoc-1
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ dag_content:
|
|||||||
deps: third_insert
|
deps: third_insert
|
||||||
first_validate:
|
first_validate:
|
||||||
config:
|
config:
|
||||||
validate_hive: true
|
validate_hive: false
|
||||||
type: ValidateDatasetNode
|
type: ValidateDatasetNode
|
||||||
deps: first_hive_sync
|
deps: first_hive_sync
|
||||||
first_upsert:
|
first_upsert:
|
||||||
@@ -76,7 +76,7 @@ dag_content:
|
|||||||
deps: first_delete
|
deps: first_delete
|
||||||
second_validate:
|
second_validate:
|
||||||
config:
|
config:
|
||||||
validate_hive: true
|
validate_hive: false
|
||||||
delete_input_data: true
|
delete_input_data: true
|
||||||
type: ValidateDatasetNode
|
type: ValidateDatasetNode
|
||||||
deps: second_hive_sync
|
deps: second_hive_sync
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ dag_content:
|
|||||||
deps: first_delete
|
deps: first_delete
|
||||||
first_validate:
|
first_validate:
|
||||||
config:
|
config:
|
||||||
validate_hive: true
|
validate_hive: false
|
||||||
type: ValidateDatasetNode
|
type: ValidateDatasetNode
|
||||||
deps: first_hive_sync
|
deps: first_hive_sync
|
||||||
first_cluster:
|
first_cluster:
|
||||||
@@ -71,6 +71,6 @@ dag_content:
|
|||||||
deps: first_cluster
|
deps: first_cluster
|
||||||
second_validate:
|
second_validate:
|
||||||
config:
|
config:
|
||||||
validate_hive: true
|
validate_hive: false
|
||||||
type: ValidateDatasetNode
|
type: ValidateDatasetNode
|
||||||
deps: second_hive_sync
|
deps: second_hive_sync
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ dag_content:
|
|||||||
deps: third_insert
|
deps: third_insert
|
||||||
first_validate:
|
first_validate:
|
||||||
config:
|
config:
|
||||||
validate_hive: true
|
validate_hive: false
|
||||||
type: ValidateDatasetNode
|
type: ValidateDatasetNode
|
||||||
deps: first_hive_sync
|
deps: first_hive_sync
|
||||||
first_upsert:
|
first_upsert:
|
||||||
@@ -76,7 +76,7 @@ dag_content:
|
|||||||
deps: first_delete
|
deps: first_delete
|
||||||
second_validate:
|
second_validate:
|
||||||
config:
|
config:
|
||||||
validate_hive: true
|
validate_hive: false
|
||||||
delete_input_data: true
|
delete_input_data: true
|
||||||
type: ValidateDatasetNode
|
type: ValidateDatasetNode
|
||||||
deps: second_hive_sync
|
deps: second_hive_sync
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ dag_content:
|
|||||||
deps: third_insert
|
deps: third_insert
|
||||||
first_validate:
|
first_validate:
|
||||||
config:
|
config:
|
||||||
validate_hive: true
|
validate_hive: false
|
||||||
type: ValidateDatasetNode
|
type: ValidateDatasetNode
|
||||||
deps: first_hive_sync
|
deps: first_hive_sync
|
||||||
first_upsert:
|
first_upsert:
|
||||||
@@ -76,7 +76,7 @@ dag_content:
|
|||||||
deps: first_delete
|
deps: first_delete
|
||||||
second_validate:
|
second_validate:
|
||||||
config:
|
config:
|
||||||
validate_hive: true
|
validate_hive: false
|
||||||
delete_input_data: true
|
delete_input_data: true
|
||||||
type: ValidateDatasetNode
|
type: ValidateDatasetNode
|
||||||
deps: second_hive_sync
|
deps: second_hive_sync
|
||||||
|
|||||||
@@ -16,6 +16,37 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
usage="
|
||||||
|
USAGE:
|
||||||
|
$(basename "$0") [--help] [--all boolen] -- Script to generate the test suite according to arguments provided and run these test suites.
|
||||||
|
|
||||||
|
where:
|
||||||
|
--help show this help text
|
||||||
|
--all set the seed value
|
||||||
|
--execute_test_suite flag if test need to execute (DEFAULT- true)
|
||||||
|
--medium_num_iterations number of medium iterations (DEFAULT- 20)
|
||||||
|
--long_num_iterations number of long iterations (DEFAULT- 30)
|
||||||
|
--intermittent_delay_mins delay after every test run (DEFAULT- 1)
|
||||||
|
--table_type hoodie table type to test (DEFAULT COPY_ON_WRITE)
|
||||||
|
--include_long_test_suite_yaml include long infra test suite (DEFAULT false)
|
||||||
|
--include_medium_test_suite_yaml include medium infra test suite (DEFAULT false)
|
||||||
|
--cluster_num_itr number of cluster iterations (DEFAULT 30)
|
||||||
|
--include_cluster_yaml include cluster infra test suite (DEFAULT false)
|
||||||
|
--input_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/input/)
|
||||||
|
--output_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/output/)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Note - Execute the command from within docker folder
|
||||||
|
|
||||||
|
1. To generate and run all test suites
|
||||||
|
./generate_test_suite.sh --all true
|
||||||
|
2. To only generate test suites
|
||||||
|
./generate_test_suite.sh --all --execute_test_suite false
|
||||||
|
3. To run only specific test suite yaml
|
||||||
|
./generate_test_suite.sh --execute_test_suite true --include_medium_test_suite_yaml true
|
||||||
|
"
|
||||||
|
|
||||||
|
|
||||||
MEDIUM_NUM_ITR=20
|
MEDIUM_NUM_ITR=20
|
||||||
LONG_NUM_ITR=50
|
LONG_NUM_ITR=50
|
||||||
DELAY_MINS=1
|
DELAY_MINS=1
|
||||||
@@ -39,6 +70,17 @@ do
|
|||||||
key="$1"
|
key="$1"
|
||||||
|
|
||||||
case $key in
|
case $key in
|
||||||
|
--help)
|
||||||
|
echo "$usage"
|
||||||
|
exit
|
||||||
|
;;
|
||||||
|
--all)
|
||||||
|
INCLUDE_LONG_TEST_SUITE="$2"
|
||||||
|
INCLUDE_MEDIUM_TEST_SUITE="$2"
|
||||||
|
INCLUDE_CLUSTER_YAML="$2"
|
||||||
|
shift # past argument
|
||||||
|
shift # past value
|
||||||
|
;;
|
||||||
--execute_test_suite)
|
--execute_test_suite)
|
||||||
EXECUTE_TEST_SUITE="$2"
|
EXECUTE_TEST_SUITE="$2"
|
||||||
shift # past argument
|
shift # past argument
|
||||||
@@ -115,12 +157,15 @@ case $key in
|
|||||||
;;
|
;;
|
||||||
*) # unknown option
|
*) # unknown option
|
||||||
POSITIONAL+=("$1") # save it in an array for later
|
POSITIONAL+=("$1") # save it in an array for later
|
||||||
|
echo "Unknown argument provided - '$1'"
|
||||||
|
echo "$usage"
|
||||||
|
exit 0
|
||||||
shift # past argument
|
shift # past argument
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
set -- "${POSITIONAL[@]}" # restore positional parameters
|
set -- "${POSITIONAL[@]}" # restore positional parameters
|
||||||
|
echo "$POSITIONAL"
|
||||||
echo "Include Medium test suite $INCLUDE_MEDIUM_TEST_SUITE"
|
echo "Include Medium test suite $INCLUDE_MEDIUM_TEST_SUITE"
|
||||||
if $INCLUDE_MEDIUM_TEST_SUITE ; then
|
if $INCLUDE_MEDIUM_TEST_SUITE ; then
|
||||||
echo "Medium test suite iterations = ${MEDIUM_NUM_ITR}"
|
echo "Medium test suite iterations = ${MEDIUM_NUM_ITR}"
|
||||||
@@ -232,7 +277,7 @@ fi
|
|||||||
|
|
||||||
if $EXECUTE_TEST_SUITE ; then
|
if $EXECUTE_TEST_SUITE ; then
|
||||||
|
|
||||||
docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/$JAR_NAME adhoc-2:/opt/
|
docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/"$JAR_NAME" adhoc-2:/opt/
|
||||||
docker exec -it adhoc-2 /bin/bash rm -rf /opt/staging*
|
docker exec -it adhoc-2 /bin/bash rm -rf /opt/staging*
|
||||||
docker cp demo/config/test-suite/staging/ adhoc-2:/opt/
|
docker cp demo/config/test-suite/staging/ adhoc-2:/opt/
|
||||||
docker exec -it adhoc-2 /bin/bash echo "\n============================== Executing sanity test suite ============================== "
|
docker exec -it adhoc-2 /bin/bash echo "\n============================== Executing sanity test suite ============================== "
|
||||||
|
|||||||
@@ -177,7 +177,7 @@ cd /opt
|
|||||||
Copy the integration tests jar into the docker container
|
Copy the integration tests jar into the docker container
|
||||||
|
|
||||||
```
|
```
|
||||||
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar adhoc-2:/opt
|
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -214,21 +214,29 @@ spark-submit \
|
|||||||
--conf spark.network.timeout=600s \
|
--conf spark.network.timeout=600s \
|
||||||
--conf spark.yarn.max.executor.failures=10 \
|
--conf spark.yarn.max.executor.failures=10 \
|
||||||
--conf spark.sql.catalogImplementation=hive \
|
--conf spark.sql.catalogImplementation=hive \
|
||||||
|
--conf spark.driver.extraClassPath=/var/demo/jars/* \
|
||||||
|
--conf spark.executor.extraClassPath=/var/demo/jars/* \
|
||||||
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
||||||
/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \
|
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
|
||||||
--source-ordering-field test_suite_source_ordering_field \
|
--source-ordering-field test_suite_source_ordering_field \
|
||||||
--use-deltastreamer \
|
--use-deltastreamer \
|
||||||
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
||||||
--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
|
--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
|
||||||
--target-table table1 \
|
--target-table table1 \
|
||||||
--props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \
|
--props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \
|
||||||
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
|
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
|
||||||
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
|
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
|
||||||
--input-file-size 125829120 \
|
--input-file-size 125829120 \
|
||||||
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \
|
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \
|
||||||
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
|
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
|
||||||
--table-type COPY_ON_WRITE \
|
--table-type COPY_ON_WRITE \
|
||||||
--compact-scheduling-minshare 1
|
--compact-scheduling-minshare 1 \
|
||||||
|
--hoodie-conf hoodie.metrics.on=true \
|
||||||
|
--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \
|
||||||
|
--hoodie-conf hoodie.metrics.graphite.host=graphite \
|
||||||
|
--hoodie-conf hoodie.metrics.graphite.port=2003 \
|
||||||
|
--clean-input \
|
||||||
|
--clean-output
|
||||||
```
|
```
|
||||||
|
|
||||||
Or a Merge-on-Read job:
|
Or a Merge-on-Read job:
|
||||||
@@ -253,21 +261,42 @@ spark-submit \
|
|||||||
--conf spark.network.timeout=600s \
|
--conf spark.network.timeout=600s \
|
||||||
--conf spark.yarn.max.executor.failures=10 \
|
--conf spark.yarn.max.executor.failures=10 \
|
||||||
--conf spark.sql.catalogImplementation=hive \
|
--conf spark.sql.catalogImplementation=hive \
|
||||||
|
--conf spark.driver.extraClassPath=/var/demo/jars/* \
|
||||||
|
--conf spark.executor.extraClassPath=/var/demo/jars/* \
|
||||||
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
||||||
/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \
|
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
|
||||||
--source-ordering-field test_suite_source_ordering_field \
|
--source-ordering-field test_suite_source_ordering_field \
|
||||||
--use-deltastreamer \
|
--use-deltastreamer \
|
||||||
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
||||||
--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
|
--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
|
||||||
--target-table table1 \
|
--target-table table1 \
|
||||||
--props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \
|
--props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \
|
||||||
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
|
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
|
||||||
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
|
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
|
||||||
--input-file-size 125829120 \
|
--input-file-size 125829120 \
|
||||||
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \
|
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \
|
||||||
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
|
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
|
||||||
--table-type MERGE_ON_READ \
|
--table-type MERGE_ON_READ \
|
||||||
--compact-scheduling-minshare 1
|
--compact-scheduling-minshare 1 \
|
||||||
|
--hoodie-conf hoodie.metrics.on=true \
|
||||||
|
--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \
|
||||||
|
--hoodie-conf hoodie.metrics.graphite.host=graphite \
|
||||||
|
--hoodie-conf hoodie.metrics.graphite.port=2003 \
|
||||||
|
--clean-input \
|
||||||
|
--clean-output
|
||||||
|
```
|
||||||
|
|
||||||
|
## Visualize and inspect the hoodie metrics and performance (local)
|
||||||
|
Graphite server is already setup (and up) in ```docker/setup_demo.sh```.
|
||||||
|
|
||||||
|
Open browser and access metrics at
|
||||||
|
```
|
||||||
|
http://localhost:80
|
||||||
|
```
|
||||||
|
Dashboard
|
||||||
|
```
|
||||||
|
http://localhost/dashboard
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Running long running test suite in Local Docker environment
|
## Running long running test suite in Local Docker environment
|
||||||
@@ -279,12 +308,12 @@ contents both via spark datasource and hive table via spark sql engine. Hive val
|
|||||||
If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not
|
If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not
|
||||||
go well w/ hive2* jars. So, after running docker setup, follow the below steps.
|
go well w/ hive2* jars. So, after running docker setup, follow the below steps.
|
||||||
```
|
```
|
||||||
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar adhoc-2:/opt/
|
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/
|
||||||
docker cp demo/config/test-suite/test.properties adhoc-2:/opt/
|
docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/
|
||||||
```
|
```
|
||||||
Also copy your dag of interest to adhoc-2:/opt/
|
Also copy your dag of interest to adhoc-2:/opt/
|
||||||
```
|
```
|
||||||
docker cp demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/
|
docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/
|
||||||
```
|
```
|
||||||
|
|
||||||
For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins".
|
For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins".
|
||||||
@@ -428,7 +457,7 @@ spark-submit \
|
|||||||
--conf spark.driver.extraClassPath=/var/demo/jars/* \
|
--conf spark.driver.extraClassPath=/var/demo/jars/* \
|
||||||
--conf spark.executor.extraClassPath=/var/demo/jars/* \
|
--conf spark.executor.extraClassPath=/var/demo/jars/* \
|
||||||
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
||||||
/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \
|
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
|
||||||
--source-ordering-field test_suite_source_ordering_field \
|
--source-ordering-field test_suite_source_ordering_field \
|
||||||
--use-deltastreamer \
|
--use-deltastreamer \
|
||||||
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
||||||
@@ -446,6 +475,14 @@ spark-submit \
|
|||||||
--clean-output
|
--clean-output
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you wish to enable metrics add below properties as well
|
||||||
|
```
|
||||||
|
--hoodie-conf hoodie.metrics.on=true \
|
||||||
|
--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \
|
||||||
|
--hoodie-conf hoodie.metrics.graphite.host=graphite \
|
||||||
|
--hoodie-conf hoodie.metrics.graphite.port=2003 \
|
||||||
|
```
|
||||||
|
|
||||||
Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running
|
Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running
|
||||||
dags.
|
dags.
|
||||||
```
|
```
|
||||||
|
|||||||
Reference in New Issue
Block a user