[HUDI-2267] Update docs and infra test configs, add support for graphite (#3482)
Co-authored-by: Sivabalan Narayanan <n.siva.b@gmail.com>
This commit is contained in:
@@ -33,7 +33,7 @@ services:
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
|
||||
datanode1:
|
||||
image: apachehudi/hudi-hadoop_2.8.4-datanode:latest
|
||||
container_name: datanode1
|
||||
@@ -84,7 +84,7 @@ services:
|
||||
- hive-metastore-postgresql:/var/lib/postgresql
|
||||
hostname: hive-metastore-postgresql
|
||||
container_name: hive-metastore-postgresql
|
||||
|
||||
|
||||
hivemetastore:
|
||||
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest
|
||||
hostname: hivemetastore
|
||||
@@ -221,6 +221,15 @@ services:
|
||||
- ${HUDI_WS}:/var/hoodie/ws
|
||||
command: worker
|
||||
|
||||
graphite:
|
||||
container_name: graphite
|
||||
hostname: graphite
|
||||
image: graphiteapp/graphite-statsd
|
||||
ports:
|
||||
- 80:80
|
||||
- 2003-2004:2003-2004
|
||||
- 8126:8126
|
||||
|
||||
adhoc-1:
|
||||
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest
|
||||
hostname: adhoc-1
|
||||
|
||||
@@ -49,7 +49,7 @@ dag_content:
|
||||
deps: third_insert
|
||||
first_validate:
|
||||
config:
|
||||
validate_hive: true
|
||||
validate_hive: false
|
||||
type: ValidateDatasetNode
|
||||
deps: first_hive_sync
|
||||
first_upsert:
|
||||
@@ -76,7 +76,7 @@ dag_content:
|
||||
deps: first_delete
|
||||
second_validate:
|
||||
config:
|
||||
validate_hive: true
|
||||
validate_hive: false
|
||||
delete_input_data: true
|
||||
type: ValidateDatasetNode
|
||||
deps: second_hive_sync
|
||||
|
||||
@@ -55,7 +55,7 @@ dag_content:
|
||||
deps: first_delete
|
||||
first_validate:
|
||||
config:
|
||||
validate_hive: true
|
||||
validate_hive: false
|
||||
type: ValidateDatasetNode
|
||||
deps: first_hive_sync
|
||||
first_cluster:
|
||||
@@ -71,6 +71,6 @@ dag_content:
|
||||
deps: first_cluster
|
||||
second_validate:
|
||||
config:
|
||||
validate_hive: true
|
||||
validate_hive: false
|
||||
type: ValidateDatasetNode
|
||||
deps: second_hive_sync
|
||||
|
||||
@@ -49,7 +49,7 @@ dag_content:
|
||||
deps: third_insert
|
||||
first_validate:
|
||||
config:
|
||||
validate_hive: true
|
||||
validate_hive: false
|
||||
type: ValidateDatasetNode
|
||||
deps: first_hive_sync
|
||||
first_upsert:
|
||||
@@ -76,7 +76,7 @@ dag_content:
|
||||
deps: first_delete
|
||||
second_validate:
|
||||
config:
|
||||
validate_hive: true
|
||||
validate_hive: false
|
||||
delete_input_data: true
|
||||
type: ValidateDatasetNode
|
||||
deps: second_hive_sync
|
||||
|
||||
@@ -49,7 +49,7 @@ dag_content:
|
||||
deps: third_insert
|
||||
first_validate:
|
||||
config:
|
||||
validate_hive: true
|
||||
validate_hive: false
|
||||
type: ValidateDatasetNode
|
||||
deps: first_hive_sync
|
||||
first_upsert:
|
||||
@@ -76,7 +76,7 @@ dag_content:
|
||||
deps: first_delete
|
||||
second_validate:
|
||||
config:
|
||||
validate_hive: true
|
||||
validate_hive: false
|
||||
delete_input_data: true
|
||||
type: ValidateDatasetNode
|
||||
deps: second_hive_sync
|
||||
|
||||
@@ -16,6 +16,37 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
usage="
|
||||
USAGE:
|
||||
$(basename "$0") [--help] [--all boolen] -- Script to generate the test suite according to arguments provided and run these test suites.
|
||||
|
||||
where:
|
||||
--help show this help text
|
||||
--all set the seed value
|
||||
--execute_test_suite flag if test need to execute (DEFAULT- true)
|
||||
--medium_num_iterations number of medium iterations (DEFAULT- 20)
|
||||
--long_num_iterations number of long iterations (DEFAULT- 30)
|
||||
--intermittent_delay_mins delay after every test run (DEFAULT- 1)
|
||||
--table_type hoodie table type to test (DEFAULT COPY_ON_WRITE)
|
||||
--include_long_test_suite_yaml include long infra test suite (DEFAULT false)
|
||||
--include_medium_test_suite_yaml include medium infra test suite (DEFAULT false)
|
||||
--cluster_num_itr number of cluster iterations (DEFAULT 30)
|
||||
--include_cluster_yaml include cluster infra test suite (DEFAULT false)
|
||||
--input_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/input/)
|
||||
--output_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/output/)
|
||||
|
||||
Example:
|
||||
Note - Execute the command from within docker folder
|
||||
|
||||
1. To generate and run all test suites
|
||||
./generate_test_suite.sh --all true
|
||||
2. To only generate test suites
|
||||
./generate_test_suite.sh --all --execute_test_suite false
|
||||
3. To run only specific test suite yaml
|
||||
./generate_test_suite.sh --execute_test_suite true --include_medium_test_suite_yaml true
|
||||
"
|
||||
|
||||
|
||||
MEDIUM_NUM_ITR=20
|
||||
LONG_NUM_ITR=50
|
||||
DELAY_MINS=1
|
||||
@@ -39,6 +70,17 @@ do
|
||||
key="$1"
|
||||
|
||||
case $key in
|
||||
--help)
|
||||
echo "$usage"
|
||||
exit
|
||||
;;
|
||||
--all)
|
||||
INCLUDE_LONG_TEST_SUITE="$2"
|
||||
INCLUDE_MEDIUM_TEST_SUITE="$2"
|
||||
INCLUDE_CLUSTER_YAML="$2"
|
||||
shift # past argument
|
||||
shift # past value
|
||||
;;
|
||||
--execute_test_suite)
|
||||
EXECUTE_TEST_SUITE="$2"
|
||||
shift # past argument
|
||||
@@ -115,12 +157,15 @@ case $key in
|
||||
;;
|
||||
*) # unknown option
|
||||
POSITIONAL+=("$1") # save it in an array for later
|
||||
echo "Unknown argument provided - '$1'"
|
||||
echo "$usage"
|
||||
exit 0
|
||||
shift # past argument
|
||||
;;
|
||||
esac
|
||||
done
|
||||
set -- "${POSITIONAL[@]}" # restore positional parameters
|
||||
|
||||
echo "$POSITIONAL"
|
||||
echo "Include Medium test suite $INCLUDE_MEDIUM_TEST_SUITE"
|
||||
if $INCLUDE_MEDIUM_TEST_SUITE ; then
|
||||
echo "Medium test suite iterations = ${MEDIUM_NUM_ITR}"
|
||||
@@ -232,7 +277,7 @@ fi
|
||||
|
||||
if $EXECUTE_TEST_SUITE ; then
|
||||
|
||||
docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/$JAR_NAME adhoc-2:/opt/
|
||||
docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/"$JAR_NAME" adhoc-2:/opt/
|
||||
docker exec -it adhoc-2 /bin/bash rm -rf /opt/staging*
|
||||
docker cp demo/config/test-suite/staging/ adhoc-2:/opt/
|
||||
docker exec -it adhoc-2 /bin/bash echo "\n============================== Executing sanity test suite ============================== "
|
||||
|
||||
@@ -177,7 +177,7 @@ cd /opt
|
||||
Copy the integration tests jar into the docker container
|
||||
|
||||
```
|
||||
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar adhoc-2:/opt
|
||||
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt
|
||||
```
|
||||
|
||||
```
|
||||
@@ -214,21 +214,29 @@ spark-submit \
|
||||
--conf spark.network.timeout=600s \
|
||||
--conf spark.yarn.max.executor.failures=10 \
|
||||
--conf spark.sql.catalogImplementation=hive \
|
||||
--conf spark.driver.extraClassPath=/var/demo/jars/* \
|
||||
--conf spark.executor.extraClassPath=/var/demo/jars/* \
|
||||
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
||||
/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \
|
||||
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
|
||||
--source-ordering-field test_suite_source_ordering_field \
|
||||
--use-deltastreamer \
|
||||
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
||||
--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
|
||||
--target-table table1 \
|
||||
--props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \
|
||||
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
|
||||
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
|
||||
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
|
||||
--input-file-size 125829120 \
|
||||
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \
|
||||
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
|
||||
--table-type COPY_ON_WRITE \
|
||||
--compact-scheduling-minshare 1
|
||||
--compact-scheduling-minshare 1 \
|
||||
--hoodie-conf hoodie.metrics.on=true \
|
||||
--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \
|
||||
--hoodie-conf hoodie.metrics.graphite.host=graphite \
|
||||
--hoodie-conf hoodie.metrics.graphite.port=2003 \
|
||||
--clean-input \
|
||||
--clean-output
|
||||
```
|
||||
|
||||
Or a Merge-on-Read job:
|
||||
@@ -253,23 +261,44 @@ spark-submit \
|
||||
--conf spark.network.timeout=600s \
|
||||
--conf spark.yarn.max.executor.failures=10 \
|
||||
--conf spark.sql.catalogImplementation=hive \
|
||||
--conf spark.driver.extraClassPath=/var/demo/jars/* \
|
||||
--conf spark.executor.extraClassPath=/var/demo/jars/* \
|
||||
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
||||
/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \
|
||||
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
|
||||
--source-ordering-field test_suite_source_ordering_field \
|
||||
--use-deltastreamer \
|
||||
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
||||
--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
|
||||
--target-table table1 \
|
||||
--props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \
|
||||
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
|
||||
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
|
||||
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
|
||||
--input-file-size 125829120 \
|
||||
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \
|
||||
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
|
||||
--table-type MERGE_ON_READ \
|
||||
--compact-scheduling-minshare 1
|
||||
--compact-scheduling-minshare 1 \
|
||||
--hoodie-conf hoodie.metrics.on=true \
|
||||
--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \
|
||||
--hoodie-conf hoodie.metrics.graphite.host=graphite \
|
||||
--hoodie-conf hoodie.metrics.graphite.port=2003 \
|
||||
--clean-input \
|
||||
--clean-output
|
||||
```
|
||||
|
||||
## Visualize and inspect the hoodie metrics and performance (local)
|
||||
Graphite server is already setup (and up) in ```docker/setup_demo.sh```.
|
||||
|
||||
Open browser and access metrics at
|
||||
```
|
||||
http://localhost:80
|
||||
```
|
||||
Dashboard
|
||||
```
|
||||
http://localhost/dashboard
|
||||
|
||||
```
|
||||
|
||||
## Running long running test suite in Local Docker environment
|
||||
|
||||
For long running test suite, validation has to be done differently. Idea is to run same dag in a repeated manner for
|
||||
@@ -279,12 +308,12 @@ contents both via spark datasource and hive table via spark sql engine. Hive val
|
||||
If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not
|
||||
go well w/ hive2* jars. So, after running docker setup, follow the below steps.
|
||||
```
|
||||
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar adhoc-2:/opt/
|
||||
docker cp demo/config/test-suite/test.properties adhoc-2:/opt/
|
||||
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/
|
||||
docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/
|
||||
```
|
||||
Also copy your dag of interest to adhoc-2:/opt/
|
||||
```
|
||||
docker cp demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/
|
||||
docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/
|
||||
```
|
||||
|
||||
For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins".
|
||||
@@ -428,7 +457,7 @@ spark-submit \
|
||||
--conf spark.driver.extraClassPath=/var/demo/jars/* \
|
||||
--conf spark.executor.extraClassPath=/var/demo/jars/* \
|
||||
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
|
||||
/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \
|
||||
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
|
||||
--source-ordering-field test_suite_source_ordering_field \
|
||||
--use-deltastreamer \
|
||||
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
|
||||
@@ -446,6 +475,14 @@ spark-submit \
|
||||
--clean-output
|
||||
```
|
||||
|
||||
If you wish to enable metrics add below properties as well
|
||||
```
|
||||
--hoodie-conf hoodie.metrics.on=true \
|
||||
--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \
|
||||
--hoodie-conf hoodie.metrics.graphite.host=graphite \
|
||||
--hoodie-conf hoodie.metrics.graphite.port=2003 \
|
||||
```
|
||||
|
||||
Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running
|
||||
dags.
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user