1
0

[HUDI-2267] Update docs and infra test configs, add support for graphite (#3482)

Co-authored-by: Sivabalan Narayanan <n.siva.b@gmail.com>
This commit is contained in:
Satish M
2021-09-17 19:40:15 +05:30
committed by GitHub
parent 3a150ee181
commit c7a5c8273b
7 changed files with 114 additions and 23 deletions

View File

@@ -33,7 +33,7 @@ services:
interval: 30s
timeout: 10s
retries: 3
datanode1:
image: apachehudi/hudi-hadoop_2.8.4-datanode:latest
container_name: datanode1
@@ -84,7 +84,7 @@ services:
- hive-metastore-postgresql:/var/lib/postgresql
hostname: hive-metastore-postgresql
container_name: hive-metastore-postgresql
hivemetastore:
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest
hostname: hivemetastore
@@ -221,6 +221,15 @@ services:
- ${HUDI_WS}:/var/hoodie/ws
command: worker
graphite:
container_name: graphite
hostname: graphite
image: graphiteapp/graphite-statsd
ports:
- 80:80
- 2003-2004:2003-2004
- 8126:8126
adhoc-1:
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest
hostname: adhoc-1

View File

@@ -49,7 +49,7 @@ dag_content:
deps: third_insert
first_validate:
config:
validate_hive: true
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_upsert:
@@ -76,7 +76,7 @@ dag_content:
deps: first_delete
second_validate:
config:
validate_hive: true
validate_hive: false
delete_input_data: true
type: ValidateDatasetNode
deps: second_hive_sync

View File

@@ -55,7 +55,7 @@ dag_content:
deps: first_delete
first_validate:
config:
validate_hive: true
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_cluster:
@@ -71,6 +71,6 @@ dag_content:
deps: first_cluster
second_validate:
config:
validate_hive: true
validate_hive: false
type: ValidateDatasetNode
deps: second_hive_sync

View File

@@ -49,7 +49,7 @@ dag_content:
deps: third_insert
first_validate:
config:
validate_hive: true
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_upsert:
@@ -76,7 +76,7 @@ dag_content:
deps: first_delete
second_validate:
config:
validate_hive: true
validate_hive: false
delete_input_data: true
type: ValidateDatasetNode
deps: second_hive_sync

View File

@@ -49,7 +49,7 @@ dag_content:
deps: third_insert
first_validate:
config:
validate_hive: true
validate_hive: false
type: ValidateDatasetNode
deps: first_hive_sync
first_upsert:
@@ -76,7 +76,7 @@ dag_content:
deps: first_delete
second_validate:
config:
validate_hive: true
validate_hive: false
delete_input_data: true
type: ValidateDatasetNode
deps: second_hive_sync

View File

@@ -16,6 +16,37 @@
# See the License for the specific language governing permissions and
# limitations under the License.
usage="
USAGE:
$(basename "$0") [--help] [--all boolen] -- Script to generate the test suite according to arguments provided and run these test suites.
where:
--help show this help text
--all set the seed value
--execute_test_suite flag if test need to execute (DEFAULT- true)
--medium_num_iterations number of medium iterations (DEFAULT- 20)
--long_num_iterations number of long iterations (DEFAULT- 30)
--intermittent_delay_mins delay after every test run (DEFAULT- 1)
--table_type hoodie table type to test (DEFAULT COPY_ON_WRITE)
--include_long_test_suite_yaml include long infra test suite (DEFAULT false)
--include_medium_test_suite_yaml include medium infra test suite (DEFAULT false)
--cluster_num_itr number of cluster iterations (DEFAULT 30)
--include_cluster_yaml include cluster infra test suite (DEFAULT false)
--input_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/input/)
--output_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/output/)
Example:
Note - Execute the command from within docker folder
1. To generate and run all test suites
./generate_test_suite.sh --all true
2. To only generate test suites
./generate_test_suite.sh --all --execute_test_suite false
3. To run only specific test suite yaml
./generate_test_suite.sh --execute_test_suite true --include_medium_test_suite_yaml true
"
MEDIUM_NUM_ITR=20
LONG_NUM_ITR=50
DELAY_MINS=1
@@ -39,6 +70,17 @@ do
key="$1"
case $key in
--help)
echo "$usage"
exit
;;
--all)
INCLUDE_LONG_TEST_SUITE="$2"
INCLUDE_MEDIUM_TEST_SUITE="$2"
INCLUDE_CLUSTER_YAML="$2"
shift # past argument
shift # past value
;;
--execute_test_suite)
EXECUTE_TEST_SUITE="$2"
shift # past argument
@@ -115,12 +157,15 @@ case $key in
;;
*) # unknown option
POSITIONAL+=("$1") # save it in an array for later
echo "Unknown argument provided - '$1'"
echo "$usage"
exit 0
shift # past argument
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
echo "$POSITIONAL"
echo "Include Medium test suite $INCLUDE_MEDIUM_TEST_SUITE"
if $INCLUDE_MEDIUM_TEST_SUITE ; then
echo "Medium test suite iterations = ${MEDIUM_NUM_ITR}"
@@ -232,7 +277,7 @@ fi
if $EXECUTE_TEST_SUITE ; then
docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/$JAR_NAME adhoc-2:/opt/
docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/"$JAR_NAME" adhoc-2:/opt/
docker exec -it adhoc-2 /bin/bash rm -rf /opt/staging*
docker cp demo/config/test-suite/staging/ adhoc-2:/opt/
docker exec -it adhoc-2 /bin/bash echo "\n============================== Executing sanity test suite ============================== "

View File

@@ -177,7 +177,7 @@ cd /opt
Copy the integration tests jar into the docker container
```
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar adhoc-2:/opt
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt
```
```
@@ -214,21 +214,29 @@ spark-submit \
--conf spark.network.timeout=600s \
--conf spark.yarn.max.executor.failures=10 \
--conf spark.sql.catalogImplementation=hive \
--conf spark.driver.extraClassPath=/var/demo/jars/* \
--conf spark.executor.extraClassPath=/var/demo/jars/* \
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
--source-ordering-field test_suite_source_ordering_field \
--use-deltastreamer \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
--target-table table1 \
--props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
--input-file-size 125829120 \
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
--table-type COPY_ON_WRITE \
--compact-scheduling-minshare 1
--compact-scheduling-minshare 1 \
--hoodie-conf hoodie.metrics.on=true \
--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \
--hoodie-conf hoodie.metrics.graphite.host=graphite \
--hoodie-conf hoodie.metrics.graphite.port=2003 \
--clean-input \
--clean-output
```
Or a Merge-on-Read job:
@@ -253,23 +261,44 @@ spark-submit \
--conf spark.network.timeout=600s \
--conf spark.yarn.max.executor.failures=10 \
--conf spark.sql.catalogImplementation=hive \
--conf spark.driver.extraClassPath=/var/demo/jars/* \
--conf spark.executor.extraClassPath=/var/demo/jars/* \
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
--source-ordering-field test_suite_source_ordering_field \
--use-deltastreamer \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
--target-table table1 \
--props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
--input-file-size 125829120 \
--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \
--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
--table-type MERGE_ON_READ \
--compact-scheduling-minshare 1
--compact-scheduling-minshare 1 \
--hoodie-conf hoodie.metrics.on=true \
--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \
--hoodie-conf hoodie.metrics.graphite.host=graphite \
--hoodie-conf hoodie.metrics.graphite.port=2003 \
--clean-input \
--clean-output
```
## Visualize and inspect the hoodie metrics and performance (local)
Graphite server is already setup (and up) in ```docker/setup_demo.sh```.
Open browser and access metrics at
```
http://localhost:80
```
Dashboard
```
http://localhost/dashboard
```
## Running long running test suite in Local Docker environment
For long running test suite, validation has to be done differently. Idea is to run same dag in a repeated manner for
@@ -279,12 +308,12 @@ contents both via spark datasource and hive table via spark sql engine. Hive val
If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not
go well w/ hive2* jars. So, after running docker setup, follow the below steps.
```
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar adhoc-2:/opt/
docker cp demo/config/test-suite/test.properties adhoc-2:/opt/
docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/
docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/
```
Also copy your dag of interest to adhoc-2:/opt/
```
docker cp demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/
docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/
```
For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins".
@@ -428,7 +457,7 @@ spark-submit \
--conf spark.driver.extraClassPath=/var/demo/jars/* \
--conf spark.executor.extraClassPath=/var/demo/jars/* \
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \
/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
--source-ordering-field test_suite_source_ordering_field \
--use-deltastreamer \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
@@ -446,6 +475,14 @@ spark-submit \
--clean-output
```
If you wish to enable metrics add below properties as well
```
--hoodie-conf hoodie.metrics.on=true \
--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \
--hoodie-conf hoodie.metrics.graphite.host=graphite \
--hoodie-conf hoodie.metrics.graphite.port=2003 \
```
Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running
dags.
```