From 12731f5b891b41ecbbf6f79e5b33b6eb581771e8 Mon Sep 17 00:00:00 2001 From: Sivabalan Narayanan Date: Sun, 10 Apr 2022 17:09:48 -0700 Subject: [PATCH] [HUDI-3842] Integ tests for non partitioned datasets (#5276) - Adding non-partitioned support to integ tests - Fixing some of the test yamls and properties --- ...ong-running-multi-partitions-metadata.yaml | 8 +-- ...treamer-long-running-multi-partitions.yaml | 8 +-- .../deltastreamer-medium-clustering.yaml | 8 +-- ...reamer-medium-full-dataset-validation.yaml | 8 +-- ...aml => deltastreamer-non-partitioned.yaml} | 41 +++++++----- .../test-suite/insert-overwrite-table.yaml | 2 +- ...> spark-long-running-non-partitioned.yaml} | 16 ++--- ...e-clean-archival-inline-compact.properties | 56 ++++++++++++++++ .../test-aggressive-clean-archival.properties | 8 +-- ...e-clean-archival-inline-compact.properties | 63 ++++++++++++++++++ ...ering-aggressive-clean-archival.properties | 8 +-- .../test-clustering-inline-compact.properties | 58 +++++++++++++++++ ...e-clean-archival-inline-compact.properties | 64 +++++++++++++++++++ ...adata-aggressive-clean-archival.properties | 8 +-- .../test-suite/test-clustering.properties | 8 +-- .../test-suite/test-inline-compact.properties | 53 +++++++++++++++ ...e-clean-archival-inline-compact.properties | 57 +++++++++++++++++ ...adata-aggressive-clean-archival.properties | 8 +-- .../test-metadata-inline-compact.properties | 57 +++++++++++++++++ .../test-suite/test-metadata.properties | 8 +-- ...t-nonpartitioned-inline-compact.properties | 60 +++++++++++++++++ ...itioned-metadata-inline-compact.properties | 60 +++++++++++++++++ .../test-nonpartitioned-metadata.properties | 59 +++++++++++++++++ .../test-suite/test-nonpartitioned.properties | 59 +++++++++++++++++ docker/demo/config/test-suite/test.properties | 8 +-- .../dag/nodes/BaseValidateDatasetNode.java | 4 +- .../dag/nodes/ValidateDatasetNode.java | 5 +- .../SparkDataSourceContinuousIngest.scala | 22 ++++--- 28 files changed, 742 insertions(+), 82 deletions(-) rename docker/demo/config/test-suite/{cow-spark-simple.yaml => deltastreamer-non-partitioned.yaml} (65%) rename docker/demo/config/test-suite/{cow-spark-long-running.yaml => spark-long-running-non-partitioned.yaml} (86%) create mode 100644 docker/demo/config/test-suite/test-aggressive-clean-archival-inline-compact.properties create mode 100644 docker/demo/config/test-suite/test-clustering-aggressive-clean-archival-inline-compact.properties create mode 100644 docker/demo/config/test-suite/test-clustering-inline-compact.properties create mode 100644 docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival-inline-compact.properties create mode 100644 docker/demo/config/test-suite/test-inline-compact.properties create mode 100644 docker/demo/config/test-suite/test-metadata-aggressive-clean-archival-inline-compact.properties create mode 100644 docker/demo/config/test-suite/test-metadata-inline-compact.properties create mode 100644 docker/demo/config/test-suite/test-nonpartitioned-inline-compact.properties create mode 100644 docker/demo/config/test-suite/test-nonpartitioned-metadata-inline-compact.properties create mode 100644 docker/demo/config/test-suite/test-nonpartitioned-metadata.properties create mode 100644 docker/demo/config/test-suite/test-nonpartitioned.properties diff --git a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml index 57c8d0100..05f1bf855 100644 --- a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml +++ b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml @@ -20,7 +20,7 @@ dag_content: first_insert: config: record_size: 1000 - num_partitions_insert: 5 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 1000 type: InsertNode @@ -36,7 +36,7 @@ dag_content: third_insert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 300 deps: second_insert @@ -44,11 +44,11 @@ dag_content: first_upsert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 num_records_insert: 300 repeat_count: 1 num_records_upsert: 100 - num_partitions_upsert: 1 + num_partitions_upsert: 50 type: UpsertNode deps: third_insert first_delete: diff --git a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml index a29152bb4..410df754c 100644 --- a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml +++ b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml @@ -20,7 +20,7 @@ dag_content: first_insert: config: record_size: 1000 - num_partitions_insert: 5 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 1000 type: InsertNode @@ -36,7 +36,7 @@ dag_content: third_insert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 300 deps: second_insert @@ -44,11 +44,11 @@ dag_content: first_upsert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 num_records_insert: 300 repeat_count: 1 num_records_upsert: 100 - num_partitions_upsert: 1 + num_partitions_upsert: 50 type: UpsertNode deps: third_insert first_delete: diff --git a/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml b/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml index 0cd4108cb..81c21a7be 100644 --- a/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml +++ b/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml @@ -23,7 +23,7 @@ dag_content: first_insert: config: record_size: 1000 - num_partitions_insert: 5 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 1000 type: InsertNode @@ -39,7 +39,7 @@ dag_content: third_insert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 300 deps: second_insert @@ -47,11 +47,11 @@ dag_content: first_upsert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 num_records_insert: 300 repeat_count: 1 num_records_upsert: 100 - num_partitions_upsert: 1 + num_partitions_upsert: 50 type: UpsertNode deps: third_insert first_delete: diff --git a/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml b/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml index a20870f26..a2d85a7a4 100644 --- a/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml +++ b/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml @@ -23,7 +23,7 @@ dag_content: first_insert: config: record_size: 1000 - num_partitions_insert: 5 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 1000 type: InsertNode @@ -39,7 +39,7 @@ dag_content: third_insert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 300 deps: second_insert @@ -47,11 +47,11 @@ dag_content: first_upsert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 num_records_insert: 300 repeat_count: 1 num_records_upsert: 100 - num_partitions_upsert: 1 + num_partitions_upsert: 50 type: UpsertNode deps: third_insert first_delete: diff --git a/docker/demo/config/test-suite/cow-spark-simple.yaml b/docker/demo/config/test-suite/deltastreamer-non-partitioned.yaml similarity index 65% rename from docker/demo/config/test-suite/cow-spark-simple.yaml rename to docker/demo/config/test-suite/deltastreamer-non-partitioned.yaml index 192adcf37..a8be72e10 100644 --- a/docker/demo/config/test-suite/cow-spark-simple.yaml +++ b/docker/demo/config/test-suite/deltastreamer-non-partitioned.yaml @@ -13,42 +13,51 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -dag_name: cow-spark-simple.yaml -dag_rounds: 1 -dag_intermittent_delay_mins: 1 +dag_name: deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 6 +dag_intermittent_delay_mins: 0 dag_content: first_insert: config: record_size: 1000 num_partitions_insert: 1 repeat_count: 1 - num_records_insert: 100 - type: SparkInsertNode + num_records_insert: 1000 + type: InsertNode deps: none - first_validate: + second_insert: config: - validate_hive: false - type: ValidateDatasetNode + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 deps: first_insert + type: InsertNode first_upsert: config: record_size: 1000 num_partitions_insert: 1 - num_records_insert: 50 + num_records_insert: 1000 repeat_count: 1 - num_records_upsert: 100 + num_records_upsert: 8000 num_partitions_upsert: 1 - type: SparkUpsertNode - deps: first_validate + type: UpsertNode + deps: second_insert first_delete: config: num_partitions_delete: 1 - num_records_delete: 30 - type: SparkDeleteNode + num_records_delete: 1000 + type: DeleteNode deps: first_upsert second_validate: config: + validate_once_every_itr : 3 validate_hive: false - delete_input_data: false + delete_input_data: true type: ValidateDatasetNode - deps: first_delete \ No newline at end of file + deps: first_delete + last_validate: + config: + execute_itr_count: 6 + type: ValidateAsyncOperations + deps: second_validate diff --git a/docker/demo/config/test-suite/insert-overwrite-table.yaml b/docker/demo/config/test-suite/insert-overwrite-table.yaml index 1a58abdcc..2251660b7 100644 --- a/docker/demo/config/test-suite/insert-overwrite-table.yaml +++ b/docker/demo/config/test-suite/insert-overwrite-table.yaml @@ -56,7 +56,7 @@ dag_content: first_insert_overwrite_table: config: record_size: 1000 - repeat_count: 10 + repeat_count: 1 num_records_insert: 10 type: SparkInsertOverwriteTableNode deps: second_upsert diff --git a/docker/demo/config/test-suite/cow-spark-long-running.yaml b/docker/demo/config/test-suite/spark-long-running-non-partitioned.yaml similarity index 86% rename from docker/demo/config/test-suite/cow-spark-long-running.yaml rename to docker/demo/config/test-suite/spark-long-running-non-partitioned.yaml index 00fea43f4..3c47729e6 100644 --- a/docker/demo/config/test-suite/cow-spark-long-running.yaml +++ b/docker/demo/config/test-suite/spark-long-running-non-partitioned.yaml @@ -14,13 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. dag_name: cow-spark-deltastreamer-long-running-multi-partitions.yaml -dag_rounds: 30 +dag_rounds: 6 dag_intermittent_delay_mins: 0 dag_content: first_insert: config: record_size: 200 - num_partitions_insert: 50 + num_partitions_insert: 1 repeat_count: 1 num_records_insert: 10000 type: SparkInsertNode @@ -28,28 +28,28 @@ dag_content: first_upsert: config: record_size: 200 - num_partitions_insert: 50 + num_partitions_insert: 1 num_records_insert: 300 repeat_count: 1 num_records_upsert: 3000 - num_partitions_upsert: 50 + num_partitions_upsert: 1 type: SparkUpsertNode deps: first_insert first_delete: config: - num_partitions_delete: 50 - num_records_delete: 4000 + num_partitions_delete: 1 + num_records_delete: 1000 type: SparkDeleteNode deps: first_upsert second_validate: config: - validate_once_every_itr : 5 + validate_once_every_itr : 3 validate_hive: false delete_input_data: true type: ValidateDatasetNode deps: first_delete last_validate: config: - execute_itr_count: 30 + execute_itr_count: 6 type: ValidateAsyncOperations deps: second_validate diff --git a/docker/demo/config/test-suite/test-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 000000000..ed6ed750a --- /dev/null +++ b/docker/demo/config/test-suite/test-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,56 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.compact.inline=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-aggressive-clean-archival.properties index 159c1f233..70d94c6fe 100644 --- a/docker/demo/config/test-suite/test-aggressive-clean-archival.properties +++ b/docker/demo/config/test-suite/test-aggressive-clean-archival.properties @@ -18,9 +18,10 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.cleaner.commits.retained=8 hoodie.keep.min.commits=12 @@ -29,7 +30,6 @@ hoodie.keep.max.commits=14 hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 000000000..eac9b30b7 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,63 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.compact.inline=true + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties index d079536f9..4508e3982 100644 --- a/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties +++ b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties @@ -18,9 +18,10 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.cleaner.commits.retained=8 hoodie.keep.min.commits=12 @@ -29,7 +30,6 @@ hoodie.keep.max.commits=14 hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-clustering-inline-compact.properties b/docker/demo/config/test-suite/test-clustering-inline-compact.properties new file mode 100644 index 000000000..b61c5bd79 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-inline-compact.properties @@ -0,0 +1,58 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.compact.inline=true +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 000000000..b1c872768 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,64 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.compact.inline=true +hoodie.metadata.enable=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties index 23b95f430..e286e9575 100644 --- a/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties +++ b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties @@ -18,9 +18,10 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.cleaner.commits.retained=8 hoodie.keep.min.commits=12 @@ -31,7 +32,6 @@ hoodie.metadata.enable=true hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-clustering.properties b/docker/demo/config/test-suite/test-clustering.properties index 9aa4843b2..3b3242c5d 100644 --- a/docker/demo/config/test-suite/test-clustering.properties +++ b/docker/demo/config/test-suite/test-clustering.properties @@ -18,14 +18,14 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-inline-compact.properties b/docker/demo/config/test-suite/test-inline-compact.properties new file mode 100644 index 000000000..03584a14a --- /dev/null +++ b/docker/demo/config/test-suite/test-inline-compact.properties @@ -0,0 +1,53 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=false +hoodie.compact.inline=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 000000000..e56bfdac6 --- /dev/null +++ b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.metadata.enable=true +hoodie.compact.inline=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties index 160da8300..c6351d634 100644 --- a/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties +++ b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties @@ -18,9 +18,10 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.cleaner.commits.retained=8 hoodie.keep.min.commits=12 @@ -31,7 +32,6 @@ hoodie.metadata.enable=true hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-metadata-inline-compact.properties b/docker/demo/config/test-suite/test-metadata-inline-compact.properties new file mode 100644 index 000000000..d4bec244e --- /dev/null +++ b/docker/demo/config/test-suite/test-metadata-inline-compact.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=true +hoodie.compact.inline=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.plan.strategy.sort.columns=_row_key +hoodie.clustering.plan.strategy.daybased.lookback.partitions=0 +hoodie.clustering.inline.max.commits=1 + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata.properties b/docker/demo/config/test-suite/test-metadata.properties index 48da77c51..2ac645d09 100644 --- a/docker/demo/config/test-suite/test-metadata.properties +++ b/docker/demo/config/test-suite/test-metadata.properties @@ -18,16 +18,16 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.metadata.enable=true hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-nonpartitioned-inline-compact.properties b/docker/demo/config/test-suite/test-nonpartitioned-inline-compact.properties new file mode 100644 index 000000000..b8ed4b62a --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned-inline-compact.properties @@ -0,0 +1,60 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=false +hoodie.compact.inline=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-nonpartitioned-metadata-inline-compact.properties b/docker/demo/config/test-suite/test-nonpartitioned-metadata-inline-compact.properties new file mode 100644 index 000000000..2e0ca2a46 --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned-metadata-inline-compact.properties @@ -0,0 +1,60 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=true +hoodie.compact.inline=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-nonpartitioned-metadata.properties b/docker/demo/config/test-suite/test-nonpartitioned-metadata.properties new file mode 100644 index 000000000..487825c1d --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned-metadata.properties @@ -0,0 +1,59 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-nonpartitioned.properties b/docker/demo/config/test-suite/test-nonpartitioned.properties new file mode 100644 index 000000000..0d8b5250a --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned.properties @@ -0,0 +1,59 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test.properties b/docker/demo/config/test-suite/test.properties index 509b9f4ba..dc743e516 100644 --- a/docker/demo/config/test-suite/test.properties +++ b/docker/demo/config/test-suite/test.properties @@ -15,16 +15,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.metadata.enable=false hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java index de58bf6a1..b5c661cb0 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java @@ -81,7 +81,7 @@ public abstract class BaseValidateDatasetNode extends DagNode { SparkSession session = SparkSession.builder().sparkContext(context.getJsc().sc()).getOrCreate(); // todo: Fix partitioning schemes. For now, assumes data based partitioning. String inputPath = context.getHoodieTestSuiteWriter().getCfg().inputBasePath + "/*/*"; - log.warn("Validation using data from input path " + inputPath); + log.info("Validation using data from input path " + inputPath); // listing batches to be validated String inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; if (log.isDebugEnabled()) { @@ -166,7 +166,7 @@ public abstract class BaseValidateDatasetNode extends DagNode { ExpressionEncoder encoder = getEncoder(inputDf.schema()); return inputDf.groupByKey( (MapFunction) value -> - value.getAs(partitionPathField) + "+" + value.getAs(recordKeyField), Encoders.STRING()) + (partitionPathField.isEmpty() ? value.getAs(recordKeyField) : (value.getAs(partitionPathField) + "+" + value.getAs(recordKeyField))), Encoders.STRING()) .reduceGroups((ReduceFunction) (v1, v2) -> { int ts1 = v1.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); int ts2 = v2.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java index cc293ea47..358abb36f 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java @@ -20,6 +20,7 @@ package org.apache.hudi.integ.testsuite.dag.nodes; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; import org.apache.spark.sql.Dataset; @@ -48,8 +49,8 @@ public class ValidateDatasetNode extends BaseValidateDatasetNode { @Override public Dataset getDatasetToValidate(SparkSession session, ExecutionContext context, StructType inputSchema) { - String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/*/*/*"; - log.info("Validate data in target hudi path " + hudiPath); + String partitionPathField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.PARTITIONPATH_FIELD().key()); + String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + (partitionPathField.isEmpty() ? "/" : "/*/*/*"); Dataset hudiDf = session.read().option(HoodieMetadataConfig.ENABLE.key(), String.valueOf(config.isEnableMetadataValidate())) .format("hudi").load(hudiPath); return hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala index 9ead7f290..635210632 100644 --- a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala @@ -64,16 +64,20 @@ class SparkDataSourceContinuousIngest(val spark: SparkSession, val conf: Configu } } - orderedBatch.foreach(entry => { - log.info("Consuming from batch " + entry) - val pathToConsume = new Path(sourcePath.toString + "/" + entry.getPath.getName) - val df = spark.read.format(sourceFormat).load(pathToConsume.toString) + if (orderedBatch.isEmpty) { + log.info("All batches have been consumed. Exiting.") + } else { + orderedBatch.foreach(entry => { + log.info("Consuming from batch " + entry) + val pathToConsume = new Path(sourcePath.toString + "/" + entry.getPath.getName) + val df = spark.read.format(sourceFormat).load(pathToConsume.toString) - df.write.format("hudi").options(hudiOptions).mode(SaveMode.Append).save(hudiBasePath.toString) - writeToFile(checkpointFile, entry.getPath.getName, checkPointFs) - log.info("Completed batch " + entry + ". Moving to next batch. Sleeping for " + minSyncIntervalSeconds + " secs before next batch") - Thread.sleep(minSyncIntervalSeconds * 1000) - }) + df.write.format("hudi").options(hudiOptions).mode(SaveMode.Append).save(hudiBasePath.toString) + writeToFile(checkpointFile, entry.getPath.getName, checkPointFs) + log.info("Completed batch " + entry + ". Moving to next batch. Sleeping for " + minSyncIntervalSeconds + " secs before next batch") + Thread.sleep(minSyncIntervalSeconds * 1000) + }) + } } def fetchListOfFilesToConsume(fs: FileSystem, basePath: Path, pathFilter: PathFilter): Array[FileStatus] = {