diff --git a/.travis.yml b/.travis.yml index 1454c6dc7..67fa0071f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,7 +27,7 @@ jobs: - name: "Functional tests" env: MODE=functional HUDI_QUIETER_LOGGING=1 - name: "Integration tests" - env: MODE=integration + env: MODE=integration HUDI_QUIETER_LOGGING=1 install: true services: - docker diff --git a/docker/demo/config/test-suite/base.properties b/docker/demo/config/test-suite/base.properties new file mode 100644 index 000000000..13b1acc8e --- /dev/null +++ b/docker/demo/config/test-suite/base.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +hoodie.upsert.shuffle.parallelism=2 +hoodie.insert.shuffle.parallelism=2 +hoodie.bulkinsert.shuffle.parallelism=2 +hoodie.datasource.write.partitionpath.field=timestamp diff --git a/docker/demo/config/test-suite/complex-dag-cow.yaml b/docker/demo/config/test-suite/complex-dag-cow.yaml new file mode 100644 index 000000000..5a97688fb --- /dev/null +++ b/docker/demo/config/test-suite/complex-dag-cow.yaml @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +first_insert: + config: + record_size: 70000 + num_insert_partitions: 1 + repeat_count: 5 + num_records_insert: 1000 + type: InsertNode + deps: none +second_insert: + config: + record_size: 70000 + num_insert_partitions: 1 + repeat_count: 5 + num_records_insert: 10000 + deps: first_insert + type: InsertNode +third_insert: + config: + record_size: 70000 + num_insert_partitions: 1 + repeat_count: 2 + num_records_insert: 300 + deps: second_insert + type: InsertNode +first_rollback: + config: + deps: third_insert + type: RollbackNode +first_upsert: + config: + record_size: 70000 + num_insert_partitions: 1 + num_records_insert: 300 + repeat_count: 5 + num_records_upsert: 100 + num_upsert_partitions: 10 + type: UpsertNode + deps: first_rollback +first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_upsert +first_hive_query: + config: + hive_props: + prop1: "set hive.execution.engine=spark" + prop2: "set spark.yarn.queue=" + prop3: "set hive.strict.checks.large.query=false" + prop4: "set hive.stats.autogather=false" + hive_queries: + query1: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1" + result1: 0 + query2: "select count(*) from testdb1.table1" + result2: 22100000 + type: HiveQueryNode + deps: first_hive_sync +second_upsert: + config: + record_size: 70000 + num_insert_partitions: 1 + num_records_insert: 300 + repeat_count: 5 + num_records_upsert: 100 + num_upsert_partitions: 10 + type: UpsertNode + deps: first_hive_query +second_hive_query: + config: + hive_props: + prop1: "set hive.execution.engine=mr" + prop2: "set mapred.job.queue.name=" + prop3: "set hive.strict.checks.large.query=false" + prop4: "set hive.stats.autogather=false" + hive_queries: + query1: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1" + result1: 0 + query2: "select count(*) from testdb1.table1" + result2: 22100 + type: HiveQueryNode + deps: second_upsert \ No newline at end of file diff --git a/docker/demo/config/test-suite/complex-dag-mor.yaml b/docker/demo/config/test-suite/complex-dag-mor.yaml new file mode 100644 index 000000000..3981603e6 --- /dev/null +++ b/docker/demo/config/test-suite/complex-dag-mor.yaml @@ -0,0 +1,119 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +first_insert: + config: + record_size: 70000 + num_insert_partitions: 1 + repeat_count: 5 + num_records_insert: 100 + type: InsertNode + deps: none +second_insert: + config: + record_size: 70000 + num_insert_partitions: 1 + repeat_count: 5 + num_records_insert: 100 + deps: first_insert + type: InsertNode +third_insert: + config: + record_size: 70000 + num_insert_partitions: 1 + repeat_count: 2 + num_records_insert: 300 + deps: second_insert + type: InsertNode +first_rollback: + config: + deps: third_insert + type: RollbackNode +first_upsert: + config: + record_size: 70000 + num_insert_partitions: 1 + num_records_insert: 300 + repeat_count: 5 + num_records_upsert: 100 + num_upsert_partitions: 10 + type: UpsertNode + deps: first_rollback +first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_upsert +first_hive_query: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveQueryNode + deps: first_hive_sync +second_upsert: + config: + record_size: 70000 + num_insert_partitions: 1 + num_records_insert: 300 + repeat_count: 5 + num_records_upsert: 100 + num_upsert_partitions: 10 + type: UpsertNode + deps: first_hive_query +second_hive_query: + config: + queue_name: "adhoc" + engine: "mr" + hive_queries: + query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1" + result1: 0 + query2: "select count(*) from testdb.table1" + result2: 3100 + query3: "select count(*) from testdb.table1_rt group by `_row_key` having count(*) > 1" + result3: 0 + query4: "select count(*) from testdb.table1_rt" + result4: 3100 + type: HiveQueryNode + deps: second_upsert +first_schedule_compact: + config: + type: ScheduleCompactNode + deps: second_hive_query +third_upsert: + config: + record_size: 70000 + num_insert_partitions: 1 + num_records_insert: 300 + repeat_count: 5 + num_records_upsert: 100 + num_upsert_partitions: 10 + type: UpsertNode + deps: first_schedule_compact +first_compact: + config: + type: CompactNode + deps: first_schedule_compact +third_hive_query: + config: + queue_name: "adhoc" + engine: "mr" + hive_queries: + query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1" + result1: 0 + query2: "select count(*) from testdb.table1" + result2: 2210 + type: HiveQueryNode + deps: second_upsert \ No newline at end of file diff --git a/docker/demo/config/test-suite/complex-source.avsc b/docker/demo/config/test-suite/complex-source.avsc new file mode 100644 index 000000000..a202d2d87 --- /dev/null +++ b/docker/demo/config/test-suite/complex-source.avsc @@ -0,0 +1,466 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "name": "COMPLEX", + "fields": [ + { + "default": null, + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ], + "name": "array_of_string_fields1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field2" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field3" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field4" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field5" + }, + { + "default": null, + "type": [ + "null", + "boolean" + ], + "name": "boolean_field1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field6" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field7" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field8" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field9" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field10" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field11" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field12" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field13" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field14" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field2" + }, + { + "default": null, + "type": [ + "null", + { + "items": { + "fields": [ + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field15" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field16" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field17" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field3" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field4" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field2" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field3" + } + ], + "type": "record", + "name": "record_field1" + }, + "type": "array" + } + ], + "name": "record_name1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field18" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field5" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field4" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field5" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field19" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field6" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field20" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field7" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field6" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field21" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field22" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field23" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field8" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field7" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field24" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field10" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field25" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field26" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field11" + }, + { + "default": null, + "type": [ + "null", + "boolean" + ], + "name": "boolean_field3" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field12" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field8" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field13" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field27" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field28" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field29" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field30" + } + ], + "type": "record" +} \ No newline at end of file diff --git a/docker/demo/config/test-suite/source.avsc b/docker/demo/config/test-suite/source.avsc new file mode 100644 index 000000000..c2a8d7c2a --- /dev/null +++ b/docker/demo/config/test-suite/source.avsc @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "type" : "record", + "name" : "triprec", + "fields" : [ + { + "name" : "timestamp", + "type" : "long" + }, { + "name" : "_row_key", + "type" : "string" + }, { + "name" : "rider", + "type" : "string" + }, { + "name" : "driver", + "type" : "string" + }, { + "name" : "begin_lat", + "type" : "double" + }, { + "name" : "begin_lon", + "type" : "double" + }, { + "name" : "end_lat", + "type" : "double" + }, { + "name" : "end_lon", + "type" : "double" + }, { + "name" : "fare", + "type" : "double" + } ] +} + diff --git a/docker/demo/config/test-suite/target.avsc b/docker/demo/config/test-suite/target.avsc new file mode 100644 index 000000000..11e23a493 --- /dev/null +++ b/docker/demo/config/test-suite/target.avsc @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "type" : "record", + "name" : "triprec", + "fields" : [ + { + "name" : "timestamp", + "type" : "double" + }, { + "name" : "_row_key", + "type" : "string" + }, { + "name" : "rider", + "type" : "string" + }, { + "name" : "driver", + "type" : "string" + }, { + "name" : "begin_lat", + "type" : "double" + }, { + "name" : "begin_lon", + "type" : "double" + }, { + "name" : "end_lat", + "type" : "double" + }, { + "name" : "end_lon", + "type" : "double" + }, { + "name" : "fare", + "type" : "double" + }, { + "name" : "haversine_distance", + "type" : "double" + }] +} + diff --git a/docker/demo/config/test-suite/test-source.properties b/docker/demo/config/test-suite/test-source.properties new file mode 100644 index 000000000..397f871c8 --- /dev/null +++ b/docker/demo/config/test-suite/test-source.properties @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +hoodie.datasource.write.recordkey.field=_row_key +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-bench/input +hoodie.datasource.write.keygenerator.class=org.apache.hudi.ComplexKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp +hoodie.deltastreamer.schemaprovider.source.schema.file=/var/hoodie/ws/docker/demo/config/bench/source.avsc +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=test_table +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.NonPartitionedExtractor +hoodie.datasource.hive_sync.assume_date_partitioning=true +hoodie.datasource.write.keytranslator.class=org.apache.hudi.DayBasedPartitionPathKeyTranslator +hoodie.deltastreamer.schemaprovider.target.schema.file=/var/hoodie/ws/docker/demo/config/bench/source.avsc \ No newline at end of file diff --git a/docker/hoodie/hadoop/hive_base/Dockerfile b/docker/hoodie/hadoop/hive_base/Dockerfile index b8452d618..8d85fd5b5 100644 --- a/docker/hoodie/hadoop/hive_base/Dockerfile +++ b/docker/hoodie/hadoop/hive_base/Dockerfile @@ -55,6 +55,7 @@ ENV HUDI_HADOOP_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hood ENV HUDI_HIVE_SYNC_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-hive-sync-bundle.jar ENV HUDI_SPARK_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-spark-bundle.jar ENV HUDI_UTILITIES_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-utilities.jar +ENV HUDI_INTEG_TEST_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-integ-test-bundle.jar COPY startup.sh /usr/local/bin/ RUN chmod +x /usr/local/bin/startup.sh diff --git a/docker/hoodie/hadoop/hive_base/pom.xml b/docker/hoodie/hadoop/hive_base/pom.xml index 1a632eb8e..399f7b713 100644 --- a/docker/hoodie/hadoop/hive_base/pom.xml +++ b/docker/hoodie/hadoop/hive_base/pom.xml @@ -59,9 +59,7 @@ - + diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java index e23d88586..ed7a4582e 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java @@ -68,7 +68,6 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; - /** * Class to be used in tests to keep generating test inserts and updates against a corpus. *

@@ -98,7 +97,8 @@ public class HoodieTestDataGenerator { + "{\"name\": \"amount\",\"type\": \"double\"},{\"name\": \"currency\", \"type\": \"string\"}]}},"; public static final String FARE_FLATTENED_SCHEMA = "{\"name\": \"fare\", \"type\": \"double\"}," + "{\"name\": \"currency\", \"type\": \"string\"},"; - public static final String TIP_NESTED_SCHEMA = "{\"name\": \"tip_history\", \"type\": {\"type\": \"array\", \"items\": {\"type\": \"record\", \"name\": \"tip_history\", \"fields\": [" + public static final String TIP_NESTED_SCHEMA = "{\"name\": \"tip_history\", \"default\": null, \"type\": {\"type\": " + + "\"array\", \"items\": {\"type\": \"record\", \"default\": null, \"name\": \"tip_history\", \"fields\": [" + "{\"name\": \"amount\", \"type\": \"double\"}, {\"name\": \"currency\", \"type\": \"string\"}]}}},"; public static final String MAP_TYPE_SCHEMA = "{\"name\": \"city_to_state\", \"type\": {\"type\": \"map\", \"values\": \"string\"}},"; public static final String EXTRA_TYPE_SCHEMA = "{\"name\": \"distance_in_meters\", \"type\": \"int\"}," @@ -254,7 +254,6 @@ public class HoodieTestDataGenerator { rec.put("begin_lon", RAND.nextDouble()); rec.put("end_lat", RAND.nextDouble()); rec.put("end_lon", RAND.nextDouble()); - if (isFlattened) { rec.put("fare", RAND.nextDouble() * 100); rec.put("currency", "USD"); @@ -730,7 +729,7 @@ public class HoodieTestDataGenerator { public boolean deleteExistingKeyIfPresent(HoodieKey key) { Map existingKeys = existingKeysBySchema.get(TRIP_EXAMPLE_SCHEMA); Integer numExistingKeys = numKeysBySchema.get(TRIP_EXAMPLE_SCHEMA); - for (Map.Entry entry: existingKeys.entrySet()) { + for (Map.Entry entry : existingKeys.entrySet()) { if (entry.getValue().key.equals(key)) { int index = entry.getKey(); existingKeys.put(index, existingKeys.get(numExistingKeys - 1)); @@ -740,10 +739,18 @@ public class HoodieTestDataGenerator { return true; } } - return false; } + public List generateGenericRecords(int numRecords) { + List list = new ArrayList<>(); + IntStream.range(0, numRecords).forEach(i -> { + list.add(generateGenericRecord(UUID.randomUUID().toString(), UUID.randomUUID().toString(), UUID.randomUUID() + .toString(), RAND.nextDouble())); + }); + return list; + } + public String[] getPartitionPaths() { return partitionPaths; } diff --git a/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index 877ba4791..d8292526e 100644 --- a/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -112,6 +112,15 @@ public class HiveSyncTool { + " of type " + hoodieHiveClient.getTableType()); // Check if the necessary table exists boolean tableExists = hoodieHiveClient.doesTableExist(tableName); + + // check if the database exists else create it + try { + hoodieHiveClient.updateHiveSQL("create database if not exists " + cfg.databaseName); + } catch (Exception e) { + // this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing + LOG.warn("Unable to create database", e); + } + // Get the parquet schema for this table looking at the latest commit MessageType schema = hoodieHiveClient.getDataSchema(); // Sync schema if needed diff --git a/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java b/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java index f1034e3cd..2bb3fd112 100644 --- a/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java +++ b/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java @@ -572,4 +572,8 @@ public class HoodieHiveClient { return new PartitionEvent(PartitionEventType.UPDATE, storagePartition); } } + + public IMetaStoreClient getClient() { + return client; + } } \ No newline at end of file diff --git a/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestService.java b/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestService.java index 2f9880304..51a5bf5ed 100644 --- a/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestService.java +++ b/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestService.java @@ -126,10 +126,21 @@ public class HiveTestService { public void stop() { resetSystemProperties(); if (tServer != null) { - tServer.stop(); + try { + tServer.stop(); + } catch (Exception e) { + LOG.error("Stop meta store failed", e); + } } if (hiveServer != null) { - hiveServer.stop(); + try { + hiveServer.stop(); + } catch (Exception e) { + LOG.error("Stop hive server failed", e); + } + } + if (executorService != null) { + executorService.shutdownNow(); } LOG.info("Hive Minicluster service shut down."); tServer = null; diff --git a/hudi-integ-test/README.md b/hudi-integ-test/README.md new file mode 100644 index 000000000..d87fec3ad --- /dev/null +++ b/hudi-integ-test/README.md @@ -0,0 +1,262 @@ + + +This page describes in detail how to run end to end tests on a hudi dataset that helps in improving our confidence +in a release as well as perform large scale performance benchmarks. + +# Objectives + +1. Test with different versions of core libraries and components such as `hdfs`, `parquet`, `spark`, +`hive` and `avro`. +2. Generate different types of workloads across different dimensions such as `payload size`, `number of updates`, +`number of inserts`, `number of partitions` +3. Perform multiple types of operations such as `insert`, `bulk_insert`, `upsert`, `compact`, `query` +4. Support custom post process actions and validations + +# High Level Design + +The Hudi test suite runs as a long running spark job. The suite is divided into the following high level components : + +## Workload Generation + +This component does the work of generating the workload; `inserts`, `upserts` etc. + +## Workload Scheduling + +Depending on the type of workload generated, data is either ingested into the target hudi +dataset or the corresponding workload operation is executed. For example compaction does not necessarily need a workload +to be generated/ingested but can require an execution. + +## Other actions/operatons + +The test suite supports different types of operations besides ingestion such as Hive Query execution, Clean action etc. + +# Usage instructions + + +## Entry class to the test suite + +``` +org.apache.hudi.testsuite.HoodieTestSuiteJob.java - Entry Point of the hudi test suite job. This +class wraps all the functionalities required to run a configurable integration suite. +``` + +## Configurations required to run the job +``` +org.apache.hudi.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig - Config class that drives the behavior of the +integration test suite. This class extends from com.uber.hoodie.utilities.DeltaStreamerConfig. Look at +link#HudiDeltaStreamer page to learn about all the available configs applicable to your test suite. +``` + +## Generating a custom Workload Pattern + +There are 2 ways to generate a workload pattern + + 1.Programatically + +Choose to write up the entire DAG of operations programatically, take a look at `WorkflowDagGenerator` class. +Once you're ready with the DAG you want to execute, simply pass the class name as follows: + +``` +spark-submit +... +... +--class org.apache.hudi.testsuite.HoodieTestSuiteJob +--workload-generator-classname org.apache.hudi.testsuite.dag.scheduler. +... +``` + + 2.YAML file + +Choose to write up the entire DAG of operations in YAML, take a look at `complex-workload-dag-cow.yaml` or +`complex-workload-dag-mor.yaml`. +Once you're ready with the DAG you want to execute, simply pass the yaml file path as follows: + +``` +spark-submit +... +... +--class org.apache.hudi.testsuite.HoodieTestSuiteJob +--workload-yaml-path /path/to/your-workflow-dag.yaml +... +``` + +## Building the test suite + +The test suite can be found in the `hudi-integ-test` module. Use the `prepare_integration_suite.sh` script to +build +the test suite, you can provide different parameters to the script. + +``` +shell$ ./prepare_integration_suite.sh --help +Usage: prepare_integration_suite.sh + --spark-command, prints the spark command + -h, hdfs-version + -s, spark version + -p, parquet version + -a, avro version + -s, hive version +``` + +``` +shell$ ./prepare_integration_suite.sh +.... +.... +Final command : mvn clean install -DskipTests +``` + +## Running on the cluster or in your local machine +Copy over the necessary files and jars that are required to your cluster and then run the following spark-submit +command after replacing the correct values for the parameters. +NOTE : The properties-file should have all the necessary information required to ingest into a Hudi dataset. For more + information on what properties need to be set, take a look at the test suite section under demo steps. +``` +shell$ ./prepare_integration_suite.sh --spark-command +spark-submit --packages com.databricks:spark-avro_2.11:4.0.0 --master prepare_integration_suite.sh --deploy-mode +--properties-file --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob target/hudi-integ-test-0.6 +.0-SNAPSHOT.jar --source-class --source-ordering-field --input-base-path --target-base-path --target-table --props --storage-type --payload-class --workload-yaml-path --input-file-size -- +``` + +## Running through a test-case (local) +Take a look at the `TestHoodieTestSuiteJob` to check how you can run the entire suite using JUnit. + +## Running an end to end test suite in Local Docker environment + +``` +docker exec -it adhoc-2 /bin/bash +# COPY_ON_WRITE tables +========================= +## Run the following command to start the test suite +spark-submit \ +--packages com.databricks:spark-avro_2.11:4.0.0 \ +--conf spark.task.cpus=1 \ +--conf spark.executor.cores=1 \ +--conf spark.task.maxFailures=100 \ +--conf spark.memory.fraction=0.4 \ +--conf spark.rdd.compress=true \ +--conf spark.kryoserializer.buffer.max=2000m \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.memory.storageFraction=0.1 \ +--conf spark.shuffle.service.enabled=true \ +--conf spark.sql.hive.convertMetastoreParquet=false \ +--conf spark.ui.port=5555 \ +--conf spark.driver.maxResultSize=12g \ +--conf spark.executor.heartbeatInterval=120s \ +--conf spark.network.timeout=600s \ +--conf spark.eventLog.overwrite=true \ +--conf spark.eventLog.enabled=true \ +--conf spark.yarn.max.executor.failures=10 \ +--conf spark.sql.catalogImplementation=hive \ +--conf spark.sql.shuffle.partitions=1000 \ +--class org.apache.hudi.testsuite.HoodieTestSuiteJob $HUDI_TEST_SUITE_BUNDLE \ +--source-ordering-field timestamp \ +--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ +--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \ +--target-table test_table \ +--props /var/hoodie/ws/docker/demo/config/testsuite/test-source.properties \ +--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ +--source-limit 300000 \ +--source-class org.apache.hudi.utilities.sources.AvroDFSSource \ +--input-file-size 125829120 \ +--workload-yaml-path /var/hoodie/ws/docker/demo/config/testsuite/complex-workflow-dag-cow.yaml \ +--storage-type COPY_ON_WRITE \ +--compact-scheduling-minshare 1 \ +--hoodie-conf "hoodie.deltastreamer.source.test.num_partitions=100" \ +--hoodie-conf "hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false" \ +--hoodie-conf "hoodie.deltastreamer.source.test.max_unique_records=100000000" \ +--hoodie-conf "hoodie.embed.timeline.server=false" \ +--hoodie-conf "hoodie.datasource.write.recordkey.field=_row_key" \ +--hoodie-conf "hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input" \ +--hoodie-conf "hoodie.datasource.write.keygenerator.class=org.apache.hudi.ComplexKeyGenerator" \ +--hoodie-conf "hoodie.datasource.write.partitionpath.field=timestamp" \ +--hoodie-conf "hoodie.deltastreamer.schemaprovider.source.schema.file=/var/hoodie/ws/docker/demo/config/testsuite/source.avsc" \ +--hoodie-conf "hoodie.datasource.hive_sync.assume_date_partitioning=false" \ +--hoodie-conf "hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/" \ +--hoodie-conf "hoodie.datasource.hive_sync.database=testdb" \ +--hoodie-conf "hoodie.datasource.hive_sync.table=test_table" \ +--hoodie-conf "hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.NonPartitionedExtractor" \ +--hoodie-conf "hoodie.datasource.hive_sync.assume_date_partitioning=true" \ +--hoodie-conf "hoodie.datasource.write.keytranslator.class=org.apache.hudi.DayBasedPartitionPathKeyTranslator" \ +--hoodie-conf "hoodie.deltastreamer.schemaprovider.target.schema.file=/var/hoodie/ws/docker/demo/config/testsuite/source.avsc" +... +... +2019-11-03 05:44:47 INFO DagScheduler:69 - ----------- Finished workloads ---------- +2019-11-03 05:44:47 INFO HoodieTestSuiteJob:138 - Finished scheduling all tasks +... +2019-11-03 05:44:48 INFO SparkContext:54 - Successfully stopped SparkContext +# MERGE_ON_READ tables +========================= +## Run the following command to start the test suite +spark-submit \ +--packages com.databricks:spark-avro_2.11:4.0.0 \ +--conf spark.task.cpus=1 \ +--conf spark.executor.cores=1 \ +--conf spark.task.maxFailures=100 \ +--conf spark.memory.fraction=0.4 \ +--conf spark.rdd.compress=true \ +--conf spark.kryoserializer.buffer.max=2000m \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.memory.storageFraction=0.1 \ +--conf spark.shuffle.service.enabled=true \ +--conf spark.sql.hive.convertMetastoreParquet=false \ +--conf spark.ui.port=5555 \ +--conf spark.driver.maxResultSize=12g \ +--conf spark.executor.heartbeatInterval=120s \ +--conf spark.network.timeout=600s \ +--conf spark.eventLog.overwrite=true \ +--conf spark.eventLog.enabled=true \ +--conf spark.yarn.max.executor.failures=10 \ +--conf spark.sql.catalogImplementation=hive \ +--conf spark.sql.shuffle.partitions=1000 \ +--class org.apache.hudi.testsuite.HoodieTestSuiteJob $HUDI_TEST_SUITE_BUNDLE \ +--source-ordering-field timestamp \ +--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ +--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \ +--target-table test_table \ +--props /var/hoodie/ws/docker/demo/config/testsuite/test-source.properties \ +--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ +--source-limit 300000 \ +--source-class org.apache.hudi.utilities.sources.AvroDFSSource \ +--input-file-size 125829120 \ +--workload-yaml-path /var/hoodie/ws/docker/demo/config/testsuite/complex-workflow-dag-mor.yaml \ +--storage-type MERGE_ON_READ \ +--compact-scheduling-minshare 1 \ +--hoodie-conf "hoodie.deltastreamer.source.test.num_partitions=100" \ +--hoodie-conf "hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false" \ +--hoodie-conf "hoodie.deltastreamer.source.test.max_unique_records=100000000" \ +--hoodie-conf "hoodie.embed.timeline.server=false" \ +--hoodie-conf "hoodie.datasource.write.recordkey.field=_row_key" \ +--hoodie-conf "hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input" \ +--hoodie-conf "hoodie.datasource.write.keygenerator.class=org.apache.hudi.ComplexKeyGenerator" \ +--hoodie-conf "hoodie.datasource.write.partitionpath.field=timestamp" \ +--hoodie-conf "hoodie.deltastreamer.schemaprovider.source.schema.file=/var/hoodie/ws/docker/demo/config/testsuite/source.avsc" \ +--hoodie-conf "hoodie.datasource.hive_sync.assume_date_partitioning=false" \ +--hoodie-conf "hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/" \ +--hoodie-conf "hoodie.datasource.hive_sync.database=testdb" \ +--hoodie-conf "hoodie.datasource.hive_sync.table=test_table" \ +--hoodie-conf "hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.NonPartitionedExtractor" \ +--hoodie-conf "hoodie.datasource.hive_sync.assume_date_partitioning=true" \ +--hoodie-conf "hoodie.datasource.write.keytranslator.class=org.apache.hudi.DayBasedPartitionPathKeyTranslator" \ +--hoodie-conf "hoodie.deltastreamer.schemaprovider.target.schema.file=/var/hoodie/ws/docker/demo/config/testsuite/source.avsc" +... +... +2019-11-03 05:44:47 INFO DagScheduler:69 - ----------- Finished workloads ---------- +2019-11-03 05:44:47 INFO HoodieTestSuiteJob:138 - Finished scheduling all tasks +... +2019-11-03 05:44:48 INFO SparkContext:54 - Successfully stopped SparkContext +``` + \ No newline at end of file diff --git a/hudi-integ-test/pom.xml b/hudi-integ-test/pom.xml index b9bcc6d96..e81f335ba 100644 --- a/hudi-integ-test/pom.xml +++ b/hudi-integ-test/pom.xml @@ -51,12 +51,69 @@ test + + + + org.eclipse.jetty.aggregate + jetty-all + ${jetty.version} + uber + test + + + + org.apache.spark + spark-sql_${scala.binary.version} + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + org.eclipse.jetty + * + + + + org.apache.hudi hudi-spark_${scala.binary.version} ${project.version} + + org.apache.hudi + hudi-utilities_${scala.binary.version} + ${project.version} + provided + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + com.databricks + spark-avro_${scala.binary.version} + + + @@ -89,6 +146,67 @@ test-jar test + + org.apache.hudi + hudi-hive-sync + ${project.version} + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + tests + test-jar + + + org.apache.hudi + hudi-utilities_${scala.binary.version} + ${project.version} + tests + test-jar + test + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + org.apache.hudi + hudi-client + ${project.version} + test-jar + + + + + com.fasterxml.jackson.dataformat + jackson-dataformat-yaml + 2.7.4 + + + com.fasterxml.jackson.core + jackson-databind + 2.6.7.3 + @@ -107,6 +225,87 @@ test + + + + org.apache.hadoop + hadoop-common + tests + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + + org.apache.hadoop + hadoop-hdfs + tests + + + javax.servlet + * + + + netty + io.netty + + + netty-all + io.netty + + + + + + + ${hive.groupid} + hive-exec + ${hive.version} + ${hive.exec.classifier} + + + javax.servlet + servlet-api + + + + + + ${hive.groupid} + hive-jdbc + ${hive.version} + + + org.slf4j + slf4j-api + + + javax.servlet.jsp + * + + + javax.servlet + * + + + org.eclipse.jetty + * + + + test + + org.awaitility awaitility @@ -138,6 +337,12 @@ test + + org.mockito + mockito-junit-jupiter + test + + diff --git a/hudi-integ-test/prepare_integration_suite.sh b/hudi-integ-test/prepare_integration_suite.sh new file mode 100644 index 000000000..06999d9ca --- /dev/null +++ b/hudi-integ-test/prepare_integration_suite.sh @@ -0,0 +1,174 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#!/bin/bash + +# Determine the current working directory +_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +# Preserve the calling directory +_CALLING_DIR="$(pwd)" + +######################### +# The command line help # +######################### +usage() { + echo "Usage: $0" + echo " --spark-command, prints the spark command" + echo " -h | --hadoop, hadoop-version" + echo " -s | --spark, spark version" + echo " -p | --parquet, parquet version" + echo " -a | --avro, avro version" + echo " -i | --hive, hive version" + echo " -l | --scala, scala version" + exit 1 +} + +get_spark_command() { +if [ -z "$scala" ] +then + scala="2.11" +else + scala=$1 +fi +echo "spark-submit --packages org.apache.spark:spark-avro_${scala}:2.4.4 \ +--master $0 \ +--deploy-mode $1 \ +--properties-file $2 \ +--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ +`ls target/hudi-integ-test-*-SNAPSHOT.jar` \ +--source-class $3 \ +--source-ordering-field $4 \ +--input-base-path $5 \ +--target-base-path $6 \ +--target-table $7 \ +--props $8 \ +--storage-type $9 \ +--payload-class "${10}" \ +--workload-yaml-path "${11}" \ +--input-file-size "${12}" \ +--" +} + +case "$1" in + --help) + usage + exit 0 + ;; +esac + +case "$1" in + --spark-command) + get_spark_command + exit 0 + ;; +esac + +while getopts ":h:s:p:a:i:l:-:" opt; do + case $opt in + h) hadoop="$OPTARG" + printf "Argument hadoop is %s\n" "$hadoop" + ;; + s) spark="$OPTARG" + printf "Argument spark is %s\n" "$spark" + ;; + p) parquet="$OPTARG" + printf "Argument parquet is %s\n" "$parquet" + ;; + a) avro="$OPTARG" + printf "Argument avro is %s\n" "$avro" + ;; + i) hive="$OPTARG" + printf "Argument hive is %s\n" "$hive" + ;; + l) scala="$OPTARG" + printf "Argument scala is %s\n" "$scala" + ;; + -) + case "$OPTARG" in + hadoop) + hadoop="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 )) + printf "Argument hadoop is %s\n" "$hadoop" + ;; + spark) + spark="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 )) + printf "Argument spark is %s\n" "$spark" + ;; + parquet) + parquet="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 )) + printf "Argument parquet is %s\n" "$parquet" + ;; + avro) + avro="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 )) + printf "Argument avro is %s\n" "$avro" + ;; + hive) + hive="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 )) + printf "Argument hive is %s\n" "$hive" + ;; + scala) + scala="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 )) + printf "Argument scala is %s\n" "$scala" + ;; + *) echo "Invalid option --$OPTARG" >&2 + ;; + esac ;; + \?) echo "Invalid option -$OPTARG" >&2 + ;; + esac +done + + +get_versions () { + base_command='' + if [ -z "$hadoop" ] + then + base_command=$base_command + else + hadoop=$1 + base_command+=' -Dhadoop.version='$hadoop + fi + + if [ -z "$hive" ] + then + base_command=$base_command + else + hive=$2 + base_command+=' -Dhive.version='$hive + fi + + if [ -z "$scala" ] + then + base_command=$base_command + else + scala=$3 + base_command+=' -Dscala-'$scala + fi + echo $base_command +} + +versions=$(get_versions $hadoop $hive $scala) + +final_command='mvn clean install -DskipTests '$versions +printf "Final command $final_command \n" + +# change to the project root directory to run maven command +move_to_root='cd ..' +$move_to_root && $final_command + +# change back to original working directory +cd $_CALLING_DIR + +printf "A sample spark command to start the integration suite \n" +get_spark_command $scala \ No newline at end of file diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieDeltaStreamerWrapper.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieDeltaStreamerWrapper.java new file mode 100644 index 000000000..a489bacc3 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieDeltaStreamerWrapper.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; +import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +/** + * Extends the {@link HoodieDeltaStreamer} to expose certain operations helpful in running the Test Suite. + * This is done to achieve 2 things 1) Leverage some components of {@link HoodieDeltaStreamer} 2) + * Piggyback on the suite to test {@link HoodieDeltaStreamer} + */ +public class HoodieDeltaStreamerWrapper extends HoodieDeltaStreamer { + + public HoodieDeltaStreamerWrapper(Config cfg, JavaSparkContext jssc) throws Exception { + super(cfg, jssc); + } + + public HoodieDeltaStreamerWrapper(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf conf) throws Exception { + super(cfg, jssc, fs, conf); + } + + public JavaRDD upsert(Operation operation) throws + Exception { + cfg.operation = operation; + return deltaSyncService.getDeltaSync().syncOnce().getRight(); + } + + public JavaRDD insert() throws Exception { + return upsert(Operation.INSERT); + } + + public JavaRDD bulkInsert() throws + Exception { + return upsert(Operation.BULK_INSERT); + } + + public void scheduleCompact() throws Exception { + // Since we don't support scheduleCompact() operation in delta-streamer, assume upsert without any data that will + // trigger scheduling compaction + upsert(Operation.UPSERT); + } + + public JavaRDD compact() throws Exception { + // Since we don't support compact() operation in delta-streamer, assume upsert without any data that will trigger + // inline compaction + return upsert(Operation.UPSERT); + } + + public Pair>> fetchSource() throws Exception { + return deltaSyncService.getDeltaSync().readFromSource(deltaSyncService.getDeltaSync().getCommitTimelineOpt()); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java new file mode 100644 index 000000000..be036c16d --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hudi.DataSourceUtils; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig; +import org.apache.hudi.integ.testsuite.dag.DagUtils; +import org.apache.hudi.integ.testsuite.dag.WorkflowDag; +import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator; +import org.apache.hudi.integ.testsuite.dag.scheduler.DagScheduler; +import org.apache.hudi.integ.testsuite.generator.DeltaGenerator; +import org.apache.hudi.integ.testsuite.reader.DeltaInputType; +import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.utilities.UtilHelpers; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; +import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This is the entry point for running a Hudi Test Suite. Although this class has similarities with + * {@link HoodieDeltaStreamer} this class does not extend it since do not want to create a dependency on the changes in + * DeltaStreamer. + */ +public class HoodieTestSuiteJob { + + private static volatile Logger log = LoggerFactory.getLogger(HoodieTestSuiteJob.class); + + private final HoodieTestSuiteConfig cfg; + /** + * Bag of properties with source, hoodie client, key generator etc. + */ + TypedProperties props; + /** + * Schema provider that supplies the command for writing out the generated payloads. + */ + private transient SchemaProvider schemaProvider; + /** + * Filesystem used. + */ + private transient FileSystem fs; + /** + * Spark context. + */ + private transient JavaSparkContext jsc; + /** + * Spark Session. + */ + private transient SparkSession sparkSession; + /** + * Hive Config. + */ + private transient HiveConf hiveConf; + + private KeyGenerator keyGenerator; + + public HoodieTestSuiteJob(HoodieTestSuiteConfig cfg, JavaSparkContext jsc) throws IOException { + this.cfg = cfg; + this.jsc = jsc; + this.sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate(); + this.fs = FSUtils.getFs(cfg.inputBasePath, jsc.hadoopConfiguration()); + this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); + log.info("Creating workload generator with configs : {}", props.toString()); + this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jsc); + this.hiveConf = getDefaultHiveConf(jsc.hadoopConfiguration()); + this.keyGenerator = DataSourceUtils.createKeyGenerator(props); + if (!fs.exists(new Path(cfg.targetBasePath))) { + HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), cfg.targetBasePath, + HoodieTableType.valueOf(cfg.tableType), cfg.targetTableName, "archived"); + } + } + + private static HiveConf getDefaultHiveConf(Configuration cfg) { + HiveConf hiveConf = new HiveConf(); + hiveConf.addResource(cfg); + return hiveConf; + } + + public static void main(String[] args) throws Exception { + final HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig(); + JCommander cmd = new JCommander(cfg, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + + JavaSparkContext jssc = UtilHelpers.buildSparkContext("workload-generator-" + cfg.outputTypeName + + "-" + cfg.inputFormatName, cfg.sparkMaster); + new HoodieTestSuiteJob(cfg, jssc).runTestSuite(); + } + + public void runTestSuite() { + try { + WorkflowDag workflowDag = this.cfg.workloadYamlPath == null ? ((WorkflowDagGenerator) ReflectionUtils + .loadClass((this.cfg).workloadDagGenerator)).build() + : DagUtils.convertYamlPathToDag(this.fs, this.cfg.workloadYamlPath); + log.info("Workflow Dag => " + DagUtils.convertDagToYaml(workflowDag)); + long startTime = System.currentTimeMillis(); + String schemaStr = schemaProvider.getSourceSchema().toString(); + final HoodieTestSuiteWriter writer = new HoodieTestSuiteWriter(jsc, props, cfg, schemaStr); + final DeltaGenerator deltaGenerator = new DeltaGenerator( + new DFSDeltaConfig(DeltaOutputMode.valueOf(cfg.outputTypeName), DeltaInputType.valueOf(cfg.inputFormatName), + new SerializableConfiguration(jsc.hadoopConfiguration()), cfg.inputBasePath, cfg.targetBasePath, + schemaStr, cfg.limitFileSize), jsc, sparkSession, schemaStr, keyGenerator); + DagScheduler dagScheduler = new DagScheduler(workflowDag, writer, deltaGenerator); + dagScheduler.schedule(); + log.info("Finished scheduling all tasks, Time taken {}", System.currentTimeMillis() - startTime); + } catch (Exception e) { + log.error("Failed to run Test Suite ", e); + throw new HoodieException("Failed to run Test Suite ", e); + } finally { + jsc.stop(); + } + } + + /** + * The Hudi test suite uses {@link HoodieDeltaStreamer} to run some operations hence extend delta streamer config. + */ + public static class HoodieTestSuiteConfig extends HoodieDeltaStreamer.Config { + + @Parameter(names = {"--input-base-path"}, description = "base path for input data" + + "(Will be created if did not exist first time around. If exists, more data will be added to that path)", + required = true) + public String inputBasePath; + + @Parameter(names = { + "--workload-generator-classname"}, description = "WorkflowDag of operations to generate the workload", + required = true) + public String workloadDagGenerator = WorkflowDagGenerator.class.getName(); + + @Parameter(names = { + "--workload-yaml-path"}, description = "Workflow Dag yaml path to generate the workload") + public String workloadYamlPath; + + @Parameter(names = {"--delta-output-type"}, description = "Subclass of " + + "org.apache.hudi.testsuite.workload.DeltaOutputMode to readAvro data.") + public String outputTypeName = DeltaOutputMode.DFS.name(); + + @Parameter(names = {"--delta-input-format"}, description = "Subclass of " + + "org.apache.hudi.testsuite.workload.DeltaOutputMode to read avro data.") + public String inputFormatName = DeltaInputType.AVRO.name(); + + @Parameter(names = {"--input-file-size"}, description = "The min/max size of the input files to generate", + required = true) + public Long limitFileSize = 1024 * 1024 * 120L; + + @Parameter(names = {"--use-deltastreamer"}, description = "Choose whether to use HoodieDeltaStreamer to " + + "perform" + + " ingestion. If set to false, HoodieWriteClient will be used") + public Boolean useDeltaStreamer = false; + + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java new file mode 100644 index 000000000..b22faca22 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.HoodieReadClient; +import org.apache.hudi.client.HoodieWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.integ.testsuite.dag.nodes.CleanNode; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode; +import org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Operation; +import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +/** + * A writer abstraction for the Hudi test suite. This class wraps different implementations of writers used to perform + * write operations into the target hudi dataset. Current supported writers are {@link HoodieDeltaStreamerWrapper} + * and {@link HoodieWriteClient}. + */ +public class HoodieTestSuiteWriter { + + private HoodieDeltaStreamerWrapper deltaStreamerWrapper; + private HoodieWriteClient writeClient; + protected HoodieTestSuiteConfig cfg; + private Option lastCheckpoint; + private HoodieReadClient hoodieReadClient; + private Properties props; + private String schema; + private transient Configuration configuration; + private transient JavaSparkContext sparkContext; + private static Set VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE = new HashSet<>( + Arrays.asList(RollbackNode.class.getName(), CleanNode.class.getName(), ScheduleCompactNode.class.getName())); + + public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema) throws + Exception { + this(jsc, props, cfg, schema, true); + } + + public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema, + boolean rollbackInflight) throws Exception { + // We ensure that only 1 instance of HoodieWriteClient is instantiated for a HoodieTestSuiteWriter + // This does not instantiate a HoodieWriteClient until a + // {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} is invoked. + this.deltaStreamerWrapper = new HoodieDeltaStreamerWrapper(cfg, jsc); + this.hoodieReadClient = new HoodieReadClient(jsc, cfg.targetBasePath); + if (!cfg.useDeltaStreamer) { + this.writeClient = new HoodieWriteClient(jsc, getHoodieClientConfig(cfg, props, schema), rollbackInflight); + } + this.cfg = cfg; + this.configuration = jsc.hadoopConfiguration(); + this.sparkContext = jsc; + this.props = props; + this.schema = schema; + } + + private HoodieWriteConfig getHoodieClientConfig(HoodieTestSuiteConfig cfg, Properties props, String schema) { + HoodieWriteConfig.Builder builder = + HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath) + .withAutoCommit(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build()) + .forTable(cfg.targetTableName) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .withProps(props); + builder = builder.withSchema(schema); + return builder.build(); + } + + private boolean allowWriteClientAccess(DagNode dagNode) { + if (VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE.contains(dagNode.getClass().getName())) { + return true; + } + return false; + } + + public Pair>> fetchSource() throws Exception { + return this.deltaStreamerWrapper.fetchSource(); + } + + public Option startCommit() { + if (cfg.useDeltaStreamer) { + return Option.of(HoodieActiveTimeline.createNewInstantTime()); + } else { + return Option.of(writeClient.startCommit()); + } + } + + public JavaRDD upsert(Option instantTime) throws Exception { + if (cfg.useDeltaStreamer) { + return deltaStreamerWrapper.upsert(Operation.UPSERT); + } else { + Pair>> nextBatch = fetchSource(); + lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); + return writeClient.upsert(nextBatch.getRight().getRight(), instantTime.get()); + } + } + + public JavaRDD insert(Option instantTime) throws Exception { + if (cfg.useDeltaStreamer) { + return deltaStreamerWrapper.insert(); + } else { + Pair>> nextBatch = fetchSource(); + lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); + return writeClient.insert(nextBatch.getRight().getRight(), instantTime.get()); + } + } + + public JavaRDD bulkInsert(Option instantTime) throws Exception { + if (cfg.useDeltaStreamer) { + return deltaStreamerWrapper.bulkInsert(); + } else { + Pair>> nextBatch = fetchSource(); + lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); + return writeClient.bulkInsert(nextBatch.getRight().getRight(), instantTime.get()); + } + } + + public JavaRDD compact(Option instantTime) throws Exception { + if (cfg.useDeltaStreamer) { + return deltaStreamerWrapper.compact(); + } else { + if (!instantTime.isPresent()) { + Option> compactionPlanPair = Option + .fromJavaOptional(hoodieReadClient.getPendingCompactions() + .stream().findFirst()); + if (compactionPlanPair.isPresent()) { + instantTime = Option.of(compactionPlanPair.get().getLeft()); + } + } + if (instantTime.isPresent()) { + return writeClient.compact(instantTime.get()); + } else { + return null; + } + } + } + + public Option scheduleCompaction(Option> previousCommitExtraMetadata) throws + Exception { + if (!cfg.useDeltaStreamer) { + deltaStreamerWrapper.scheduleCompact(); + return Option.empty(); + } else { + return writeClient.scheduleCompaction(previousCommitExtraMetadata); + } + } + + public void commit(JavaRDD records, Option instantTime) { + if (!cfg.useDeltaStreamer) { + Map extraMetadata = new HashMap<>(); + /** Store the checkpoint in the commit metadata just like + * {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} **/ + extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get()); + writeClient.commit(instantTime.get(), records, Option.of(extraMetadata)); + } + } + + public HoodieWriteClient getWriteClient(DagNode dagNode) throws IllegalAccessException { + if (cfg.useDeltaStreamer & !allowWriteClientAccess(dagNode)) { + throw new IllegalAccessException("cannot access write client when testing in deltastreamer mode"); + } + synchronized (this) { + if (writeClient == null) { + this.writeClient = new HoodieWriteClient(this.sparkContext, getHoodieClientConfig(cfg, props, schema), false); + } + } + return writeClient; + } + + public HoodieDeltaStreamerWrapper getDeltaStreamerWrapper() { + return deltaStreamerWrapper; + } + + public HoodieTestSuiteConfig getCfg() { + return cfg; + } + + public Configuration getConfiguration() { + return configuration; + } + + public JavaSparkContext getSparkContext() { + return sparkContext; + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DFSDeltaConfig.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DFSDeltaConfig.java new file mode 100644 index 000000000..291562808 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DFSDeltaConfig.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.configuration; + +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.integ.testsuite.reader.DeltaInputType; +import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; + +/** + * Configuration to hold details about a DFS based output type, implements {@link DeltaConfig}. + */ +public class DFSDeltaConfig extends DeltaConfig { + + // The base path where the generated data should be written to. This data will in turn be used to write into a hudi + // dataset + private final String deltaBasePath; + private final String datasetOutputPath; + private final String schemaStr; + // Maximum file size for the files generated + private final Long maxFileSize; + // The current batch id + private Integer batchId; + + public DFSDeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType, + SerializableConfiguration configuration, + String deltaBasePath, String targetBasePath, String schemaStr, Long maxFileSize) { + super(deltaOutputMode, deltaInputType, configuration); + this.deltaBasePath = deltaBasePath; + this.schemaStr = schemaStr; + this.maxFileSize = maxFileSize; + this.datasetOutputPath = targetBasePath; + } + + public String getDeltaBasePath() { + return deltaBasePath; + } + + public String getDatasetOutputPath() { + return datasetOutputPath; + } + + public String getSchemaStr() { + return schemaStr; + } + + public Long getMaxFileSize() { + return maxFileSize; + } + + public Integer getBatchId() { + return batchId; + } + + public void setBatchId(Integer batchId) { + this.batchId = batchId; + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java new file mode 100644 index 000000000..30fa58442 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.configuration; + +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.integ.testsuite.reader.DeltaInputType; +import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; + +/** + * Configuration to hold the delta output type and delta input format. + */ +public class DeltaConfig implements Serializable { + + private final DeltaOutputMode deltaOutputMode; + private final DeltaInputType deltaInputType; + private final SerializableConfiguration configuration; + + public DeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType, + SerializableConfiguration configuration) { + this.deltaOutputMode = deltaOutputMode; + this.deltaInputType = deltaInputType; + this.configuration = configuration; + } + + public DeltaOutputMode getDeltaOutputMode() { + return deltaOutputMode; + } + + public DeltaInputType getDeltaInputType() { + return deltaInputType; + } + + public Configuration getConfiguration() { + return configuration.get(); + } + + /** + * Represents any kind of workload operation for new data. Each workload also contains a set of Option sequence of + * actions that can be executed in parallel. + */ + public static class Config { + + public static final String CONFIG_NAME = "config"; + public static final String TYPE = "type"; + public static final String NODE_NAME = "name"; + public static final String DEPENDENCIES = "deps"; + public static final String CHILDREN = "children"; + public static final String HIVE_QUERIES = "hive_queries"; + public static final String HIVE_PROPERTIES = "hive_props"; + private static String NUM_RECORDS_INSERT = "num_records_insert"; + private static String NUM_RECORDS_UPSERT = "num_records_upsert"; + private static String REPEAT_COUNT = "repeat_count"; + private static String RECORD_SIZE = "record_size"; + private static String NUM_PARTITIONS_INSERT = "num_partitions_insert"; + private static String NUM_PARTITIONS_UPSERT = "num_partitions_upsert"; + private static String NUM_FILES_UPSERT = "num_files_upsert"; + private static String FRACTION_UPSERT_PER_FILE = "fraction_upsert_per_file"; + private static String DISABLE_GENERATE = "disable_generate"; + private static String DISABLE_INGEST = "disable_ingest"; + private static String HIVE_LOCAL = "hive_local"; + + private Map configsMap; + + public Config(Map configsMap) { + this.configsMap = configsMap; + } + + public static Builder newBuilder() { + return new Builder(); + } + + public long getNumRecordsInsert() { + return Long.valueOf(configsMap.getOrDefault(NUM_RECORDS_INSERT, 0).toString()); + } + + public long getNumRecordsUpsert() { + return Long.valueOf(configsMap.getOrDefault(NUM_RECORDS_UPSERT, 0).toString()); + } + + public int getRecordSize() { + return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString()); + } + + public int getNumInsertPartitions() { + return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString()); + } + + public int getRepeatCount() { + return Integer.valueOf(configsMap.getOrDefault(REPEAT_COUNT, 1).toString()); + } + + public int getNumUpsertPartitions() { + return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_UPSERT, 0).toString()); + } + + public int getNumUpsertFiles() { + return Integer.valueOf(configsMap.getOrDefault(NUM_FILES_UPSERT, 1).toString()); + } + + public double getFractionUpsertPerFile() { + return Double.valueOf(configsMap.getOrDefault(FRACTION_UPSERT_PER_FILE, 0.0).toString()); + } + + public boolean isDisableGenerate() { + return Boolean.valueOf(configsMap.getOrDefault(DISABLE_GENERATE, false).toString()); + } + + public boolean isDisableIngest() { + return Boolean.valueOf(configsMap.getOrDefault(DISABLE_INGEST, false).toString()); + } + + public Map getOtherConfigs() { + if (configsMap == null) { + return new HashMap<>(); + } + return configsMap; + } + + public List> getHiveQueries() { + try { + return (List>) this.configsMap.getOrDefault("hive_queries", new ArrayList<>()); + } catch (Exception e) { + throw new RuntimeException("unable to get hive queries from configs"); + } + } + + public boolean isHiveLocal() { + return Boolean.valueOf(configsMap.getOrDefault(HIVE_LOCAL, false).toString()); + } + + public List getHiveProperties() { + return (List) this.configsMap.getOrDefault(HIVE_PROPERTIES, new ArrayList<>()); + } + + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this.configsMap); + } catch (Exception e) { + throw new RuntimeException("unable to generate string representation of config"); + } + } + + public static class Builder { + + private Map configsMap = new HashMap<>(); + + public Builder() { + } + + public Builder withNumRecordsToInsert(long numRecordsInsert) { + this.configsMap.put(NUM_RECORDS_INSERT, numRecordsInsert); + return this; + } + + public Builder withNumRecordsToUpdate(long numRecordsUpsert) { + this.configsMap.put(NUM_RECORDS_UPSERT, numRecordsUpsert); + return this; + } + + public Builder withNumInsertPartitions(int numInsertPartitions) { + this.configsMap.put(NUM_PARTITIONS_INSERT, numInsertPartitions); + return this; + } + + public Builder withNumUpsertPartitions(int numUpsertPartitions) { + this.configsMap.put(NUM_PARTITIONS_UPSERT, numUpsertPartitions); + return this; + } + + public Builder withNumUpsertFiles(int numUpsertFiles) { + this.configsMap.put(NUM_FILES_UPSERT, numUpsertFiles); + return this; + } + + public Builder withFractionUpsertPerFile(double fractionUpsertPerFile) { + this.configsMap.put(FRACTION_UPSERT_PER_FILE, fractionUpsertPerFile); + return this; + } + + public Builder withNumTimesToRepeat(int repeatCount) { + this.configsMap.put(REPEAT_COUNT, repeatCount); + return this; + } + + public Builder withRecordSize(int recordSize) { + this.configsMap.put(RECORD_SIZE, recordSize); + return this; + } + + public Builder disableGenerate(boolean generate) { + this.configsMap.put(DISABLE_GENERATE, generate); + return this; + } + + public Builder disableIngest(boolean ingest) { + this.configsMap.put(DISABLE_INGEST, ingest); + return this; + } + + public Builder withConfig(String name, Object value) { + this.configsMap.put(name, value); + return this; + } + + public Builder withHiveQueryAndResults(List> hiveQueries) { + this.configsMap.put(HIVE_QUERIES, hiveQueries); + return this; + } + + public Builder withHiveLocal(boolean startHiveLocal) { + this.configsMap.put(HIVE_LOCAL, startHiveLocal); + return this; + } + + public Builder withHiveProperties(List hiveProperties) { + this.configsMap.put(HIVE_PROPERTIES, hiveProperties); + return this; + } + + public Builder withConfigsMap(Map configsMap) { + this.configsMap = configsMap; + return this; + } + + public Config build() { + return new Config(configsMap); + } + + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/Converter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/Converter.java new file mode 100644 index 000000000..e4ad0a7fa --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/Converter.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.converter; + +import java.io.Serializable; +import org.apache.spark.api.java.JavaRDD; + +/** + * Implementations of {@link Converter} will convert data from one format to another. + * + * @param Input Data Type + * @param Output Data Type + */ +public interface Converter extends Serializable { + + JavaRDD convert(JavaRDD inputRDD); +} \ No newline at end of file diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/UpdateConverter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/UpdateConverter.java new file mode 100644 index 000000000..24520a362 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/UpdateConverter.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.converter; + +import java.util.List; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.integ.testsuite.generator.LazyRecordGeneratorIterator; +import org.apache.hudi.integ.testsuite.generator.UpdateGeneratorIterator; +import org.apache.spark.api.java.JavaRDD; + +/** + * This converter creates an update {@link GenericRecord} from an existing {@link GenericRecord}. + */ +public class UpdateConverter implements Converter { + + private final String schemaStr; + // The fields that should not be mutated when converting the insert record to an update record, generally the + // record_key + private final List partitionPathFields; + private final List recordKeyFields; + private final int minPayloadSize; + + public UpdateConverter(String schemaStr, int minPayloadSize, List partitionPathFields, + List recordKeyFields) { + this.schemaStr = schemaStr; + this.partitionPathFields = partitionPathFields; + this.recordKeyFields = recordKeyFields; + this.minPayloadSize = minPayloadSize; + } + + @Override + public JavaRDD convert(JavaRDD inputRDD) { + return inputRDD.mapPartitions(recordItr -> new LazyRecordGeneratorIterator(new UpdateGeneratorIterator(recordItr, + schemaStr, partitionPathFields, recordKeyFields, minPayloadSize))); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java new file mode 100644 index 000000000..288986719 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; +import com.fasterxml.jackson.dataformat.yaml.YAMLGenerator.Feature; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; + +/** + * Utility class to SerDe workflow dag. + */ +public class DagUtils { + + static final ObjectMapper MAPPER = new ObjectMapper(); + + /** + * Converts a YAML path to {@link WorkflowDag}. + */ + public static WorkflowDag convertYamlPathToDag(FileSystem fs, String path) throws IOException { + InputStream is = fs.open(new Path(path)); + return convertYamlToDag(toString(is)); + } + + /** + * Converts a YAML representation to {@link WorkflowDag}. + */ + public static WorkflowDag convertYamlToDag(String yaml) throws IOException { + Map allNodes = new HashMap<>(); + final ObjectMapper yamlReader = new ObjectMapper(new YAMLFactory()); + final JsonNode jsonNode = yamlReader.readTree(yaml); + Iterator> itr = jsonNode.fields(); + while (itr.hasNext()) { + Entry dagNode = itr.next(); + allNodes.put(dagNode.getKey(), convertJsonToDagNode(allNodes, dagNode.getValue())); + } + return new WorkflowDag(findRootNodes(allNodes)); + } + + /** + * Converts {@link WorkflowDag} to a YAML representation. + */ + public static String convertDagToYaml(WorkflowDag dag) throws IOException { + final ObjectMapper yamlWriter = new ObjectMapper(new YAMLFactory().disable(Feature.WRITE_DOC_START_MARKER) + .enable(Feature.MINIMIZE_QUOTES).enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES)); + JsonNode yamlNode = MAPPER.createObjectNode(); + convertDagToYaml(yamlNode, dag.getNodeList()); + return yamlWriter.writerWithDefaultPrettyPrinter().writeValueAsString(yamlNode); + } + + private static void convertDagToYaml(JsonNode yamlNode, List dagNodes) throws IOException { + for (DagNode dagNode : dagNodes) { + String name = dagNode.getConfig().getOtherConfigs().getOrDefault(DeltaConfig.Config.NODE_NAME, dagNode.getName()).toString(); + ((ObjectNode) yamlNode).put(name, convertDagNodeToJsonNode(dagNode)); + if (dagNode.getChildNodes().size() > 0) { + convertDagToYaml(yamlNode, dagNode.getChildNodes()); + } + } + } + + private static DagNode convertJsonToDagNode(Map allNodes, JsonNode node) throws IOException { + String type = node.get(DeltaConfig.Config.TYPE).asText(); + final DagNode retNode = convertJsonToDagNode(node, type); + Arrays.asList(node.get(DeltaConfig.Config.DEPENDENCIES).textValue().split(",")).stream().forEach(dep -> { + DagNode parentNode = allNodes.get(dep); + if (parentNode != null) { + parentNode.addChildNode(retNode); + } + }); + return retNode; + } + + private static List findRootNodes(Map allNodes) { + final List rootNodes = new ArrayList<>(); + allNodes.entrySet().stream().forEach(entry -> { + if (entry.getValue().getParentNodes().size() < 1) { + rootNodes.add(entry.getValue()); + } + }); + return rootNodes; + } + + private static DagNode convertJsonToDagNode(JsonNode node, String type) { + try { + DeltaConfig.Config config = DeltaConfig.Config.newBuilder().withConfigsMap(convertJsonNodeToMap(node)).build(); + return (DagNode) ReflectionUtils.loadClass(generateFQN(type), config); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + private static String generateFQN(String name) throws ClassNotFoundException { + return Class.forName(StringUtils.joinUsingDelim(".", + DagNode.class.getName().substring(0, DagNode.class.getName().lastIndexOf(".")), name)).getName(); + } + + private static JsonNode convertDagNodeToJsonNode(DagNode node) throws IOException { + return createJsonNode(node, node.getClass().getSimpleName()); + } + + private static Map convertJsonNodeToMap(JsonNode node) { + Map configsMap = new HashMap<>(); + Iterator> itr = node.get(DeltaConfig.Config.CONFIG_NAME).fields(); + while (itr.hasNext()) { + Entry entry = itr.next(); + switch (entry.getKey()) { + case DeltaConfig.Config.HIVE_QUERIES: + configsMap.put(DeltaConfig.Config.HIVE_QUERIES, getHiveQueries(entry)); + break; + case DeltaConfig.Config.HIVE_PROPERTIES: + configsMap.put(DeltaConfig.Config.HIVE_PROPERTIES, getProperties(entry)); + break; + default: + configsMap.put(entry.getKey(), getValue(entry.getValue())); + break; + } + } + return configsMap; + } + + private static List> getHiveQueries(Entry entry) { + List> queries = new ArrayList<>(); + Iterator> queriesItr = entry.getValue().fields(); + while (queriesItr.hasNext()) { + queries.add(Pair.of(queriesItr.next().getValue().textValue(), queriesItr.next().getValue().asInt())); + } + return queries; + } + + private static List getProperties(Entry entry) { + List properties = new ArrayList<>(); + Iterator> queriesItr = entry.getValue().fields(); + while (queriesItr.hasNext()) { + properties.add(queriesItr.next().getValue().textValue()); + } + return properties; + } + + private static Object getValue(JsonNode node) { + if (node.isInt()) { + return node.asInt(); + } else if (node.isLong()) { + return node.asLong(); + } else if (node.isShort()) { + return node.asInt(); + } else if (node.isBoolean()) { + return node.asBoolean(); + } else if (node.isDouble()) { + return node.asDouble(); + } else if (node.isFloat()) { + return node.asDouble(); + } + return node.textValue(); + } + + private static JsonNode createJsonNode(DagNode node, String type) throws IOException { + JsonNode configNode = MAPPER.readTree(node.getConfig().toString()); + JsonNode jsonNode = MAPPER.createObjectNode(); + ((ObjectNode) jsonNode).put(DeltaConfig.Config.CONFIG_NAME, configNode); + ((ObjectNode) jsonNode).put(DeltaConfig.Config.TYPE, type); + ((ObjectNode) jsonNode).put(DeltaConfig.Config.DEPENDENCIES, getDependencyNames(node)); + return jsonNode; + } + + private static String getDependencyNames(DagNode node) { + return node.getParentNodes().stream() + .map(e -> ((DagNode) e).getConfig().getOtherConfigs().getOrDefault(DeltaConfig.Config.NODE_NAME, node.getName()).toString()) + .collect(Collectors.joining(",")).toString(); + } + + public static String toString(InputStream inputStream) throws IOException { + ByteArrayOutputStream result = new ByteArrayOutputStream(); + byte[] buffer = new byte[1024]; + int length; + while ((length = inputStream.read(buffer)) != -1) { + result.write(buffer, 0, length); + } + return result.toString("utf-8"); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/ExecutionContext.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/ExecutionContext.java new file mode 100644 index 000000000..eecd763d1 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/ExecutionContext.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag; + +import java.io.Serializable; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.generator.DeltaGenerator; +import org.apache.spark.api.java.JavaSparkContext; + +/** + * This wraps the context needed for an execution of + * a {@link DagNode#execute(ExecutionContext)}. + */ +public class ExecutionContext implements Serializable { + + private HoodieTestSuiteWriter hoodieTestSuiteWriter; + private DeltaGenerator deltaGenerator; + private transient JavaSparkContext jsc; + + public ExecutionContext(JavaSparkContext jsc, HoodieTestSuiteWriter hoodieTestSuiteWriter, DeltaGenerator deltaGenerator) { + this.hoodieTestSuiteWriter = hoodieTestSuiteWriter; + this.deltaGenerator = deltaGenerator; + this.jsc = jsc; + } + + public HoodieTestSuiteWriter getHoodieTestSuiteWriter() { + return hoodieTestSuiteWriter; + } + + public DeltaGenerator getDeltaGenerator() { + return deltaGenerator; + } + + public JavaSparkContext getJsc() { + return jsc; + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/SimpleWorkflowDagGenerator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/SimpleWorkflowDagGenerator.java new file mode 100644 index 000000000..ad6e9cb0c --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/SimpleWorkflowDagGenerator.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag; + +import java.util.ArrayList; +import java.util.List; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode; +import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; +import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode; + +/** + * An example of how to generate a workflow dag programmatically. This is also used as the default workflow dag if + * none is provided. + */ +public class SimpleWorkflowDagGenerator implements WorkflowDagGenerator { + + @Override + public WorkflowDag build() { + + DagNode root = new InsertNode(DeltaConfig.Config.newBuilder() + .withNumRecordsToInsert(100) + .withNumInsertPartitions(1) + .withNumTimesToRepeat(2) + .withRecordSize(1000).build()); + + DagNode child1 = new InsertNode(DeltaConfig.Config.newBuilder() + .withNumRecordsToInsert(100) + .withNumInsertPartitions(1) + .withNumTimesToRepeat(2) + .withRecordSize(1000).build()); + + root.addChildNode(child1); + + DagNode child1OfChild1 = new UpsertNode(DeltaConfig.Config.newBuilder() + .withNumRecordsToUpdate(100) + .withNumUpsertPartitions(2) + .withNumTimesToRepeat(1) + .withRecordSize(1000).build()); + + // Tests running 2 nodes in parallel + child1.addChildNode(child1OfChild1); + + List> queryAndResult = new ArrayList<>(); + queryAndResult.add(Pair.of("select " + "count(*) from testdb1.table1 group " + + "by rider having count(*) < 1", 0)); + DagNode child2OfChild1 = new HiveQueryNode(DeltaConfig.Config.newBuilder() + .withHiveQueryAndResults(queryAndResult).withHiveLocal(true).build()); + child1.addChildNode(child2OfChild1); + + List rootNodes = new ArrayList<>(); + rootNodes.add(root); + + return new WorkflowDag(rootNodes); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDag.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDag.java new file mode 100644 index 000000000..e9171fc47 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDag.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag; + +import java.util.List; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; + +/** + * Workflow dag that encapsulates all execute nodes. + */ +public class WorkflowDag { + + private List> nodeList; + + public WorkflowDag(List> nodeList) { + this.nodeList = nodeList; + } + + public List> getNodeList() { + return nodeList; + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDagGenerator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDagGenerator.java new file mode 100644 index 000000000..98b086e4f --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDagGenerator.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag; + +/** + * A interface which represents a workflow dag generator. + */ +public interface WorkflowDagGenerator { + + /** + * Builds a {@link WorkflowDag}. + */ + WorkflowDag build(); + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BulkInsertNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BulkInsertNode.java new file mode 100644 index 000000000..7a8f40550 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BulkInsertNode.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.spark.api.java.JavaRDD; + +public class BulkInsertNode extends InsertNode { + + public BulkInsertNode(Config config) { + super(config); + } + + @Override + protected JavaRDD ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, Option commitTime) + throws Exception { + log.info("Execute bulk ingest node {}", this.getName()); + return hoodieTestSuiteWriter.bulkInsert(commitTime); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CleanNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CleanNode.java new file mode 100644 index 000000000..7083b47cb --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CleanNode.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +public class CleanNode extends DagNode { + + public CleanNode() { + } + + @Override + public void execute(ExecutionContext executionContext) throws Exception { + log.info("Executing clean node {}", this.getName()); + executionContext.getHoodieTestSuiteWriter().getWriteClient(this).clean(); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java new file mode 100644 index 000000000..92fe53c3e --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; +import org.apache.spark.api.java.JavaRDD; + +public class CompactNode extends DagNode> { + + public CompactNode(Config config) { + this.config = config; + } + + @Override + public void execute(ExecutionContext executionContext) throws Exception { + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(), + executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath); + Option lastInstant = metaClient.getActiveTimeline() + .getCommitsAndCompactionTimeline().filterPendingCompactionTimeline().lastInstant(); + if (lastInstant.isPresent()) { + log.info("Compacting instant {}", lastInstant.get()); + this.result = executionContext.getHoodieTestSuiteWriter().compact(Option.of(lastInstant.get().getTimestamp())); + } + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DagNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DagNode.java new file mode 100644 index 000000000..aa54dc913 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DagNode.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.Objects; +import java.util.UUID; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Represents a Node in the DAG of operations for a workflow. + */ +public abstract class DagNode implements Comparable> { + + protected static Logger log = LoggerFactory.getLogger(DagNode.class); + + protected List> childNodes; + protected List> parentNodes; + protected O result; + protected Config config; + private boolean isCompleted; + + public DagNode addChildNode(DagNode childNode) { + childNode.getParentNodes().add(this); + getChildNodes().add(childNode); + return this; + } + + public DagNode addParentNode(DagNode parentNode) { + if (!this.getParentNodes().contains(parentNode)) { + this.getParentNodes().add(parentNode); + } + return this; + } + + public O getResult() { + return result; + } + + public List> getChildNodes() { + if (childNodes == null) { + childNodes = new LinkedList<>(); + } + return childNodes; + } + + public List> getParentNodes() { + if (parentNodes == null) { + this.parentNodes = new ArrayList<>(); + } + return this.parentNodes; + } + + public void setParentNodes(List> parentNodes) { + this.parentNodes = parentNodes; + } + + public abstract void execute(ExecutionContext context) throws Exception; + + public boolean isCompleted() { + return isCompleted; + } + + public void setCompleted(boolean completed) { + isCompleted = completed; + } + + public Config getConfig() { + return config; + } + + public String getName() { + Object name = this.config.getOtherConfigs().get(Config.NODE_NAME); + if (name == null) { + String randomName = UUID.randomUUID().toString(); + this.config.getOtherConfigs().put(Config.NODE_NAME, randomName); + return randomName; + } + return name.toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DagNode dagNode = (DagNode) o; + return getName() == dagNode.getName(); + } + + @Override + public int hashCode() { + return Objects.hash(getName()); + } + + @Override + public int compareTo(DagNode thatNode) { + return this.hashCode() - thatNode.hashCode(); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java new file mode 100644 index 000000000..04f8c2e73 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import org.apache.hudi.DataSourceUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; +import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider; + +public class HiveQueryNode extends DagNode { + + private HiveServiceProvider hiveServiceProvider; + + public HiveQueryNode(DeltaConfig.Config config) { + this.config = config; + this.hiveServiceProvider = new HiveServiceProvider(config); + } + + @Override + public void execute(ExecutionContext executionContext) throws Exception { + log.info("Executing hive query node {}", this.getName()); + this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration()); + HiveSyncConfig hiveSyncConfig = DataSourceUtils + .buildHiveSyncConfig(executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() + .getDeltaSyncService().getDeltaSync().getProps(), + executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() + .getDeltaSyncService().getDeltaSync().getCfg().targetBasePath, + executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() + .getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat); + this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter()); + Connection con = DriverManager.getConnection(hiveSyncConfig.jdbcUrl, hiveSyncConfig.hiveUser, + hiveSyncConfig.hivePass); + Statement stmt = con.createStatement(); + stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); + for (String hiveProperty : this.config.getHiveProperties()) { + executeStatement(hiveProperty, stmt); + } + for (Pair queryAndResult : this.config.getHiveQueries()) { + log.info("Running {}", queryAndResult.getLeft()); + ResultSet res = stmt.executeQuery(queryAndResult.getLeft()); + if (!res.next()) { + log.info("res.next() was False - typically this means the query returned no rows."); + assert 0 == queryAndResult.getRight(); + } else { + Integer result = res.getInt(1); + if (!queryAndResult.getRight().equals(result)) { + throw new AssertionError( + "QUERY: " + queryAndResult.getLeft() + + " | EXPECTED RESULT = " + queryAndResult.getRight() + + " | ACTUAL RESULT = " + result + ); + } + } + log.info("Successfully validated query!"); + } + this.hiveServiceProvider.stopLocalHiveServiceIfNeeded(); + } + + private void executeStatement(String query, Statement stmt) throws SQLException { + log.info("Executing statement {}", stmt.toString()); + stmt.execute(query); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveSyncNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveSyncNode.java new file mode 100644 index 000000000..f24da1496 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveSyncNode.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +public class HiveSyncNode extends DagNode { + + private HiveServiceProvider hiveServiceProvider; + + public HiveSyncNode(Config config) { + this.config = config; + this.hiveServiceProvider = new HiveServiceProvider(config); + } + + @Override + public void execute(ExecutionContext executionContext) throws Exception { + log.info("Executing hive sync node"); + this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration()); + this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter()); + executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive(); + this.hiveServiceProvider.stopLocalHiveServiceIfNeeded(); + } + + public HiveServiceProvider getHiveServiceProvider() { + return hiveServiceProvider; + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java new file mode 100644 index 000000000..23cb28574 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; +import org.apache.hudi.integ.testsuite.generator.DeltaGenerator; +import org.apache.spark.api.java.JavaRDD; + +public class InsertNode extends DagNode> { + + public InsertNode(Config config) { + this.config = config; + } + + @Override + public void execute(ExecutionContext executionContext) throws Exception { + generate(executionContext.getDeltaGenerator()); + log.info("Configs : {}", this.config); + if (!config.isDisableIngest()) { + log.info("Inserting input data {}", this.getName()); + Option commitTime = executionContext.getHoodieTestSuiteWriter().startCommit(); + JavaRDD writeStatus = ingest(executionContext.getHoodieTestSuiteWriter(), commitTime); + executionContext.getHoodieTestSuiteWriter().commit(writeStatus, commitTime); + this.result = writeStatus; + } + } + + protected void generate(DeltaGenerator deltaGenerator) throws Exception { + if (!config.isDisableGenerate()) { + log.info("Generating input data for node {}", this.getName()); + deltaGenerator.writeRecords(deltaGenerator.generateInserts(config)).count(); + } + } + + protected JavaRDD ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, + Option commitTime) throws Exception { + return hoodieTestSuiteWriter.insert(commitTime); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/RollbackNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/RollbackNode.java new file mode 100644 index 000000000..b6d828a10 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/RollbackNode.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +public class RollbackNode extends DagNode> { + + public RollbackNode(Config config) { + this.config = config; + } + + @Override + public void execute(ExecutionContext executionContext) throws Exception { + log.info("Executing rollback node {}", this.getName()); + // Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer + // testing for now + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(), + executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath); + Option lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant(); + if (lastInstant.isPresent()) { + log.info("Rolling back last instant {}", lastInstant.get()); + executionContext.getHoodieTestSuiteWriter().getWriteClient(this).rollback(lastInstant.get().getTimestamp()); + this.result = lastInstant; + } + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java new file mode 100644 index 000000000..93502aee1 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +public class ScheduleCompactNode extends DagNode> { + + public ScheduleCompactNode(Config config) { + this.config = config; + } + + @Override + public void execute(ExecutionContext executionContext) throws Exception { + log.info("Executing schedule compact node {}", this.getName()); + // Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer + // testing for now + // Find the last commit and extra the extra metadata to be passed to the schedule compaction. This is + // done to ensure the CHECKPOINT is correctly passed from commit to commit + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(), + executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath); + Option lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant(); + if (lastInstant.isPresent()) { + HoodieCommitMetadata metadata = org.apache.hudi.common.model.HoodieCommitMetadata.fromBytes(metaClient + .getActiveTimeline().getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class); + Option scheduledInstant = executionContext.getHoodieTestSuiteWriter().scheduleCompaction(Option.of(metadata + .getExtraMetadata())); + if (scheduledInstant.isPresent()) { + log.info("Scheduling compaction instant {}", scheduledInstant.get()); + } + this.result = scheduledInstant; + } + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/SparkSQLQueryNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/SparkSQLQueryNode.java new file mode 100644 index 000000000..a8a1b72bb --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/SparkSQLQueryNode.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +public class SparkSQLQueryNode extends DagNode { + + HiveServiceProvider hiveServiceProvider; + + public SparkSQLQueryNode(Config config) { + this.config = config; + this.hiveServiceProvider = new HiveServiceProvider(config); + } + + @Override + public void execute(ExecutionContext executionContext) throws Exception { + log.info("Executing spark sql query node"); + this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration()); + this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter()); + SparkSession session = SparkSession.builder().sparkContext(executionContext.getJsc().sc()).getOrCreate(); + for (String hiveProperty : this.config.getHiveProperties()) { + session.sql(hiveProperty).count(); + } + for (Pair queryAndResult : this.config.getHiveQueries()) { + log.info("Running {}", queryAndResult.getLeft()); + Dataset res = session.sql(queryAndResult.getLeft()); + if (res.count() == 0) { + assert 0 == queryAndResult.getRight(); + } else { + assert ((Row[]) res.collect())[0].getInt(0) == queryAndResult.getRight(); + } + log.info("Successfully validated query!"); + } + this.hiveServiceProvider.stopLocalHiveServiceIfNeeded(); + this.result = true; + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/UpsertNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/UpsertNode.java new file mode 100644 index 000000000..3872f767b --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/UpsertNode.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.generator.DeltaGenerator; +import org.apache.spark.api.java.JavaRDD; + +public class UpsertNode extends InsertNode { + + public UpsertNode(Config config) { + super(config); + } + + @Override + protected void generate(DeltaGenerator deltaGenerator) throws Exception { + if (!config.isDisableGenerate()) { + log.info("Generating input data {}", this.getName()); + deltaGenerator.writeRecords(deltaGenerator.generateUpdates(config)).count(); + } + } + + @Override + protected JavaRDD ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, Option commitTime) + throws Exception { + if (!config.isDisableIngest()) { + log.info("Upserting input data {}", this.getName()); + this.result = hoodieTestSuiteWriter.upsert(commitTime); + } + return this.result; + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateNode.java new file mode 100644 index 000000000..5ba0249a6 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateNode.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import java.util.List; +import java.util.function.Function; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +public class ValidateNode extends DagNode { + + protected Function, R> function; + + public ValidateNode(Config config, Function, R> function) { + this.function = function; + this.config = config; + } + + @Override + public void execute(ExecutionContext executionContext) { + if (this.getParentNodes().size() > 0 && (Boolean) this.config.getOtherConfigs().getOrDefault("WAIT_FOR_PARENTS", + true)) { + for (DagNode node : (List) this.getParentNodes()) { + if (!node.isCompleted()) { + throw new RuntimeException("cannot validate before parent nodes are complete"); + } + } + } + this.result = this.function.apply((List) this.getParentNodes()); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/DagScheduler.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/DagScheduler.java new file mode 100644 index 000000000..7d78d8311 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/DagScheduler.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.scheduler; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.PriorityQueue; +import java.util.Queue; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; +import org.apache.hudi.integ.testsuite.dag.WorkflowDag; +import org.apache.hudi.integ.testsuite.generator.DeltaGenerator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DagScheduler { + + private static Logger log = LoggerFactory.getLogger(DagScheduler.class); + private WorkflowDag workflowDag; + private ExecutionContext executionContext; + + public DagScheduler(WorkflowDag workflowDag, HoodieTestSuiteWriter hoodieTestSuiteWriter, DeltaGenerator deltaGenerator) { + this.workflowDag = workflowDag; + this.executionContext = new ExecutionContext(null, hoodieTestSuiteWriter, deltaGenerator); + } + + public void schedule() throws Exception { + ExecutorService service = Executors.newFixedThreadPool(2); + try { + execute(service, workflowDag.getNodeList()); + service.shutdown(); + } finally { + if (!service.isShutdown()) { + log.info("Forcing shutdown of executor service, this might kill running tasks"); + service.shutdownNow(); + } + } + } + + private void execute(ExecutorService service, List nodes) throws Exception { + // Nodes at the same level are executed in parallel + Queue queue = new PriorityQueue<>(nodes); + log.info("Running workloads"); + do { + List futures = new ArrayList<>(); + Set childNodes = new HashSet<>(); + while (queue.size() > 0) { + DagNode nodeToExecute = queue.poll(); + futures.add(service.submit(() -> executeNode(nodeToExecute))); + if (nodeToExecute.getChildNodes().size() > 0) { + childNodes.addAll(nodeToExecute.getChildNodes()); + } + } + queue.addAll(childNodes); + childNodes.clear(); + for (Future future : futures) { + future.get(1, TimeUnit.HOURS); + } + } while (queue.size() > 0); + log.info("Finished workloads"); + } + + private void executeNode(DagNode node) { + if (node.isCompleted()) { + throw new RuntimeException("DagNode already completed! Cannot re-execute"); + } + try { + node.execute(executionContext); + node.setCompleted(true); + log.info("Finished executing {}", node.getName()); + } catch (Exception e) { + log.error("Exception executing node"); + throw new HoodieException(e); + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java new file mode 100644 index 000000000..24311e0fe --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.generator; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.StreamSupport; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.converter.Converter; +import org.apache.hudi.integ.testsuite.converter.UpdateConverter; +import org.apache.hudi.integ.testsuite.reader.DFSAvroDeltaInputReader; +import org.apache.hudi.integ.testsuite.reader.DFSHoodieDatasetInputReader; +import org.apache.hudi.integ.testsuite.reader.DeltaInputReader; +import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; +import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats; +import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter; +import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory; +import org.apache.hudi.keygen.ComplexKeyGenerator; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.storage.StorageLevel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +/** + * The delta generator generates all types of workloads (insert, update) for the given configs. + */ +public class DeltaGenerator implements Serializable { + + private static Logger log = LoggerFactory.getLogger(DFSHoodieDatasetInputReader.class); + + private DeltaConfig deltaOutputConfig; + private transient JavaSparkContext jsc; + private transient SparkSession sparkSession; + private String schemaStr; + private List recordRowKeyFieldNames; + private List partitionPathFieldNames; + private int batchId; + + public DeltaGenerator(DeltaConfig deltaOutputConfig, JavaSparkContext jsc, SparkSession sparkSession, + String schemaStr, + KeyGenerator keyGenerator) { + this.deltaOutputConfig = deltaOutputConfig; + this.jsc = jsc; + this.sparkSession = sparkSession; + this.schemaStr = schemaStr; + this.recordRowKeyFieldNames = keyGenerator instanceof ComplexKeyGenerator ? ((ComplexKeyGenerator) keyGenerator) + .getRecordKeyFields() : Arrays.asList(((SimpleKeyGenerator) keyGenerator).getRecordKeyField()); + this.partitionPathFieldNames = keyGenerator instanceof ComplexKeyGenerator ? ((ComplexKeyGenerator) keyGenerator) + .getPartitionPathFields() : Arrays.asList(((SimpleKeyGenerator) keyGenerator).getPartitionPathField()); + } + + public JavaRDD writeRecords(JavaRDD records) { + // The following creates a new anonymous function for iterator and hence results in serialization issues + JavaRDD ws = records.mapPartitions(itr -> { + try { + DeltaWriterAdapter deltaWriterAdapter = DeltaWriterFactory + .getDeltaWriterAdapter(deltaOutputConfig, batchId); + return Collections.singletonList(deltaWriterAdapter.write(itr)).iterator(); + } catch (IOException io) { + throw new UncheckedIOException(io); + } + }).flatMap(List::iterator); + batchId++; + return ws; + } + + public JavaRDD generateInserts(Config operation) { + long recordsPerPartition = operation.getNumRecordsInsert(); + int minPayloadSize = operation.getRecordSize(); + JavaRDD inputBatch = jsc.parallelize(Collections.EMPTY_LIST) + .repartition(operation.getNumInsertPartitions()).mapPartitions(p -> { + return new LazyRecordGeneratorIterator(new FlexibleSchemaRecordGenerationIterator(recordsPerPartition, + minPayloadSize, schemaStr, partitionPathFieldNames)); + }); + return inputBatch; + } + + public JavaRDD generateUpdates(Config config) throws IOException { + if (deltaOutputConfig.getDeltaOutputMode() == DeltaOutputMode.DFS) { + JavaRDD inserts = null; + if (config.getNumRecordsInsert() > 0) { + inserts = generateInserts(config); + } + DeltaInputReader deltaInputReader = null; + JavaRDD adjustedRDD = null; + if (config.getNumUpsertPartitions() < 1) { + // randomly generate updates for a given number of records without regard to partitions and files + deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr, + ((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty()); + adjustedRDD = deltaInputReader.read(config.getNumRecordsUpsert()); + adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsUpsert()); + } else { + deltaInputReader = + new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(), + schemaStr); + if (config.getFractionUpsertPerFile() > 0) { + adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), + config.getFractionUpsertPerFile()); + } else { + adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), config + .getNumRecordsUpsert()); + } + } + log.info("Repartitioning records"); + // persist this since we will make multiple passes over this + adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism()); + log.info("Repartitioning records done"); + Converter converter = new UpdateConverter(schemaStr, config.getRecordSize(), + partitionPathFieldNames, recordRowKeyFieldNames); + JavaRDD updates = converter.convert(adjustedRDD); + log.info("Records converted"); + updates.persist(StorageLevel.DISK_ONLY()); + return inserts != null ? inserts.union(updates) : updates; + // TODO : Generate updates for only N partitions. + } else { + throw new IllegalArgumentException("Other formats are not supported at the moment"); + } + } + + public Map getPartitionToCountMap(JavaRDD records) { + // Requires us to keep the partitioner the same + return records.mapPartitionsWithIndex((index, itr) -> { + Iterable newIterable = () -> itr; + // parallelize counting for speed + long count = StreamSupport.stream(newIterable.spliterator(), true).count(); + return Arrays.asList(new Tuple2<>(index, count)).iterator(); + }, true).mapToPair(i -> i).collectAsMap(); + } + + public Map getAdjustedPartitionsCount(Map partitionCountMap, long + recordsToRemove) { + long remainingRecordsToRemove = recordsToRemove; + Iterator> iterator = partitionCountMap.entrySet().iterator(); + Map adjustedPartitionCountMap = new HashMap<>(); + while (iterator.hasNext()) { + Map.Entry entry = iterator.next(); + if (entry.getValue() < remainingRecordsToRemove) { + remainingRecordsToRemove -= entry.getValue(); + adjustedPartitionCountMap.put(entry.getKey(), 0L); + } else { + long newValue = entry.getValue() - remainingRecordsToRemove; + remainingRecordsToRemove = 0; + adjustedPartitionCountMap.put(entry.getKey(), newValue); + } + if (remainingRecordsToRemove == 0) { + break; + } + } + return adjustedPartitionCountMap; + } + + public JavaRDD adjustRDDToGenerateExactNumUpdates(JavaRDD updates, JavaSparkContext + jsc, long totalRecordsRequired) { + Map actualPartitionCountMap = getPartitionToCountMap(updates); + long totalRecordsGenerated = actualPartitionCountMap.values().stream().mapToLong(Long::longValue).sum(); + if (isSafeToTake(totalRecordsRequired, totalRecordsGenerated)) { + // Generate totalRecordsRequired - totalRecordsGenerated new records and union the RDD's + // NOTE : This performs poorly when totalRecordsRequired >> totalRecordsGenerated. Hence, always + // ensure that enough inserts are created before hand (this needs to be noted during the WorkflowDag creation) + long sizeOfUpdateRDD = totalRecordsGenerated; + while (totalRecordsRequired != sizeOfUpdateRDD) { + long recordsToTake = (totalRecordsRequired - sizeOfUpdateRDD) > sizeOfUpdateRDD + ? sizeOfUpdateRDD : (totalRecordsRequired - sizeOfUpdateRDD); + if ((totalRecordsRequired - sizeOfUpdateRDD) > recordsToTake && recordsToTake <= sizeOfUpdateRDD) { + updates = updates.union(updates); + sizeOfUpdateRDD *= 2; + } else { + List remainingUpdates = updates.take((int) (recordsToTake)); + updates = updates.union(jsc.parallelize(remainingUpdates)); + sizeOfUpdateRDD = sizeOfUpdateRDD + recordsToTake; + } + } + return updates; + } else if (totalRecordsRequired < totalRecordsGenerated) { + final Map adjustedPartitionCountMap = getAdjustedPartitionsCount(actualPartitionCountMap, + totalRecordsGenerated - totalRecordsRequired); + // limit counts across partitions to meet the exact number of updates required + JavaRDD trimmedRecords = updates.mapPartitionsWithIndex((index, itr) -> { + int counter = 1; + List entriesToKeep = new ArrayList<>(); + if (!adjustedPartitionCountMap.containsKey(index)) { + return itr; + } else { + long recordsToKeepForThisPartition = adjustedPartitionCountMap.get(index); + while (counter <= recordsToKeepForThisPartition && itr.hasNext()) { + entriesToKeep.add(itr.next()); + counter++; + } + return entriesToKeep.iterator(); + } + }, true); + return trimmedRecords; + } + return updates; + } + + private boolean isSafeToTake(long totalRecords, long totalRecordsGenerated) { + // TODO : Ensure that the difference between totalRecords and totalRecordsGenerated is not too big, if yes, + // then there are fewer number of records on disk, hence we need to find another way to generate updates when + // requiredUpdates >> insertedRecords + return totalRecords > totalRecordsGenerated; + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/FlexibleSchemaRecordGenerationIterator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/FlexibleSchemaRecordGenerationIterator.java new file mode 100644 index 000000000..614add52c --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/FlexibleSchemaRecordGenerationIterator.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.generator; + +import java.util.Iterator; +import java.util.List; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; + +/** + * A GenericRecordGeneratorIterator for the custom schema of the workload. Implements {@link Iterator} to allow for + * iteration semantics. + */ +public class FlexibleSchemaRecordGenerationIterator implements Iterator { + + // Stores how many records to generate as part of this iterator. Ideally, one iterator is started per spark + // partition. + private long counter; + // Use the full payload generator as default + private GenericRecordFullPayloadGenerator generator; + // Store last record for the partition path of the first payload to be used for all subsequent generated payloads + private GenericRecord lastRecord; + // Partition path field name + private List partitionPathFieldNames; + + public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, String schema) { + this(maxEntriesToProduce, GenericRecordFullPayloadGenerator.DEFAULT_PAYLOAD_SIZE, schema, null); + } + + public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, int minPayloadSize, String schemaStr, + List partitionPathFieldNames) { + this.counter = maxEntriesToProduce; + this.partitionPathFieldNames = partitionPathFieldNames; + Schema schema = new Schema.Parser().parse(schemaStr); + this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize); + } + + @Override + public boolean hasNext() { + return this.counter > 0; + } + + @Override + public GenericRecord next() { + this.counter--; + if (lastRecord == null) { + GenericRecord record = this.generator.getNewPayload(); + lastRecord = record; + return record; + } else { + return this.generator.randomize(lastRecord, this.partitionPathFieldNames); + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadGenerator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadGenerator.java new file mode 100644 index 000000000..df1a4c7cd --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadGenerator.java @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.generator; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.UUID; +import org.apache.avro.LogicalType; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.Schema.Type; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericData.Fixed; +import org.apache.avro.generic.GenericFixed; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.util.collection.Pair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This is a GenericRecord payload generator that generates full generic records {@link GenericRecord}. + * Every field of a generic record created using this generator contains a random value. + */ +public class GenericRecordFullPayloadGenerator implements Serializable { + + public static final int DEFAULT_PAYLOAD_SIZE = 1024 * 10; // 10 KB + private static Logger log = LoggerFactory.getLogger(GenericRecordFullPayloadGenerator.class); + protected final Random random = new Random(); + // The source schema used to generate a payload + private final transient Schema baseSchema; + // Used to validate a generic record + private final transient GenericData genericData = new GenericData(); + // Number of more bytes to add based on the estimated full record payload size and min payload size + private int numberOfBytesToAdd; + // If more elements should be packed to meet the minPayloadSize + private boolean shouldAddMore; + // How many complex fields have we visited that can help us pack more entries and increase the size of the record + private int numberOfComplexFields; + // The size of a full record where every field of a generic record created contains 1 random value + private int estimatedFullPayloadSize; + // LogicalTypes in Avro 1.8.2 + private static final String DECIMAL = "decimal"; + private static final String UUID_NAME = "uuid"; + private static final String DATE = "date"; + private static final String TIME_MILLIS = "time-millis"; + private static final String TIME_MICROS = "time-micros"; + private static final String TIMESTAMP_MILLIS = "timestamp-millis"; + private static final String TIMESTAMP_MICROS = "timestamp-micros"; + + public GenericRecordFullPayloadGenerator(Schema schema) { + this(schema, DEFAULT_PAYLOAD_SIZE); + } + + public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize) { + Pair sizeInfo = new GenericRecordFullPayloadSizeEstimator(schema) + .typeEstimateAndNumComplexFields(); + this.estimatedFullPayloadSize = sizeInfo.getLeft(); + this.numberOfComplexFields = sizeInfo.getRight(); + this.baseSchema = schema; + this.shouldAddMore = estimatedFullPayloadSize < minPayloadSize; + if (this.shouldAddMore) { + this.numberOfBytesToAdd = minPayloadSize - estimatedFullPayloadSize; + if (numberOfComplexFields < 1) { + log.warn("The schema does not have any collections/complex fields. Cannot achieve minPayloadSize : {}", + minPayloadSize); + } + } + } + + protected static boolean isPrimitive(Schema localSchema) { + if (localSchema.getType() != Type.ARRAY + && localSchema.getType() != Type.MAP + && localSchema.getType() != Type.RECORD + && localSchema.getType() != Type.UNION) { + return true; + } else { + return false; + } + } + + public GenericRecord getNewPayload() { + return convert(baseSchema); + } + + public GenericRecord getUpdatePayload(GenericRecord record, List blacklistFields) { + return randomize(record, blacklistFields); + } + + protected GenericRecord convert(Schema schema) { + GenericRecord result = new GenericData.Record(schema); + for (Schema.Field f : schema.getFields()) { + result.put(f.name(), typeConvert(f.schema())); + } + return result; + } + + protected GenericRecord convertPartial(Schema schema) { + GenericRecord result = new GenericData.Record(schema); + for (Schema.Field f : schema.getFields()) { + boolean setNull = random.nextBoolean(); + if (!setNull) { + result.put(f.name(), typeConvert(f.schema())); + } else { + result.put(f.name(), null); + } + } + // TODO : pack remaining bytes into a complex field + return result; + } + + protected GenericRecord randomize(GenericRecord record, List blacklistFields) { + for (Schema.Field f : record.getSchema().getFields()) { + if (blacklistFields == null || !blacklistFields.contains(f.name())) { + record.put(f.name(), typeConvert(f.schema())); + } + } + return record; + } + + private Object typeConvert(Schema schema) { + Schema localSchema = schema; + if (isOption(schema)) { + localSchema = getNonNull(schema); + } + switch (localSchema.getType()) { + case BOOLEAN: + return random.nextBoolean(); + case DOUBLE: + return random.nextDouble(); + case FLOAT: + return random.nextFloat(); + case INT: + return random.nextInt(); + case LONG: + return random.nextLong(); + case STRING: + return UUID.randomUUID().toString(); + case ENUM: + List enumSymbols = localSchema.getEnumSymbols(); + return new GenericData.EnumSymbol(localSchema, enumSymbols.get(random.nextInt(enumSymbols.size() - 1))); + case RECORD: + return convert(localSchema); + case ARRAY: + Schema elementSchema = localSchema.getElementType(); + List listRes = new ArrayList(); + if (isPrimitive(elementSchema) && this.shouldAddMore) { + int numEntriesToAdd = numEntriesToAdd(elementSchema); + while (numEntriesToAdd > 0) { + listRes.add(typeConvert(elementSchema)); + numEntriesToAdd--; + } + } else { + listRes.add(typeConvert(elementSchema)); + } + return listRes; + case MAP: + Schema valueSchema = localSchema.getValueType(); + Map mapRes = new HashMap(); + if (isPrimitive(valueSchema) && this.shouldAddMore) { + int numEntriesToAdd = numEntriesToAdd(valueSchema); + while (numEntriesToAdd > 0) { + mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema)); + numEntriesToAdd--; + } + } else { + mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema)); + } + return mapRes; + case BYTES: + return ByteBuffer.wrap(UUID.randomUUID().toString().getBytes(Charset.defaultCharset())); + case FIXED: + return generateFixedType(localSchema); + default: + throw new IllegalArgumentException( + "Cannot handle type: " + localSchema.getType()); + } + } + + private Object generateFixedType(Schema localSchema) { + // TODO: Need to implement valid data generation for fixed type + GenericFixed genericFixed = new GenericData.Fixed(localSchema); + switch (localSchema.getLogicalType().getName()) { + case UUID_NAME: + ((Fixed) genericFixed).bytes(UUID.randomUUID().toString().getBytes()); + return genericFixed; + case DECIMAL: + return genericFixed; + case DATE: + return genericFixed; + case TIME_MILLIS: + return genericFixed; + default: + throw new IllegalArgumentException( + "Cannot handle type: " + localSchema.getLogicalType()); + } + } + + public boolean validate(GenericRecord record) { + return genericData.validate(baseSchema, record); + } + + protected boolean isOption(Schema schema) { + return schema.getType().equals(Schema.Type.UNION) + && schema.getTypes().size() == 2 + && (schema.getTypes().get(0).getType().equals(Schema.Type.NULL) + || schema.getTypes().get(1).getType().equals(Schema.Type.NULL)); + } + + protected Schema getNonNull(Schema schema) { + List types = schema.getTypes(); + return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0); + } + + public int getEstimatedFullPayloadSize() { + return estimatedFullPayloadSize; + } + + private int getSize(Schema elementSchema) { + switch (elementSchema.getType()) { + case BOOLEAN: + return 1; + case DOUBLE: + return Double.BYTES; + case FLOAT: + return Float.BYTES; + case INT: + return Integer.BYTES; + case LONG: + return Long.BYTES; + case STRING: + return UUID.randomUUID().toString().length(); + case ENUM: + return 1; + case BYTES: + return UUID.randomUUID().toString().length(); + case FIXED: + return elementSchema.getFixedSize(); + default: + throw new RuntimeException("Unknown type " + elementSchema.getType()); + } + } + + private int numEntriesToAdd(Schema elementSchema) { + // Find the size of the primitive data type in bytes + int primitiveDataTypeSize = getSize(elementSchema); + int numEntriesToAdd = numberOfBytesToAdd / primitiveDataTypeSize; + // If more than 10 entries are being added for this same complex field and there are still more complex fields to + // be visited in the schema, reduce the number of entries to add by a factor of 10 to allow for other complex + // fields to pack some entries + if (numEntriesToAdd % 10 > 0 && this.numberOfComplexFields > 1) { + numEntriesToAdd = numEntriesToAdd / 10; + numberOfBytesToAdd -= numEntriesToAdd * primitiveDataTypeSize; + this.shouldAddMore = true; + } else { + this.numberOfBytesToAdd = 0; + this.shouldAddMore = false; + } + this.numberOfComplexFields -= 1; + return numEntriesToAdd; + } +} + diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadSizeEstimator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadSizeEstimator.java new file mode 100644 index 000000000..c6a8f4e31 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadSizeEstimator.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.generator; + +import java.io.Serializable; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.util.collection.Pair; + +/** + * This is a GenericRecord payload estimator estimates the size of a full generic record {@link GenericRecord}. + * A full record is defined as "Every field of a generic record created contains 1 random value" + */ +public class GenericRecordFullPayloadSizeEstimator implements Serializable { + + private final transient Schema baseSchema; + + // This variable is used to track the number of complex/collection fields with primitive data types at their leaf. + // This is used to figure out how many entries can be packed in such a collection field to meet the min payload + // size requested + private final transient AtomicInteger counter = new AtomicInteger(0); + + public GenericRecordFullPayloadSizeEstimator(Schema schema) { + this.baseSchema = schema; + } + + public Pair typeEstimateAndNumComplexFields() { + int size = estimate(baseSchema); + return Pair.of(size, counter.get()); + } + + /** + * This method estimates the size of the payload if all entries of this payload were populated with one value. + * For eg. A primitive data type such as String will be populated with {@link UUID} so the length if 36 bytes + * whereas a complex data type such as an Array of type Int, will be populated with exactly 1 Integer value. + */ + protected int estimate(Schema schema) { + long size = 0; + for (Schema.Field f : schema.getFields()) { + size += typeEstimate(f.schema()); + } + return (int) size; + } + + private long typeEstimate(Schema schema) { + Schema localSchema = schema; + if (isOption(schema)) { + localSchema = getNonNull(schema); + } + switch (localSchema.getType()) { + case BOOLEAN: + return 1; + case DOUBLE: + return 8; + case FLOAT: + return 4; + case INT: + return 4; + case LONG: + return 8; + case STRING: + return UUID.randomUUID().toString().length(); + case ENUM: + return 1; + case RECORD: + return estimate(localSchema); + case ARRAY: + if (GenericRecordFullPayloadGenerator.isPrimitive(localSchema.getElementType())) { + counter.addAndGet(1); + } + Schema elementSchema = localSchema.getElementType(); + return typeEstimate(elementSchema); + case MAP: + if (GenericRecordFullPayloadGenerator.isPrimitive(localSchema.getValueType())) { + counter.addAndGet(1); + } + Schema valueSchema = localSchema.getValueType(); + return UUID.randomUUID().toString().length() + typeEstimate(valueSchema); + case BYTES: + return UUID.randomUUID().toString().length(); + case FIXED: + return localSchema.getFixedSize(); + default: + throw new IllegalArgumentException( + "Cannot handle type: " + localSchema.getType()); + } + } + + protected boolean isOption(Schema schema) { + return schema.getType().equals(Schema.Type.UNION) + && schema.getTypes().size() == 2 + && (schema.getTypes().get(0).getType().equals(Schema.Type.NULL) + || schema.getTypes().get(1).getType().equals(Schema.Type.NULL)); + } + + protected Schema getNonNull(Schema schema) { + List types = schema.getTypes(); + return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0); + } + +} + diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordPartialPayloadGenerator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordPartialPayloadGenerator.java new file mode 100644 index 000000000..f7e4174e6 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordPartialPayloadGenerator.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.generator; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; + +/** + * This is a GenericRecord payload generator that generates partial generic records {@link GenericRecord}. A partial + * records is one that has some fields of the schema NULL or NOT PRESENT. This payload enables us to simulate + * creation of partial records which are possible in many cases, especially for database change logs. + */ +public class GenericRecordPartialPayloadGenerator extends GenericRecordFullPayloadGenerator { + + public GenericRecordPartialPayloadGenerator(Schema schema) { + super(schema); + } + + public GenericRecordPartialPayloadGenerator(Schema schema, int minPayloadSize) { + super(schema, minPayloadSize); + } + + @Override + protected GenericRecord convert(Schema schema) { + GenericRecord record = super.convertPartial(schema); + return record; + } + + private void setNull(GenericRecord record) { + for (Schema.Field field : record.getSchema().getFields()) { + // A random boolean decides whether this field of the generic record should be present or absent. + // Using this we can set only a handful of fields in the record and generate partial records + boolean setNull = random.nextBoolean(); + if (setNull) { // TODO : DO NOT SET THE RECORD KEY FIELDS TO NULL + record.put(field.name(), null); + } else { + if (record.get(field.name()) instanceof GenericData.Record) { + setNull((GenericData.Record) record.get(field.name())); + } + } + } + } + + @Override + public boolean validate(GenericRecord record) { + return validate((Object) record); + } + + // Atleast 1 entry should be null + private boolean validate(Object object) { + if (object == null) { + return true; + } else if (object instanceof GenericRecord) { + for (Schema.Field field : ((GenericRecord) object).getSchema().getFields()) { + boolean ret = validate(((GenericRecord) object).get(field.name())); + if (ret) { + return ret; + } + } + } + return false; + } + +} + diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/LazyRecordGeneratorIterator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/LazyRecordGeneratorIterator.java new file mode 100644 index 000000000..a775d37c5 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/LazyRecordGeneratorIterator.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.generator; + +import java.util.Iterator; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.client.utils.LazyIterableIterator; + +/** + * A lazy record generator to generate {@link GenericRecord}s lazily and not hold a list of records in memory. + */ +public class LazyRecordGeneratorIterator extends + LazyIterableIterator { + + public LazyRecordGeneratorIterator(Iterator inputItr) { + super(inputItr); + } + + @Override + protected void start() { + } + + @Override + protected GenericRecord computeNext() { + return inputItr.next(); + } + + @Override + protected void end() { + + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/UpdateGeneratorIterator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/UpdateGeneratorIterator.java new file mode 100644 index 000000000..685a5c77a --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/UpdateGeneratorIterator.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.generator; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; + +public class UpdateGeneratorIterator implements Iterator { + + // Use the full payload generator as default + private GenericRecordFullPayloadGenerator generator; + private List blackListedFields; + // iterator + private Iterator itr; + + public UpdateGeneratorIterator(Iterator itr, String schemaStr, List partitionPathFieldNames, + List recordKeyFieldNames, int minPayloadSize) { + this.itr = itr; + this.blackListedFields = new ArrayList<>(); + this.blackListedFields.addAll(partitionPathFieldNames); + this.blackListedFields.addAll(recordKeyFieldNames); + Schema schema = new Schema.Parser().parse(schemaStr); + this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize); + } + + @Override + public boolean hasNext() { + return itr.hasNext(); + } + + @Override + public GenericRecord next() { + GenericRecord newRecord = itr.next(); + return this.generator.randomize(newRecord, this.blackListedFields); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/DFSTestSuitePathSelector.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/DFSTestSuitePathSelector.java new file mode 100644 index 000000000..b67e21fe2 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/DFSTestSuitePathSelector.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.helpers; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.utilities.sources.helpers.DFSPathSelector; + +/** + * A custom dfs path selector used only for the hudi test suite. To be used only if workload is not run inline. + */ +public class DFSTestSuitePathSelector extends DFSPathSelector { + + public DFSTestSuitePathSelector(TypedProperties props, Configuration hadoopConf) { + super(props, hadoopConf); + } + + @Override + public Pair, String> getNextFilePathsAndMaxModificationTime( + Option lastCheckpointStr, long sourceLimit) { + + Integer lastBatchId; + Integer nextBatchId; + try { + if (lastCheckpointStr.isPresent()) { + lastBatchId = Integer.parseInt(lastCheckpointStr.get()); + nextBatchId = lastBatchId + 1; + } else { + lastBatchId = -1; + nextBatchId = 0; + } + // obtain all eligible files for the batch + List eligibleFiles = new ArrayList<>(); + FileStatus[] fileStatuses = fs.globStatus( + new Path(props.getString(Config.ROOT_INPUT_PATH_PROP), "*")); + for (FileStatus fileStatus : fileStatuses) { + if (!fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream() + .anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) { + continue; + } else if (fileStatus.getPath().getName().compareTo(lastBatchId.toString()) > 0 && fileStatus.getPath() + .getName().compareTo(nextBatchId.toString()) <= 0) { + RemoteIterator files = fs.listFiles(fileStatus.getPath(), true); + while (files.hasNext()) { + eligibleFiles.add(files.next()); + } + } + } + // no data to readAvro + if (eligibleFiles.size() == 0) { + return new ImmutablePair<>(Option.empty(), + lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE))); + } + // readAvro the files out. + String pathStr = eligibleFiles.stream().map(f -> f.getPath().toString()) + .collect(Collectors.joining(",")); + + return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(nextBatchId)); + } catch (IOException ioe) { + throw new HoodieIOException( + "Unable to readAvro from source from checkpoint: " + lastCheckpointStr, ioe); + } + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java new file mode 100644 index 000000000..88022ed1c --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.helpers; + +import java.io.IOException; +import java.net.BindException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hive.service.server.HiveServer2; +import org.apache.hudi.hive.testutils.HiveTestService; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.thrift.transport.TTransportException; + +/** + * Hive Service provider. + */ +public class HiveServiceProvider { + + private HiveTestService hiveService; + private HiveServer2 hiveServer; + private Config config; + + private static final Logger LOG = LogManager.getLogger(HiveServiceProvider.class); + + public HiveServiceProvider(Config config) { + this.config = config; + } + + public void startLocalHiveServiceIfNeeded(Configuration configuration) throws IOException { + if (config.isHiveLocal()) { + hiveService = new HiveTestService(configuration); + hiveServer = hiveService.start(); + } + } + + public void syncToLocalHiveIfNeeded(HoodieTestSuiteWriter writer) { + if (this.config.isHiveLocal()) { + writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync() + .syncHive(getLocalHiveServer().getHiveConf()); + } else { + writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive(); + } + } + + public void stopLocalHiveServiceIfNeeded() throws IOException { + if (config.isHiveLocal()) { + if (hiveService != null) { + hiveService.stop(); + } + } + } + + public HiveServer2 getLocalHiveServer() { + return hiveServer; + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSAvroDeltaInputReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSAvroDeltaInputReader.java new file mode 100644 index 000000000..f1bb02a69 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSAvroDeltaInputReader.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.reader; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter; +import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.SparkSession; + +/** + * A reader of {@link DeltaOutputMode#DFS} and {@link DeltaInputType#AVRO}. + */ +public class DFSAvroDeltaInputReader extends DFSDeltaInputReader { + + private final SparkSession sparkSession; + private final String schemaStr; + private final String basePath; + private final Option structName; + private final Option nameSpace; + protected PathFilter filter = (path) -> { + if (path.toUri().toString().contains(AvroFileDeltaInputWriter.AVRO_EXTENSION)) { + return true; + } else { + return false; + } + }; + + public DFSAvroDeltaInputReader(SparkSession sparkSession, String schemaStr, String basePath, + Option structName, + Option nameSpace) { + this.sparkSession = sparkSession; + this.schemaStr = schemaStr; + this.basePath = basePath; + this.structName = structName; + this.nameSpace = nameSpace; + } + + @Override + public JavaRDD read(long totalRecordsToRead) throws IOException { + return SparkBasedReader.readAvro(sparkSession, schemaStr, getFilePathsToRead(basePath, filter, totalRecordsToRead), + structName, nameSpace); + } + + @Override + public JavaRDD read(int numPartitions, long approxNumRecords) throws IOException { + throw new UnsupportedOperationException("cannot generate updates"); + } + + @Override + public JavaRDD read(int numPartitions, int numFiles, long approxNumRecords) throws IOException { + throw new UnsupportedOperationException("cannot generate updates"); + } + + @Override + public JavaRDD read(int numPartitions, int numFiles, double percentageRecordsPerFile) + throws IOException { + throw new UnsupportedOperationException("cannot generate updates"); + } + + @Override + protected long analyzeSingleFile(String filePath) { + JavaRDD recordsFromOneFile = SparkBasedReader + .readAvro(sparkSession, schemaStr, Arrays.asList(filePath), + structName, nameSpace); + return recordsFromOneFile.count(); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSDeltaInputReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSDeltaInputReader.java new file mode 100644 index 000000000..da9268161 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSDeltaInputReader.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.reader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.collection.Pair; + +/** + * This class helps to estimate the number of files to read a given number of total records. + * Use this class for all DFS based implementations of {@link DeltaInputReader} + */ +public abstract class DFSDeltaInputReader implements DeltaInputReader { + + protected List getFilePathsToRead(String basePath, PathFilter filter, long totalRecordsToRead) throws + IOException { + FileSystem fs = FSUtils.getFs(basePath, new Configuration()); + // TODO : Sort list by file size and take the median file status to ensure fair calculation and change to remote + // iterator + List fileStatuses = Arrays.asList(fs.globStatus(new Path(basePath, "*/*"), filter)); + if (fileStatuses.size() > 0) { + FileStatus status = fileStatuses.get(0); + long avgNumRecordsPerFile = analyzeSingleFile(status.getPath().toString()); + long numFilesToMatchExpectedRecords = (long) Math.ceil((double) totalRecordsToRead / (double) + avgNumRecordsPerFile); + long avgSizeOfEachFile = status.getLen(); + long totalSizeToRead = avgSizeOfEachFile * numFilesToMatchExpectedRecords; + // choose N files with that length + Pair fileStatusIndexRange = getFileStatusIndexRange(fileStatuses, avgSizeOfEachFile, + totalSizeToRead); + int startIndex = fileStatusIndexRange.getLeft(); + List filePaths = new ArrayList<>(); + while (startIndex == 0 || startIndex < fileStatusIndexRange.getRight()) { + filePaths.add(fileStatuses.get(startIndex).getPath().toString()); + startIndex++; + } + return filePaths; + } + return Collections.emptyList(); + } + + protected Pair getFileStatusIndexRange(List fileStatuses, long averageFileSize, long + totalSizeToRead) { + long totalSizeOfFilesPresent = 0; + int startOffset = 0; + int endOffset = 0; + for (FileStatus fileStatus : fileStatuses) { + // If current file length is greater than averageFileSize, increment by averageFileSize since our + // totalSizeToRead calculation is based on the averageRecordSize * numRecordsToRead. + if (fileStatus.getLen() > averageFileSize) { + totalSizeOfFilesPresent += averageFileSize; + } else { + totalSizeOfFilesPresent += fileStatus.getLen(); + } + if (totalSizeOfFilesPresent <= totalSizeToRead) { + endOffset++; + continue; + } else { + return Pair.of(startOffset, endOffset); + } + } + return Pair.of(startOffset, endOffset); + } + + /** + * Implementation of {@link DeltaInputReader}s to provide a way to read a single file on DFS and provide an + * average number of records across N files. + */ + protected long analyzeSingleFile(String filePath) { + throw new UnsupportedOperationException("No implementation found"); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java new file mode 100644 index 000000000..209aa469c --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java @@ -0,0 +1,338 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.reader; + +import static java.util.Map.Entry.comparingByValue; +import static java.util.stream.Collectors.toMap; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ParquetReaderIterator; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieMemoryConfig; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +/** + * This class helps to generate updates from an already existing hoodie dataset. It supports generating updates in + * across partitions, files and records. + */ +public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader { + + private static Logger log = LoggerFactory.getLogger(DFSHoodieDatasetInputReader.class); + + private transient JavaSparkContext jsc; + private String schemaStr; + private HoodieTableMetaClient metaClient; + + public DFSHoodieDatasetInputReader(JavaSparkContext jsc, String basePath, String schemaStr) { + this.jsc = jsc; + this.schemaStr = schemaStr; + this.metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); + } + + protected List getPartitions(Option partitionsLimit) throws IOException { + List partitionPaths = FSUtils + .getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), false); + // Sort partition so we can pick last N partitions by default + Collections.sort(partitionPaths); + if (!partitionPaths.isEmpty()) { + ValidationUtils.checkArgument(partitionPaths.size() >= partitionsLimit.get(), + "Cannot generate updates for more partitions " + "than present in the dataset, partitions " + + "requested " + partitionsLimit.get() + ", partitions present " + partitionPaths.size()); + return partitionPaths.subList(0, partitionsLimit.get()); + } + return partitionPaths; + + } + + private JavaPairRDD> getPartitionToFileSlice(HoodieTableMetaClient metaClient, + List partitionPaths) { + TableFileSystemView.SliceView fileSystemView = new HoodieTableFileSystemView(metaClient, + metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants()); + // pass num partitions to another method + JavaPairRDD> partitionToFileSliceList = jsc.parallelize(partitionPaths).mapToPair(p -> { + return new Tuple2<>(p, fileSystemView.getLatestFileSlices(p).iterator()); + }); + return partitionToFileSliceList; + } + + @Override + protected long analyzeSingleFile(String filePath) { + return SparkBasedReader.readParquet(new SparkSession(jsc.sc()), Arrays.asList(filePath), + Option.empty(), Option.empty()).count(); + } + + private JavaRDD fetchAnyRecordsFromDataset(Option numRecordsToUpdate) throws IOException { + return fetchRecordsFromDataset(Option.empty(), Option.empty(), numRecordsToUpdate, Option.empty()); + } + + private JavaRDD fetchAnyRecordsFromDataset(Option numRecordsToUpdate, Option + numPartitions) throws IOException { + return fetchRecordsFromDataset(numPartitions, Option.empty(), numRecordsToUpdate, Option.empty()); + } + + private JavaRDD fetchPercentageRecordsFromDataset(Option numPartitions, Option + numFiles, Option percentageRecordsPerFile) throws IOException { + return fetchRecordsFromDataset(numPartitions, numFiles, Option.empty(), percentageRecordsPerFile); + } + + private JavaRDD fetchRecordsFromDataset(Option numPartitions, Option + numFiles, Option numRecordsToUpdate) throws IOException { + return fetchRecordsFromDataset(numPartitions, numFiles, numRecordsToUpdate, Option.empty()); + } + + private JavaRDD fetchRecordsFromDataset(Option numPartitions, Option numFiles, + Option numRecordsToUpdate, Option percentageRecordsPerFile) throws IOException { + log.info("NumPartitions : {}, NumFiles : {}, numRecordsToUpdate : {}, percentageRecordsPerFile : {}", + numPartitions, numFiles, numRecordsToUpdate, percentageRecordsPerFile); + List partitionPaths = getPartitions(numPartitions); + // Read all file slices in the partition + JavaPairRDD> partitionToFileSlice = getPartitionToFileSlice(metaClient, + partitionPaths); + // TODO : read record count from metadata + // Read the records in a single file + long recordsInSingleFile = iteratorSize(readParquetOrLogFiles(getSingleSliceFromRDD(partitionToFileSlice))); + int numFilesToUpdate; + long numRecordsToUpdatePerFile; + if (!numFiles.isPresent() || numFiles.get() == 0) { + // If num files are not passed, find the number of files to update based on total records to update and records + // per file + numFilesToUpdate = (int) (numRecordsToUpdate.get() / recordsInSingleFile); + log.info("Files to update {}", numFilesToUpdate); + numRecordsToUpdatePerFile = recordsInSingleFile; + } else { + // If num files is passed, find the number of records per file based on either percentage or total records to + // update and num files passed + numFilesToUpdate = numFiles.get(); + numRecordsToUpdatePerFile = percentageRecordsPerFile.isPresent() ? (long) (recordsInSingleFile + * percentageRecordsPerFile.get()) : numRecordsToUpdate.get() / numFilesToUpdate; + } + // Adjust the number of files to read per partition based on the requested partition & file counts + Map adjustedPartitionToFileIdCountMap = getFilesToReadPerPartition(partitionToFileSlice, + getPartitions(numPartitions).size(), numFilesToUpdate); + JavaRDD updates = projectSchema(generateUpdates(adjustedPartitionToFileIdCountMap, + partitionToFileSlice, numFilesToUpdate, (int) numRecordsToUpdatePerFile)); + if (numRecordsToUpdate.isPresent() && numFiles.isPresent() && numFiles.get() != 0 && numRecordsToUpdate.get() + != numRecordsToUpdatePerFile * numFiles.get()) { + long remainingRecordsToAdd = (numRecordsToUpdate.get() - (numRecordsToUpdatePerFile * numFiles.get())); + updates = updates.union(projectSchema(jsc.parallelize(generateUpdates(adjustedPartitionToFileIdCountMap, + partitionToFileSlice, numFilesToUpdate, (int) remainingRecordsToAdd).take((int) remainingRecordsToAdd)))); + } + log.info("Finished generating updates"); + return updates; + } + + private JavaRDD projectSchema(JavaRDD updates) { + // The records read from the hoodie dataset have the hoodie record fields, rewrite the record to eliminate them + return updates + .map(r -> HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(r, new Schema.Parser().parse(schemaStr))); + } + + private JavaRDD generateUpdates(Map adjustedPartitionToFileIdCountMap, + JavaPairRDD> partitionToFileSlice, int numFiles, int numRecordsToReadPerFile) { + return partitionToFileSlice.map(p -> { + int maxFilesToRead = adjustedPartitionToFileIdCountMap.get(p._1); + return iteratorLimit(p._2, maxFilesToRead); + }).flatMap(p -> p).repartition(numFiles).map(fileSlice -> { + if (numRecordsToReadPerFile > 0) { + return iteratorLimit(readParquetOrLogFiles(fileSlice), numRecordsToReadPerFile); + } else { + return readParquetOrLogFiles(fileSlice); + } + }).flatMap(p -> p).map(i -> (GenericRecord) i); + } + + private Map getFilesToReadPerPartition(JavaPairRDD> + partitionToFileSlice, Integer numPartitions, Integer numFiles) { + int numFilesPerPartition = (int) Math.ceil(numFiles / numPartitions); + Map partitionToFileIdCountMap = partitionToFileSlice + .mapToPair(p -> new Tuple2<>(p._1, iteratorSize(p._2))).collectAsMap(); + long totalExistingFilesCount = partitionToFileIdCountMap.values().stream().reduce((a, b) -> a + b).get(); + ValidationUtils.checkArgument(totalExistingFilesCount >= numFiles, "Cannot generate updates " + + "for more files than present in the dataset, file requested " + numFiles + ", files present " + + totalExistingFilesCount); + Map partitionToFileIdCountSortedMap = partitionToFileIdCountMap + .entrySet() + .stream() + .sorted(comparingByValue()) + .collect(toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, + LinkedHashMap::new)); + // Limit files to be read per partition + Map adjustedPartitionToFileIdCountMap = new HashMap<>(); + partitionToFileIdCountSortedMap.entrySet().stream().forEach(e -> { + if (e.getValue() <= numFilesPerPartition) { + adjustedPartitionToFileIdCountMap.put(e.getKey(), e.getValue()); + } else { + adjustedPartitionToFileIdCountMap.put(e.getKey(), numFilesPerPartition); + } + }); + return adjustedPartitionToFileIdCountMap; + } + + private FileSlice getSingleSliceFromRDD(JavaPairRDD> partitionToFileSlice) { + return partitionToFileSlice.map(f -> { + FileSlice slice = f._2.next(); + FileSlice newSlice = new FileSlice(slice.getFileGroupId(), slice.getBaseInstantTime()); + if (slice.getBaseFile().isPresent()) { + newSlice.setBaseFile(slice.getBaseFile().get()); + } else { + slice.getLogFiles().forEach(l -> { + newSlice.addLogFile(l); + }); + } + return newSlice; + }).take(1).get(0); + } + + private Iterator readParquetOrLogFiles(FileSlice fileSlice) throws IOException { + if (fileSlice.getBaseFile().isPresent()) { + Iterator itr = + new ParquetReaderIterator(AvroParquetReader.builder(new + Path(fileSlice.getBaseFile().get().getPath())).withConf(metaClient.getHadoopConf()).build()); + return itr; + } else { + // If there is no data file, fall back to reading log files + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(metaClient.getFs(), + metaClient.getBasePath(), + fileSlice.getLogFiles().map(l -> l.getPath().getName()).collect(Collectors.toList()), + new Schema.Parser().parse(schemaStr), metaClient.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants().lastInstant().get().getTimestamp(), + HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES, true, false, + HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE, + HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH); + // readAvro log files + Iterable> iterable = () -> scanner.iterator(); + Schema schema = new Schema.Parser().parse(schemaStr); + return StreamSupport.stream(iterable.spliterator(), false) + .map(e -> { + try { + return (IndexedRecord) e.getData().getInsertValue(schema).get(); + } catch (IOException io) { + throw new UncheckedIOException(io); + } + }).iterator(); + } + } + + /** + * Returns the number of elements remaining in {@code iterator}. The iterator + * will be left exhausted: its {@code hasNext()} method will return + * {@code false}. + */ + private static int iteratorSize(Iterator iterator) { + int count = 0; + while (iterator.hasNext()) { + iterator.next(); + count++; + } + return count; + } + + /** + * Creates an iterator returning the first {@code limitSize} elements of the + * given iterator. If the original iterator does not contain that many + * elements, the returned iterator will have the same behavior as the original + * iterator. The returned iterator supports {@code remove()} if the original + * iterator does. + * + * @param iterator the iterator to limit + * @param limitSize the maximum number of elements in the returned iterator + * @throws IllegalArgumentException if {@code limitSize} is negative + */ + private static Iterator iteratorLimit( + final Iterator iterator, final int limitSize) { + ValidationUtils.checkArgument(iterator != null, "iterator is null"); + ValidationUtils.checkArgument(limitSize >= 0, "limit is negative"); + return new Iterator() { + private int count; + + @Override + public boolean hasNext() { + return count < limitSize && iterator.hasNext(); + } + + @Override + public T next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + count++; + return iterator.next(); + } + + @Override + public void remove() { + iterator.remove(); + } + }; + } + + @Override + public JavaRDD read(long numRecords) throws IOException { + return fetchAnyRecordsFromDataset(Option.of(numRecords)); + } + + @Override + public JavaRDD read(int numPartitions, long approxNumRecords) throws IOException { + return fetchAnyRecordsFromDataset(Option.of(approxNumRecords), Option.of(numPartitions)); + } + + @Override + public JavaRDD read(int numPartitions, int numFiles, long numRecords) throws IOException { + return fetchRecordsFromDataset(Option.of(numPartitions), Option.of(numFiles), Option.of(numRecords)); + } + + @Override + public JavaRDD read(int numPartitions, int numFiles, double percentageRecordsPerFile) + throws IOException { + return fetchPercentageRecordsFromDataset(Option.of(numPartitions), Option.of(numFiles), + Option.of(percentageRecordsPerFile)); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DeltaInputReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DeltaInputReader.java new file mode 100644 index 000000000..3ba404153 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DeltaInputReader.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.reader; + +import java.io.IOException; +import java.io.Serializable; +import org.apache.spark.api.java.JavaRDD; + +/** + * Implementations of {@link DeltaInputReader} will read the configured input type and provide an RDD of records to the + * client. + * + * @param Read result data type + */ +public interface DeltaInputReader extends Serializable { + + /** + * Attempts to reads an approximate number of records close to approxNumRecords. + * This highly depends on the number of records already present in the input. + */ + JavaRDD read(long approxNumRecords) throws IOException; + + /** + * @throws IOException Attempts to read approx number of records (exact if equal or more records available) + * across requested number of + * partitions. + */ + JavaRDD read(int numPartitions, long approxNumRecords) throws IOException; + + /** + * @throws IOException Attempts to read approx number of records (exact if equal or more records available) + * across requested number of + * partitions and number of files. + * 1. Find numFiles across numPartitions + * 2. numRecordsToReadPerFile = approxNumRecords / numFiles + */ + JavaRDD read(int numPartitions, int numFiles, long approxNumRecords) throws IOException; + + /** + * @throws IOException Attempts to a % of records per file across requested number of partitions and number of files. + * 1. Find numFiles across numPartitions + * 2. numRecordsToReadPerFile = approxNumRecordsPerFile * percentageRecordsPerFile + */ + JavaRDD read(int numPartitions, int numFiles, double percentageRecordsPerFile) throws IOException; + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DeltaInputType.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DeltaInputType.java new file mode 100644 index 000000000..137a503ee --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DeltaInputType.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.reader; + +/** + * Supported delta input data types. + */ +public enum DeltaInputType { + AVRO, PARQUET +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/SparkBasedReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/SparkBasedReader.java new file mode 100644 index 000000000..16a5259ee --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/SparkBasedReader.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.reader; + +import java.util.List; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.utilities.schema.RowBasedSchemaProvider; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import scala.collection.JavaConverters; + + +/** + * Helper class to read avro and/or parquet files and generate a RDD of {@link GenericRecord}. + */ +public class SparkBasedReader { + + public static final String SPARK_AVRO_FORMAT = "avro"; + public static final String SPARK_PARQUET_FORMAT = "com.databricks.spark.parquet"; + private static final String AVRO_SCHEMA_OPTION_KEY = "avroSchema"; + + // Spark anyways globs the path and gets all the paths in memory so take the List as an argument. + // https://github.com/apache/spark/.../org/apache/spark/sql/execution/datasources/DataSource.scala#L251 + public static JavaRDD readAvro(SparkSession sparkSession, String schemaStr, List listOfPaths, + Option structName, Option nameSpace) { + + Dataset dataSet = sparkSession.read() + .format(SPARK_AVRO_FORMAT) + .option(AVRO_SCHEMA_OPTION_KEY, schemaStr) + .load(JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq()); + + return AvroConversionUtils + .createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME), + nameSpace.orElse(RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE)) + .toJavaRDD(); + } + + public static JavaRDD readParquet(SparkSession sparkSession, List + listOfPaths, Option structName, Option nameSpace) { + + Dataset dataSet = sparkSession.read() + .parquet((JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq())); + + return AvroConversionUtils + .createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME), + RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE) + .toJavaRDD(); + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/AvroFileDeltaInputWriter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/AvroFileDeltaInputWriter.java new file mode 100644 index 000000000..24181527c --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/AvroFileDeltaInputWriter.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.writer; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.UUID; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.io.DatumWriter; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Implementation of {@link DeltaInputWriter} that writes avro records to the result file. + */ +public class AvroFileDeltaInputWriter implements DeltaInputWriter { + + public static final String AVRO_EXTENSION = ".avro"; + private static Logger log = LoggerFactory.getLogger(AvroFileDeltaInputWriter.class); + // The maximum file size for an avro file before being rolled over to a new one + private final Long maxFileSize; + private final Configuration configuration; + private HoodieWrapperFileSystem fs; + // Path of the actual avro file + private Path file; + // Base input path to write avro files under + // TODO : Make this bucketed so don't have a large number of files in a single directory + private String basePath; + private DatumWriter writer; + private DataFileWriter dataFileWriter; + private OutputStream output; + private Schema schema; + private DeltaWriteStats deltaWriteStats; + private long recordsWritten = 0; + + // TODO : Handle failure case which may leave behind tons of small corrupt files + public AvroFileDeltaInputWriter(Configuration configuration, String basePath, String schemaStr, Long maxFileSize) + throws IOException { + this.schema = Schema.parse(schemaStr); + this.maxFileSize = maxFileSize; + this.configuration = configuration; + this.basePath = basePath; + Path path = new Path(basePath, new Path(UUID.randomUUID().toString() + AVRO_EXTENSION)); + this.file = HoodieWrapperFileSystem.convertToHoodiePath(path, configuration); + this.fs = (HoodieWrapperFileSystem) this.file + .getFileSystem(FSUtils.registerFileSystem(path, configuration)); + this.output = this.fs.create(this.file); + this.writer = new GenericDatumWriter(schema); + this.dataFileWriter = new DataFileWriter<>(writer).create(schema, output); + this.deltaWriteStats = new DeltaWriteStats(); + } + + @Override + public void writeData(GenericRecord iData) throws IOException { + this.dataFileWriter.append(iData); + recordsWritten++; + } + + @Override + public boolean canWrite() { + return fs.getBytesWritten(file) < maxFileSize; + } + + @Override + public void close() throws IOException { + this.deltaWriteStats.setBytesWritten(this.fs.getBytesWritten(this.file)); + this.deltaWriteStats.setRecordsWritten(this.recordsWritten); + this.deltaWriteStats.setFilePath(this.file.toUri().getPath()); + this.dataFileWriter.close(); + log.info("New Avro File : {}", getPath()); + } + + @Override + public DeltaInputWriter getNewWriter() throws IOException { + AvroFileDeltaInputWriter avroFileDeltaInputWriter = new AvroFileDeltaInputWriter(this.configuration, this.basePath, this + .schema.toString(), this.maxFileSize); + return avroFileDeltaInputWriter; + } + + public FileSystem getFs() { + return fs; + } + + public Path getPath() { + return this.file; + } + + @Override + public DeltaWriteStats getDeltaWriteStats() { + return this.deltaWriteStats; + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSDeltaWriterAdapter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSDeltaWriterAdapter.java new file mode 100644 index 000000000..4c70ac564 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSDeltaWriterAdapter.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.writer; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import org.apache.avro.generic.GenericRecord; + +/** + * {@link org.apache.hadoop.hdfs.DistributedFileSystem} (or {@link org.apache.hadoop.fs.LocalFileSystem}) based delta + * generator. + */ +public class DFSDeltaWriterAdapter implements DeltaWriterAdapter { + + private DeltaInputWriter deltaInputGenerator; + private List metrics = new ArrayList<>(); + + public DFSDeltaWriterAdapter(DeltaInputWriter deltaInputGenerator) { + this.deltaInputGenerator = deltaInputGenerator; + } + + @Override + public List write(Iterator input) throws IOException { + while (input.hasNext()) { + if (this.deltaInputGenerator.canWrite()) { + this.deltaInputGenerator.writeData(input.next()); + } else if (input.hasNext()) { + rollOver(); + } + } + close(); + return this.metrics; + } + + public void rollOver() throws IOException { + close(); + this.deltaInputGenerator = this.deltaInputGenerator.getNewWriter(); + } + + private void close() throws IOException { + this.deltaInputGenerator.close(); + this.metrics.add(this.deltaInputGenerator.getDeltaWriteStats()); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSSparkAvroDeltaWriter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSSparkAvroDeltaWriter.java new file mode 100644 index 000000000..74cbe5ddc --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSSparkAvroDeltaWriter.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.writer; + +import java.io.IOException; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter.SparkBasedDeltaWriter; +import org.apache.spark.api.java.JavaRDD; + +/** + * NEED TO IMPLEMENT A CUSTOM SPARK PARTITIONER TO ENSURE WE WRITE LARGE ENOUGH AVRO FILES. + */ +public class DFSSparkAvroDeltaWriter implements SparkBasedDeltaWriter> { + + private DeltaInputWriter> deltaInputWriter; + + public DFSSparkAvroDeltaWriter(DeltaInputWriter> deltaInputWriter) { + this.deltaInputWriter = deltaInputWriter; + } + + @Override + public JavaRDD write(JavaRDD input) throws IOException { + this.deltaInputWriter.writeData(input); + return null; + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaInputWriter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaInputWriter.java new file mode 100644 index 000000000..da52434ff --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaInputWriter.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.writer; + +import java.io.Closeable; +import java.io.IOException; + +/** + * Implementations of {@link DeltaInputWriter} will be able to generate data. + * + * @param Data type to be generated. + */ +public interface DeltaInputWriter extends Closeable { + + /** + * Generate any type of data. + */ + void writeData(I iData) throws IOException; + + /** + * Check whether more data can/should be written. + */ + boolean canWrite(); + + /** + * Return the statistics of data written. + */ + DeltaWriteStats getDeltaWriteStats(); + + /** + * Return a new instance of this writer. + * @return + * @throws IOException + */ + DeltaInputWriter getNewWriter() throws IOException; + +} \ No newline at end of file diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaOutputMode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaOutputMode.java new file mode 100644 index 000000000..daed0fbc3 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaOutputMode.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.writer; + +/** + * Supported output destination types for the generated delta workload. + */ +public enum DeltaOutputMode { + KAFKA, DFS +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriteStats.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriteStats.java new file mode 100644 index 000000000..8ccd69f27 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriteStats.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.writer; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import org.apache.hudi.common.util.collection.Pair; + +/** + * This class holds the write statistics for {@link DeltaInputWriter}. + */ +public class DeltaWriteStats implements Serializable { + + // The file path (if any) for the data written + private String filePath; + // Number of bytes written before being closed + private long bytesWritten; + // Number of records written before being closed + private long recordsWritten; + + private List> partitionPathRecordKey = new ArrayList<>(); + + public String getFilePath() { + return filePath; + } + + public void setFilePath(String filePath) { + this.filePath = filePath; + } + + public long getBytesWritten() { + return bytesWritten; + } + + public void setBytesWritten(long bytesWritten) { + this.bytesWritten = bytesWritten; + } + + public List> getPartitionPathRecordKey() { + return partitionPathRecordKey; + } + + public void setPartitionPathRecordKey(List> partitionPathRecordKey) { + this.partitionPathRecordKey = partitionPathRecordKey; + } + + public long getRecordsWritten() { + return recordsWritten; + } + + public void setRecordsWritten(long recordsWritten) { + this.recordsWritten = recordsWritten; + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterAdapter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterAdapter.java new file mode 100644 index 000000000..c941458fc --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterAdapter.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.writer; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import org.apache.spark.api.java.JavaRDD; + +public interface DeltaWriterAdapter { + + List write(Iterator input) throws IOException; + + interface SparkBasedDeltaWriter { + + JavaRDD write(J input) throws IOException; + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterFactory.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterFactory.java new file mode 100644 index 000000000..b4d9b9f89 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.writer; + +import java.io.IOException; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.reader.DeltaInputType; + +/** + * A factory to help instantiate different {@link DeltaWriterAdapter}s depending on the {@link DeltaOutputMode} and + * {@link DeltaInputType}. + */ +public class DeltaWriterFactory { + + private DeltaWriterFactory() { + } + + public static DeltaWriterAdapter getDeltaWriterAdapter(DeltaConfig config, Integer batchId) throws IOException { + switch (config.getDeltaOutputMode()) { + case DFS: + switch (config.getDeltaInputType()) { + case AVRO: + DFSDeltaConfig dfsDeltaConfig = (DFSDeltaConfig) config; + dfsDeltaConfig.setBatchId(batchId); + DeltaInputWriter fileDeltaInputGenerator = new AvroFileDeltaInputWriter( + dfsDeltaConfig.getConfiguration(), + StringUtils + .join(new String[]{dfsDeltaConfig.getDeltaBasePath(), dfsDeltaConfig.getBatchId().toString()}, + "/"), dfsDeltaConfig.getSchemaStr(), dfsDeltaConfig.getMaxFileSize()); + return new DFSDeltaWriterAdapter(fileDeltaInputGenerator); + default: + throw new IllegalArgumentException("Invalid delta input format " + config.getDeltaInputType()); + } + default: + throw new IllegalArgumentException("Invalid delta input type " + config.getDeltaOutputMode()); + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/SparkAvroDeltaInputWriter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/SparkAvroDeltaInputWriter.java new file mode 100644 index 000000000..920f06ea4 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/SparkAvroDeltaInputWriter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.writer; + +import java.io.IOException; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.AvroConversionUtils; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.SparkSession; + +/** + * Spark based avro delta input writer. We don't use this yet since we cannot control result file size. + */ +public class SparkAvroDeltaInputWriter implements DeltaInputWriter> { + + private static final String AVRO_FORMAT_PACKAGE = "avro"; + public SparkSession sparkSession; + private String schemaStr; + // TODO : the base path has to be a new path every time for spark avro + private String basePath; + + public SparkAvroDeltaInputWriter(SparkSession sparkSession, String schemaStr, String basePath) { + this.sparkSession = sparkSession; + this.schemaStr = schemaStr; + this.basePath = basePath; + } + + @Override + public void writeData(JavaRDD iData) throws IOException { + AvroConversionUtils.createDataFrame(iData.rdd(), schemaStr, sparkSession).write() + .format(AVRO_FORMAT_PACKAGE).save(basePath); + } + + @Override + public boolean canWrite() { + throw new UnsupportedOperationException("not applicable for spark based writer"); + } + + @Override + public void close() throws IOException { + } + + @Override + public DeltaWriteStats getDeltaWriteStats() { + throw new UnsupportedOperationException("not applicable for spark based writer"); + } + + @Override + public DeltaInputWriter getNewWriter() throws IOException { + throw new UnsupportedOperationException("not applicable for spark based writer"); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java new file mode 100644 index 000000000..f6d907388 --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite; + +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.generator.FlexibleSchemaRecordGenerationIterator; +import org.apache.hudi.integ.testsuite.reader.DeltaInputType; +import org.apache.hudi.integ.testsuite.utils.TestUtils; +import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter; +import org.apache.hudi.integ.testsuite.writer.DFSDeltaWriterAdapter; +import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter; +import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; +import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats; +import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter; +import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory; +import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +public class TestDFSHoodieTestSuiteWriterAdapter extends UtilitiesTestBase { + + private FilebasedSchemaProvider schemaProvider; + private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/"; + + @BeforeAll + public static void initClass() throws Exception { + UtilitiesTestBase.initClass(); + } + + @AfterAll + public static void cleanupClass() { + UtilitiesTestBase.cleanupClass(); + } + + @BeforeEach + public void setup() throws Exception { + super.setup(); + schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFSWithAbsoluteScope( + System.getProperty("user.dir") + "/.." + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, + "complex-source.avsc"), jsc); + } + + @AfterEach + public void teardown() throws Exception { + super.teardown(); + } + + @Test + public void testDFSOneFileWrite() throws IOException { + + DeltaInputWriter mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class); + DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class); + when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter); + when(mockFileSinkWriter.canWrite()).thenReturn(true); + when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats); + + DeltaWriterAdapter dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter); + + JavaRDD records = TestUtils.makeRDD(jsc, 10); + + dfsDeltaWriterAdapter.write(records.collect().iterator()); + Mockito.verify(mockFileSinkWriter, times(10)).canWrite(); + Mockito.verify(mockFileSinkWriter, times(1)).close(); + } + + @Test + public void testDFSTwoFilesWriteWithRollover() throws IOException { + + DeltaInputWriter mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class); + DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class); + when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter); + when(mockFileSinkWriter.canWrite()).thenReturn(false, true); + when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats); + + DeltaWriterAdapter dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter); + + Iterator mockIterator = Mockito.mock(Iterator.class); + when(mockIterator.hasNext()).thenReturn(true, true, true, false); + + dfsDeltaWriterAdapter.write(mockIterator); + Mockito.verify(mockFileSinkWriter, times(2)).canWrite(); + Mockito.verify(mockFileSinkWriter, times(1)).getNewWriter(); + Mockito.verify(mockFileSinkWriter, times(2)).close(); + } + + @Test + public void testDFSWorkloadSinkWithMultipleFilesFunctional() throws IOException { + DeltaConfig dfsSinkConfig = new DFSDeltaConfig(DeltaOutputMode.DFS, DeltaInputType.AVRO, + new SerializableConfiguration(jsc.hadoopConfiguration()), dfsBasePath, dfsBasePath, + schemaProvider.getSourceSchema().toString(), 10240L); + DeltaWriterAdapter dfsDeltaWriterAdapter = DeltaWriterFactory + .getDeltaWriterAdapter(dfsSinkConfig, 1); + FlexibleSchemaRecordGenerationIterator itr = new FlexibleSchemaRecordGenerationIterator(1000, + schemaProvider.getSourceSchema().toString()); + dfsDeltaWriterAdapter.write(itr); + FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration()); + FileStatus[] fileStatuses = fs.listStatus(new Path(dfsBasePath)); + // Since maxFileSize was 10240L and we produced 1K records each close to 1K size, we should produce more than + // 1 file + assertTrue(fileStatuses.length > 0); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestFileDeltaInputWriter.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestFileDeltaInputWriter.java new file mode 100644 index 000000000..f5dbd540f --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestFileDeltaInputWriter.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.reader.SparkBasedReader; +import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter; +import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter; +import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats; +import org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator; +import org.apache.hudi.integ.testsuite.reader.SparkBasedReader; +import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter; +import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter; +import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats; +import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestFileDeltaInputWriter extends UtilitiesTestBase { + + private FilebasedSchemaProvider schemaProvider; + private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/"; + + @BeforeAll + public static void initClass() throws Exception { + UtilitiesTestBase.initClass(); + } + + @AfterAll + public static void cleanupClass() { + UtilitiesTestBase.cleanupClass(); + } + + @BeforeEach + public void setup() throws Exception { + super.setup(); + schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFSWithAbsoluteScope(System.getProperty("user.dir") + "/.." + + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, "complex-source.avsc"), jsc); + } + + @AfterEach + public void teardown() throws Exception { + super.teardown(); + } + + @Test + public void testAvroFileSinkWriter() throws IOException { + // 1. Create a Avro File Sink Writer + DeltaInputWriter fileSinkWriter = + new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath + "/input", schemaProvider.getSourceSchema() + .toString(), 1024 * 1024L); + GenericRecordFullPayloadGenerator payloadGenerator = + new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema()); + // 2. Generate 100 avro payloads and write them to an avro file + IntStream.range(0, 100).forEach(a -> { + try { + fileSinkWriter.writeData(payloadGenerator.getNewPayload()); + } catch (IOException io) { + throw new UncheckedIOException(io); + } + }); + fileSinkWriter.close(); + DeltaWriteStats deltaWriteStats = fileSinkWriter.getDeltaWriteStats(); + FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration()); + FileStatus[] fileStatuses = fs.listStatus(new Path(deltaWriteStats.getFilePath())); + // Atleast 1 file was written + assertEquals(1, fileStatuses.length); + // File length should be greater than 0 + assertTrue(fileStatuses[0].getLen() > 0); + // File length should be the same as the number of bytes written + assertTrue(deltaWriteStats.getBytesWritten() > 0); + List paths = Arrays.asList(fs.globStatus(new Path(dfsBasePath + "/*/*.avro"))) + .stream().map(f -> f.getPath().toString()).collect(Collectors.toList()); + JavaRDD writtenRecords = + SparkBasedReader.readAvro(sparkSession, schemaProvider.getSourceSchema().toString(), paths, Option.empty(), + Option.empty()); + // Number of records written should be 100 + assertEquals(writtenRecords.count(), 100); + // Number of records in file should match with the stats + assertEquals(writtenRecords.count(), deltaWriteStats.getRecordsWritten()); + } + + @Test + public void testAvroFileSinkCreateNewWriter() throws IOException { + // 1. Create a Avro File Sink Writer + DeltaInputWriter fileSinkWriter = + new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath, + schemaProvider.getSourceSchema().toString(), + 1024 * 1024L); + GenericRecordFullPayloadGenerator payloadGenerator = + new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema()); + // 2. Generate 100 avro payloads and write them to an avro file + IntStream.range(0, 100).forEach(a -> { + try { + fileSinkWriter.writeData(payloadGenerator.getNewPayload()); + } catch (IOException io) { + throw new UncheckedIOException(io); + } + }); + fileSinkWriter.close(); + String oldFilePath = fileSinkWriter.getDeltaWriteStats().getFilePath(); + assertFalse(oldFilePath == null); + DeltaInputWriter newFileSinkWriter = fileSinkWriter.getNewWriter(); + newFileSinkWriter.close(); + DeltaWriteStats newStats = newFileSinkWriter.getDeltaWriteStats(); + assertEquals(newStats.getBytesWritten(), 3674); + assertEquals(newStats.getRecordsWritten(), 0); + assertTrue(newStats.getFilePath() != null); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/configuration/TestWorkflowBuilder.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/configuration/TestWorkflowBuilder.java new file mode 100644 index 000000000..0ffe8a9b2 --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/configuration/TestWorkflowBuilder.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.configuration; + +import static junit.framework.Assert.assertTrue; +import static junit.framework.TestCase.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; +import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.WorkflowDag; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; +import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode; +import org.junit.jupiter.api.Test; + +public class TestWorkflowBuilder { + + @Test + public void testWorkloadOperationSequenceBuilder() { + + DagNode root = new InsertNode(DeltaConfig.Config.newBuilder() + .withNumRecordsToInsert(10000) + .withNumInsertPartitions(1) + .withNumTimesToRepeat(2) + .withRecordSize(1000).build()); + + DagNode child1 = new UpsertNode(DeltaConfig.Config.newBuilder() + .withNumRecordsToUpdate(10000) + .withNumInsertPartitions(1) + .withNumTimesToRepeat(2) + .withRecordSize(1000).build()); + + root.addChildNode(child1); + child1.addParentNode(root); + List rootNodes = new ArrayList<>(); + rootNodes.add(root); + WorkflowDag workflowDag = new WorkflowDag(rootNodes); + + assertEquals(workflowDag.getNodeList().size(), 1); + assertEquals(((DagNode) workflowDag.getNodeList().get(0)).getChildNodes().size(), 1); + DagNode dagNode = (DagNode) workflowDag.getNodeList().get(0); + assertTrue(dagNode instanceof InsertNode); + DeltaConfig.Config config = dagNode.getConfig(); + assertEquals(config.getNumInsertPartitions(), 1); + assertEquals(config.getRecordSize(), 1000); + assertEquals(config.getRepeatCount(), 2); + assertEquals(config.getNumRecordsInsert(), 10000); + assertEquals(config.getNumRecordsUpsert(), 0); + dagNode = (DagNode) ((DagNode) workflowDag.getNodeList().get(0)).getChildNodes().get(0); + assertTrue(dagNode instanceof UpsertNode); + config = dagNode.getConfig(); + assertEquals(config.getNumInsertPartitions(), 1); + assertEquals(config.getRecordSize(), 1000); + assertEquals(config.getRepeatCount(), 2); + assertEquals(config.getNumRecordsInsert(), 0); + assertEquals(config.getNumRecordsUpsert(), 10000); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestUpdateConverter.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestUpdateConverter.java new file mode 100644 index 000000000..98e09915c --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestUpdateConverter.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.converter; + +import static junit.framework.TestCase.assertTrue; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import org.apache.avro.Schema.Field; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.integ.testsuite.utils.TestUtils; +import org.apache.hudi.integ.testsuite.utils.TestUtils; +import org.apache.hudi.utilities.UtilHelpers; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import scala.Tuple2; + +public class TestUpdateConverter { + + private JavaSparkContext jsc; + + @BeforeEach + public void setup() throws Exception { + jsc = UtilHelpers.buildSparkContext(this.getClass().getName() + "-hoodie", "local[1]"); + + } + + @AfterEach + public void teardown() { + jsc.stop(); + } + + @Test + public void testGenerateUpdateRecordsFromInputRecords() throws Exception { + JavaRDD inputRDD = TestUtils.makeRDD(jsc, 10); + String schemaStr = inputRDD.take(1).get(0).getSchema().toString(); + int minPayloadSize = 1000; + // 2. DFS converter reads existing records and generates random updates for the same row keys + UpdateConverter updateConverter = new UpdateConverter(schemaStr, minPayloadSize, + Arrays.asList("timestamp"), Arrays.asList("_row_key")); + List insertRowKeys = inputRDD.map(r -> r.get("_row_key").toString()).collect(); + assertTrue(inputRDD.count() == 10); + JavaRDD outputRDD = updateConverter.convert(inputRDD); + List updateRowKeys = outputRDD.map(row -> row.get("_row_key").toString()).collect(); + // The insert row keys should be the same as update row keys + assertTrue(insertRowKeys.containsAll(updateRowKeys)); + Map inputRecords = inputRDD.mapToPair(r -> new Tuple2<>(r.get("_row_key").toString(), r)) + .collectAsMap(); + List updateRecords = outputRDD.collect(); + updateRecords.stream().forEach(updateRecord -> { + GenericRecord inputRecord = inputRecords.get(updateRecord.get("_row_key").toString()); + assertTrue(areRecordsDifferent(inputRecord, updateRecord)); + }); + + } + + /** + * Checks if even a single field in the 2 records is different (except the row key which is the same for an update). + */ + private boolean areRecordsDifferent(GenericRecord in, GenericRecord up) { + for (Field field : in.getSchema().getFields()) { + if (field.name() == "_row_key") { + continue; + } else { + // Just convert all types to string for now since all are primitive + if (in.get(field.name()).toString() != up.get(field.name()).toString()) { + return true; + } + } + } + return false; + } +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/ComplexDagGenerator.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/ComplexDagGenerator.java new file mode 100644 index 000000000..653a9e452 --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/ComplexDagGenerator.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.Function; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; +import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode; +import org.apache.hudi.integ.testsuite.dag.nodes.ValidateNode; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; +import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode; +import org.apache.hudi.integ.testsuite.dag.nodes.ValidateNode; +import org.apache.spark.api.java.JavaRDD; + +public class ComplexDagGenerator implements WorkflowDagGenerator { + + @Override + public WorkflowDag build() { + DagNode root = new InsertNode(Config.newBuilder() + .withNumRecordsToInsert(1000) + .withNumInsertPartitions(3) + .withRecordSize(1000).build()); + + DagNode child1 = new UpsertNode(Config.newBuilder() + .withNumRecordsToUpdate(999) + .withNumRecordsToInsert(1000) + .withNumUpsertFiles(1) + .withNumUpsertPartitions(1) + .withNumInsertPartitions(1) + .withRecordSize(10000).build()); + + Function>>, Boolean> function = (dagNodes) -> { + DagNode> parent1 = dagNodes.get(0); + List statuses = parent1.getResult().collect(); + long totalRecordsTouched = statuses.stream().map(st -> st.getStat().getNumUpdateWrites() + st.getStat() + .getNumInserts()).reduce((a, b) -> a + b).get(); + boolean b1 = totalRecordsTouched == parent1.getConfig().getNumRecordsInsert() + + parent1.getConfig().getNumRecordsUpsert(); + boolean b2 = statuses.size() > parent1.getConfig().getNumUpsertFiles(); + + DagNode> parent2 = parent1.getParentNodes().get(0); + statuses = parent2.getResult().collect(); + totalRecordsTouched = statuses.stream().map(st -> st.getStat().getNumUpdateWrites() + st.getStat() + .getNumInserts()).reduce((a, b) -> a + b).get(); + boolean b3 = totalRecordsTouched == parent2.getConfig().getNumRecordsInsert() + * parent2.getConfig().getNumInsertPartitions() + parent2.getConfig().getNumRecordsUpsert(); + return b1 & b2 & b3; + }; + DagNode child2 = new ValidateNode(Config.newBuilder().build(), function); + + root.addChildNode(child1); + // child1.addParentNode(root); + child1.addChildNode(child2); + // child2.addParentNode(child1); + List rootNodes = new ArrayList<>(); + rootNodes.add(root); + return new WorkflowDag(rootNodes); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGenerator.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGenerator.java new file mode 100644 index 000000000..79eb24688 --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGenerator.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode; +import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode; +import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode; +import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; + +public class HiveSyncDagGenerator implements WorkflowDagGenerator { + + @Override + public WorkflowDag build() { + DagNode root = new InsertNode(Config.newBuilder() + .withNumRecordsToInsert(100) + .withNumInsertPartitions(1) + .withNumTimesToRepeat(1) + .withRecordSize(1000).build()); + + DagNode child1 = new HiveSyncNode(Config.newBuilder().withHiveLocal(true).build()); + + root.addChildNode(child1); + + DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays + .asList(Pair.of("select " + "count(*) from testdb1.table1 group " + "by rider having count(*) < 1", 0))) + .build()); + child1.addChildNode(child2); + + List rootNodes = new ArrayList<>(); + rootNodes.add(root); + return new WorkflowDag(rootNodes); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGeneratorMOR.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGeneratorMOR.java new file mode 100644 index 000000000..1f9f7a474 --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGeneratorMOR.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode; +import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode; +import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode; +import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; + +public class HiveSyncDagGeneratorMOR implements WorkflowDagGenerator { + + @Override + public WorkflowDag build() { + DagNode root = new InsertNode(Config.newBuilder() + .withNumRecordsToInsert(100) + .withNumInsertPartitions(1) + .withNumTimesToRepeat(1) + .withRecordSize(1000).build()); + + DagNode child1 = new HiveSyncNode(Config.newBuilder().withHiveLocal(true).build()); + + root.addChildNode(child1); + + DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays + .asList(Pair.of("select " + "count(*) from testdb1.table1_rt group " + "by rider having count(*) < 1", 0))) + .build()); + child1.addChildNode(child2); + + List rootNodes = new ArrayList<>(); + rootNodes.add(root); + return new WorkflowDag(rootNodes); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java new file mode 100644 index 000000000..b5a0cd39f --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class TestDagUtils { + + private static final String COW_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-cow.yaml"; + + @Test + public void testConvertDagToYaml() throws Exception { + ComplexDagGenerator dag = new ComplexDagGenerator(); + String yaml = DagUtils.convertDagToYaml(dag.build()); + System.out.println(yaml); + } + + @Test + public void testConvertYamlToDag() throws Exception { + WorkflowDag dag = DagUtils.convertYamlToDag(UtilitiesTestBase.Helpers + .readFileFromAbsolutePath((System.getProperty("user.dir") + "/.." + COW_DAG_DOCKER_DEMO_RELATIVE_PATH))); + assertEquals(dag.getNodeList().size(), 1); + Assertions.assertEquals(((DagNode) dag.getNodeList().get(0)).getParentNodes().size(), 0); + assertEquals(((DagNode) dag.getNodeList().get(0)).getChildNodes().size(), 1); + DagNode firstChild = (DagNode) ((DagNode) dag.getNodeList().get(0)).getChildNodes().get(0); + assertEquals(firstChild.getParentNodes().size(), 1); + assertEquals(firstChild.getChildNodes().size(), 1); + assertEquals(((DagNode) firstChild.getChildNodes().get(0)).getChildNodes().size(), 1); + } + + public static class ComplexDagGenerator implements WorkflowDagGenerator { + + @Override + public WorkflowDag build() { + DagNode root = new InsertNode(Config.newBuilder() + .withNumRecordsToInsert(1000000) + .withNumInsertPartitions(1) + .withNumTimesToRepeat(2) + .withRecordSize(1000).build()); + + DagNode child1 = new InsertNode(Config.newBuilder() + .withNumRecordsToInsert(1000000) + .withNumInsertPartitions(1) + .withNumTimesToRepeat(2) + .withRecordSize(1000).build()); + + DagNode child2 = new InsertNode(Config.newBuilder() + .withNumRecordsToInsert(1000000) + .withNumInsertPartitions(1) + .withNumTimesToRepeat(2) + .withRecordSize(1000).build()); + + root.addChildNode(child1); + root.addChildNode(child2); + + DagNode child3 = new InsertNode(Config.newBuilder() + .withNumRecordsToInsert(1000000) + .withNumInsertPartitions(1) + .withNumTimesToRepeat(2) + .withRecordSize(1000).build()); + + child2.addChildNode(child3); + List rootNodes = new ArrayList<>(); + rootNodes.add(root); + + return new WorkflowDag(rootNodes); + } + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java new file mode 100644 index 000000000..85d535840 --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.generator; + +import static junit.framework.TestCase.assertEquals; + +import org.apache.avro.Schema; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; +import org.junit.jupiter.api.Test; + +public class TestGenericRecordPayloadEstimator { + + private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc"; + private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = + "/docker/demo/config/test-suite/complex-source.avsc"; + + @Test + public void testSimpleSchemaSize() throws Exception { + Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + GenericRecordFullPayloadSizeEstimator estimator = + new GenericRecordFullPayloadSizeEstimator(schema); + Pair estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields(); + assertEquals(estimateAndNumComplexFields.getRight().intValue(), 0); + assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 156); + } + + @Test + public void testComplexSchemaSize() throws Exception { + Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers.readFileFromAbsolutePath( + System.getProperty("user.dir") + "/.." + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + GenericRecordFullPayloadSizeEstimator estimator = + new GenericRecordFullPayloadSizeEstimator(schema); + Pair estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields(); + assertEquals(estimateAndNumComplexFields.getRight().intValue(), 1); + assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 1278); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadGenerator.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadGenerator.java new file mode 100644 index 000000000..886fb160c --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadGenerator.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.generator; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.IntStream; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; +import org.junit.jupiter.api.Test; + +public class TestGenericRecordPayloadGenerator { + + private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc"; + private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = + "/docker/demo/config/test-suite/complex-source.avsc"; + + @Test + public void testSimplePayload() throws Exception { + Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema); + GenericRecord record = payloadGenerator.getNewPayload(); + // The generated payload should validate with the provided schema + payloadGenerator.validate(record); + } + + @Test + public void testComplexPayload() throws IOException { + Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema); + GenericRecord record = payloadGenerator.getNewPayload(); + // The generated payload should validate with the provided schema + assertTrue(payloadGenerator.validate(record)); + } + + @Test + public void testComplexPartialPayload() throws IOException { + Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + GenericRecordPartialPayloadGenerator payloadGenerator = new GenericRecordPartialPayloadGenerator(schema); + IntStream.range(0, 10).forEach(a -> { + GenericRecord record = payloadGenerator.getNewPayload(); + // The generated payload should validate with the provided schema + assertTrue(payloadGenerator.validate(record)); + }); + } + + @Test + public void testUpdatePayloadGenerator() throws IOException { + Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema); + List insertRowKeys = new ArrayList<>(); + List updateRowKeys = new ArrayList<>(); + List insertTimeStamps = new ArrayList<>(); + List updateTimeStamps = new ArrayList<>(); + List records = new ArrayList<>(); + // Generate 10 new records + IntStream.range(0, 10).forEach(a -> { + GenericRecord record = payloadGenerator.getNewPayload(); + records.add(record); + insertRowKeys.add(record.get("_row_key").toString()); + insertTimeStamps.add((Long) record.get("timestamp")); + }); + List blacklistFields = Arrays.asList("_row_key"); + records.stream().forEach(a -> { + // Generate 10 updated records + GenericRecord record = payloadGenerator.getUpdatePayload(a, blacklistFields); + updateRowKeys.add(record.get("_row_key").toString()); + updateTimeStamps.add((Long) record.get("timestamp")); + }); + // The row keys from insert payloads should match all the row keys from the update payloads + assertTrue(insertRowKeys.containsAll(updateRowKeys)); + // The timestamp field for the insert payloads should not all match with the update payloads + assertFalse(insertTimeStamps.containsAll(updateTimeStamps)); + } + + @Test + public void testSimplePayloadWithLargeMinSize() throws Exception { + Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + int minPayloadSize = 1000; + GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema, + minPayloadSize); + GenericRecord record = payloadGenerator.getNewPayload(); + // The payload generated is less than minPayloadSize due to no collections present + assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize); + } + + @Test + public void testComplexPayloadWithLargeMinSize() throws Exception { + Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + int minPayloadSize = 10000; + GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator( + schema, minPayloadSize); + GenericRecord record = payloadGenerator.getNewPayload(); + // The payload generated should be within 10% extra of the minPayloadSize + assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize + 0.1 * minPayloadSize); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java new file mode 100644 index 000000000..9d0d104b9 --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.job; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.UUID; +import java.util.stream.Stream; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig; +import org.apache.hudi.integ.testsuite.dag.ComplexDagGenerator; +import org.apache.hudi.integ.testsuite.dag.HiveSyncDagGenerator; +import org.apache.hudi.integ.testsuite.dag.HiveSyncDagGeneratorMOR; +import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator; +import org.apache.hudi.integ.testsuite.reader.DeltaInputType; +import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; +import org.apache.hudi.keygen.TimestampBasedKeyGenerator; +import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; +import org.apache.hudi.utilities.sources.AvroDFSSource; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +public class TestHoodieTestSuiteJob extends UtilitiesTestBase { + + private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with useDeltaStreamer={0}, tableType={1}"; + private static final String BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/base" + + ".properties"; + private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc"; + private static final String TARGET_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/target.avsc"; + private static final String COW_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-cow.yaml"; + private static final String MOR_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-mor.yaml"; + + public static Stream configParams() { + Object[][] data = + new Object[][] {{false, "COPY_ON_WRITE"}}; + return Stream.of(data).map(Arguments::of); + } + + @BeforeAll + public static void initClass() throws Exception { + UtilitiesTestBase.initClass(); + // prepare the configs. + UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.." + + BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/base.properties"); + UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.." + + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/source.avsc"); + UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.." + + TARGET_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/target.avsc"); + + UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.." + + COW_DAG_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/complex-dag-cow.yaml"); + UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.." + + MOR_DAG_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/complex-dag-mor.yaml"); + + TypedProperties props = new TypedProperties(); + props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); + props.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp"); + props.setProperty("hoodie.deltastreamer.keygen.timebased.timestamp.type", "UNIX_TIMESTAMP"); + props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyy/MM/dd"); + props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); + props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc"); + props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsBasePath + "/input"); + props.setProperty("hoodie.datasource.hive_sync.assume_date_partitioning", "true"); + props.setProperty("hoodie.datasource.write.keytranslator.class", "org.apache.hudi" + + ".DayBasedPartitionPathKeyTranslator"); + props.setProperty("hoodie.compact.inline.max.delta.commits", "3"); + props.setProperty("hoodie.parquet.max.file.size", "1024000"); + props.setProperty("hoodie.compact.inline.max.delta.commits", "0"); + // Hive Configs + props.setProperty(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), "jdbc:hive2://127.0.0.1:9999/"); + props.setProperty(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), "testdb1"); + props.setProperty(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), "table1"); + props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "datestr"); + props.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), TimestampBasedKeyGenerator.class.getName()); + UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-source" + + ".properties"); + + // Properties used for the delta-streamer which incrementally pulls from upstream DFS Avro source and + // writes to downstream hudi table + TypedProperties downstreamProps = new TypedProperties(); + downstreamProps.setProperty("include", "base.properties"); + downstreamProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); + downstreamProps.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp"); + + // Source schema is the target schema of upstream table + downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); + downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc"); + UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs, + dfsBasePath + "/test-downstream-source.properties"); + // these tests cause a lot of log verbosity from spark, turning it down + Logger.getLogger("org.apache.spark").setLevel(Level.WARN); + } + + @AfterAll + public static void cleanupClass() { + UtilitiesTestBase.cleanupClass(); + } + + @BeforeEach + public void setup() throws Exception { + super.setup(); + } + + @AfterEach + public void teardown() throws Exception { + super.teardown(); + } + + // Tests in this class add to the test build time significantly. Since this is a Integration Test (end to end), we + // would like to run this as a nightly build which is a TODO. + // TODO : Clean up input / result paths after each test + @MethodSource("configParams") + public void testDagWithInsertUpsertAndValidate(boolean useDeltaStreamer, String tableType) throws Exception { + dfs.delete(new Path(dfsBasePath + "/input"), true); + dfs.delete(new Path(dfsBasePath + "/result"), true); + String inputBasePath = dfsBasePath + "/input/" + UUID.randomUUID().toString(); + String outputBasePath = dfsBasePath + "/result/" + UUID.randomUUID().toString(); + HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, tableType); + cfg.workloadDagGenerator = ComplexDagGenerator.class.getName(); + HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); + hoodieTestSuiteJob.runTestSuite(); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath); + assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 2); + } + + @MethodSource("configParams") + public void testHiveSync(boolean useDeltaStreamer, String tableType) throws Exception { + dfs.delete(new Path(dfsBasePath + "/input"), true); + dfs.delete(new Path(dfsBasePath + "/result"), true); + String inputBasePath = dfsBasePath + "/input"; + String outputBasePath = dfsBasePath + "/result"; + HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, tableType); + if (tableType == HoodieTableType.COPY_ON_WRITE.name()) { + cfg.workloadDagGenerator = HiveSyncDagGenerator.class.getName(); + } else { + cfg.workloadDagGenerator = HiveSyncDagGeneratorMOR.class.getName(); + } + HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); + hoodieTestSuiteJob.runTestSuite(); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath); + assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 1); + } + + @MethodSource("configParams") + public void testCOWFullDagFromYaml(boolean useDeltaStreamer, String tableType) throws Exception { + dfs.delete(new Path(dfsBasePath + "/input"), true); + dfs.delete(new Path(dfsBasePath + "/result"), true); + String inputBasePath = dfsBasePath + "/input"; + String outputBasePath = dfsBasePath + "/result"; + HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType + .COPY_ON_WRITE.name()); + cfg.workloadYamlPath = dfsBasePath + "/complex-dag-cow.yaml"; + HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); + hoodieTestSuiteJob.runTestSuite(); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath); + assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 5); + } + + @MethodSource("configParams") + public void testMORFullDagFromYaml(boolean useDeltaStreamer, String tableType) throws Exception { + dfs.delete(new Path(dfsBasePath + "/input"), true); + dfs.delete(new Path(dfsBasePath + "/result"), true); + String inputBasePath = dfsBasePath + "/input"; + String outputBasePath = dfsBasePath + "/result"; + HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType + .MERGE_ON_READ.name()); + cfg.workloadYamlPath = dfsBasePath + "/complex-dag-mor.yaml"; + HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); + hoodieTestSuiteJob.runTestSuite(); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath); + assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 7); + } + + protected HoodieTestSuiteConfig makeConfig(String inputBasePath, String outputBasePath, boolean useDeltaStream, + String tableType) { + HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig(); + cfg.targetBasePath = outputBasePath; + cfg.inputBasePath = inputBasePath; + cfg.targetTableName = "table1"; + cfg.tableType = tableType; + cfg.sourceClassName = AvroDFSSource.class.getName(); + cfg.sourceOrderingField = "timestamp"; + cfg.propsFilePath = dfsBasePath + "/test-source.properties"; + cfg.outputTypeName = DeltaOutputMode.DFS.name(); + cfg.inputFormatName = DeltaInputType.AVRO.name(); + cfg.limitFileSize = 1024 * 1024L; + cfg.sourceLimit = 20000000; + cfg.workloadDagGenerator = WorkflowDagGenerator.class.getName(); + cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName(); + cfg.useDeltaStreamer = useDeltaStream; + return cfg; + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSAvroDeltaInputReader.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSAvroDeltaInputReader.java new file mode 100644 index 000000000..498cc62c2 --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSAvroDeltaInputReader.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.reader; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.utils.TestUtils; +import org.apache.hudi.integ.testsuite.utils.TestUtils; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +public class TestDFSAvroDeltaInputReader extends UtilitiesTestBase { + + @BeforeAll + public static void initClass() throws Exception { + UtilitiesTestBase.initClass(); + } + + @AfterAll + public static void cleanupClass() { + UtilitiesTestBase.cleanupClass(); + } + + @BeforeEach + public void setup() throws Exception { + super.setup(); + } + + @Test + @Disabled + public void testDFSSinkReader() throws IOException { + FileSystem fs = FSUtils.getFs(dfsBasePath, new Configuration()); + // Create 10 avro files with 10 records each + TestUtils.createAvroFiles(jsc, sparkSession, dfsBasePath, 10, 10); + FileStatus[] statuses = fs.globStatus(new Path(dfsBasePath + "/*/*.avro")); + DFSAvroDeltaInputReader reader = + new DFSAvroDeltaInputReader(sparkSession, TestUtils.getSchema().toString(), dfsBasePath, Option.empty(), + Option.empty()); + assertEquals(reader.analyzeSingleFile(statuses[0].getPath().toString()), 5); + assertEquals(reader.read(100).count(), 100); + assertEquals(reader.read(1000).count(), 100); + assertEquals(reader.read(10).count(), 10); + assertTrue(reader.read(11).count() > 11); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java new file mode 100644 index 000000000..89e586eeb --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.reader; + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertTrue; + +import java.util.HashSet; +import java.util.List; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.client.HoodieWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestDFSHoodieDatasetInputReader extends UtilitiesTestBase { + + @BeforeAll + public static void initClass() throws Exception { + UtilitiesTestBase.initClass(); + } + + @AfterAll + public static void cleanupClass() { + UtilitiesTestBase.cleanupClass(); + } + + @BeforeEach + public void setup() throws Exception { + super.setup(); + HoodieTestUtils.init(jsc.hadoopConfiguration(), dfsBasePath); + } + + @AfterEach + public void teardown() throws Exception { + super.teardown(); + } + + @Test + public void testSimpleHoodieDatasetReader() throws Exception { + + HoodieWriteConfig config = makeHoodieClientConfig(); + HoodieWriteClient client = new HoodieWriteClient(jsc, config); + String commitTime = client.startCommit(); + HoodieTestDataGenerator generator = new HoodieTestDataGenerator(); + // Insert 100 records across 3 partitions + List inserts = generator.generateInserts(commitTime, 100); + JavaRDD writeStatuses = client.upsert(jsc.parallelize(inserts), commitTime); + writeStatuses.count(); + + DFSHoodieDatasetInputReader reader = new DFSHoodieDatasetInputReader(jsc, config.getBasePath(), + HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())).toString()); + // Try to read 100 records for the same partition path and same file ID + JavaRDD records = reader.read(1, 1, 100L); + assertTrue(records.count() <= 100); + assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), + 1); + assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), + 1); + + // Try to read 100 records for 3 partition paths and 3 different file ids + records = reader.read(3, 3, 100L); + assertTrue(records.count() <= 100); + assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), + 3); + assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), + 3); + + // Try to read 100 records for 3 partition paths and 50% records from each file + records = reader.read(3, 3, 0.5); + assertTrue(records.count() <= 100); + assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), + 3); + assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), + 3); + } + + private HoodieWriteConfig makeHoodieClientConfig() throws Exception { + return makeHoodieClientConfigBuilder().build(); + } + + private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception { + // Prepare the AvroParquetIO + return HoodieWriteConfig.newBuilder().withPath(dfsBasePath) + .withParallelism(2, 2) + .withSchema(HoodieTestDataGenerator + .TRIP_EXAMPLE_SCHEMA); + } + +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/utils/TestUtils.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/utils/TestUtils.java new file mode 100644 index 000000000..4699324fd --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/utils/TestUtils.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.utils; + +import java.util.List; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.utilities.schema.RowBasedSchemaProvider; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; + +public class TestUtils { + + /** + * Create a RDD of generic records for testing purposes. + */ + public static JavaRDD makeRDD(JavaSparkContext jsc, int numRecords) { + return jsc.parallelize(generateGenericRecords(numRecords)); + } + + /** + * Generate generic records. + */ + public static List generateGenericRecords(int numRecords) { + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); + return dataGenerator.generateGenericRecords(numRecords); + } + + public static void createAvroFiles(JavaSparkContext jsc, SparkSession sparkSession, String basePath, int numFiles, + int numRecordsPerFile) { + Schema schema = HoodieTestDataGenerator.AVRO_SCHEMA; + for (int i = 0; i < numFiles; i++) { + JavaRDD rdd = makeRDD(jsc, numRecordsPerFile); + AvroConversionUtils.createDataFrame(rdd.rdd(), schema.toString(), sparkSession).write() + .format("avro").option("recordName", RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME) + .option("recordNamespace", RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE).save(basePath + "/" + i); + } + } + + public static Schema getSchema() { + return HoodieTestDataGenerator.AVRO_SCHEMA; + } + +} diff --git a/hudi-integ-test/src/test/resources/log4j-surefire-quiet.properties b/hudi-integ-test/src/test/resources/log4j-surefire-quiet.properties index b21b5d407..61fbf78d1 100644 --- a/hudi-integ-test/src/test/resources/log4j-surefire-quiet.properties +++ b/hudi-integ-test/src/test/resources/log4j-surefire-quiet.properties @@ -15,8 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. ### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache.hudi=DEBUG +log4j.rootLogger=ERROR, CONSOLE +log4j.logger.org.apache.hudi=ERROR +log4j.category.org.apache.spark=ERROR # CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender @@ -25,5 +26,5 @@ log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN +log4j.appender.CONSOLE.filter.a.LevelMin=ERROR log4j.appender.CONSOLE.filter.a.LevelMax=FATAL diff --git a/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java b/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java index 064e37c49..513bb59ed 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java +++ b/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java @@ -95,7 +95,7 @@ public class QuickstartUtils { } public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName, - double timestamp) { + double timestamp) { GenericRecord rec = new GenericData.Record(avroSchema); rec.put("uuid", rowKey); rec.put("ts", timestamp); @@ -221,4 +221,4 @@ public class QuickstartUtils { demoConfigs.put("hoodie.upsert.shuffle.parallelism", "2"); return demoConfigs; } -} +} \ No newline at end of file diff --git a/hudi-spark/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java b/hudi-spark/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java index b3ab3d065..b6431fe9a 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java +++ b/hudi-spark/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java @@ -37,6 +37,7 @@ public class ComplexKeyGenerator extends KeyGenerator { private static final String DEFAULT_PARTITION_PATH = "default"; private static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/"; + public static final String DEFAULT_RECORD_KEY_SEPARATOR = ":"; protected static final String NULL_RECORDKEY_PLACEHOLDER = "__null__"; protected static final String EMPTY_RECORDKEY_PLACEHOLDER = "__empty__"; @@ -98,8 +99,16 @@ public class ComplexKeyGenerator extends KeyGenerator { recordKey.deleteCharAt(recordKey.length() - 1); if (keyIsNullEmpty) { throw new HoodieKeyException("recordKey values: \"" + recordKey + "\" for fields: " - + recordKeyFields.toString() + " cannot be entirely null or empty."); + + recordKeyFields.toString() + " cannot be entirely null or empty."); } return recordKey.toString(); } -} + + public List getRecordKeyFields() { + return recordKeyFields; + } + + public List getPartitionPathFields() { + return partitionPathFields; + } +} \ No newline at end of file diff --git a/hudi-spark/src/main/java/org/apache/hudi/keygen/KeyGenerator.java b/hudi-spark/src/main/java/org/apache/hudi/keygen/KeyGenerator.java index 7dde32679..b4d609dcf 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/keygen/KeyGenerator.java +++ b/hudi-spark/src/main/java/org/apache/hudi/keygen/KeyGenerator.java @@ -40,4 +40,4 @@ public abstract class KeyGenerator implements Serializable { * Generate a Hoodie Key out of provided generic record. */ public abstract HoodieKey getKey(GenericRecord record); -} +} \ No newline at end of file diff --git a/hudi-spark/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java b/hudi-spark/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java index e790b4693..bb2642cc6 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java +++ b/hudi-spark/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java @@ -48,4 +48,4 @@ public class NonpartitionedKeyGenerator extends KeyGenerator { } return new HoodieKey(recordKey, EMPTY_PARTITION); } -} +} \ No newline at end of file diff --git a/hudi-spark/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java b/hudi-spark/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java index e9b9396b2..a9048600e 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java +++ b/hudi-spark/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java @@ -73,4 +73,12 @@ public class SimpleKeyGenerator extends KeyGenerator { } return recordKey; } -} + + public String getRecordKeyField() { + return recordKeyField; + } + + public String getPartitionPathField() { + return partitionPathField; + } +} \ No newline at end of file diff --git a/hudi-spark/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java b/hudi-spark/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java index 7c34ef734..894df433a 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java +++ b/hudi-spark/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java @@ -149,7 +149,6 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator { "Unexpected type for partition field: " + partitionVal.getClass().getName()); } DateTime timestamp = new DateTime(timeMs, outputDateTimeZone); - return hiveStylePartitioning ? partitionPathField + "=" + timestamp.toString(partitionFormatter) : timestamp.toString(partitionFormatter); } catch (Exception e) { diff --git a/hudi-spark/src/test/java/TestComplexKeyGenerator.java b/hudi-spark/src/test/java/TestComplexKeyGenerator.java new file mode 100644 index 000000000..f0671fa71 --- /dev/null +++ b/hudi-spark/src/test/java/TestComplexKeyGenerator.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static junit.framework.TestCase.assertEquals; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.keygen.ComplexKeyGenerator; +import org.junit.jupiter.api.Test; + +public class TestComplexKeyGenerator { + + @Test + public void testSingleValueKeyGenerator() { + TypedProperties properties = new TypedProperties(); + properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key"); + properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "timestamp"); + ComplexKeyGenerator compositeKeyGenerator = new ComplexKeyGenerator(properties); + assertEquals(compositeKeyGenerator.getRecordKeyFields().size(), 1); + assertEquals(compositeKeyGenerator.getPartitionPathFields().size(), 1); + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); + GenericRecord record = dataGenerator.generateGenericRecords(1).get(0); + String rowKey = record.get("_row_key").toString(); + String partitionPath = record.get("timestamp").toString(); + HoodieKey hoodieKey = compositeKeyGenerator.getKey(record); + assertEquals("_row_key:" + rowKey, hoodieKey.getRecordKey()); + assertEquals(partitionPath, hoodieKey.getPartitionPath()); + } + + @Test + public void testMultipleValueKeyGenerator() { + TypedProperties properties = new TypedProperties(); + properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key,timestamp"); + properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "rider,driver"); + ComplexKeyGenerator compositeKeyGenerator = new ComplexKeyGenerator(properties); + assertEquals(compositeKeyGenerator.getRecordKeyFields().size(), 2); + assertEquals(compositeKeyGenerator.getPartitionPathFields().size(), 2); + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); + GenericRecord record = dataGenerator.generateGenericRecords(1).get(0); + String rowKey = + "_row_key" + ComplexKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("_row_key").toString() + "," + + "timestamp" + ComplexKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("timestamp").toString(); + String partitionPath = record.get("rider").toString() + "/" + record.get("driver").toString(); + HoodieKey hoodieKey = compositeKeyGenerator.getKey(record); + assertEquals(rowKey, hoodieKey.getRecordKey()); + assertEquals(partitionPath, hoodieKey.getPartitionPath()); + } + +} \ No newline at end of file diff --git a/hudi-utilities/pom.xml b/hudi-utilities/pom.xml index e320d6747..742f5a13b 100644 --- a/hudi-utilities/pom.xml +++ b/hudi-utilities/pom.xml @@ -131,6 +131,12 @@ log4j + + org.slf4j + slf4j-api + ${slf4j.version} + + com.fasterxml.jackson.module @@ -163,6 +169,10 @@ javax.servlet * + + org.slf4j + slf4j-api + diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java index 64b0238eb..1dffd07ba 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java @@ -18,6 +18,7 @@ package org.apache.hudi.utilities; +import org.apache.hadoop.conf.Configuration; import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.client.HoodieWriteClient; import org.apache.hudi.client.WriteStatus; @@ -35,6 +36,7 @@ import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.utilities.checkpointing.InitialCheckPointProvider; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.Source; +import org.apache.hudi.utilities.sources.helpers.DFSPathSelector; import org.apache.hudi.utilities.transform.ChainedTransformer; import org.apache.hudi.utilities.transform.Transformer; @@ -348,4 +350,15 @@ public class UtilHelpers { throw new HoodieException(String.format("%s table does not exists!", table)); } } + + public static DFSPathSelector createSourceSelector(String sourceSelectorClass, TypedProperties props, + Configuration conf) throws IOException { + try { + return (DFSPathSelector) ReflectionUtils.loadClass(sourceSelectorClass, + new Class[]{TypedProperties.class, Configuration.class}, + props, conf); + } catch (Throwable e) { + throw new IOException("Could not load source selector class " + sourceSelectorClass, e); + } + } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index 41efc5075..e7226fbe6 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -40,6 +40,7 @@ import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.keygen.KeyGenerator; import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.exception.HoodieDeltaStreamerException; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Config; import org.apache.hudi.utilities.schema.DelegatingSchemaProvider; import org.apache.hudi.utilities.schema.RowBasedSchemaProvider; import org.apache.hudi.utilities.schema.SchemaProvider; @@ -212,8 +213,8 @@ public class DeltaSync implements Serializable { /** * Run one round of delta sync and return new compaction instant if one got scheduled. */ - public Option syncOnce() throws Exception { - Option scheduledCompaction = Option.empty(); + public Pair, JavaRDD> syncOnce() throws Exception { + Pair, JavaRDD> result = null; HoodieDeltaStreamerMetrics metrics = new HoodieDeltaStreamerMetrics(getHoodieClientConfig(schemaProvider)); Timer.Context overallTimerContext = metrics.getOverallTimerContext(); @@ -231,13 +232,13 @@ public class DeltaSync implements Serializable { setupWriteClient(); } - scheduledCompaction = writeToSink(srcRecordsWithCkpt.getRight().getRight(), + result = writeToSink(srcRecordsWithCkpt.getRight().getRight(), srcRecordsWithCkpt.getRight().getLeft(), metrics, overallTimerContext); } // Clear persistent RDDs jssc.getPersistentRDDs().values().forEach(JavaRDD::unpersist); - return scheduledCompaction; + return result; } /** @@ -248,7 +249,7 @@ public class DeltaSync implements Serializable { * of schemaProvider, checkpointStr and hoodieRecord * @throws Exception in case of any Exception */ - private Pair>> readFromSource( + public Pair>> readFromSource( Option commitTimelineOpt) throws Exception { // Retrieve the previous round checkpoints, if any Option resumeCheckpointStr = Option.empty(); @@ -355,8 +356,8 @@ public class DeltaSync implements Serializable { * @param overallTimerContext Timer Context * @return Option Compaction instant if one is scheduled */ - private Option writeToSink(JavaRDD records, String checkpointStr, - HoodieDeltaStreamerMetrics metrics, Timer.Context overallTimerContext) throws Exception { + private Pair, JavaRDD> writeToSink(JavaRDD records, String checkpointStr, + HoodieDeltaStreamerMetrics metrics, Timer.Context overallTimerContext) throws Exception { Option scheduledCompactionInstant = Option.empty(); // filter dupes if needed @@ -413,7 +414,7 @@ public class DeltaSync implements Serializable { if (!isEmpty) { // Sync to hive if enabled Timer.Context hiveSyncContext = metrics.getHiveSyncTimerContext(); - syncHive(); + syncHiveIfNeeded(); hiveSyncTimeMs = hiveSyncContext != null ? hiveSyncContext.stop() : 0; } } else { @@ -438,7 +439,7 @@ public class DeltaSync implements Serializable { // Send DeltaStreamer Metrics metrics.updateDeltaStreamerMetrics(overallTimeMs, hiveSyncTimeMs); - return scheduledCompactionInstant; + return Pair.of(scheduledCompactionInstant, writeStatusRDD); } /** @@ -472,15 +473,27 @@ public class DeltaSync implements Serializable { /** * Sync to Hive. */ - private void syncHive() { + public void syncHiveIfNeeded() throws ClassNotFoundException { if (cfg.enableHiveSync) { - HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat); - LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :" - + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath); - new HiveSyncTool(hiveSyncConfig, new HiveConf(conf, HiveConf.class), fs).syncHoodieTable(); + syncHive(); } } + public void syncHive() { + HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat); + LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :" + + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath); + HiveConf hiveConf = new HiveConf(conf, HiveConf.class); + LOG.info("Hive Conf => " + hiveConf.getAllProperties().toString()); + LOG.info("Hive Sync Conf => " + hiveSyncConfig.toString()); + new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable(); + } + + public void syncHive(HiveConf conf) { + this.conf = conf; + syncHive(); + } + /** * Note that depending on configs and source-type, schemaProvider could either be eagerly or lazily created. * SchemaProvider creation is a precursor to HoodieWriteClient and AsyncCompactor creation. This method takes care of @@ -558,4 +571,20 @@ public class DeltaSync implements Serializable { writeClient = null; } } + + public FileSystem getFs() { + return fs; + } + + public TypedProperties getProps() { + return props; + } + + public Config getCfg() { + return cfg; + } + + public Option getCommitTimelineOpt() { + return commitTimelineOpt; + } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java index 729042949..15624554c 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java @@ -18,8 +18,9 @@ package org.apache.hudi.utilities.deltastreamer; -import org.apache.hudi.client.HoodieWriteClient; import org.apache.hudi.async.AbstractAsyncService; +import org.apache.hudi.client.HoodieWriteClient; +import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieTableType; @@ -49,6 +50,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; @@ -83,9 +85,9 @@ public class HoodieDeltaStreamer implements Serializable { public static String CHECKPOINT_KEY = "deltastreamer.checkpoint.key"; - private final transient Config cfg; + protected final transient Config cfg; - private transient DeltaSyncService deltaSyncService; + protected transient DeltaSyncService deltaSyncService; public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc) throws IOException { this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()), @@ -435,11 +437,11 @@ public class HoodieDeltaStreamer implements Serializable { while (!isShutdownRequested()) { try { long start = System.currentTimeMillis(); - Option scheduledCompactionInstant = deltaSync.syncOnce(); - if (scheduledCompactionInstant.isPresent()) { - LOG.info("Enqueuing new pending compaction instant (" + scheduledCompactionInstant + ")"); + Pair, JavaRDD> scheduledCompactionInstantAndRDD = deltaSync.syncOnce(); + if (scheduledCompactionInstantAndRDD.getLeft().isPresent()) { + LOG.info("Enqueuing new pending compaction instant (" + scheduledCompactionInstantAndRDD.getLeft() + ")"); asyncCompactService.enqueuePendingCompaction(new HoodieInstant(State.REQUESTED, - HoodieTimeline.COMPACTION_ACTION, scheduledCompactionInstant.get())); + HoodieTimeline.COMPACTION_ACTION, scheduledCompactionInstantAndRDD.getLeft().get())); asyncCompactService.waitTillPendingCompactionsReducesTo(cfg.maxPendingCompactions); } long toSleepMs = cfg.minSyncIntervalSeconds * 1000 - (System.currentTimeMillis() - start); @@ -623,4 +625,8 @@ public class HoodieDeltaStreamer implements Serializable { }, executor)).toArray(CompletableFuture[]::new)), executor); } } + + public DeltaSyncService getDeltaSyncService() { + return deltaSyncService; + } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java index a054b277a..38ec91e9f 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java @@ -23,15 +23,17 @@ import org.apache.hudi.metrics.Metrics; import com.codahale.metrics.Timer; -public class HoodieDeltaStreamerMetrics { +import java.io.Serializable; + +public class HoodieDeltaStreamerMetrics implements Serializable { private HoodieWriteConfig config; private String tableName; public String overallTimerName = null; public String hiveSyncTimerName = null; - private Timer overallTimer = null; - public Timer hiveSyncTimer = null; + private transient Timer overallTimer = null; + public transient Timer hiveSyncTimer = null; public HoodieDeltaStreamerMetrics(HoodieWriteConfig config) { this.config = config; diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SourceFormatAdapter.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SourceFormatAdapter.java index 9c0be88dc..7fd23b869 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SourceFormatAdapter.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SourceFormatAdapter.java @@ -71,11 +71,11 @@ public final class SourceFormatAdapter { // pass in the schema for the Row-to-Avro conversion // to avoid nullability mismatch between Avro schema and Row schema ? AvroConversionUtils.createRdd( - rdd, r.getSchemaProvider().getSourceSchema(), - HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD() + rdd, r.getSchemaProvider().getSourceSchema(), + HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD() : AvroConversionUtils.createRdd( rdd, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD() - )) + )) .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider()); } default: @@ -116,4 +116,4 @@ public final class SourceFormatAdapter { throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")"); } } -} +} \ No newline at end of file diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroDFSSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroDFSSource.java index a2f7df90f..b5ce96fa8 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroDFSSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroDFSSource.java @@ -21,6 +21,7 @@ package org.apache.hudi.utilities.sources; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.DFSPathSelector; @@ -33,6 +34,8 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; +import java.io.IOException; + /** * DFS Source that reads avro data. */ @@ -41,9 +44,11 @@ public class AvroDFSSource extends AvroSource { private final DFSPathSelector pathSelector; public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, - SchemaProvider schemaProvider) { + SchemaProvider schemaProvider) throws IOException { super(props, sparkContext, sparkSession, schemaProvider); - this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration()); + this.pathSelector = UtilHelpers + .createSourceSelector(DFSPathSelector.class.getName(), props, sparkContext + .hadoopConfiguration()); } @Override diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java index cb0100a24..59263e40a 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java @@ -32,6 +32,8 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import java.io.IOException; import java.util.ArrayList; @@ -42,18 +44,20 @@ import java.util.stream.Collectors; public class DFSPathSelector { + protected static volatile Logger log = LogManager.getLogger(DFSPathSelector.class); + /** * Configs supported. */ - static class Config { + public static class Config { - private static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.dfs.root"; + public static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.dfs.root"; } - private static final List IGNORE_FILEPREFIX_LIST = Arrays.asList(".", "_"); + protected static final List IGNORE_FILEPREFIX_LIST = Arrays.asList(".", "_"); - private final transient FileSystem fs; - private final TypedProperties props; + protected final transient FileSystem fs; + protected final TypedProperties props; public DFSPathSelector(TypedProperties props, Configuration hadoopConf) { DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.ROOT_INPUT_PATH_PROP)); @@ -66,6 +70,7 @@ public class DFSPathSelector { try { // obtain all eligible files under root folder. + log.info("Root path => " + props.getString(Config.ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit); List eligibleFiles = new ArrayList<>(); RemoteIterator fitr = fs.listFiles(new Path(props.getString(Config.ROOT_INPUT_PATH_PROP)), true); @@ -79,7 +84,6 @@ public class DFSPathSelector { } // sort them by modification time. eligibleFiles.sort(Comparator.comparingLong(FileStatus::getModificationTime)); - // Filter based on checkpoint & input size, if needed long currentBytes = 0; long maxModificationTime = Long.MIN_VALUE; diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestCsvDFSSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestCsvDFSSource.java index 8fe99d86c..7b8eead14 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestCsvDFSSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestCsvDFSSource.java @@ -42,7 +42,7 @@ public class TestCsvDFSSource extends AbstractDFSSourceTestBase { this.fileSuffix = ".json"; this.useFlattenedSchema = true; this.schemaProvider = new FilebasedSchemaProvider( - Helpers.setupSchemaOnDFS("source-flattened.avsc"), jsc); + Helpers.setupSchemaOnDFS("delta-streamer-config", "source-flattened.avsc"), jsc); } @Override diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java index e8a20d81e..4fbbe5b91 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java @@ -18,6 +18,7 @@ package org.apache.hudi.utilities.testutils; +import java.io.FileInputStream; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -189,6 +190,17 @@ public class UtilitiesTestBase { return sb.toString(); } + public static String readFileFromAbsolutePath(String absolutePathForResource) throws IOException { + BufferedReader reader = + new BufferedReader(new InputStreamReader(new FileInputStream(absolutePathForResource))); + StringBuffer sb = new StringBuffer(); + String line; + while ((line = reader.readLine()) != null) { + sb.append(line + "\n"); + } + return sb.toString(); + } + public static void copyToDFS(String testResourcePath, FileSystem fs, String targetPath) throws IOException { PrintStream os = new PrintStream(fs.create(new Path(targetPath), true)); os.print(readFile(testResourcePath)); @@ -196,6 +208,14 @@ public class UtilitiesTestBase { os.close(); } + public static void copyToDFSFromAbsolutePath(String absolutePathForResource, FileSystem fs, String targetPath) + throws IOException { + PrintStream os = new PrintStream(fs.create(new Path(targetPath), true)); + os.print(readFileFromAbsolutePath(absolutePathForResource)); + os.flush(); + os.close(); + } + public static void savePropsToDFS(TypedProperties props, FileSystem fs, String targetPath) throws IOException { String[] lines = props.keySet().stream().map(k -> String.format("%s=%s", k, props.get(k))).toArray(String[]::new); saveStringsToDFS(lines, fs, targetPath); @@ -258,11 +278,18 @@ public class UtilitiesTestBase { } public static TypedProperties setupSchemaOnDFS() throws IOException { - return setupSchemaOnDFS("source.avsc"); + return setupSchemaOnDFS("delta-streamer-config", "source.avsc"); } - public static TypedProperties setupSchemaOnDFS(String filename) throws IOException { - UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/" + filename, dfs, dfsBasePath + "/" + filename); + public static TypedProperties setupSchemaOnDFS(String scope, String filename) throws IOException { + UtilitiesTestBase.Helpers.copyToDFS(scope + "/" + filename, dfs, dfsBasePath + "/" + filename); + TypedProperties props = new TypedProperties(); + props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/" + filename); + return props; + } + + public static TypedProperties setupSchemaOnDFSWithAbsoluteScope(String scope, String filename) throws IOException { + UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(scope + "/" + filename, dfs, dfsBasePath + "/" + filename); TypedProperties props = new TypedProperties(); props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/" + filename); return props; @@ -278,7 +305,7 @@ public class UtilitiesTestBase { } public static List toGenericRecords(List hoodieRecords) { - List records = new ArrayList(); + List records = new ArrayList<>(); for (HoodieRecord hoodieRecord : hoodieRecords) { records.add(toGenericRecord(hoodieRecord)); } diff --git a/hudi-utilities/src/test/resources/delta-streamer-config/complex-source.avsc b/hudi-utilities/src/test/resources/delta-streamer-config/complex-source.avsc new file mode 100644 index 000000000..56a3bf04b --- /dev/null +++ b/hudi-utilities/src/test/resources/delta-streamer-config/complex-source.avsc @@ -0,0 +1,466 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + { + "name": "COMPLEX", + "fields": [ + { + "default": null, + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ], + "name": "array_of_string_fields1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field2" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field3" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field4" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field5" + }, + { + "default": null, + "type": [ + "null", + "boolean" + ], + "name": "boolean_field1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field6" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field7" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field8" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field9" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field10" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field11" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field12" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field13" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field14" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field2" + }, + { + "default": null, + "type": [ + "null", + { + "items": { + "fields": [ + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field15" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field16" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field17" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field3" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field4" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field2" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field3" + } + ], + "type": "record", + "name": "record_field1" + }, + "type": "array" + } + ], + "name": "record_name1" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field18" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field5" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field4" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field5" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field19" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field6" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field20" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field7" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field6" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field21" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field22" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field23" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field8" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field7" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field24" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field10" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field25" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field26" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field11" + }, + { + "default": null, + "type": [ + "null", + "boolean" + ], + "name": "boolean_field3" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field12" + }, + { + "default": null, + "type": [ + "null", + "double" + ], + "name": "double_field8" + }, + { + "default": null, + "type": [ + "null", + "long" + ], + "name": "long_field13" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field27" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field28" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field29" + }, + { + "default": null, + "type": [ + "null", + "string" + ], + "name": "string_field30" + } + ], + "type": "record" +} \ No newline at end of file diff --git a/packaging/hudi-integ-test-bundle/pom.xml b/packaging/hudi-integ-test-bundle/pom.xml new file mode 100644 index 000000000..9ce67fd3a --- /dev/null +++ b/packaging/hudi-integ-test-bundle/pom.xml @@ -0,0 +1,565 @@ + + + + + hudi + org.apache.hudi + 0.6.0-SNAPSHOT + ../../pom.xml + + 4.0.0 + hudi-integ-test-bundle + jar + + + true + ${project.parent.basedir} + + + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-shade-plugin + ${maven-shade-plugin.version} + + + package + + shade + + + true + ${project.build.directory}/dependency-reduced-pom.xml + + + + + + true + + + META-INF/LICENSE + target/classes/META-INF/LICENSE + + + + + commons-codec:commons-codec + commons-dbcp:commons-dbcp + commons-lang:commons-lang + commons-pool:commons-pool + org.apache.hudi:hudi-common + org.apache.hudi:hudi-client + org.apache.hudi:hudi-utilities_${scala.binary.version} + org.apache.hudi:hudi-spark_${scala.binary.version} + org.apache.hudi:hudi-hive-sync + org.apache.hudi:hudi-hadoop-mr + org.apache.hudi:hudi-timeline-service + org.apache.hudi:hudi-integ-test + com.beust:jcommander + com.twitter:bijection-avro_${scala.binary.version} + com.twitter:bijection-core_${scala.binary.version} + org.apache.parquet:parquet-avro + com.twitter:parquet-avro + com.twitter.common:objectsize + io.confluent:kafka-avro-serializer + io.confluent:common-config + io.confluent:common-utils + io.confluent:kafka-schema-registry-client + io.dropwizard.metrics:metrics-core + io.dropwizard.metrics:metrics-graphite + org.apache.spark:spark-streaming-kafka-0-10_${scala.binary.version} + org.apache.kafka:kafka_${scala.binary.version} + com.101tec:zkclient + org.apache.kafka:kafka-clients + org.apache.hive:hive-common + org.apache.hive:hive-service + org.apache.hive:hive-metastore + org.apache.hive:hive-jdbc + org.apache.hive:hive-exec + com.esotericsoftware:kryo-shaded + org.objenesis:objenesis + com.esotericsoftware:minlog + com.yammer.metrics:metrics-core + org.apache.thrift:libfb303 + org.apache.thrift:libthrift + com.fasterxml.jackson.dataformat:jackson-dataformat-yaml + + + + + com.beust.jcommander. + org.apache.hudi.com.beust.jcommander. + + + org.apache.commons.dbcp. + org.apache.hudi.org.apache.commons.dbcp. + + + org.apache.commons.lang. + org.apache.hudi.org.apache.commons.lang. + + + org.apache.commons.pool. + org.apache.hudi.org.apache.commons.pool. + + + org.apache.hive.jdbc. + org.apache.hudi.org.apache.hive.jdbc. + + + org.apache.hive.common. + org.apache.hudi.org.apache.hive.common. + + + org.apache.hadoop.hive.common. + org.apache.hudi.org.apache.hadoop.hive.common. + + + org.apache.hadoop.hive.shims. + org.apache.hudi.org.apache.hadoop.hive.shims. + + + org.apache.hadoop.hive.thrift. + org.apache.hudi.org.apache.hadoop.hive.thrift. + + + org.apache.hadoop.hive.serde2. + org.apache.hudi.org.apache.hadoop.hive.serde2. + + + org.apache.hadoop.hive.io. + org.apache.hudi.org.apache.hadoop.hive.io. + + + org.apache.hive.service. + org.apache.hudi.org.apache.hive.service. + + + org.apache.hadoop.hive.service. + org.apache.hudi.org.apache.hadoop_hive.service. + + + org.apache.hive.org.apache.thrift. + org.apache.hudi.org.apache.hive.org.apache.thrift. + + + org.apache.thrift. + org.apache.hudi.org.apache.thrift. + + + com.facebook.fb303. + org.apache.hudi.com.facebook.fb303. + + + org.apache.hive.jdbc. + org.apache.hudi.org.apache.hive.jdbc. + + + com.esotericsoftware.kryo. + org.apache.hudi.com.esotericsoftware.kryo. + + + org.objenesis. + org.apache.hudi.org.objenesis. + + + com.esotericsoftware.minlog. + org.apache.hudi.com.esotericsoftware.minlog. + + + com.codahale.metrics. + org.apache.hudi.com.codahale.metrics. + + + org.apache.commons.codec. + org.apache.hudi.org.apache.commons.codec. + + + com.fasterxml.jackson.dataformat. + org.apache.hudi.com.fasterxml.jackson.dataformat. + + + org.apache.parquet.avro. + org.apache.hudi.org.apache.parquet.avro. + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + META-INF/NOTICE* + META-INF/LICENSE* + + + + + + + + + + + + src/main/resources + + + src/test/resources + + + + + + + confluent + https://packages.confluent.io/maven/ + + + + + + io.javalin + javalin + 2.4.0 + + + + io.dropwizard.metrics + metrics-core + + + + com.fasterxml.jackson.module + jackson-module-scala_${scala.binary.version} + + + + com.fasterxml.jackson.dataformat + jackson-dataformat-yaml + 2.7.4 + + + + org.apache.hudi + hudi-common + ${project.version} + + + + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + + + + org.apache.hudi + hudi-hive-sync + ${project.version} + tests + test-jar + + + + org.apache.hudi + hudi-spark_${scala.binary.version} + ${project.version} + + + + org.apache.hadoop + hadoop-hdfs + tests + + + org.apache.hadoop + hadoop-common + tests + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + + ${hive.groupid} + hive-exec + ${hive.version} + ${hive.exec.classifier} + + + + ${hive.groupid} + hive-jdbc + ${hive.version} + standalone + + + org.slf4j + slf4j-api + + + javax.servlet + servlet-api + + + + + + ${hive.groupid} + hive-jdbc + ${hive.version} + + + org.slf4j + slf4j-api + + + javax.servlet + servlet-api + + + + + + ${hive.groupid} + hive-common + ${hive.version} + compile + + + + org.apache.hudi + hudi-hive-sync + ${project.version} + + + javax.servlet + servlet-api + + + + + + org.apache.hudi + hudi-client + ${project.version} + + + + org.apache.hudi + hudi-utilities_${scala.binary.version} + ${project.version} + + + + org.apache.hudi + hudi-integ-test + ${project.version} + + + + org.apache.hudi + hudi-client + ${project.version} + tests + test-jar + test + + + + commons-codec + commons-codec + 1.4 + + + + commons-dbcp + commons-dbcp + 1.4 + + + + commons-pool + commons-pool + 1.4 + + + + org.apache.httpcomponents + httpcore + + + + log4j + log4j + + + org.slf4j + slf4j-api + ${slf4j.version} + + + + org.apache.hadoop + hadoop-mapreduce-client-common + + + javax.servlet + servlet-api + + + + + + org.apache.hadoop + hadoop-client + + + + org.apache.spark + spark-core_${scala.binary.version} + + + + org.apache.spark + spark-sql_${scala.binary.version} + + + + com.yammer.metrics + metrics-core + 2.2.0 + + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${spark.version} + provided + + + + org.apache.spark + spark-streaming-kafka-0-10_${scala.binary.version} + ${spark.version} + + + + + org.antlr + stringtemplate + 4.0.2 + + + + com.beust + jcommander + + + + org.mockito + mockito-all + 1.10.19 + test + + + org.apache.avro + avro-mapred + 1.7.7 + + + + + org.apache.parquet + parquet-avro + compile + + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + + + + com.twitter + bijection-avro_${scala.binary.version} + 0.9.3 + + + + io.confluent + kafka-avro-serializer + 3.0.0 + + + + io.confluent + common-config + 3.0.0 + + + + io.confluent + common-utils + 3.0.0 + + + + io.confluent + kafka-schema-registry-client + 3.0.0 + + + + org.apache.thrift + libfb303 + 0.9.3 + + + org.apache.thrift + libthrift + 0.9.3 + + + + + diff --git a/packaging/hudi-integ-test-bundle/src/main/java/org/apache/hudi/testsuite/bundle/Main.java b/packaging/hudi-integ-test-bundle/src/main/java/org/apache/hudi/testsuite/bundle/Main.java new file mode 100644 index 000000000..268d2d06c --- /dev/null +++ b/packaging/hudi-integ-test-bundle/src/main/java/org/apache/hudi/testsuite/bundle/Main.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.testsuite.bundle; + +import org.apache.hudi.common.util.ReflectionUtils; + +/** + * A simple main class to dump all classes loaded in current classpath + * + * This is a workaround for generating sources and javadoc jars for packaging modules. The maven plugins for generating + * javadoc and sources plugins do not generate corresponding jars if there are no source files. + * + * This class does not have anything to do with Hudi but is there to keep mvn javadocs/source plugin happy. + */ +public class Main { + + public static void main(String[] args) { + ReflectionUtils.getTopLevelClassesInClasspath(Main.class).forEach(System.out::println); + } +} diff --git a/pom.xml b/pom.xml index 05907f348..76ae34ab5 100644 --- a/pom.xml +++ b/pom.xml @@ -51,6 +51,7 @@ packaging/hudi-timeline-server-bundle docker/hoodie/hadoop hudi-integ-test + packaging/hudi-integ-test-bundle hudi-examples @@ -89,7 +90,7 @@ 1.7.0-M1 3.3.3 1.2.17 - 1.7.5 + 1.7.15 2.9.9 2.7.3 org.apache.hive @@ -995,6 +996,7 @@ **/*FunctionalTestSuite.java **/IT*.java + **/testsuite/**/Test*.java @@ -1096,6 +1098,7 @@ ${skipITs} + **/testsuite/**/Test*.java **/IT*.java