1
0

[HUDI-394] Provide a basic implementation of test suite

This commit is contained in:
Nishith Agarwal
2019-11-01 12:40:09 -07:00
committed by n3nash
parent d5b593b7d9
commit 2fc2b01d86
102 changed files with 8633 additions and 64 deletions

View File

@@ -27,7 +27,7 @@ jobs:
- name: "Functional tests"
env: MODE=functional HUDI_QUIETER_LOGGING=1
- name: "Integration tests"
env: MODE=integration
env: MODE=integration HUDI_QUIETER_LOGGING=1
install: true
services:
- docker

View File

@@ -0,0 +1,21 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
hoodie.upsert.shuffle.parallelism=2
hoodie.insert.shuffle.parallelism=2
hoodie.bulkinsert.shuffle.parallelism=2
hoodie.datasource.write.partitionpath.field=timestamp

View File

@@ -0,0 +1,97 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
first_insert:
config:
record_size: 70000
num_insert_partitions: 1
repeat_count: 5
num_records_insert: 1000
type: InsertNode
deps: none
second_insert:
config:
record_size: 70000
num_insert_partitions: 1
repeat_count: 5
num_records_insert: 10000
deps: first_insert
type: InsertNode
third_insert:
config:
record_size: 70000
num_insert_partitions: 1
repeat_count: 2
num_records_insert: 300
deps: second_insert
type: InsertNode
first_rollback:
config:
deps: third_insert
type: RollbackNode
first_upsert:
config:
record_size: 70000
num_insert_partitions: 1
num_records_insert: 300
repeat_count: 5
num_records_upsert: 100
num_upsert_partitions: 10
type: UpsertNode
deps: first_rollback
first_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: first_upsert
first_hive_query:
config:
hive_props:
prop1: "set hive.execution.engine=spark"
prop2: "set spark.yarn.queue="
prop3: "set hive.strict.checks.large.query=false"
prop4: "set hive.stats.autogather=false"
hive_queries:
query1: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1"
result1: 0
query2: "select count(*) from testdb1.table1"
result2: 22100000
type: HiveQueryNode
deps: first_hive_sync
second_upsert:
config:
record_size: 70000
num_insert_partitions: 1
num_records_insert: 300
repeat_count: 5
num_records_upsert: 100
num_upsert_partitions: 10
type: UpsertNode
deps: first_hive_query
second_hive_query:
config:
hive_props:
prop1: "set hive.execution.engine=mr"
prop2: "set mapred.job.queue.name="
prop3: "set hive.strict.checks.large.query=false"
prop4: "set hive.stats.autogather=false"
hive_queries:
query1: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1"
result1: 0
query2: "select count(*) from testdb1.table1"
result2: 22100
type: HiveQueryNode
deps: second_upsert

View File

@@ -0,0 +1,119 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
first_insert:
config:
record_size: 70000
num_insert_partitions: 1
repeat_count: 5
num_records_insert: 100
type: InsertNode
deps: none
second_insert:
config:
record_size: 70000
num_insert_partitions: 1
repeat_count: 5
num_records_insert: 100
deps: first_insert
type: InsertNode
third_insert:
config:
record_size: 70000
num_insert_partitions: 1
repeat_count: 2
num_records_insert: 300
deps: second_insert
type: InsertNode
first_rollback:
config:
deps: third_insert
type: RollbackNode
first_upsert:
config:
record_size: 70000
num_insert_partitions: 1
num_records_insert: 300
repeat_count: 5
num_records_upsert: 100
num_upsert_partitions: 10
type: UpsertNode
deps: first_rollback
first_hive_sync:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveSyncNode
deps: first_upsert
first_hive_query:
config:
queue_name: "adhoc"
engine: "mr"
type: HiveQueryNode
deps: first_hive_sync
second_upsert:
config:
record_size: 70000
num_insert_partitions: 1
num_records_insert: 300
repeat_count: 5
num_records_upsert: 100
num_upsert_partitions: 10
type: UpsertNode
deps: first_hive_query
second_hive_query:
config:
queue_name: "adhoc"
engine: "mr"
hive_queries:
query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1"
result1: 0
query2: "select count(*) from testdb.table1"
result2: 3100
query3: "select count(*) from testdb.table1_rt group by `_row_key` having count(*) > 1"
result3: 0
query4: "select count(*) from testdb.table1_rt"
result4: 3100
type: HiveQueryNode
deps: second_upsert
first_schedule_compact:
config:
type: ScheduleCompactNode
deps: second_hive_query
third_upsert:
config:
record_size: 70000
num_insert_partitions: 1
num_records_insert: 300
repeat_count: 5
num_records_upsert: 100
num_upsert_partitions: 10
type: UpsertNode
deps: first_schedule_compact
first_compact:
config:
type: CompactNode
deps: first_schedule_compact
third_hive_query:
config:
queue_name: "adhoc"
engine: "mr"
hive_queries:
query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1"
result1: 0
query2: "select count(*) from testdb.table1"
result2: 2210
type: HiveQueryNode
deps: second_upsert

View File

@@ -0,0 +1,466 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"name": "COMPLEX",
"fields": [
{
"default": null,
"type": [
"null",
{
"items": "string",
"type": "array"
}
],
"name": "array_of_string_fields1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field2"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field3"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field4"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field5"
},
{
"default": null,
"type": [
"null",
"boolean"
],
"name": "boolean_field1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field6"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field7"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field8"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field9"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field10"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field11"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field12"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field13"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field14"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field2"
},
{
"default": null,
"type": [
"null",
{
"items": {
"fields": [
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field15"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field16"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field17"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field3"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field4"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field2"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field3"
}
],
"type": "record",
"name": "record_field1"
},
"type": "array"
}
],
"name": "record_name1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field18"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field5"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field4"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field5"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field19"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field6"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field20"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field7"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field6"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field21"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field22"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field23"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field8"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field7"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field24"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field10"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field25"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field26"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field11"
},
{
"default": null,
"type": [
"null",
"boolean"
],
"name": "boolean_field3"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field12"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field8"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field13"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field27"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field28"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field29"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field30"
}
],
"type": "record"
}

View File

@@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"type" : "record",
"name" : "triprec",
"fields" : [
{
"name" : "timestamp",
"type" : "long"
}, {
"name" : "_row_key",
"type" : "string"
}, {
"name" : "rider",
"type" : "string"
}, {
"name" : "driver",
"type" : "string"
}, {
"name" : "begin_lat",
"type" : "double"
}, {
"name" : "begin_lon",
"type" : "double"
}, {
"name" : "end_lat",
"type" : "double"
}, {
"name" : "end_lon",
"type" : "double"
}, {
"name" : "fare",
"type" : "double"
} ]
}

View File

@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"type" : "record",
"name" : "triprec",
"fields" : [
{
"name" : "timestamp",
"type" : "double"
}, {
"name" : "_row_key",
"type" : "string"
}, {
"name" : "rider",
"type" : "string"
}, {
"name" : "driver",
"type" : "string"
}, {
"name" : "begin_lat",
"type" : "double"
}, {
"name" : "begin_lon",
"type" : "double"
}, {
"name" : "end_lat",
"type" : "double"
}, {
"name" : "end_lon",
"type" : "double"
}, {
"name" : "fare",
"type" : "double"
}, {
"name" : "haversine_distance",
"type" : "double"
}]
}

View File

@@ -0,0 +1,27 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
hoodie.datasource.write.recordkey.field=_row_key
hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-bench/input
hoodie.datasource.write.keygenerator.class=org.apache.hudi.ComplexKeyGenerator
hoodie.datasource.write.partitionpath.field=timestamp
hoodie.deltastreamer.schemaprovider.source.schema.file=/var/hoodie/ws/docker/demo/config/bench/source.avsc
hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/
hoodie.datasource.hive_sync.database=testdb
hoodie.datasource.hive_sync.table=test_table
hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.NonPartitionedExtractor
hoodie.datasource.hive_sync.assume_date_partitioning=true
hoodie.datasource.write.keytranslator.class=org.apache.hudi.DayBasedPartitionPathKeyTranslator
hoodie.deltastreamer.schemaprovider.target.schema.file=/var/hoodie/ws/docker/demo/config/bench/source.avsc

View File

@@ -55,6 +55,7 @@ ENV HUDI_HADOOP_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hood
ENV HUDI_HIVE_SYNC_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-hive-sync-bundle.jar
ENV HUDI_SPARK_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-spark-bundle.jar
ENV HUDI_UTILITIES_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-utilities.jar
ENV HUDI_INTEG_TEST_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-integ-test-bundle.jar
COPY startup.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/startup.sh

View File

@@ -59,9 +59,7 @@
<copy file="${project.basedir}/../../../../packaging/hudi-hadoop-mr-bundle/target/hudi-hadoop-mr-bundle-${project.version}.jar" tofile="target/hoodie-hadoop-mr-bundle.jar" />
<copy file="${project.basedir}/../../../../packaging/hudi-hive-sync-bundle/target/hudi-hive-sync-bundle-${project.version}.jar" tofile="target/hoodie-hive-sync-bundle.jar" />
<copy file="${project.basedir}/../../../../packaging/hudi-spark-bundle/target/hudi-spark-bundle_${scala.binary.version}-${project.version}.jar" tofile="target/hoodie-spark-bundle.jar" />
<copy
file="${project.basedir}/../../../../packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_${scala.binary.version}-${project.version}.jar"
tofile="target/hoodie-utilities.jar"/>
<copy file="${project.basedir}/../../../../packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_${scala.binary.version}-${project.version}.jar" tofile="target/hoodie-utilities.jar"/>
</tasks>
</configuration>
<goals>

View File

@@ -68,7 +68,6 @@ import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* Class to be used in tests to keep generating test inserts and updates against a corpus.
* <p>
@@ -98,7 +97,8 @@ public class HoodieTestDataGenerator {
+ "{\"name\": \"amount\",\"type\": \"double\"},{\"name\": \"currency\", \"type\": \"string\"}]}},";
public static final String FARE_FLATTENED_SCHEMA = "{\"name\": \"fare\", \"type\": \"double\"},"
+ "{\"name\": \"currency\", \"type\": \"string\"},";
public static final String TIP_NESTED_SCHEMA = "{\"name\": \"tip_history\", \"type\": {\"type\": \"array\", \"items\": {\"type\": \"record\", \"name\": \"tip_history\", \"fields\": ["
public static final String TIP_NESTED_SCHEMA = "{\"name\": \"tip_history\", \"default\": null, \"type\": {\"type\": "
+ "\"array\", \"items\": {\"type\": \"record\", \"default\": null, \"name\": \"tip_history\", \"fields\": ["
+ "{\"name\": \"amount\", \"type\": \"double\"}, {\"name\": \"currency\", \"type\": \"string\"}]}}},";
public static final String MAP_TYPE_SCHEMA = "{\"name\": \"city_to_state\", \"type\": {\"type\": \"map\", \"values\": \"string\"}},";
public static final String EXTRA_TYPE_SCHEMA = "{\"name\": \"distance_in_meters\", \"type\": \"int\"},"
@@ -254,7 +254,6 @@ public class HoodieTestDataGenerator {
rec.put("begin_lon", RAND.nextDouble());
rec.put("end_lat", RAND.nextDouble());
rec.put("end_lon", RAND.nextDouble());
if (isFlattened) {
rec.put("fare", RAND.nextDouble() * 100);
rec.put("currency", "USD");
@@ -730,7 +729,7 @@ public class HoodieTestDataGenerator {
public boolean deleteExistingKeyIfPresent(HoodieKey key) {
Map<Integer, KeyPartition> existingKeys = existingKeysBySchema.get(TRIP_EXAMPLE_SCHEMA);
Integer numExistingKeys = numKeysBySchema.get(TRIP_EXAMPLE_SCHEMA);
for (Map.Entry<Integer, KeyPartition> entry: existingKeys.entrySet()) {
for (Map.Entry<Integer, KeyPartition> entry : existingKeys.entrySet()) {
if (entry.getValue().key.equals(key)) {
int index = entry.getKey();
existingKeys.put(index, existingKeys.get(numExistingKeys - 1));
@@ -740,10 +739,18 @@ public class HoodieTestDataGenerator {
return true;
}
}
return false;
}
public List<GenericRecord> generateGenericRecords(int numRecords) {
List<GenericRecord> list = new ArrayList<>();
IntStream.range(0, numRecords).forEach(i -> {
list.add(generateGenericRecord(UUID.randomUUID().toString(), UUID.randomUUID().toString(), UUID.randomUUID()
.toString(), RAND.nextDouble()));
});
return list;
}
public String[] getPartitionPaths() {
return partitionPaths;
}

View File

@@ -112,6 +112,15 @@ public class HiveSyncTool {
+ " of type " + hoodieHiveClient.getTableType());
// Check if the necessary table exists
boolean tableExists = hoodieHiveClient.doesTableExist(tableName);
// check if the database exists else create it
try {
hoodieHiveClient.updateHiveSQL("create database if not exists " + cfg.databaseName);
} catch (Exception e) {
// this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing
LOG.warn("Unable to create database", e);
}
// Get the parquet schema for this table looking at the latest commit
MessageType schema = hoodieHiveClient.getDataSchema();
// Sync schema if needed

View File

@@ -572,4 +572,8 @@ public class HoodieHiveClient {
return new PartitionEvent(PartitionEventType.UPDATE, storagePartition);
}
}
public IMetaStoreClient getClient() {
return client;
}
}

View File

@@ -126,10 +126,21 @@ public class HiveTestService {
public void stop() {
resetSystemProperties();
if (tServer != null) {
tServer.stop();
try {
tServer.stop();
} catch (Exception e) {
LOG.error("Stop meta store failed", e);
}
}
if (hiveServer != null) {
hiveServer.stop();
try {
hiveServer.stop();
} catch (Exception e) {
LOG.error("Stop hive server failed", e);
}
}
if (executorService != null) {
executorService.shutdownNow();
}
LOG.info("Hive Minicluster service shut down.");
tServer = null;

262
hudi-integ-test/README.md Normal file
View File

@@ -0,0 +1,262 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
This page describes in detail how to run end to end tests on a hudi dataset that helps in improving our confidence
in a release as well as perform large scale performance benchmarks.
# Objectives
1. Test with different versions of core libraries and components such as `hdfs`, `parquet`, `spark`,
`hive` and `avro`.
2. Generate different types of workloads across different dimensions such as `payload size`, `number of updates`,
`number of inserts`, `number of partitions`
3. Perform multiple types of operations such as `insert`, `bulk_insert`, `upsert`, `compact`, `query`
4. Support custom post process actions and validations
# High Level Design
The Hudi test suite runs as a long running spark job. The suite is divided into the following high level components :
## Workload Generation
This component does the work of generating the workload; `inserts`, `upserts` etc.
## Workload Scheduling
Depending on the type of workload generated, data is either ingested into the target hudi
dataset or the corresponding workload operation is executed. For example compaction does not necessarily need a workload
to be generated/ingested but can require an execution.
## Other actions/operatons
The test suite supports different types of operations besides ingestion such as Hive Query execution, Clean action etc.
# Usage instructions
## Entry class to the test suite
```
org.apache.hudi.testsuite.HoodieTestSuiteJob.java - Entry Point of the hudi test suite job. This
class wraps all the functionalities required to run a configurable integration suite.
```
## Configurations required to run the job
```
org.apache.hudi.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig - Config class that drives the behavior of the
integration test suite. This class extends from com.uber.hoodie.utilities.DeltaStreamerConfig. Look at
link#HudiDeltaStreamer page to learn about all the available configs applicable to your test suite.
```
## Generating a custom Workload Pattern
There are 2 ways to generate a workload pattern
1.Programatically
Choose to write up the entire DAG of operations programatically, take a look at `WorkflowDagGenerator` class.
Once you're ready with the DAG you want to execute, simply pass the class name as follows:
```
spark-submit
...
...
--class org.apache.hudi.testsuite.HoodieTestSuiteJob
--workload-generator-classname org.apache.hudi.testsuite.dag.scheduler.<your_workflowdaggenerator>
...
```
2.YAML file
Choose to write up the entire DAG of operations in YAML, take a look at `complex-workload-dag-cow.yaml` or
`complex-workload-dag-mor.yaml`.
Once you're ready with the DAG you want to execute, simply pass the yaml file path as follows:
```
spark-submit
...
...
--class org.apache.hudi.testsuite.HoodieTestSuiteJob
--workload-yaml-path /path/to/your-workflow-dag.yaml
...
```
## Building the test suite
The test suite can be found in the `hudi-integ-test` module. Use the `prepare_integration_suite.sh` script to
build
the test suite, you can provide different parameters to the script.
```
shell$ ./prepare_integration_suite.sh --help
Usage: prepare_integration_suite.sh
--spark-command, prints the spark command
-h, hdfs-version
-s, spark version
-p, parquet version
-a, avro version
-s, hive version
```
```
shell$ ./prepare_integration_suite.sh
....
....
Final command : mvn clean install -DskipTests
```
## Running on the cluster or in your local machine
Copy over the necessary files and jars that are required to your cluster and then run the following spark-submit
command after replacing the correct values for the parameters.
NOTE : The properties-file should have all the necessary information required to ingest into a Hudi dataset. For more
information on what properties need to be set, take a look at the test suite section under demo steps.
```
shell$ ./prepare_integration_suite.sh --spark-command
spark-submit --packages com.databricks:spark-avro_2.11:4.0.0 --master prepare_integration_suite.sh --deploy-mode
--properties-file --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob target/hudi-integ-test-0.6
.0-SNAPSHOT.jar --source-class --source-ordering-field --input-base-path --target-base-path --target-table --props --storage-type --payload-class --workload-yaml-path --input-file-size --<deltastreamer-ingest>
```
## Running through a test-case (local)
Take a look at the `TestHoodieTestSuiteJob` to check how you can run the entire suite using JUnit.
## Running an end to end test suite in Local Docker environment
```
docker exec -it adhoc-2 /bin/bash
# COPY_ON_WRITE tables
=========================
## Run the following command to start the test suite
spark-submit \
--packages com.databricks:spark-avro_2.11:4.0.0 \
--conf spark.task.cpus=1 \
--conf spark.executor.cores=1 \
--conf spark.task.maxFailures=100 \
--conf spark.memory.fraction=0.4 \
--conf spark.rdd.compress=true \
--conf spark.kryoserializer.buffer.max=2000m \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--conf spark.memory.storageFraction=0.1 \
--conf spark.shuffle.service.enabled=true \
--conf spark.sql.hive.convertMetastoreParquet=false \
--conf spark.ui.port=5555 \
--conf spark.driver.maxResultSize=12g \
--conf spark.executor.heartbeatInterval=120s \
--conf spark.network.timeout=600s \
--conf spark.eventLog.overwrite=true \
--conf spark.eventLog.enabled=true \
--conf spark.yarn.max.executor.failures=10 \
--conf spark.sql.catalogImplementation=hive \
--conf spark.sql.shuffle.partitions=1000 \
--class org.apache.hudi.testsuite.HoodieTestSuiteJob $HUDI_TEST_SUITE_BUNDLE \
--source-ordering-field timestamp \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
--target-table test_table \
--props /var/hoodie/ws/docker/demo/config/testsuite/test-source.properties \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--source-limit 300000 \
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
--input-file-size 125829120 \
--workload-yaml-path /var/hoodie/ws/docker/demo/config/testsuite/complex-workflow-dag-cow.yaml \
--storage-type COPY_ON_WRITE \
--compact-scheduling-minshare 1 \
--hoodie-conf "hoodie.deltastreamer.source.test.num_partitions=100" \
--hoodie-conf "hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false" \
--hoodie-conf "hoodie.deltastreamer.source.test.max_unique_records=100000000" \
--hoodie-conf "hoodie.embed.timeline.server=false" \
--hoodie-conf "hoodie.datasource.write.recordkey.field=_row_key" \
--hoodie-conf "hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input" \
--hoodie-conf "hoodie.datasource.write.keygenerator.class=org.apache.hudi.ComplexKeyGenerator" \
--hoodie-conf "hoodie.datasource.write.partitionpath.field=timestamp" \
--hoodie-conf "hoodie.deltastreamer.schemaprovider.source.schema.file=/var/hoodie/ws/docker/demo/config/testsuite/source.avsc" \
--hoodie-conf "hoodie.datasource.hive_sync.assume_date_partitioning=false" \
--hoodie-conf "hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/" \
--hoodie-conf "hoodie.datasource.hive_sync.database=testdb" \
--hoodie-conf "hoodie.datasource.hive_sync.table=test_table" \
--hoodie-conf "hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.NonPartitionedExtractor" \
--hoodie-conf "hoodie.datasource.hive_sync.assume_date_partitioning=true" \
--hoodie-conf "hoodie.datasource.write.keytranslator.class=org.apache.hudi.DayBasedPartitionPathKeyTranslator" \
--hoodie-conf "hoodie.deltastreamer.schemaprovider.target.schema.file=/var/hoodie/ws/docker/demo/config/testsuite/source.avsc"
...
...
2019-11-03 05:44:47 INFO DagScheduler:69 - ----------- Finished workloads ----------
2019-11-03 05:44:47 INFO HoodieTestSuiteJob:138 - Finished scheduling all tasks
...
2019-11-03 05:44:48 INFO SparkContext:54 - Successfully stopped SparkContext
# MERGE_ON_READ tables
=========================
## Run the following command to start the test suite
spark-submit \
--packages com.databricks:spark-avro_2.11:4.0.0 \
--conf spark.task.cpus=1 \
--conf spark.executor.cores=1 \
--conf spark.task.maxFailures=100 \
--conf spark.memory.fraction=0.4 \
--conf spark.rdd.compress=true \
--conf spark.kryoserializer.buffer.max=2000m \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--conf spark.memory.storageFraction=0.1 \
--conf spark.shuffle.service.enabled=true \
--conf spark.sql.hive.convertMetastoreParquet=false \
--conf spark.ui.port=5555 \
--conf spark.driver.maxResultSize=12g \
--conf spark.executor.heartbeatInterval=120s \
--conf spark.network.timeout=600s \
--conf spark.eventLog.overwrite=true \
--conf spark.eventLog.enabled=true \
--conf spark.yarn.max.executor.failures=10 \
--conf spark.sql.catalogImplementation=hive \
--conf spark.sql.shuffle.partitions=1000 \
--class org.apache.hudi.testsuite.HoodieTestSuiteJob $HUDI_TEST_SUITE_BUNDLE \
--source-ordering-field timestamp \
--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
--target-table test_table \
--props /var/hoodie/ws/docker/demo/config/testsuite/test-source.properties \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--source-limit 300000 \
--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
--input-file-size 125829120 \
--workload-yaml-path /var/hoodie/ws/docker/demo/config/testsuite/complex-workflow-dag-mor.yaml \
--storage-type MERGE_ON_READ \
--compact-scheduling-minshare 1 \
--hoodie-conf "hoodie.deltastreamer.source.test.num_partitions=100" \
--hoodie-conf "hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false" \
--hoodie-conf "hoodie.deltastreamer.source.test.max_unique_records=100000000" \
--hoodie-conf "hoodie.embed.timeline.server=false" \
--hoodie-conf "hoodie.datasource.write.recordkey.field=_row_key" \
--hoodie-conf "hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input" \
--hoodie-conf "hoodie.datasource.write.keygenerator.class=org.apache.hudi.ComplexKeyGenerator" \
--hoodie-conf "hoodie.datasource.write.partitionpath.field=timestamp" \
--hoodie-conf "hoodie.deltastreamer.schemaprovider.source.schema.file=/var/hoodie/ws/docker/demo/config/testsuite/source.avsc" \
--hoodie-conf "hoodie.datasource.hive_sync.assume_date_partitioning=false" \
--hoodie-conf "hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/" \
--hoodie-conf "hoodie.datasource.hive_sync.database=testdb" \
--hoodie-conf "hoodie.datasource.hive_sync.table=test_table" \
--hoodie-conf "hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.NonPartitionedExtractor" \
--hoodie-conf "hoodie.datasource.hive_sync.assume_date_partitioning=true" \
--hoodie-conf "hoodie.datasource.write.keytranslator.class=org.apache.hudi.DayBasedPartitionPathKeyTranslator" \
--hoodie-conf "hoodie.deltastreamer.schemaprovider.target.schema.file=/var/hoodie/ws/docker/demo/config/testsuite/source.avsc"
...
...
2019-11-03 05:44:47 INFO DagScheduler:69 - ----------- Finished workloads ----------
2019-11-03 05:44:47 INFO HoodieTestSuiteJob:138 - Finished scheduling all tasks
...
2019-11-03 05:44:48 INFO SparkContext:54 - Successfully stopped SparkContext
```

View File

@@ -51,12 +51,69 @@
<scope>test</scope>
</dependency>
<!-- Jetty -->
<dependency>
<!-- Needs to be at the top to ensure we get the correct dependency versions for jetty-server -->
<groupId>org.eclipse.jetty.aggregate</groupId>
<artifactId>jetty-all</artifactId>
<version>${jetty.version}</version>
<classifier>uber</classifier>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<exclusions>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Hoodie -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>com.databricks</groupId>
<artifactId>spark-avro_${scala.binary.version}</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Logging -->
<dependency>
@@ -89,6 +146,67 @@
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hive-sync</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
<classifier>tests</classifier>
<type>test-jar</type>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
</dependency>
<!-- Fasterxml -->
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-yaml</artifactId>
<version>2.7.4</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.6.7.3</version>
</dependency>
<!-- Fasterxml - Test-->
<dependency>
@@ -107,6 +225,87 @@
<scope>test</scope>
</dependency>
<!-- HDFS test dependencies -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<classifier>tests</classifier>
<exclusions>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<classifier>tests</classifier>
<exclusions>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<artifactId>netty</artifactId>
<groupId>io.netty</groupId>
</exclusion>
<exclusion>
<artifactId>netty-all</artifactId>
<groupId>io.netty</groupId>
</exclusion>
</exclusions>
</dependency>
<!-- Hive dependencies -->
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
<classifier>${hive.exec.classifier}</classifier>
<exclusions>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.awaitility</groupId>
<artifactId>awaitility</artifactId>
@@ -138,6 +337,12 @@
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<properties>

View File

@@ -0,0 +1,174 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/bash
# Determine the current working directory
_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# Preserve the calling directory
_CALLING_DIR="$(pwd)"
#########################
# The command line help #
#########################
usage() {
echo "Usage: $0"
echo " --spark-command, prints the spark command"
echo " -h | --hadoop, hadoop-version"
echo " -s | --spark, spark version"
echo " -p | --parquet, parquet version"
echo " -a | --avro, avro version"
echo " -i | --hive, hive version"
echo " -l | --scala, scala version"
exit 1
}
get_spark_command() {
if [ -z "$scala" ]
then
scala="2.11"
else
scala=$1
fi
echo "spark-submit --packages org.apache.spark:spark-avro_${scala}:2.4.4 \
--master $0 \
--deploy-mode $1 \
--properties-file $2 \
--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
`ls target/hudi-integ-test-*-SNAPSHOT.jar` \
--source-class $3 \
--source-ordering-field $4 \
--input-base-path $5 \
--target-base-path $6 \
--target-table $7 \
--props $8 \
--storage-type $9 \
--payload-class "${10}" \
--workload-yaml-path "${11}" \
--input-file-size "${12}" \
--<use-deltastreamer>"
}
case "$1" in
--help)
usage
exit 0
;;
esac
case "$1" in
--spark-command)
get_spark_command
exit 0
;;
esac
while getopts ":h:s:p:a:i:l:-:" opt; do
case $opt in
h) hadoop="$OPTARG"
printf "Argument hadoop is %s\n" "$hadoop"
;;
s) spark="$OPTARG"
printf "Argument spark is %s\n" "$spark"
;;
p) parquet="$OPTARG"
printf "Argument parquet is %s\n" "$parquet"
;;
a) avro="$OPTARG"
printf "Argument avro is %s\n" "$avro"
;;
i) hive="$OPTARG"
printf "Argument hive is %s\n" "$hive"
;;
l) scala="$OPTARG"
printf "Argument scala is %s\n" "$scala"
;;
-)
case "$OPTARG" in
hadoop)
hadoop="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
printf "Argument hadoop is %s\n" "$hadoop"
;;
spark)
spark="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
printf "Argument spark is %s\n" "$spark"
;;
parquet)
parquet="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
printf "Argument parquet is %s\n" "$parquet"
;;
avro)
avro="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
printf "Argument avro is %s\n" "$avro"
;;
hive)
hive="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
printf "Argument hive is %s\n" "$hive"
;;
scala)
scala="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
printf "Argument scala is %s\n" "$scala"
;;
*) echo "Invalid option --$OPTARG" >&2
;;
esac ;;
\?) echo "Invalid option -$OPTARG" >&2
;;
esac
done
get_versions () {
base_command=''
if [ -z "$hadoop" ]
then
base_command=$base_command
else
hadoop=$1
base_command+=' -Dhadoop.version='$hadoop
fi
if [ -z "$hive" ]
then
base_command=$base_command
else
hive=$2
base_command+=' -Dhive.version='$hive
fi
if [ -z "$scala" ]
then
base_command=$base_command
else
scala=$3
base_command+=' -Dscala-'$scala
fi
echo $base_command
}
versions=$(get_versions $hadoop $hive $scala)
final_command='mvn clean install -DskipTests '$versions
printf "Final command $final_command \n"
# change to the project root directory to run maven command
move_to_root='cd ..'
$move_to_root && $final_command
# change back to original working directory
cd $_CALLING_DIR
printf "A sample spark command to start the integration suite \n"
get_spark_command $scala

View File

@@ -0,0 +1,77 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
/**
* Extends the {@link HoodieDeltaStreamer} to expose certain operations helpful in running the Test Suite.
* This is done to achieve 2 things 1) Leverage some components of {@link HoodieDeltaStreamer} 2)
* Piggyback on the suite to test {@link HoodieDeltaStreamer}
*/
public class HoodieDeltaStreamerWrapper extends HoodieDeltaStreamer {
public HoodieDeltaStreamerWrapper(Config cfg, JavaSparkContext jssc) throws Exception {
super(cfg, jssc);
}
public HoodieDeltaStreamerWrapper(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf conf) throws Exception {
super(cfg, jssc, fs, conf);
}
public JavaRDD<WriteStatus> upsert(Operation operation) throws
Exception {
cfg.operation = operation;
return deltaSyncService.getDeltaSync().syncOnce().getRight();
}
public JavaRDD<WriteStatus> insert() throws Exception {
return upsert(Operation.INSERT);
}
public JavaRDD<WriteStatus> bulkInsert() throws
Exception {
return upsert(Operation.BULK_INSERT);
}
public void scheduleCompact() throws Exception {
// Since we don't support scheduleCompact() operation in delta-streamer, assume upsert without any data that will
// trigger scheduling compaction
upsert(Operation.UPSERT);
}
public JavaRDD<WriteStatus> compact() throws Exception {
// Since we don't support compact() operation in delta-streamer, assume upsert without any data that will trigger
// inline compaction
return upsert(Operation.UPSERT);
}
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchSource() throws Exception {
return deltaSyncService.getDeltaSync().readFromSource(deltaSyncService.getDeltaSync().getCommitTimelineOpt());
}
}

View File

@@ -0,0 +1,186 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
import org.apache.hudi.integ.testsuite.dag.DagUtils;
import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator;
import org.apache.hudi.integ.testsuite.dag.scheduler.DagScheduler;
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This is the entry point for running a Hudi Test Suite. Although this class has similarities with
* {@link HoodieDeltaStreamer} this class does not extend it since do not want to create a dependency on the changes in
* DeltaStreamer.
*/
public class HoodieTestSuiteJob {
private static volatile Logger log = LoggerFactory.getLogger(HoodieTestSuiteJob.class);
private final HoodieTestSuiteConfig cfg;
/**
* Bag of properties with source, hoodie client, key generator etc.
*/
TypedProperties props;
/**
* Schema provider that supplies the command for writing out the generated payloads.
*/
private transient SchemaProvider schemaProvider;
/**
* Filesystem used.
*/
private transient FileSystem fs;
/**
* Spark context.
*/
private transient JavaSparkContext jsc;
/**
* Spark Session.
*/
private transient SparkSession sparkSession;
/**
* Hive Config.
*/
private transient HiveConf hiveConf;
private KeyGenerator keyGenerator;
public HoodieTestSuiteJob(HoodieTestSuiteConfig cfg, JavaSparkContext jsc) throws IOException {
this.cfg = cfg;
this.jsc = jsc;
this.sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate();
this.fs = FSUtils.getFs(cfg.inputBasePath, jsc.hadoopConfiguration());
this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
log.info("Creating workload generator with configs : {}", props.toString());
this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jsc);
this.hiveConf = getDefaultHiveConf(jsc.hadoopConfiguration());
this.keyGenerator = DataSourceUtils.createKeyGenerator(props);
if (!fs.exists(new Path(cfg.targetBasePath))) {
HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), cfg.targetBasePath,
HoodieTableType.valueOf(cfg.tableType), cfg.targetTableName, "archived");
}
}
private static HiveConf getDefaultHiveConf(Configuration cfg) {
HiveConf hiveConf = new HiveConf();
hiveConf.addResource(cfg);
return hiveConf;
}
public static void main(String[] args) throws Exception {
final HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig();
JCommander cmd = new JCommander(cfg, args);
if (cfg.help || args.length == 0) {
cmd.usage();
System.exit(1);
}
JavaSparkContext jssc = UtilHelpers.buildSparkContext("workload-generator-" + cfg.outputTypeName
+ "-" + cfg.inputFormatName, cfg.sparkMaster);
new HoodieTestSuiteJob(cfg, jssc).runTestSuite();
}
public void runTestSuite() {
try {
WorkflowDag workflowDag = this.cfg.workloadYamlPath == null ? ((WorkflowDagGenerator) ReflectionUtils
.loadClass((this.cfg).workloadDagGenerator)).build()
: DagUtils.convertYamlPathToDag(this.fs, this.cfg.workloadYamlPath);
log.info("Workflow Dag => " + DagUtils.convertDagToYaml(workflowDag));
long startTime = System.currentTimeMillis();
String schemaStr = schemaProvider.getSourceSchema().toString();
final HoodieTestSuiteWriter writer = new HoodieTestSuiteWriter(jsc, props, cfg, schemaStr);
final DeltaGenerator deltaGenerator = new DeltaGenerator(
new DFSDeltaConfig(DeltaOutputMode.valueOf(cfg.outputTypeName), DeltaInputType.valueOf(cfg.inputFormatName),
new SerializableConfiguration(jsc.hadoopConfiguration()), cfg.inputBasePath, cfg.targetBasePath,
schemaStr, cfg.limitFileSize), jsc, sparkSession, schemaStr, keyGenerator);
DagScheduler dagScheduler = new DagScheduler(workflowDag, writer, deltaGenerator);
dagScheduler.schedule();
log.info("Finished scheduling all tasks, Time taken {}", System.currentTimeMillis() - startTime);
} catch (Exception e) {
log.error("Failed to run Test Suite ", e);
throw new HoodieException("Failed to run Test Suite ", e);
} finally {
jsc.stop();
}
}
/**
* The Hudi test suite uses {@link HoodieDeltaStreamer} to run some operations hence extend delta streamer config.
*/
public static class HoodieTestSuiteConfig extends HoodieDeltaStreamer.Config {
@Parameter(names = {"--input-base-path"}, description = "base path for input data"
+ "(Will be created if did not exist first time around. If exists, more data will be added to that path)",
required = true)
public String inputBasePath;
@Parameter(names = {
"--workload-generator-classname"}, description = "WorkflowDag of operations to generate the workload",
required = true)
public String workloadDagGenerator = WorkflowDagGenerator.class.getName();
@Parameter(names = {
"--workload-yaml-path"}, description = "Workflow Dag yaml path to generate the workload")
public String workloadYamlPath;
@Parameter(names = {"--delta-output-type"}, description = "Subclass of "
+ "org.apache.hudi.testsuite.workload.DeltaOutputMode to readAvro data.")
public String outputTypeName = DeltaOutputMode.DFS.name();
@Parameter(names = {"--delta-input-format"}, description = "Subclass of "
+ "org.apache.hudi.testsuite.workload.DeltaOutputMode to read avro data.")
public String inputFormatName = DeltaInputType.AVRO.name();
@Parameter(names = {"--input-file-size"}, description = "The min/max size of the input files to generate",
required = true)
public Long limitFileSize = 1024 * 1024 * 120L;
@Parameter(names = {"--use-deltastreamer"}, description = "Choose whether to use HoodieDeltaStreamer to "
+ "perform"
+ " ingestion. If set to false, HoodieWriteClient will be used")
public Boolean useDeltaStreamer = false;
}
}

View File

@@ -0,0 +1,219 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.client.HoodieReadClient;
import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.integ.testsuite.dag.nodes.CleanNode;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode;
import org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig;
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Operation;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
/**
* A writer abstraction for the Hudi test suite. This class wraps different implementations of writers used to perform
* write operations into the target hudi dataset. Current supported writers are {@link HoodieDeltaStreamerWrapper}
* and {@link HoodieWriteClient}.
*/
public class HoodieTestSuiteWriter {
private HoodieDeltaStreamerWrapper deltaStreamerWrapper;
private HoodieWriteClient writeClient;
protected HoodieTestSuiteConfig cfg;
private Option<String> lastCheckpoint;
private HoodieReadClient hoodieReadClient;
private Properties props;
private String schema;
private transient Configuration configuration;
private transient JavaSparkContext sparkContext;
private static Set<String> VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE = new HashSet<>(
Arrays.asList(RollbackNode.class.getName(), CleanNode.class.getName(), ScheduleCompactNode.class.getName()));
public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema) throws
Exception {
this(jsc, props, cfg, schema, true);
}
public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema,
boolean rollbackInflight) throws Exception {
// We ensure that only 1 instance of HoodieWriteClient is instantiated for a HoodieTestSuiteWriter
// This does not instantiate a HoodieWriteClient until a
// {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} is invoked.
this.deltaStreamerWrapper = new HoodieDeltaStreamerWrapper(cfg, jsc);
this.hoodieReadClient = new HoodieReadClient(jsc, cfg.targetBasePath);
if (!cfg.useDeltaStreamer) {
this.writeClient = new HoodieWriteClient(jsc, getHoodieClientConfig(cfg, props, schema), rollbackInflight);
}
this.cfg = cfg;
this.configuration = jsc.hadoopConfiguration();
this.sparkContext = jsc;
this.props = props;
this.schema = schema;
}
private HoodieWriteConfig getHoodieClientConfig(HoodieTestSuiteConfig cfg, Properties props, String schema) {
HoodieWriteConfig.Builder builder =
HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
.withAutoCommit(false)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build())
.forTable(cfg.targetTableName)
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.withProps(props);
builder = builder.withSchema(schema);
return builder.build();
}
private boolean allowWriteClientAccess(DagNode dagNode) {
if (VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE.contains(dagNode.getClass().getName())) {
return true;
}
return false;
}
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchSource() throws Exception {
return this.deltaStreamerWrapper.fetchSource();
}
public Option<String> startCommit() {
if (cfg.useDeltaStreamer) {
return Option.of(HoodieActiveTimeline.createNewInstantTime());
} else {
return Option.of(writeClient.startCommit());
}
}
public JavaRDD<WriteStatus> upsert(Option<String> instantTime) throws Exception {
if (cfg.useDeltaStreamer) {
return deltaStreamerWrapper.upsert(Operation.UPSERT);
} else {
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
return writeClient.upsert(nextBatch.getRight().getRight(), instantTime.get());
}
}
public JavaRDD<WriteStatus> insert(Option<String> instantTime) throws Exception {
if (cfg.useDeltaStreamer) {
return deltaStreamerWrapper.insert();
} else {
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
return writeClient.insert(nextBatch.getRight().getRight(), instantTime.get());
}
}
public JavaRDD<WriteStatus> bulkInsert(Option<String> instantTime) throws Exception {
if (cfg.useDeltaStreamer) {
return deltaStreamerWrapper.bulkInsert();
} else {
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
return writeClient.bulkInsert(nextBatch.getRight().getRight(), instantTime.get());
}
}
public JavaRDD<WriteStatus> compact(Option<String> instantTime) throws Exception {
if (cfg.useDeltaStreamer) {
return deltaStreamerWrapper.compact();
} else {
if (!instantTime.isPresent()) {
Option<Pair<String, HoodieCompactionPlan>> compactionPlanPair = Option
.fromJavaOptional(hoodieReadClient.getPendingCompactions()
.stream().findFirst());
if (compactionPlanPair.isPresent()) {
instantTime = Option.of(compactionPlanPair.get().getLeft());
}
}
if (instantTime.isPresent()) {
return writeClient.compact(instantTime.get());
} else {
return null;
}
}
}
public Option<String> scheduleCompaction(Option<Map<String, String>> previousCommitExtraMetadata) throws
Exception {
if (!cfg.useDeltaStreamer) {
deltaStreamerWrapper.scheduleCompact();
return Option.empty();
} else {
return writeClient.scheduleCompaction(previousCommitExtraMetadata);
}
}
public void commit(JavaRDD<WriteStatus> records, Option<String> instantTime) {
if (!cfg.useDeltaStreamer) {
Map<String, String> extraMetadata = new HashMap<>();
/** Store the checkpoint in the commit metadata just like
* {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} **/
extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get());
writeClient.commit(instantTime.get(), records, Option.of(extraMetadata));
}
}
public HoodieWriteClient getWriteClient(DagNode dagNode) throws IllegalAccessException {
if (cfg.useDeltaStreamer & !allowWriteClientAccess(dagNode)) {
throw new IllegalAccessException("cannot access write client when testing in deltastreamer mode");
}
synchronized (this) {
if (writeClient == null) {
this.writeClient = new HoodieWriteClient(this.sparkContext, getHoodieClientConfig(cfg, props, schema), false);
}
}
return writeClient;
}
public HoodieDeltaStreamerWrapper getDeltaStreamerWrapper() {
return deltaStreamerWrapper;
}
public HoodieTestSuiteConfig getCfg() {
return cfg;
}
public Configuration getConfiguration() {
return configuration;
}
public JavaSparkContext getSparkContext() {
return sparkContext;
}
}

View File

@@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.configuration;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
/**
* Configuration to hold details about a DFS based output type, implements {@link DeltaConfig}.
*/
public class DFSDeltaConfig extends DeltaConfig {
// The base path where the generated data should be written to. This data will in turn be used to write into a hudi
// dataset
private final String deltaBasePath;
private final String datasetOutputPath;
private final String schemaStr;
// Maximum file size for the files generated
private final Long maxFileSize;
// The current batch id
private Integer batchId;
public DFSDeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType,
SerializableConfiguration configuration,
String deltaBasePath, String targetBasePath, String schemaStr, Long maxFileSize) {
super(deltaOutputMode, deltaInputType, configuration);
this.deltaBasePath = deltaBasePath;
this.schemaStr = schemaStr;
this.maxFileSize = maxFileSize;
this.datasetOutputPath = targetBasePath;
}
public String getDeltaBasePath() {
return deltaBasePath;
}
public String getDatasetOutputPath() {
return datasetOutputPath;
}
public String getSchemaStr() {
return schemaStr;
}
public Long getMaxFileSize() {
return maxFileSize;
}
public Integer getBatchId() {
return batchId;
}
public void setBatchId(Integer batchId) {
this.batchId = batchId;
}
}

View File

@@ -0,0 +1,256 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.configuration;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
/**
* Configuration to hold the delta output type and delta input format.
*/
public class DeltaConfig implements Serializable {
private final DeltaOutputMode deltaOutputMode;
private final DeltaInputType deltaInputType;
private final SerializableConfiguration configuration;
public DeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType,
SerializableConfiguration configuration) {
this.deltaOutputMode = deltaOutputMode;
this.deltaInputType = deltaInputType;
this.configuration = configuration;
}
public DeltaOutputMode getDeltaOutputMode() {
return deltaOutputMode;
}
public DeltaInputType getDeltaInputType() {
return deltaInputType;
}
public Configuration getConfiguration() {
return configuration.get();
}
/**
* Represents any kind of workload operation for new data. Each workload also contains a set of Option sequence of
* actions that can be executed in parallel.
*/
public static class Config {
public static final String CONFIG_NAME = "config";
public static final String TYPE = "type";
public static final String NODE_NAME = "name";
public static final String DEPENDENCIES = "deps";
public static final String CHILDREN = "children";
public static final String HIVE_QUERIES = "hive_queries";
public static final String HIVE_PROPERTIES = "hive_props";
private static String NUM_RECORDS_INSERT = "num_records_insert";
private static String NUM_RECORDS_UPSERT = "num_records_upsert";
private static String REPEAT_COUNT = "repeat_count";
private static String RECORD_SIZE = "record_size";
private static String NUM_PARTITIONS_INSERT = "num_partitions_insert";
private static String NUM_PARTITIONS_UPSERT = "num_partitions_upsert";
private static String NUM_FILES_UPSERT = "num_files_upsert";
private static String FRACTION_UPSERT_PER_FILE = "fraction_upsert_per_file";
private static String DISABLE_GENERATE = "disable_generate";
private static String DISABLE_INGEST = "disable_ingest";
private static String HIVE_LOCAL = "hive_local";
private Map<String, Object> configsMap;
public Config(Map<String, Object> configsMap) {
this.configsMap = configsMap;
}
public static Builder newBuilder() {
return new Builder();
}
public long getNumRecordsInsert() {
return Long.valueOf(configsMap.getOrDefault(NUM_RECORDS_INSERT, 0).toString());
}
public long getNumRecordsUpsert() {
return Long.valueOf(configsMap.getOrDefault(NUM_RECORDS_UPSERT, 0).toString());
}
public int getRecordSize() {
return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString());
}
public int getNumInsertPartitions() {
return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString());
}
public int getRepeatCount() {
return Integer.valueOf(configsMap.getOrDefault(REPEAT_COUNT, 1).toString());
}
public int getNumUpsertPartitions() {
return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_UPSERT, 0).toString());
}
public int getNumUpsertFiles() {
return Integer.valueOf(configsMap.getOrDefault(NUM_FILES_UPSERT, 1).toString());
}
public double getFractionUpsertPerFile() {
return Double.valueOf(configsMap.getOrDefault(FRACTION_UPSERT_PER_FILE, 0.0).toString());
}
public boolean isDisableGenerate() {
return Boolean.valueOf(configsMap.getOrDefault(DISABLE_GENERATE, false).toString());
}
public boolean isDisableIngest() {
return Boolean.valueOf(configsMap.getOrDefault(DISABLE_INGEST, false).toString());
}
public Map<String, Object> getOtherConfigs() {
if (configsMap == null) {
return new HashMap<>();
}
return configsMap;
}
public List<Pair<String, Integer>> getHiveQueries() {
try {
return (List<Pair<String, Integer>>) this.configsMap.getOrDefault("hive_queries", new ArrayList<>());
} catch (Exception e) {
throw new RuntimeException("unable to get hive queries from configs");
}
}
public boolean isHiveLocal() {
return Boolean.valueOf(configsMap.getOrDefault(HIVE_LOCAL, false).toString());
}
public List<String> getHiveProperties() {
return (List<String>) this.configsMap.getOrDefault(HIVE_PROPERTIES, new ArrayList<>());
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this.configsMap);
} catch (Exception e) {
throw new RuntimeException("unable to generate string representation of config");
}
}
public static class Builder {
private Map<String, Object> configsMap = new HashMap<>();
public Builder() {
}
public Builder withNumRecordsToInsert(long numRecordsInsert) {
this.configsMap.put(NUM_RECORDS_INSERT, numRecordsInsert);
return this;
}
public Builder withNumRecordsToUpdate(long numRecordsUpsert) {
this.configsMap.put(NUM_RECORDS_UPSERT, numRecordsUpsert);
return this;
}
public Builder withNumInsertPartitions(int numInsertPartitions) {
this.configsMap.put(NUM_PARTITIONS_INSERT, numInsertPartitions);
return this;
}
public Builder withNumUpsertPartitions(int numUpsertPartitions) {
this.configsMap.put(NUM_PARTITIONS_UPSERT, numUpsertPartitions);
return this;
}
public Builder withNumUpsertFiles(int numUpsertFiles) {
this.configsMap.put(NUM_FILES_UPSERT, numUpsertFiles);
return this;
}
public Builder withFractionUpsertPerFile(double fractionUpsertPerFile) {
this.configsMap.put(FRACTION_UPSERT_PER_FILE, fractionUpsertPerFile);
return this;
}
public Builder withNumTimesToRepeat(int repeatCount) {
this.configsMap.put(REPEAT_COUNT, repeatCount);
return this;
}
public Builder withRecordSize(int recordSize) {
this.configsMap.put(RECORD_SIZE, recordSize);
return this;
}
public Builder disableGenerate(boolean generate) {
this.configsMap.put(DISABLE_GENERATE, generate);
return this;
}
public Builder disableIngest(boolean ingest) {
this.configsMap.put(DISABLE_INGEST, ingest);
return this;
}
public Builder withConfig(String name, Object value) {
this.configsMap.put(name, value);
return this;
}
public Builder withHiveQueryAndResults(List<Pair<String, Integer>> hiveQueries) {
this.configsMap.put(HIVE_QUERIES, hiveQueries);
return this;
}
public Builder withHiveLocal(boolean startHiveLocal) {
this.configsMap.put(HIVE_LOCAL, startHiveLocal);
return this;
}
public Builder withHiveProperties(List<String> hiveProperties) {
this.configsMap.put(HIVE_PROPERTIES, hiveProperties);
return this;
}
public Builder withConfigsMap(Map<String, Object> configsMap) {
this.configsMap = configsMap;
return this;
}
public Config build() {
return new Config(configsMap);
}
}
}
}

View File

@@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.converter;
import java.io.Serializable;
import org.apache.spark.api.java.JavaRDD;
/**
* Implementations of {@link Converter} will convert data from one format to another.
*
* @param <I> Input Data Type
* @param <O> Output Data Type
*/
public interface Converter<I, O> extends Serializable {
JavaRDD<O> convert(JavaRDD<I> inputRDD);
}

View File

@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.converter;
import java.util.List;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.integ.testsuite.generator.LazyRecordGeneratorIterator;
import org.apache.hudi.integ.testsuite.generator.UpdateGeneratorIterator;
import org.apache.spark.api.java.JavaRDD;
/**
* This converter creates an update {@link GenericRecord} from an existing {@link GenericRecord}.
*/
public class UpdateConverter implements Converter<GenericRecord, GenericRecord> {
private final String schemaStr;
// The fields that should not be mutated when converting the insert record to an update record, generally the
// record_key
private final List<String> partitionPathFields;
private final List<String> recordKeyFields;
private final int minPayloadSize;
public UpdateConverter(String schemaStr, int minPayloadSize, List<String> partitionPathFields,
List<String> recordKeyFields) {
this.schemaStr = schemaStr;
this.partitionPathFields = partitionPathFields;
this.recordKeyFields = recordKeyFields;
this.minPayloadSize = minPayloadSize;
}
@Override
public JavaRDD<GenericRecord> convert(JavaRDD<GenericRecord> inputRDD) {
return inputRDD.mapPartitions(recordItr -> new LazyRecordGeneratorIterator(new UpdateGeneratorIterator(recordItr,
schemaStr, partitionPathFields, recordKeyFields, minPayloadSize)));
}
}

View File

@@ -0,0 +1,217 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import com.fasterxml.jackson.dataformat.yaml.YAMLGenerator.Feature;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
/**
* Utility class to SerDe workflow dag.
*/
public class DagUtils {
static final ObjectMapper MAPPER = new ObjectMapper();
/**
* Converts a YAML path to {@link WorkflowDag}.
*/
public static WorkflowDag convertYamlPathToDag(FileSystem fs, String path) throws IOException {
InputStream is = fs.open(new Path(path));
return convertYamlToDag(toString(is));
}
/**
* Converts a YAML representation to {@link WorkflowDag}.
*/
public static WorkflowDag convertYamlToDag(String yaml) throws IOException {
Map<String, DagNode> allNodes = new HashMap<>();
final ObjectMapper yamlReader = new ObjectMapper(new YAMLFactory());
final JsonNode jsonNode = yamlReader.readTree(yaml);
Iterator<Entry<String, JsonNode>> itr = jsonNode.fields();
while (itr.hasNext()) {
Entry<String, JsonNode> dagNode = itr.next();
allNodes.put(dagNode.getKey(), convertJsonToDagNode(allNodes, dagNode.getValue()));
}
return new WorkflowDag(findRootNodes(allNodes));
}
/**
* Converts {@link WorkflowDag} to a YAML representation.
*/
public static String convertDagToYaml(WorkflowDag dag) throws IOException {
final ObjectMapper yamlWriter = new ObjectMapper(new YAMLFactory().disable(Feature.WRITE_DOC_START_MARKER)
.enable(Feature.MINIMIZE_QUOTES).enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES));
JsonNode yamlNode = MAPPER.createObjectNode();
convertDagToYaml(yamlNode, dag.getNodeList());
return yamlWriter.writerWithDefaultPrettyPrinter().writeValueAsString(yamlNode);
}
private static void convertDagToYaml(JsonNode yamlNode, List<DagNode> dagNodes) throws IOException {
for (DagNode dagNode : dagNodes) {
String name = dagNode.getConfig().getOtherConfigs().getOrDefault(DeltaConfig.Config.NODE_NAME, dagNode.getName()).toString();
((ObjectNode) yamlNode).put(name, convertDagNodeToJsonNode(dagNode));
if (dagNode.getChildNodes().size() > 0) {
convertDagToYaml(yamlNode, dagNode.getChildNodes());
}
}
}
private static DagNode convertJsonToDagNode(Map<String, DagNode> allNodes, JsonNode node) throws IOException {
String type = node.get(DeltaConfig.Config.TYPE).asText();
final DagNode retNode = convertJsonToDagNode(node, type);
Arrays.asList(node.get(DeltaConfig.Config.DEPENDENCIES).textValue().split(",")).stream().forEach(dep -> {
DagNode parentNode = allNodes.get(dep);
if (parentNode != null) {
parentNode.addChildNode(retNode);
}
});
return retNode;
}
private static List<DagNode> findRootNodes(Map<String, DagNode> allNodes) {
final List<DagNode> rootNodes = new ArrayList<>();
allNodes.entrySet().stream().forEach(entry -> {
if (entry.getValue().getParentNodes().size() < 1) {
rootNodes.add(entry.getValue());
}
});
return rootNodes;
}
private static DagNode convertJsonToDagNode(JsonNode node, String type) {
try {
DeltaConfig.Config config = DeltaConfig.Config.newBuilder().withConfigsMap(convertJsonNodeToMap(node)).build();
return (DagNode) ReflectionUtils.loadClass(generateFQN(type), config);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
private static String generateFQN(String name) throws ClassNotFoundException {
return Class.forName(StringUtils.joinUsingDelim(".",
DagNode.class.getName().substring(0, DagNode.class.getName().lastIndexOf(".")), name)).getName();
}
private static JsonNode convertDagNodeToJsonNode(DagNode node) throws IOException {
return createJsonNode(node, node.getClass().getSimpleName());
}
private static Map<String, Object> convertJsonNodeToMap(JsonNode node) {
Map<String, Object> configsMap = new HashMap<>();
Iterator<Entry<String, JsonNode>> itr = node.get(DeltaConfig.Config.CONFIG_NAME).fields();
while (itr.hasNext()) {
Entry<String, JsonNode> entry = itr.next();
switch (entry.getKey()) {
case DeltaConfig.Config.HIVE_QUERIES:
configsMap.put(DeltaConfig.Config.HIVE_QUERIES, getHiveQueries(entry));
break;
case DeltaConfig.Config.HIVE_PROPERTIES:
configsMap.put(DeltaConfig.Config.HIVE_PROPERTIES, getProperties(entry));
break;
default:
configsMap.put(entry.getKey(), getValue(entry.getValue()));
break;
}
}
return configsMap;
}
private static List<Pair<String, Integer>> getHiveQueries(Entry<String, JsonNode> entry) {
List<Pair<String, Integer>> queries = new ArrayList<>();
Iterator<Entry<String, JsonNode>> queriesItr = entry.getValue().fields();
while (queriesItr.hasNext()) {
queries.add(Pair.of(queriesItr.next().getValue().textValue(), queriesItr.next().getValue().asInt()));
}
return queries;
}
private static List<String> getProperties(Entry<String, JsonNode> entry) {
List<String> properties = new ArrayList<>();
Iterator<Entry<String, JsonNode>> queriesItr = entry.getValue().fields();
while (queriesItr.hasNext()) {
properties.add(queriesItr.next().getValue().textValue());
}
return properties;
}
private static Object getValue(JsonNode node) {
if (node.isInt()) {
return node.asInt();
} else if (node.isLong()) {
return node.asLong();
} else if (node.isShort()) {
return node.asInt();
} else if (node.isBoolean()) {
return node.asBoolean();
} else if (node.isDouble()) {
return node.asDouble();
} else if (node.isFloat()) {
return node.asDouble();
}
return node.textValue();
}
private static JsonNode createJsonNode(DagNode node, String type) throws IOException {
JsonNode configNode = MAPPER.readTree(node.getConfig().toString());
JsonNode jsonNode = MAPPER.createObjectNode();
((ObjectNode) jsonNode).put(DeltaConfig.Config.CONFIG_NAME, configNode);
((ObjectNode) jsonNode).put(DeltaConfig.Config.TYPE, type);
((ObjectNode) jsonNode).put(DeltaConfig.Config.DEPENDENCIES, getDependencyNames(node));
return jsonNode;
}
private static String getDependencyNames(DagNode node) {
return node.getParentNodes().stream()
.map(e -> ((DagNode) e).getConfig().getOtherConfigs().getOrDefault(DeltaConfig.Config.NODE_NAME, node.getName()).toString())
.collect(Collectors.joining(",")).toString();
}
public static String toString(InputStream inputStream) throws IOException {
ByteArrayOutputStream result = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int length;
while ((length = inputStream.read(buffer)) != -1) {
result.write(buffer, 0, length);
}
return result.toString("utf-8");
}
}

View File

@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.io.Serializable;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
import org.apache.spark.api.java.JavaSparkContext;
/**
* This wraps the context needed for an execution of
* a {@link DagNode#execute(ExecutionContext)}.
*/
public class ExecutionContext implements Serializable {
private HoodieTestSuiteWriter hoodieTestSuiteWriter;
private DeltaGenerator deltaGenerator;
private transient JavaSparkContext jsc;
public ExecutionContext(JavaSparkContext jsc, HoodieTestSuiteWriter hoodieTestSuiteWriter, DeltaGenerator deltaGenerator) {
this.hoodieTestSuiteWriter = hoodieTestSuiteWriter;
this.deltaGenerator = deltaGenerator;
this.jsc = jsc;
}
public HoodieTestSuiteWriter getHoodieTestSuiteWriter() {
return hoodieTestSuiteWriter;
}
public DeltaGenerator getDeltaGenerator() {
return deltaGenerator;
}
public JavaSparkContext getJsc() {
return jsc;
}
}

View File

@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.util.ArrayList;
import java.util.List;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
/**
* An example of how to generate a workflow dag programmatically. This is also used as the default workflow dag if
* none is provided.
*/
public class SimpleWorkflowDagGenerator implements WorkflowDagGenerator {
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(DeltaConfig.Config.newBuilder()
.withNumRecordsToInsert(100)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
DagNode child1 = new InsertNode(DeltaConfig.Config.newBuilder()
.withNumRecordsToInsert(100)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
root.addChildNode(child1);
DagNode child1OfChild1 = new UpsertNode(DeltaConfig.Config.newBuilder()
.withNumRecordsToUpdate(100)
.withNumUpsertPartitions(2)
.withNumTimesToRepeat(1)
.withRecordSize(1000).build());
// Tests running 2 nodes in parallel
child1.addChildNode(child1OfChild1);
List<Pair<String, Integer>> queryAndResult = new ArrayList<>();
queryAndResult.add(Pair.of("select " + "count(*) from testdb1.table1 group "
+ "by rider having count(*) < 1", 0));
DagNode child2OfChild1 = new HiveQueryNode(DeltaConfig.Config.newBuilder()
.withHiveQueryAndResults(queryAndResult).withHiveLocal(true).build());
child1.addChildNode(child2OfChild1);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
}

View File

@@ -0,0 +1,39 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.util.List;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
/**
* Workflow dag that encapsulates all execute nodes.
*/
public class WorkflowDag<O> {
private List<DagNode<O>> nodeList;
public WorkflowDag(List<DagNode<O>> nodeList) {
this.nodeList = nodeList;
}
public List<DagNode<O>> getNodeList() {
return nodeList;
}
}

View File

@@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
/**
* A interface which represents a workflow dag generator.
*/
public interface WorkflowDagGenerator {
/**
* Builds a {@link WorkflowDag}.
*/
WorkflowDag build();
}

View File

@@ -0,0 +1,40 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.spark.api.java.JavaRDD;
public class BulkInsertNode extends InsertNode {
public BulkInsertNode(Config config) {
super(config);
}
@Override
protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, Option<String> commitTime)
throws Exception {
log.info("Execute bulk ingest node {}", this.getName());
return hoodieTestSuiteWriter.bulkInsert(commitTime);
}
}

View File

@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
public class CleanNode extends DagNode<Boolean> {
public CleanNode() {
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing clean node {}", this.getName());
executionContext.getHoodieTestSuiteWriter().getWriteClient(this).clean();
}
}

View File

@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.apache.spark.api.java.JavaRDD;
public class CompactNode extends DagNode<JavaRDD<WriteStatus>> {
public CompactNode(Config config) {
this.config = config;
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline()
.getCommitsAndCompactionTimeline().filterPendingCompactionTimeline().lastInstant();
if (lastInstant.isPresent()) {
log.info("Compacting instant {}", lastInstant.get());
this.result = executionContext.getHoodieTestSuiteWriter().compact(Option.of(lastInstant.get().getTimestamp()));
}
}
}

View File

@@ -0,0 +1,125 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import java.util.UUID;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Represents a Node in the DAG of operations for a workflow.
*/
public abstract class DagNode<O> implements Comparable<DagNode<O>> {
protected static Logger log = LoggerFactory.getLogger(DagNode.class);
protected List<DagNode<O>> childNodes;
protected List<DagNode<O>> parentNodes;
protected O result;
protected Config config;
private boolean isCompleted;
public DagNode<O> addChildNode(DagNode childNode) {
childNode.getParentNodes().add(this);
getChildNodes().add(childNode);
return this;
}
public DagNode<O> addParentNode(DagNode parentNode) {
if (!this.getParentNodes().contains(parentNode)) {
this.getParentNodes().add(parentNode);
}
return this;
}
public O getResult() {
return result;
}
public List<DagNode<O>> getChildNodes() {
if (childNodes == null) {
childNodes = new LinkedList<>();
}
return childNodes;
}
public List<DagNode<O>> getParentNodes() {
if (parentNodes == null) {
this.parentNodes = new ArrayList<>();
}
return this.parentNodes;
}
public void setParentNodes(List<DagNode<O>> parentNodes) {
this.parentNodes = parentNodes;
}
public abstract void execute(ExecutionContext context) throws Exception;
public boolean isCompleted() {
return isCompleted;
}
public void setCompleted(boolean completed) {
isCompleted = completed;
}
public Config getConfig() {
return config;
}
public String getName() {
Object name = this.config.getOtherConfigs().get(Config.NODE_NAME);
if (name == null) {
String randomName = UUID.randomUUID().toString();
this.config.getOtherConfigs().put(Config.NODE_NAME, randomName);
return randomName;
}
return name.toString();
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
DagNode<?> dagNode = (DagNode<?>) o;
return getName() == dagNode.getName();
}
@Override
public int hashCode() {
return Objects.hash(getName());
}
@Override
public int compareTo(DagNode<O> thatNode) {
return this.hashCode() - thatNode.hashCode();
}
}

View File

@@ -0,0 +1,87 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
public class HiveQueryNode extends DagNode<Boolean> {
private HiveServiceProvider hiveServiceProvider;
public HiveQueryNode(DeltaConfig.Config config) {
this.config = config;
this.hiveServiceProvider = new HiveServiceProvider(config);
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing hive query node {}", this.getName());
this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
HiveSyncConfig hiveSyncConfig = DataSourceUtils
.buildHiveSyncConfig(executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
.getDeltaSyncService().getDeltaSync().getProps(),
executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
.getDeltaSyncService().getDeltaSync().getCfg().targetBasePath,
executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
.getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat);
this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
Connection con = DriverManager.getConnection(hiveSyncConfig.jdbcUrl, hiveSyncConfig.hiveUser,
hiveSyncConfig.hivePass);
Statement stmt = con.createStatement();
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
for (String hiveProperty : this.config.getHiveProperties()) {
executeStatement(hiveProperty, stmt);
}
for (Pair<String, Integer> queryAndResult : this.config.getHiveQueries()) {
log.info("Running {}", queryAndResult.getLeft());
ResultSet res = stmt.executeQuery(queryAndResult.getLeft());
if (!res.next()) {
log.info("res.next() was False - typically this means the query returned no rows.");
assert 0 == queryAndResult.getRight();
} else {
Integer result = res.getInt(1);
if (!queryAndResult.getRight().equals(result)) {
throw new AssertionError(
"QUERY: " + queryAndResult.getLeft()
+ " | EXPECTED RESULT = " + queryAndResult.getRight()
+ " | ACTUAL RESULT = " + result
);
}
}
log.info("Successfully validated query!");
}
this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
}
private void executeStatement(String query, Statement stmt) throws SQLException {
log.info("Executing statement {}", stmt.toString());
stmt.execute(query);
}
}

View File

@@ -0,0 +1,46 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
public class HiveSyncNode extends DagNode<Boolean> {
private HiveServiceProvider hiveServiceProvider;
public HiveSyncNode(Config config) {
this.config = config;
this.hiveServiceProvider = new HiveServiceProvider(config);
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing hive sync node");
this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive();
this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
}
public HiveServiceProvider getHiveServiceProvider() {
return hiveServiceProvider;
}
}

View File

@@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
import org.apache.spark.api.java.JavaRDD;
public class InsertNode extends DagNode<JavaRDD<WriteStatus>> {
public InsertNode(Config config) {
this.config = config;
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
generate(executionContext.getDeltaGenerator());
log.info("Configs : {}", this.config);
if (!config.isDisableIngest()) {
log.info("Inserting input data {}", this.getName());
Option<String> commitTime = executionContext.getHoodieTestSuiteWriter().startCommit();
JavaRDD<WriteStatus> writeStatus = ingest(executionContext.getHoodieTestSuiteWriter(), commitTime);
executionContext.getHoodieTestSuiteWriter().commit(writeStatus, commitTime);
this.result = writeStatus;
}
}
protected void generate(DeltaGenerator deltaGenerator) throws Exception {
if (!config.isDisableGenerate()) {
log.info("Generating input data for node {}", this.getName());
deltaGenerator.writeRecords(deltaGenerator.generateInserts(config)).count();
}
}
protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter,
Option<String> commitTime) throws Exception {
return hoodieTestSuiteWriter.insert(commitTime);
}
}

View File

@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
public class RollbackNode extends DagNode<Option<HoodieInstant>> {
public RollbackNode(Config config) {
this.config = config;
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing rollback node {}", this.getName());
// Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer
// testing for now
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant();
if (lastInstant.isPresent()) {
log.info("Rolling back last instant {}", lastInstant.get());
executionContext.getHoodieTestSuiteWriter().getWriteClient(this).rollback(lastInstant.get().getTimestamp());
this.result = lastInstant;
}
}
}

View File

@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
public class ScheduleCompactNode extends DagNode<Option<String>> {
public ScheduleCompactNode(Config config) {
this.config = config;
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing schedule compact node {}", this.getName());
// Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer
// testing for now
// Find the last commit and extra the extra metadata to be passed to the schedule compaction. This is
// done to ensure the CHECKPOINT is correctly passed from commit to commit
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant();
if (lastInstant.isPresent()) {
HoodieCommitMetadata metadata = org.apache.hudi.common.model.HoodieCommitMetadata.fromBytes(metaClient
.getActiveTimeline().getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class);
Option<String> scheduledInstant = executionContext.getHoodieTestSuiteWriter().scheduleCompaction(Option.of(metadata
.getExtraMetadata()));
if (scheduledInstant.isPresent()) {
log.info("Scheduling compaction instant {}", scheduledInstant.get());
}
this.result = scheduledInstant;
}
}
}

View File

@@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public class SparkSQLQueryNode extends DagNode<Boolean> {
HiveServiceProvider hiveServiceProvider;
public SparkSQLQueryNode(Config config) {
this.config = config;
this.hiveServiceProvider = new HiveServiceProvider(config);
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing spark sql query node");
this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
SparkSession session = SparkSession.builder().sparkContext(executionContext.getJsc().sc()).getOrCreate();
for (String hiveProperty : this.config.getHiveProperties()) {
session.sql(hiveProperty).count();
}
for (Pair<String, Integer> queryAndResult : this.config.getHiveQueries()) {
log.info("Running {}", queryAndResult.getLeft());
Dataset<Row> res = session.sql(queryAndResult.getLeft());
if (res.count() == 0) {
assert 0 == queryAndResult.getRight();
} else {
assert ((Row[]) res.collect())[0].getInt(0) == queryAndResult.getRight();
}
log.info("Successfully validated query!");
}
this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
this.result = true;
}
}

View File

@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
import org.apache.spark.api.java.JavaRDD;
public class UpsertNode extends InsertNode {
public UpsertNode(Config config) {
super(config);
}
@Override
protected void generate(DeltaGenerator deltaGenerator) throws Exception {
if (!config.isDisableGenerate()) {
log.info("Generating input data {}", this.getName());
deltaGenerator.writeRecords(deltaGenerator.generateUpdates(config)).count();
}
}
@Override
protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, Option<String> commitTime)
throws Exception {
if (!config.isDisableIngest()) {
log.info("Upserting input data {}", this.getName());
this.result = hoodieTestSuiteWriter.upsert(commitTime);
}
return this.result;
}
}

View File

@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import java.util.List;
import java.util.function.Function;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
public class ValidateNode<R> extends DagNode {
protected Function<List<DagNode>, R> function;
public ValidateNode(Config config, Function<List<DagNode>, R> function) {
this.function = function;
this.config = config;
}
@Override
public void execute(ExecutionContext executionContext) {
if (this.getParentNodes().size() > 0 && (Boolean) this.config.getOtherConfigs().getOrDefault("WAIT_FOR_PARENTS",
true)) {
for (DagNode node : (List<DagNode>) this.getParentNodes()) {
if (!node.isCompleted()) {
throw new RuntimeException("cannot validate before parent nodes are complete");
}
}
}
this.result = this.function.apply((List<DagNode>) this.getParentNodes());
}
}

View File

@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.scheduler;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DagScheduler {
private static Logger log = LoggerFactory.getLogger(DagScheduler.class);
private WorkflowDag workflowDag;
private ExecutionContext executionContext;
public DagScheduler(WorkflowDag workflowDag, HoodieTestSuiteWriter hoodieTestSuiteWriter, DeltaGenerator deltaGenerator) {
this.workflowDag = workflowDag;
this.executionContext = new ExecutionContext(null, hoodieTestSuiteWriter, deltaGenerator);
}
public void schedule() throws Exception {
ExecutorService service = Executors.newFixedThreadPool(2);
try {
execute(service, workflowDag.getNodeList());
service.shutdown();
} finally {
if (!service.isShutdown()) {
log.info("Forcing shutdown of executor service, this might kill running tasks");
service.shutdownNow();
}
}
}
private void execute(ExecutorService service, List<DagNode> nodes) throws Exception {
// Nodes at the same level are executed in parallel
Queue<DagNode> queue = new PriorityQueue<>(nodes);
log.info("Running workloads");
do {
List<Future> futures = new ArrayList<>();
Set<DagNode> childNodes = new HashSet<>();
while (queue.size() > 0) {
DagNode nodeToExecute = queue.poll();
futures.add(service.submit(() -> executeNode(nodeToExecute)));
if (nodeToExecute.getChildNodes().size() > 0) {
childNodes.addAll(nodeToExecute.getChildNodes());
}
}
queue.addAll(childNodes);
childNodes.clear();
for (Future future : futures) {
future.get(1, TimeUnit.HOURS);
}
} while (queue.size() > 0);
log.info("Finished workloads");
}
private void executeNode(DagNode node) {
if (node.isCompleted()) {
throw new RuntimeException("DagNode already completed! Cannot re-execute");
}
try {
node.execute(executionContext);
node.setCompleted(true);
log.info("Finished executing {}", node.getName());
} catch (Exception e) {
log.error("Exception executing node");
throw new HoodieException(e);
}
}
}

View File

@@ -0,0 +1,237 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.io.IOException;
import java.io.Serializable;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.StreamSupport;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.converter.Converter;
import org.apache.hudi.integ.testsuite.converter.UpdateConverter;
import org.apache.hudi.integ.testsuite.reader.DFSAvroDeltaInputReader;
import org.apache.hudi.integ.testsuite.reader.DFSHoodieDatasetInputReader;
import org.apache.hudi.integ.testsuite.reader.DeltaInputReader;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter;
import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory;
import org.apache.hudi.keygen.ComplexKeyGenerator;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.storage.StorageLevel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
/**
* The delta generator generates all types of workloads (insert, update) for the given configs.
*/
public class DeltaGenerator implements Serializable {
private static Logger log = LoggerFactory.getLogger(DFSHoodieDatasetInputReader.class);
private DeltaConfig deltaOutputConfig;
private transient JavaSparkContext jsc;
private transient SparkSession sparkSession;
private String schemaStr;
private List<String> recordRowKeyFieldNames;
private List<String> partitionPathFieldNames;
private int batchId;
public DeltaGenerator(DeltaConfig deltaOutputConfig, JavaSparkContext jsc, SparkSession sparkSession,
String schemaStr,
KeyGenerator keyGenerator) {
this.deltaOutputConfig = deltaOutputConfig;
this.jsc = jsc;
this.sparkSession = sparkSession;
this.schemaStr = schemaStr;
this.recordRowKeyFieldNames = keyGenerator instanceof ComplexKeyGenerator ? ((ComplexKeyGenerator) keyGenerator)
.getRecordKeyFields() : Arrays.asList(((SimpleKeyGenerator) keyGenerator).getRecordKeyField());
this.partitionPathFieldNames = keyGenerator instanceof ComplexKeyGenerator ? ((ComplexKeyGenerator) keyGenerator)
.getPartitionPathFields() : Arrays.asList(((SimpleKeyGenerator) keyGenerator).getPartitionPathField());
}
public JavaRDD<DeltaWriteStats> writeRecords(JavaRDD<GenericRecord> records) {
// The following creates a new anonymous function for iterator and hence results in serialization issues
JavaRDD<DeltaWriteStats> ws = records.mapPartitions(itr -> {
try {
DeltaWriterAdapter<GenericRecord> deltaWriterAdapter = DeltaWriterFactory
.getDeltaWriterAdapter(deltaOutputConfig, batchId);
return Collections.singletonList(deltaWriterAdapter.write(itr)).iterator();
} catch (IOException io) {
throw new UncheckedIOException(io);
}
}).flatMap(List::iterator);
batchId++;
return ws;
}
public JavaRDD<GenericRecord> generateInserts(Config operation) {
long recordsPerPartition = operation.getNumRecordsInsert();
int minPayloadSize = operation.getRecordSize();
JavaRDD<GenericRecord> inputBatch = jsc.parallelize(Collections.EMPTY_LIST)
.repartition(operation.getNumInsertPartitions()).mapPartitions(p -> {
return new LazyRecordGeneratorIterator(new FlexibleSchemaRecordGenerationIterator(recordsPerPartition,
minPayloadSize, schemaStr, partitionPathFieldNames));
});
return inputBatch;
}
public JavaRDD<GenericRecord> generateUpdates(Config config) throws IOException {
if (deltaOutputConfig.getDeltaOutputMode() == DeltaOutputMode.DFS) {
JavaRDD<GenericRecord> inserts = null;
if (config.getNumRecordsInsert() > 0) {
inserts = generateInserts(config);
}
DeltaInputReader deltaInputReader = null;
JavaRDD<GenericRecord> adjustedRDD = null;
if (config.getNumUpsertPartitions() < 1) {
// randomly generate updates for a given number of records without regard to partitions and files
deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr,
((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty());
adjustedRDD = deltaInputReader.read(config.getNumRecordsUpsert());
adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsUpsert());
} else {
deltaInputReader =
new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(),
schemaStr);
if (config.getFractionUpsertPerFile() > 0) {
adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(),
config.getFractionUpsertPerFile());
} else {
adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), config
.getNumRecordsUpsert());
}
}
log.info("Repartitioning records");
// persist this since we will make multiple passes over this
adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism());
log.info("Repartitioning records done");
Converter converter = new UpdateConverter(schemaStr, config.getRecordSize(),
partitionPathFieldNames, recordRowKeyFieldNames);
JavaRDD<GenericRecord> updates = converter.convert(adjustedRDD);
log.info("Records converted");
updates.persist(StorageLevel.DISK_ONLY());
return inserts != null ? inserts.union(updates) : updates;
// TODO : Generate updates for only N partitions.
} else {
throw new IllegalArgumentException("Other formats are not supported at the moment");
}
}
public Map<Integer, Long> getPartitionToCountMap(JavaRDD<GenericRecord> records) {
// Requires us to keep the partitioner the same
return records.mapPartitionsWithIndex((index, itr) -> {
Iterable<GenericRecord> newIterable = () -> itr;
// parallelize counting for speed
long count = StreamSupport.stream(newIterable.spliterator(), true).count();
return Arrays.asList(new Tuple2<>(index, count)).iterator();
}, true).mapToPair(i -> i).collectAsMap();
}
public Map<Integer, Long> getAdjustedPartitionsCount(Map<Integer, Long> partitionCountMap, long
recordsToRemove) {
long remainingRecordsToRemove = recordsToRemove;
Iterator<Map.Entry<Integer, Long>> iterator = partitionCountMap.entrySet().iterator();
Map<Integer, Long> adjustedPartitionCountMap = new HashMap<>();
while (iterator.hasNext()) {
Map.Entry<Integer, Long> entry = iterator.next();
if (entry.getValue() < remainingRecordsToRemove) {
remainingRecordsToRemove -= entry.getValue();
adjustedPartitionCountMap.put(entry.getKey(), 0L);
} else {
long newValue = entry.getValue() - remainingRecordsToRemove;
remainingRecordsToRemove = 0;
adjustedPartitionCountMap.put(entry.getKey(), newValue);
}
if (remainingRecordsToRemove == 0) {
break;
}
}
return adjustedPartitionCountMap;
}
public JavaRDD<GenericRecord> adjustRDDToGenerateExactNumUpdates(JavaRDD<GenericRecord> updates, JavaSparkContext
jsc, long totalRecordsRequired) {
Map<Integer, Long> actualPartitionCountMap = getPartitionToCountMap(updates);
long totalRecordsGenerated = actualPartitionCountMap.values().stream().mapToLong(Long::longValue).sum();
if (isSafeToTake(totalRecordsRequired, totalRecordsGenerated)) {
// Generate totalRecordsRequired - totalRecordsGenerated new records and union the RDD's
// NOTE : This performs poorly when totalRecordsRequired >> totalRecordsGenerated. Hence, always
// ensure that enough inserts are created before hand (this needs to be noted during the WorkflowDag creation)
long sizeOfUpdateRDD = totalRecordsGenerated;
while (totalRecordsRequired != sizeOfUpdateRDD) {
long recordsToTake = (totalRecordsRequired - sizeOfUpdateRDD) > sizeOfUpdateRDD
? sizeOfUpdateRDD : (totalRecordsRequired - sizeOfUpdateRDD);
if ((totalRecordsRequired - sizeOfUpdateRDD) > recordsToTake && recordsToTake <= sizeOfUpdateRDD) {
updates = updates.union(updates);
sizeOfUpdateRDD *= 2;
} else {
List<GenericRecord> remainingUpdates = updates.take((int) (recordsToTake));
updates = updates.union(jsc.parallelize(remainingUpdates));
sizeOfUpdateRDD = sizeOfUpdateRDD + recordsToTake;
}
}
return updates;
} else if (totalRecordsRequired < totalRecordsGenerated) {
final Map<Integer, Long> adjustedPartitionCountMap = getAdjustedPartitionsCount(actualPartitionCountMap,
totalRecordsGenerated - totalRecordsRequired);
// limit counts across partitions to meet the exact number of updates required
JavaRDD<GenericRecord> trimmedRecords = updates.mapPartitionsWithIndex((index, itr) -> {
int counter = 1;
List<GenericRecord> entriesToKeep = new ArrayList<>();
if (!adjustedPartitionCountMap.containsKey(index)) {
return itr;
} else {
long recordsToKeepForThisPartition = adjustedPartitionCountMap.get(index);
while (counter <= recordsToKeepForThisPartition && itr.hasNext()) {
entriesToKeep.add(itr.next());
counter++;
}
return entriesToKeep.iterator();
}
}, true);
return trimmedRecords;
}
return updates;
}
private boolean isSafeToTake(long totalRecords, long totalRecordsGenerated) {
// TODO : Ensure that the difference between totalRecords and totalRecordsGenerated is not too big, if yes,
// then there are fewer number of records on disk, hence we need to find another way to generate updates when
// requiredUpdates >> insertedRecords
return totalRecords > totalRecordsGenerated;
}
}

View File

@@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.util.Iterator;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
/**
* A GenericRecordGeneratorIterator for the custom schema of the workload. Implements {@link Iterator} to allow for
* iteration semantics.
*/
public class FlexibleSchemaRecordGenerationIterator implements Iterator<GenericRecord> {
// Stores how many records to generate as part of this iterator. Ideally, one iterator is started per spark
// partition.
private long counter;
// Use the full payload generator as default
private GenericRecordFullPayloadGenerator generator;
// Store last record for the partition path of the first payload to be used for all subsequent generated payloads
private GenericRecord lastRecord;
// Partition path field name
private List<String> partitionPathFieldNames;
public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, String schema) {
this(maxEntriesToProduce, GenericRecordFullPayloadGenerator.DEFAULT_PAYLOAD_SIZE, schema, null);
}
public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, int minPayloadSize, String schemaStr,
List<String> partitionPathFieldNames) {
this.counter = maxEntriesToProduce;
this.partitionPathFieldNames = partitionPathFieldNames;
Schema schema = new Schema.Parser().parse(schemaStr);
this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize);
}
@Override
public boolean hasNext() {
return this.counter > 0;
}
@Override
public GenericRecord next() {
this.counter--;
if (lastRecord == null) {
GenericRecord record = this.generator.getNewPayload();
lastRecord = record;
return record;
} else {
return this.generator.randomize(lastRecord, this.partitionPathFieldNames);
}
}
}

View File

@@ -0,0 +1,285 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.UUID;
import org.apache.avro.LogicalType;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Fixed;
import org.apache.avro.generic.GenericFixed;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.util.collection.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This is a GenericRecord payload generator that generates full generic records {@link GenericRecord}.
* Every field of a generic record created using this generator contains a random value.
*/
public class GenericRecordFullPayloadGenerator implements Serializable {
public static final int DEFAULT_PAYLOAD_SIZE = 1024 * 10; // 10 KB
private static Logger log = LoggerFactory.getLogger(GenericRecordFullPayloadGenerator.class);
protected final Random random = new Random();
// The source schema used to generate a payload
private final transient Schema baseSchema;
// Used to validate a generic record
private final transient GenericData genericData = new GenericData();
// Number of more bytes to add based on the estimated full record payload size and min payload size
private int numberOfBytesToAdd;
// If more elements should be packed to meet the minPayloadSize
private boolean shouldAddMore;
// How many complex fields have we visited that can help us pack more entries and increase the size of the record
private int numberOfComplexFields;
// The size of a full record where every field of a generic record created contains 1 random value
private int estimatedFullPayloadSize;
// LogicalTypes in Avro 1.8.2
private static final String DECIMAL = "decimal";
private static final String UUID_NAME = "uuid";
private static final String DATE = "date";
private static final String TIME_MILLIS = "time-millis";
private static final String TIME_MICROS = "time-micros";
private static final String TIMESTAMP_MILLIS = "timestamp-millis";
private static final String TIMESTAMP_MICROS = "timestamp-micros";
public GenericRecordFullPayloadGenerator(Schema schema) {
this(schema, DEFAULT_PAYLOAD_SIZE);
}
public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize) {
Pair<Integer, Integer> sizeInfo = new GenericRecordFullPayloadSizeEstimator(schema)
.typeEstimateAndNumComplexFields();
this.estimatedFullPayloadSize = sizeInfo.getLeft();
this.numberOfComplexFields = sizeInfo.getRight();
this.baseSchema = schema;
this.shouldAddMore = estimatedFullPayloadSize < minPayloadSize;
if (this.shouldAddMore) {
this.numberOfBytesToAdd = minPayloadSize - estimatedFullPayloadSize;
if (numberOfComplexFields < 1) {
log.warn("The schema does not have any collections/complex fields. Cannot achieve minPayloadSize : {}",
minPayloadSize);
}
}
}
protected static boolean isPrimitive(Schema localSchema) {
if (localSchema.getType() != Type.ARRAY
&& localSchema.getType() != Type.MAP
&& localSchema.getType() != Type.RECORD
&& localSchema.getType() != Type.UNION) {
return true;
} else {
return false;
}
}
public GenericRecord getNewPayload() {
return convert(baseSchema);
}
public GenericRecord getUpdatePayload(GenericRecord record, List<String> blacklistFields) {
return randomize(record, blacklistFields);
}
protected GenericRecord convert(Schema schema) {
GenericRecord result = new GenericData.Record(schema);
for (Schema.Field f : schema.getFields()) {
result.put(f.name(), typeConvert(f.schema()));
}
return result;
}
protected GenericRecord convertPartial(Schema schema) {
GenericRecord result = new GenericData.Record(schema);
for (Schema.Field f : schema.getFields()) {
boolean setNull = random.nextBoolean();
if (!setNull) {
result.put(f.name(), typeConvert(f.schema()));
} else {
result.put(f.name(), null);
}
}
// TODO : pack remaining bytes into a complex field
return result;
}
protected GenericRecord randomize(GenericRecord record, List<String> blacklistFields) {
for (Schema.Field f : record.getSchema().getFields()) {
if (blacklistFields == null || !blacklistFields.contains(f.name())) {
record.put(f.name(), typeConvert(f.schema()));
}
}
return record;
}
private Object typeConvert(Schema schema) {
Schema localSchema = schema;
if (isOption(schema)) {
localSchema = getNonNull(schema);
}
switch (localSchema.getType()) {
case BOOLEAN:
return random.nextBoolean();
case DOUBLE:
return random.nextDouble();
case FLOAT:
return random.nextFloat();
case INT:
return random.nextInt();
case LONG:
return random.nextLong();
case STRING:
return UUID.randomUUID().toString();
case ENUM:
List<String> enumSymbols = localSchema.getEnumSymbols();
return new GenericData.EnumSymbol(localSchema, enumSymbols.get(random.nextInt(enumSymbols.size() - 1)));
case RECORD:
return convert(localSchema);
case ARRAY:
Schema elementSchema = localSchema.getElementType();
List listRes = new ArrayList();
if (isPrimitive(elementSchema) && this.shouldAddMore) {
int numEntriesToAdd = numEntriesToAdd(elementSchema);
while (numEntriesToAdd > 0) {
listRes.add(typeConvert(elementSchema));
numEntriesToAdd--;
}
} else {
listRes.add(typeConvert(elementSchema));
}
return listRes;
case MAP:
Schema valueSchema = localSchema.getValueType();
Map<String, Object> mapRes = new HashMap<String, Object>();
if (isPrimitive(valueSchema) && this.shouldAddMore) {
int numEntriesToAdd = numEntriesToAdd(valueSchema);
while (numEntriesToAdd > 0) {
mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema));
numEntriesToAdd--;
}
} else {
mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema));
}
return mapRes;
case BYTES:
return ByteBuffer.wrap(UUID.randomUUID().toString().getBytes(Charset.defaultCharset()));
case FIXED:
return generateFixedType(localSchema);
default:
throw new IllegalArgumentException(
"Cannot handle type: " + localSchema.getType());
}
}
private Object generateFixedType(Schema localSchema) {
// TODO: Need to implement valid data generation for fixed type
GenericFixed genericFixed = new GenericData.Fixed(localSchema);
switch (localSchema.getLogicalType().getName()) {
case UUID_NAME:
((Fixed) genericFixed).bytes(UUID.randomUUID().toString().getBytes());
return genericFixed;
case DECIMAL:
return genericFixed;
case DATE:
return genericFixed;
case TIME_MILLIS:
return genericFixed;
default:
throw new IllegalArgumentException(
"Cannot handle type: " + localSchema.getLogicalType());
}
}
public boolean validate(GenericRecord record) {
return genericData.validate(baseSchema, record);
}
protected boolean isOption(Schema schema) {
return schema.getType().equals(Schema.Type.UNION)
&& schema.getTypes().size() == 2
&& (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
|| schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
}
protected Schema getNonNull(Schema schema) {
List<Schema> types = schema.getTypes();
return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
}
public int getEstimatedFullPayloadSize() {
return estimatedFullPayloadSize;
}
private int getSize(Schema elementSchema) {
switch (elementSchema.getType()) {
case BOOLEAN:
return 1;
case DOUBLE:
return Double.BYTES;
case FLOAT:
return Float.BYTES;
case INT:
return Integer.BYTES;
case LONG:
return Long.BYTES;
case STRING:
return UUID.randomUUID().toString().length();
case ENUM:
return 1;
case BYTES:
return UUID.randomUUID().toString().length();
case FIXED:
return elementSchema.getFixedSize();
default:
throw new RuntimeException("Unknown type " + elementSchema.getType());
}
}
private int numEntriesToAdd(Schema elementSchema) {
// Find the size of the primitive data type in bytes
int primitiveDataTypeSize = getSize(elementSchema);
int numEntriesToAdd = numberOfBytesToAdd / primitiveDataTypeSize;
// If more than 10 entries are being added for this same complex field and there are still more complex fields to
// be visited in the schema, reduce the number of entries to add by a factor of 10 to allow for other complex
// fields to pack some entries
if (numEntriesToAdd % 10 > 0 && this.numberOfComplexFields > 1) {
numEntriesToAdd = numEntriesToAdd / 10;
numberOfBytesToAdd -= numEntriesToAdd * primitiveDataTypeSize;
this.shouldAddMore = true;
} else {
this.numberOfBytesToAdd = 0;
this.shouldAddMore = false;
}
this.numberOfComplexFields -= 1;
return numEntriesToAdd;
}
}

View File

@@ -0,0 +1,121 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.io.Serializable;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.util.collection.Pair;
/**
* This is a GenericRecord payload estimator estimates the size of a full generic record {@link GenericRecord}.
* A full record is defined as "Every field of a generic record created contains 1 random value"
*/
public class GenericRecordFullPayloadSizeEstimator implements Serializable {
private final transient Schema baseSchema;
// This variable is used to track the number of complex/collection fields with primitive data types at their leaf.
// This is used to figure out how many entries can be packed in such a collection field to meet the min payload
// size requested
private final transient AtomicInteger counter = new AtomicInteger(0);
public GenericRecordFullPayloadSizeEstimator(Schema schema) {
this.baseSchema = schema;
}
public Pair<Integer, Integer> typeEstimateAndNumComplexFields() {
int size = estimate(baseSchema);
return Pair.of(size, counter.get());
}
/**
* This method estimates the size of the payload if all entries of this payload were populated with one value.
* For eg. A primitive data type such as String will be populated with {@link UUID} so the length if 36 bytes
* whereas a complex data type such as an Array of type Int, will be populated with exactly 1 Integer value.
*/
protected int estimate(Schema schema) {
long size = 0;
for (Schema.Field f : schema.getFields()) {
size += typeEstimate(f.schema());
}
return (int) size;
}
private long typeEstimate(Schema schema) {
Schema localSchema = schema;
if (isOption(schema)) {
localSchema = getNonNull(schema);
}
switch (localSchema.getType()) {
case BOOLEAN:
return 1;
case DOUBLE:
return 8;
case FLOAT:
return 4;
case INT:
return 4;
case LONG:
return 8;
case STRING:
return UUID.randomUUID().toString().length();
case ENUM:
return 1;
case RECORD:
return estimate(localSchema);
case ARRAY:
if (GenericRecordFullPayloadGenerator.isPrimitive(localSchema.getElementType())) {
counter.addAndGet(1);
}
Schema elementSchema = localSchema.getElementType();
return typeEstimate(elementSchema);
case MAP:
if (GenericRecordFullPayloadGenerator.isPrimitive(localSchema.getValueType())) {
counter.addAndGet(1);
}
Schema valueSchema = localSchema.getValueType();
return UUID.randomUUID().toString().length() + typeEstimate(valueSchema);
case BYTES:
return UUID.randomUUID().toString().length();
case FIXED:
return localSchema.getFixedSize();
default:
throw new IllegalArgumentException(
"Cannot handle type: " + localSchema.getType());
}
}
protected boolean isOption(Schema schema) {
return schema.getType().equals(Schema.Type.UNION)
&& schema.getTypes().size() == 2
&& (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
|| schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
}
protected Schema getNonNull(Schema schema) {
List<Schema> types = schema.getTypes();
return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
}
}

View File

@@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
/**
* This is a GenericRecord payload generator that generates partial generic records {@link GenericRecord}. A partial
* records is one that has some fields of the schema NULL or NOT PRESENT. This payload enables us to simulate
* creation of partial records which are possible in many cases, especially for database change logs.
*/
public class GenericRecordPartialPayloadGenerator extends GenericRecordFullPayloadGenerator {
public GenericRecordPartialPayloadGenerator(Schema schema) {
super(schema);
}
public GenericRecordPartialPayloadGenerator(Schema schema, int minPayloadSize) {
super(schema, minPayloadSize);
}
@Override
protected GenericRecord convert(Schema schema) {
GenericRecord record = super.convertPartial(schema);
return record;
}
private void setNull(GenericRecord record) {
for (Schema.Field field : record.getSchema().getFields()) {
// A random boolean decides whether this field of the generic record should be present or absent.
// Using this we can set only a handful of fields in the record and generate partial records
boolean setNull = random.nextBoolean();
if (setNull) { // TODO : DO NOT SET THE RECORD KEY FIELDS TO NULL
record.put(field.name(), null);
} else {
if (record.get(field.name()) instanceof GenericData.Record) {
setNull((GenericData.Record) record.get(field.name()));
}
}
}
}
@Override
public boolean validate(GenericRecord record) {
return validate((Object) record);
}
// Atleast 1 entry should be null
private boolean validate(Object object) {
if (object == null) {
return true;
} else if (object instanceof GenericRecord) {
for (Schema.Field field : ((GenericRecord) object).getSchema().getFields()) {
boolean ret = validate(((GenericRecord) object).get(field.name()));
if (ret) {
return ret;
}
}
}
return false;
}
}

View File

@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.util.Iterator;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.client.utils.LazyIterableIterator;
/**
* A lazy record generator to generate {@link GenericRecord}s lazily and not hold a list of records in memory.
*/
public class LazyRecordGeneratorIterator extends
LazyIterableIterator<GenericRecord, GenericRecord> {
public LazyRecordGeneratorIterator(Iterator<GenericRecord> inputItr) {
super(inputItr);
}
@Override
protected void start() {
}
@Override
protected GenericRecord computeNext() {
return inputItr.next();
}
@Override
protected void end() {
}
}

View File

@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
public class UpdateGeneratorIterator implements Iterator<GenericRecord> {
// Use the full payload generator as default
private GenericRecordFullPayloadGenerator generator;
private List<String> blackListedFields;
// iterator
private Iterator<GenericRecord> itr;
public UpdateGeneratorIterator(Iterator<GenericRecord> itr, String schemaStr, List<String> partitionPathFieldNames,
List<String> recordKeyFieldNames, int minPayloadSize) {
this.itr = itr;
this.blackListedFields = new ArrayList<>();
this.blackListedFields.addAll(partitionPathFieldNames);
this.blackListedFields.addAll(recordKeyFieldNames);
Schema schema = new Schema.Parser().parse(schemaStr);
this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize);
}
@Override
public boolean hasNext() {
return itr.hasNext();
}
@Override
public GenericRecord next() {
GenericRecord newRecord = itr.next();
return this.generator.randomize(newRecord, this.blackListedFields);
}
}

View File

@@ -0,0 +1,92 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.helpers;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.ImmutablePair;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.utilities.sources.helpers.DFSPathSelector;
/**
* A custom dfs path selector used only for the hudi test suite. To be used only if workload is not run inline.
*/
public class DFSTestSuitePathSelector extends DFSPathSelector {
public DFSTestSuitePathSelector(TypedProperties props, Configuration hadoopConf) {
super(props, hadoopConf);
}
@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(
Option<String> lastCheckpointStr, long sourceLimit) {
Integer lastBatchId;
Integer nextBatchId;
try {
if (lastCheckpointStr.isPresent()) {
lastBatchId = Integer.parseInt(lastCheckpointStr.get());
nextBatchId = lastBatchId + 1;
} else {
lastBatchId = -1;
nextBatchId = 0;
}
// obtain all eligible files for the batch
List<FileStatus> eligibleFiles = new ArrayList<>();
FileStatus[] fileStatuses = fs.globStatus(
new Path(props.getString(Config.ROOT_INPUT_PATH_PROP), "*"));
for (FileStatus fileStatus : fileStatuses) {
if (!fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream()
.anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) {
continue;
} else if (fileStatus.getPath().getName().compareTo(lastBatchId.toString()) > 0 && fileStatus.getPath()
.getName().compareTo(nextBatchId.toString()) <= 0) {
RemoteIterator<LocatedFileStatus> files = fs.listFiles(fileStatus.getPath(), true);
while (files.hasNext()) {
eligibleFiles.add(files.next());
}
}
}
// no data to readAvro
if (eligibleFiles.size() == 0) {
return new ImmutablePair<>(Option.empty(),
lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE)));
}
// readAvro the files out.
String pathStr = eligibleFiles.stream().map(f -> f.getPath().toString())
.collect(Collectors.joining(","));
return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(nextBatchId));
} catch (IOException ioe) {
throw new HoodieIOException(
"Unable to readAvro from source from checkpoint: " + lastCheckpointStr, ioe);
}
}
}

View File

@@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.helpers;
import java.io.IOException;
import java.net.BindException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hive.service.server.HiveServer2;
import org.apache.hudi.hive.testutils.HiveTestService;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.thrift.transport.TTransportException;
/**
* Hive Service provider.
*/
public class HiveServiceProvider {
private HiveTestService hiveService;
private HiveServer2 hiveServer;
private Config config;
private static final Logger LOG = LogManager.getLogger(HiveServiceProvider.class);
public HiveServiceProvider(Config config) {
this.config = config;
}
public void startLocalHiveServiceIfNeeded(Configuration configuration) throws IOException {
if (config.isHiveLocal()) {
hiveService = new HiveTestService(configuration);
hiveServer = hiveService.start();
}
}
public void syncToLocalHiveIfNeeded(HoodieTestSuiteWriter writer) {
if (this.config.isHiveLocal()) {
writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync()
.syncHive(getLocalHiveServer().getHiveConf());
} else {
writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive();
}
}
public void stopLocalHiveServiceIfNeeded() throws IOException {
if (config.isHiveLocal()) {
if (hiveService != null) {
hiveService.stop();
}
}
}
public HiveServer2 getLocalHiveServer() {
return hiveServer;
}
}

View File

@@ -0,0 +1,89 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import java.io.IOException;
import java.util.Arrays;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.SparkSession;
/**
* A reader of {@link DeltaOutputMode#DFS} and {@link DeltaInputType#AVRO}.
*/
public class DFSAvroDeltaInputReader extends DFSDeltaInputReader {
private final SparkSession sparkSession;
private final String schemaStr;
private final String basePath;
private final Option<String> structName;
private final Option<String> nameSpace;
protected PathFilter filter = (path) -> {
if (path.toUri().toString().contains(AvroFileDeltaInputWriter.AVRO_EXTENSION)) {
return true;
} else {
return false;
}
};
public DFSAvroDeltaInputReader(SparkSession sparkSession, String schemaStr, String basePath,
Option<String> structName,
Option<String> nameSpace) {
this.sparkSession = sparkSession;
this.schemaStr = schemaStr;
this.basePath = basePath;
this.structName = structName;
this.nameSpace = nameSpace;
}
@Override
public JavaRDD<GenericRecord> read(long totalRecordsToRead) throws IOException {
return SparkBasedReader.readAvro(sparkSession, schemaStr, getFilePathsToRead(basePath, filter, totalRecordsToRead),
structName, nameSpace);
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, long approxNumRecords) throws IOException {
throw new UnsupportedOperationException("cannot generate updates");
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, long approxNumRecords) throws IOException {
throw new UnsupportedOperationException("cannot generate updates");
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, double percentageRecordsPerFile)
throws IOException {
throw new UnsupportedOperationException("cannot generate updates");
}
@Override
protected long analyzeSingleFile(String filePath) {
JavaRDD<GenericRecord> recordsFromOneFile = SparkBasedReader
.readAvro(sparkSession, schemaStr, Arrays.asList(filePath),
structName, nameSpace);
return recordsFromOneFile.count();
}
}

View File

@@ -0,0 +1,98 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.collection.Pair;
/**
* This class helps to estimate the number of files to read a given number of total records.
* Use this class for all DFS based implementations of {@link DeltaInputReader}
*/
public abstract class DFSDeltaInputReader implements DeltaInputReader<GenericRecord> {
protected List<String> getFilePathsToRead(String basePath, PathFilter filter, long totalRecordsToRead) throws
IOException {
FileSystem fs = FSUtils.getFs(basePath, new Configuration());
// TODO : Sort list by file size and take the median file status to ensure fair calculation and change to remote
// iterator
List<FileStatus> fileStatuses = Arrays.asList(fs.globStatus(new Path(basePath, "*/*"), filter));
if (fileStatuses.size() > 0) {
FileStatus status = fileStatuses.get(0);
long avgNumRecordsPerFile = analyzeSingleFile(status.getPath().toString());
long numFilesToMatchExpectedRecords = (long) Math.ceil((double) totalRecordsToRead / (double)
avgNumRecordsPerFile);
long avgSizeOfEachFile = status.getLen();
long totalSizeToRead = avgSizeOfEachFile * numFilesToMatchExpectedRecords;
// choose N files with that length
Pair<Integer, Integer> fileStatusIndexRange = getFileStatusIndexRange(fileStatuses, avgSizeOfEachFile,
totalSizeToRead);
int startIndex = fileStatusIndexRange.getLeft();
List<String> filePaths = new ArrayList<>();
while (startIndex == 0 || startIndex < fileStatusIndexRange.getRight()) {
filePaths.add(fileStatuses.get(startIndex).getPath().toString());
startIndex++;
}
return filePaths;
}
return Collections.emptyList();
}
protected Pair<Integer, Integer> getFileStatusIndexRange(List<FileStatus> fileStatuses, long averageFileSize, long
totalSizeToRead) {
long totalSizeOfFilesPresent = 0;
int startOffset = 0;
int endOffset = 0;
for (FileStatus fileStatus : fileStatuses) {
// If current file length is greater than averageFileSize, increment by averageFileSize since our
// totalSizeToRead calculation is based on the averageRecordSize * numRecordsToRead.
if (fileStatus.getLen() > averageFileSize) {
totalSizeOfFilesPresent += averageFileSize;
} else {
totalSizeOfFilesPresent += fileStatus.getLen();
}
if (totalSizeOfFilesPresent <= totalSizeToRead) {
endOffset++;
continue;
} else {
return Pair.of(startOffset, endOffset);
}
}
return Pair.of(startOffset, endOffset);
}
/**
* Implementation of {@link DeltaInputReader}s to provide a way to read a single file on DFS and provide an
* average number of records across N files.
*/
protected long analyzeSingleFile(String filePath) {
throw new UnsupportedOperationException("No implementation found");
}
}

View File

@@ -0,0 +1,338 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import static java.util.Map.Entry.comparingByValue;
import static java.util.stream.Collectors.toMap;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.table.view.TableFileSystemView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ParquetReaderIterator;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.config.HoodieMemoryConfig;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
/**
* This class helps to generate updates from an already existing hoodie dataset. It supports generating updates in
* across partitions, files and records.
*/
public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader {
private static Logger log = LoggerFactory.getLogger(DFSHoodieDatasetInputReader.class);
private transient JavaSparkContext jsc;
private String schemaStr;
private HoodieTableMetaClient metaClient;
public DFSHoodieDatasetInputReader(JavaSparkContext jsc, String basePath, String schemaStr) {
this.jsc = jsc;
this.schemaStr = schemaStr;
this.metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
}
protected List<String> getPartitions(Option<Integer> partitionsLimit) throws IOException {
List<String> partitionPaths = FSUtils
.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), false);
// Sort partition so we can pick last N partitions by default
Collections.sort(partitionPaths);
if (!partitionPaths.isEmpty()) {
ValidationUtils.checkArgument(partitionPaths.size() >= partitionsLimit.get(),
"Cannot generate updates for more partitions " + "than present in the dataset, partitions "
+ "requested " + partitionsLimit.get() + ", partitions present " + partitionPaths.size());
return partitionPaths.subList(0, partitionsLimit.get());
}
return partitionPaths;
}
private JavaPairRDD<String, Iterator<FileSlice>> getPartitionToFileSlice(HoodieTableMetaClient metaClient,
List<String> partitionPaths) {
TableFileSystemView.SliceView fileSystemView = new HoodieTableFileSystemView(metaClient,
metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants());
// pass num partitions to another method
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSliceList = jsc.parallelize(partitionPaths).mapToPair(p -> {
return new Tuple2<>(p, fileSystemView.getLatestFileSlices(p).iterator());
});
return partitionToFileSliceList;
}
@Override
protected long analyzeSingleFile(String filePath) {
return SparkBasedReader.readParquet(new SparkSession(jsc.sc()), Arrays.asList(filePath),
Option.empty(), Option.empty()).count();
}
private JavaRDD<GenericRecord> fetchAnyRecordsFromDataset(Option<Long> numRecordsToUpdate) throws IOException {
return fetchRecordsFromDataset(Option.empty(), Option.empty(), numRecordsToUpdate, Option.empty());
}
private JavaRDD<GenericRecord> fetchAnyRecordsFromDataset(Option<Long> numRecordsToUpdate, Option<Integer>
numPartitions) throws IOException {
return fetchRecordsFromDataset(numPartitions, Option.empty(), numRecordsToUpdate, Option.empty());
}
private JavaRDD<GenericRecord> fetchPercentageRecordsFromDataset(Option<Integer> numPartitions, Option<Integer>
numFiles, Option<Double> percentageRecordsPerFile) throws IOException {
return fetchRecordsFromDataset(numPartitions, numFiles, Option.empty(), percentageRecordsPerFile);
}
private JavaRDD<GenericRecord> fetchRecordsFromDataset(Option<Integer> numPartitions, Option<Integer>
numFiles, Option<Long> numRecordsToUpdate) throws IOException {
return fetchRecordsFromDataset(numPartitions, numFiles, numRecordsToUpdate, Option.empty());
}
private JavaRDD<GenericRecord> fetchRecordsFromDataset(Option<Integer> numPartitions, Option<Integer> numFiles,
Option<Long> numRecordsToUpdate, Option<Double> percentageRecordsPerFile) throws IOException {
log.info("NumPartitions : {}, NumFiles : {}, numRecordsToUpdate : {}, percentageRecordsPerFile : {}",
numPartitions, numFiles, numRecordsToUpdate, percentageRecordsPerFile);
List<String> partitionPaths = getPartitions(numPartitions);
// Read all file slices in the partition
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice = getPartitionToFileSlice(metaClient,
partitionPaths);
// TODO : read record count from metadata
// Read the records in a single file
long recordsInSingleFile = iteratorSize(readParquetOrLogFiles(getSingleSliceFromRDD(partitionToFileSlice)));
int numFilesToUpdate;
long numRecordsToUpdatePerFile;
if (!numFiles.isPresent() || numFiles.get() == 0) {
// If num files are not passed, find the number of files to update based on total records to update and records
// per file
numFilesToUpdate = (int) (numRecordsToUpdate.get() / recordsInSingleFile);
log.info("Files to update {}", numFilesToUpdate);
numRecordsToUpdatePerFile = recordsInSingleFile;
} else {
// If num files is passed, find the number of records per file based on either percentage or total records to
// update and num files passed
numFilesToUpdate = numFiles.get();
numRecordsToUpdatePerFile = percentageRecordsPerFile.isPresent() ? (long) (recordsInSingleFile
* percentageRecordsPerFile.get()) : numRecordsToUpdate.get() / numFilesToUpdate;
}
// Adjust the number of files to read per partition based on the requested partition & file counts
Map<String, Integer> adjustedPartitionToFileIdCountMap = getFilesToReadPerPartition(partitionToFileSlice,
getPartitions(numPartitions).size(), numFilesToUpdate);
JavaRDD<GenericRecord> updates = projectSchema(generateUpdates(adjustedPartitionToFileIdCountMap,
partitionToFileSlice, numFilesToUpdate, (int) numRecordsToUpdatePerFile));
if (numRecordsToUpdate.isPresent() && numFiles.isPresent() && numFiles.get() != 0 && numRecordsToUpdate.get()
!= numRecordsToUpdatePerFile * numFiles.get()) {
long remainingRecordsToAdd = (numRecordsToUpdate.get() - (numRecordsToUpdatePerFile * numFiles.get()));
updates = updates.union(projectSchema(jsc.parallelize(generateUpdates(adjustedPartitionToFileIdCountMap,
partitionToFileSlice, numFilesToUpdate, (int) remainingRecordsToAdd).take((int) remainingRecordsToAdd))));
}
log.info("Finished generating updates");
return updates;
}
private JavaRDD<GenericRecord> projectSchema(JavaRDD<GenericRecord> updates) {
// The records read from the hoodie dataset have the hoodie record fields, rewrite the record to eliminate them
return updates
.map(r -> HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(r, new Schema.Parser().parse(schemaStr)));
}
private JavaRDD<GenericRecord> generateUpdates(Map<String, Integer> adjustedPartitionToFileIdCountMap,
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice, int numFiles, int numRecordsToReadPerFile) {
return partitionToFileSlice.map(p -> {
int maxFilesToRead = adjustedPartitionToFileIdCountMap.get(p._1);
return iteratorLimit(p._2, maxFilesToRead);
}).flatMap(p -> p).repartition(numFiles).map(fileSlice -> {
if (numRecordsToReadPerFile > 0) {
return iteratorLimit(readParquetOrLogFiles(fileSlice), numRecordsToReadPerFile);
} else {
return readParquetOrLogFiles(fileSlice);
}
}).flatMap(p -> p).map(i -> (GenericRecord) i);
}
private Map<String, Integer> getFilesToReadPerPartition(JavaPairRDD<String, Iterator<FileSlice>>
partitionToFileSlice, Integer numPartitions, Integer numFiles) {
int numFilesPerPartition = (int) Math.ceil(numFiles / numPartitions);
Map<String, Integer> partitionToFileIdCountMap = partitionToFileSlice
.mapToPair(p -> new Tuple2<>(p._1, iteratorSize(p._2))).collectAsMap();
long totalExistingFilesCount = partitionToFileIdCountMap.values().stream().reduce((a, b) -> a + b).get();
ValidationUtils.checkArgument(totalExistingFilesCount >= numFiles, "Cannot generate updates "
+ "for more files than present in the dataset, file requested " + numFiles + ", files present "
+ totalExistingFilesCount);
Map<String, Integer> partitionToFileIdCountSortedMap = partitionToFileIdCountMap
.entrySet()
.stream()
.sorted(comparingByValue())
.collect(toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2,
LinkedHashMap::new));
// Limit files to be read per partition
Map<String, Integer> adjustedPartitionToFileIdCountMap = new HashMap<>();
partitionToFileIdCountSortedMap.entrySet().stream().forEach(e -> {
if (e.getValue() <= numFilesPerPartition) {
adjustedPartitionToFileIdCountMap.put(e.getKey(), e.getValue());
} else {
adjustedPartitionToFileIdCountMap.put(e.getKey(), numFilesPerPartition);
}
});
return adjustedPartitionToFileIdCountMap;
}
private FileSlice getSingleSliceFromRDD(JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice) {
return partitionToFileSlice.map(f -> {
FileSlice slice = f._2.next();
FileSlice newSlice = new FileSlice(slice.getFileGroupId(), slice.getBaseInstantTime());
if (slice.getBaseFile().isPresent()) {
newSlice.setBaseFile(slice.getBaseFile().get());
} else {
slice.getLogFiles().forEach(l -> {
newSlice.addLogFile(l);
});
}
return newSlice;
}).take(1).get(0);
}
private Iterator<IndexedRecord> readParquetOrLogFiles(FileSlice fileSlice) throws IOException {
if (fileSlice.getBaseFile().isPresent()) {
Iterator<IndexedRecord> itr =
new ParquetReaderIterator<IndexedRecord>(AvroParquetReader.<IndexedRecord>builder(new
Path(fileSlice.getBaseFile().get().getPath())).withConf(metaClient.getHadoopConf()).build());
return itr;
} else {
// If there is no data file, fall back to reading log files
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(metaClient.getFs(),
metaClient.getBasePath(),
fileSlice.getLogFiles().map(l -> l.getPath().getName()).collect(Collectors.toList()),
new Schema.Parser().parse(schemaStr), metaClient.getActiveTimeline().getCommitsTimeline()
.filterCompletedInstants().lastInstant().get().getTimestamp(),
HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES, true, false,
HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE,
HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH);
// readAvro log files
Iterable<HoodieRecord<? extends HoodieRecordPayload>> iterable = () -> scanner.iterator();
Schema schema = new Schema.Parser().parse(schemaStr);
return StreamSupport.stream(iterable.spliterator(), false)
.map(e -> {
try {
return (IndexedRecord) e.getData().getInsertValue(schema).get();
} catch (IOException io) {
throw new UncheckedIOException(io);
}
}).iterator();
}
}
/**
* Returns the number of elements remaining in {@code iterator}. The iterator
* will be left exhausted: its {@code hasNext()} method will return
* {@code false}.
*/
private static int iteratorSize(Iterator<?> iterator) {
int count = 0;
while (iterator.hasNext()) {
iterator.next();
count++;
}
return count;
}
/**
* Creates an iterator returning the first {@code limitSize} elements of the
* given iterator. If the original iterator does not contain that many
* elements, the returned iterator will have the same behavior as the original
* iterator. The returned iterator supports {@code remove()} if the original
* iterator does.
*
* @param iterator the iterator to limit
* @param limitSize the maximum number of elements in the returned iterator
* @throws IllegalArgumentException if {@code limitSize} is negative
*/
private static <T> Iterator<T> iteratorLimit(
final Iterator<T> iterator, final int limitSize) {
ValidationUtils.checkArgument(iterator != null, "iterator is null");
ValidationUtils.checkArgument(limitSize >= 0, "limit is negative");
return new Iterator<T>() {
private int count;
@Override
public boolean hasNext() {
return count < limitSize && iterator.hasNext();
}
@Override
public T next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
count++;
return iterator.next();
}
@Override
public void remove() {
iterator.remove();
}
};
}
@Override
public JavaRDD<GenericRecord> read(long numRecords) throws IOException {
return fetchAnyRecordsFromDataset(Option.of(numRecords));
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, long approxNumRecords) throws IOException {
return fetchAnyRecordsFromDataset(Option.of(approxNumRecords), Option.of(numPartitions));
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, long numRecords) throws IOException {
return fetchRecordsFromDataset(Option.of(numPartitions), Option.of(numFiles), Option.of(numRecords));
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, double percentageRecordsPerFile)
throws IOException {
return fetchPercentageRecordsFromDataset(Option.of(numPartitions), Option.of(numFiles),
Option.of(percentageRecordsPerFile));
}
}

View File

@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import java.io.IOException;
import java.io.Serializable;
import org.apache.spark.api.java.JavaRDD;
/**
* Implementations of {@link DeltaInputReader} will read the configured input type and provide an RDD of records to the
* client.
*
* @param <O> Read result data type
*/
public interface DeltaInputReader<O> extends Serializable {
/**
* Attempts to reads an approximate number of records close to approxNumRecords.
* This highly depends on the number of records already present in the input.
*/
JavaRDD<O> read(long approxNumRecords) throws IOException;
/**
* @throws IOException Attempts to read approx number of records (exact if equal or more records available)
* across requested number of
* partitions.
*/
JavaRDD<O> read(int numPartitions, long approxNumRecords) throws IOException;
/**
* @throws IOException Attempts to read approx number of records (exact if equal or more records available)
* across requested number of
* partitions and number of files.
* 1. Find numFiles across numPartitions
* 2. numRecordsToReadPerFile = approxNumRecords / numFiles
*/
JavaRDD<O> read(int numPartitions, int numFiles, long approxNumRecords) throws IOException;
/**
* @throws IOException Attempts to a % of records per file across requested number of partitions and number of files.
* 1. Find numFiles across numPartitions
* 2. numRecordsToReadPerFile = approxNumRecordsPerFile * percentageRecordsPerFile
*/
JavaRDD<O> read(int numPartitions, int numFiles, double percentageRecordsPerFile) throws IOException;
}

View File

@@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
/**
* Supported delta input data types.
*/
public enum DeltaInputType {
AVRO, PARQUET
}

View File

@@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import java.util.List;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.utilities.schema.RowBasedSchemaProvider;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import scala.collection.JavaConverters;
/**
* Helper class to read avro and/or parquet files and generate a RDD of {@link GenericRecord}.
*/
public class SparkBasedReader {
public static final String SPARK_AVRO_FORMAT = "avro";
public static final String SPARK_PARQUET_FORMAT = "com.databricks.spark.parquet";
private static final String AVRO_SCHEMA_OPTION_KEY = "avroSchema";
// Spark anyways globs the path and gets all the paths in memory so take the List<filePaths> as an argument.
// https://github.com/apache/spark/.../org/apache/spark/sql/execution/datasources/DataSource.scala#L251
public static JavaRDD<GenericRecord> readAvro(SparkSession sparkSession, String schemaStr, List<String> listOfPaths,
Option<String> structName, Option<String> nameSpace) {
Dataset<Row> dataSet = sparkSession.read()
.format(SPARK_AVRO_FORMAT)
.option(AVRO_SCHEMA_OPTION_KEY, schemaStr)
.load(JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq());
return AvroConversionUtils
.createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME),
nameSpace.orElse(RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE))
.toJavaRDD();
}
public static JavaRDD<GenericRecord> readParquet(SparkSession sparkSession, List<String>
listOfPaths, Option<String> structName, Option<String> nameSpace) {
Dataset<Row> dataSet = sparkSession.read()
.parquet((JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq()));
return AvroConversionUtils
.createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME),
RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE)
.toJavaRDD();
}
}

View File

@@ -0,0 +1,117 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.io.DatumWriter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Implementation of {@link DeltaInputWriter} that writes avro records to the result file.
*/
public class AvroFileDeltaInputWriter implements DeltaInputWriter<GenericRecord> {
public static final String AVRO_EXTENSION = ".avro";
private static Logger log = LoggerFactory.getLogger(AvroFileDeltaInputWriter.class);
// The maximum file size for an avro file before being rolled over to a new one
private final Long maxFileSize;
private final Configuration configuration;
private HoodieWrapperFileSystem fs;
// Path of the actual avro file
private Path file;
// Base input path to write avro files under
// TODO : Make this bucketed so don't have a large number of files in a single directory
private String basePath;
private DatumWriter<IndexedRecord> writer;
private DataFileWriter<IndexedRecord> dataFileWriter;
private OutputStream output;
private Schema schema;
private DeltaWriteStats deltaWriteStats;
private long recordsWritten = 0;
// TODO : Handle failure case which may leave behind tons of small corrupt files
public AvroFileDeltaInputWriter(Configuration configuration, String basePath, String schemaStr, Long maxFileSize)
throws IOException {
this.schema = Schema.parse(schemaStr);
this.maxFileSize = maxFileSize;
this.configuration = configuration;
this.basePath = basePath;
Path path = new Path(basePath, new Path(UUID.randomUUID().toString() + AVRO_EXTENSION));
this.file = HoodieWrapperFileSystem.convertToHoodiePath(path, configuration);
this.fs = (HoodieWrapperFileSystem) this.file
.getFileSystem(FSUtils.registerFileSystem(path, configuration));
this.output = this.fs.create(this.file);
this.writer = new GenericDatumWriter(schema);
this.dataFileWriter = new DataFileWriter<>(writer).create(schema, output);
this.deltaWriteStats = new DeltaWriteStats();
}
@Override
public void writeData(GenericRecord iData) throws IOException {
this.dataFileWriter.append(iData);
recordsWritten++;
}
@Override
public boolean canWrite() {
return fs.getBytesWritten(file) < maxFileSize;
}
@Override
public void close() throws IOException {
this.deltaWriteStats.setBytesWritten(this.fs.getBytesWritten(this.file));
this.deltaWriteStats.setRecordsWritten(this.recordsWritten);
this.deltaWriteStats.setFilePath(this.file.toUri().getPath());
this.dataFileWriter.close();
log.info("New Avro File : {}", getPath());
}
@Override
public DeltaInputWriter getNewWriter() throws IOException {
AvroFileDeltaInputWriter avroFileDeltaInputWriter = new AvroFileDeltaInputWriter(this.configuration, this.basePath, this
.schema.toString(), this.maxFileSize);
return avroFileDeltaInputWriter;
}
public FileSystem getFs() {
return fs;
}
public Path getPath() {
return this.file;
}
@Override
public DeltaWriteStats getDeltaWriteStats() {
return this.deltaWriteStats;
}
}

View File

@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.avro.generic.GenericRecord;
/**
* {@link org.apache.hadoop.hdfs.DistributedFileSystem} (or {@link org.apache.hadoop.fs.LocalFileSystem}) based delta
* generator.
*/
public class DFSDeltaWriterAdapter implements DeltaWriterAdapter<GenericRecord> {
private DeltaInputWriter deltaInputGenerator;
private List<DeltaWriteStats> metrics = new ArrayList<>();
public DFSDeltaWriterAdapter(DeltaInputWriter<GenericRecord> deltaInputGenerator) {
this.deltaInputGenerator = deltaInputGenerator;
}
@Override
public List<DeltaWriteStats> write(Iterator<GenericRecord> input) throws IOException {
while (input.hasNext()) {
if (this.deltaInputGenerator.canWrite()) {
this.deltaInputGenerator.writeData(input.next());
} else if (input.hasNext()) {
rollOver();
}
}
close();
return this.metrics;
}
public void rollOver() throws IOException {
close();
this.deltaInputGenerator = this.deltaInputGenerator.getNewWriter();
}
private void close() throws IOException {
this.deltaInputGenerator.close();
this.metrics.add(this.deltaInputGenerator.getDeltaWriteStats());
}
}

View File

@@ -0,0 +1,42 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter.SparkBasedDeltaWriter;
import org.apache.spark.api.java.JavaRDD;
/**
* NEED TO IMPLEMENT A CUSTOM SPARK PARTITIONER TO ENSURE WE WRITE LARGE ENOUGH AVRO FILES.
*/
public class DFSSparkAvroDeltaWriter implements SparkBasedDeltaWriter<JavaRDD<GenericRecord>> {
private DeltaInputWriter<JavaRDD<GenericRecord>> deltaInputWriter;
public DFSSparkAvroDeltaWriter(DeltaInputWriter<JavaRDD<GenericRecord>> deltaInputWriter) {
this.deltaInputWriter = deltaInputWriter;
}
@Override
public JavaRDD<DeltaWriteStats> write(JavaRDD<GenericRecord> input) throws IOException {
this.deltaInputWriter.writeData(input);
return null;
}
}

View File

@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.Closeable;
import java.io.IOException;
/**
* Implementations of {@link DeltaInputWriter} will be able to generate data.
*
* @param <I> Data type to be generated.
*/
public interface DeltaInputWriter<I> extends Closeable {
/**
* Generate any type of data.
*/
void writeData(I iData) throws IOException;
/**
* Check whether more data can/should be written.
*/
boolean canWrite();
/**
* Return the statistics of data written.
*/
DeltaWriteStats getDeltaWriteStats();
/**
* Return a new instance of this writer.
* @return
* @throws IOException
*/
DeltaInputWriter getNewWriter() throws IOException;
}

View File

@@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
/**
* Supported output destination types for the generated delta workload.
*/
public enum DeltaOutputMode {
KAFKA, DFS
}

View File

@@ -0,0 +1,72 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.apache.hudi.common.util.collection.Pair;
/**
* This class holds the write statistics for {@link DeltaInputWriter}.
*/
public class DeltaWriteStats implements Serializable {
// The file path (if any) for the data written
private String filePath;
// Number of bytes written before being closed
private long bytesWritten;
// Number of records written before being closed
private long recordsWritten;
private List<Pair<String, String>> partitionPathRecordKey = new ArrayList<>();
public String getFilePath() {
return filePath;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public long getBytesWritten() {
return bytesWritten;
}
public void setBytesWritten(long bytesWritten) {
this.bytesWritten = bytesWritten;
}
public List<Pair<String, String>> getPartitionPathRecordKey() {
return partitionPathRecordKey;
}
public void setPartitionPathRecordKey(List<Pair<String, String>> partitionPathRecordKey) {
this.partitionPathRecordKey = partitionPathRecordKey;
}
public long getRecordsWritten() {
return recordsWritten;
}
public void setRecordsWritten(long recordsWritten) {
this.recordsWritten = recordsWritten;
}
}

View File

@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
public interface DeltaWriterAdapter<I> {
List<DeltaWriteStats> write(Iterator<I> input) throws IOException;
interface SparkBasedDeltaWriter<J> {
JavaRDD<DeltaWriteStats> write(J input) throws IOException;
}
}

View File

@@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
/**
* A factory to help instantiate different {@link DeltaWriterAdapter}s depending on the {@link DeltaOutputMode} and
* {@link DeltaInputType}.
*/
public class DeltaWriterFactory {
private DeltaWriterFactory() {
}
public static DeltaWriterAdapter getDeltaWriterAdapter(DeltaConfig config, Integer batchId) throws IOException {
switch (config.getDeltaOutputMode()) {
case DFS:
switch (config.getDeltaInputType()) {
case AVRO:
DFSDeltaConfig dfsDeltaConfig = (DFSDeltaConfig) config;
dfsDeltaConfig.setBatchId(batchId);
DeltaInputWriter<GenericRecord> fileDeltaInputGenerator = new AvroFileDeltaInputWriter(
dfsDeltaConfig.getConfiguration(),
StringUtils
.join(new String[]{dfsDeltaConfig.getDeltaBasePath(), dfsDeltaConfig.getBatchId().toString()},
"/"), dfsDeltaConfig.getSchemaStr(), dfsDeltaConfig.getMaxFileSize());
return new DFSDeltaWriterAdapter(fileDeltaInputGenerator);
default:
throw new IllegalArgumentException("Invalid delta input format " + config.getDeltaInputType());
}
default:
throw new IllegalArgumentException("Invalid delta input type " + config.getDeltaOutputMode());
}
}
}

View File

@@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.AvroConversionUtils;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.SparkSession;
/**
* Spark based avro delta input writer. We don't use this yet since we cannot control result file size.
*/
public class SparkAvroDeltaInputWriter implements DeltaInputWriter<JavaRDD<GenericRecord>> {
private static final String AVRO_FORMAT_PACKAGE = "avro";
public SparkSession sparkSession;
private String schemaStr;
// TODO : the base path has to be a new path every time for spark avro
private String basePath;
public SparkAvroDeltaInputWriter(SparkSession sparkSession, String schemaStr, String basePath) {
this.sparkSession = sparkSession;
this.schemaStr = schemaStr;
this.basePath = basePath;
}
@Override
public void writeData(JavaRDD<GenericRecord> iData) throws IOException {
AvroConversionUtils.createDataFrame(iData.rdd(), schemaStr, sparkSession).write()
.format(AVRO_FORMAT_PACKAGE).save(basePath);
}
@Override
public boolean canWrite() {
throw new UnsupportedOperationException("not applicable for spark based writer");
}
@Override
public void close() throws IOException {
}
@Override
public DeltaWriteStats getDeltaWriteStats() {
throw new UnsupportedOperationException("not applicable for spark based writer");
}
@Override
public DeltaInputWriter getNewWriter() throws IOException {
throw new UnsupportedOperationException("not applicable for spark based writer");
}
}

View File

@@ -0,0 +1,138 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.when;
import java.io.IOException;
import java.util.Iterator;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.generator.FlexibleSchemaRecordGenerationIterator;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
import org.apache.hudi.integ.testsuite.utils.TestUtils;
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DFSDeltaWriterAdapter;
import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter;
import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory;
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.apache.spark.api.java.JavaRDD;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
public class TestDFSHoodieTestSuiteWriterAdapter extends UtilitiesTestBase {
private FilebasedSchemaProvider schemaProvider;
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/";
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass();
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFSWithAbsoluteScope(
System.getProperty("user.dir") + "/.." + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH,
"complex-source.avsc"), jsc);
}
@AfterEach
public void teardown() throws Exception {
super.teardown();
}
@Test
public void testDFSOneFileWrite() throws IOException {
DeltaInputWriter<GenericRecord> mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class);
DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class);
when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter);
when(mockFileSinkWriter.canWrite()).thenReturn(true);
when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats);
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter);
JavaRDD<GenericRecord> records = TestUtils.makeRDD(jsc, 10);
dfsDeltaWriterAdapter.write(records.collect().iterator());
Mockito.verify(mockFileSinkWriter, times(10)).canWrite();
Mockito.verify(mockFileSinkWriter, times(1)).close();
}
@Test
public void testDFSTwoFilesWriteWithRollover() throws IOException {
DeltaInputWriter<GenericRecord> mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class);
DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class);
when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter);
when(mockFileSinkWriter.canWrite()).thenReturn(false, true);
when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats);
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter);
Iterator<GenericRecord> mockIterator = Mockito.mock(Iterator.class);
when(mockIterator.hasNext()).thenReturn(true, true, true, false);
dfsDeltaWriterAdapter.write(mockIterator);
Mockito.verify(mockFileSinkWriter, times(2)).canWrite();
Mockito.verify(mockFileSinkWriter, times(1)).getNewWriter();
Mockito.verify(mockFileSinkWriter, times(2)).close();
}
@Test
public void testDFSWorkloadSinkWithMultipleFilesFunctional() throws IOException {
DeltaConfig dfsSinkConfig = new DFSDeltaConfig(DeltaOutputMode.DFS, DeltaInputType.AVRO,
new SerializableConfiguration(jsc.hadoopConfiguration()), dfsBasePath, dfsBasePath,
schemaProvider.getSourceSchema().toString(), 10240L);
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = DeltaWriterFactory
.getDeltaWriterAdapter(dfsSinkConfig, 1);
FlexibleSchemaRecordGenerationIterator itr = new FlexibleSchemaRecordGenerationIterator(1000,
schemaProvider.getSourceSchema().toString());
dfsDeltaWriterAdapter.write(itr);
FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
FileStatus[] fileStatuses = fs.listStatus(new Path(dfsBasePath));
// Since maxFileSize was 10240L and we produced 1K records each close to 1K size, we should produce more than
// 1 file
assertTrue(fileStatuses.length > 0);
}
}

View File

@@ -0,0 +1,147 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.reader.SparkBasedReader;
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
import org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator;
import org.apache.hudi.integ.testsuite.reader.SparkBasedReader;
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.apache.spark.api.java.JavaRDD;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
public class TestFileDeltaInputWriter extends UtilitiesTestBase {
private FilebasedSchemaProvider schemaProvider;
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/";
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass();
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFSWithAbsoluteScope(System.getProperty("user.dir") + "/.."
+ COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, "complex-source.avsc"), jsc);
}
@AfterEach
public void teardown() throws Exception {
super.teardown();
}
@Test
public void testAvroFileSinkWriter() throws IOException {
// 1. Create a Avro File Sink Writer
DeltaInputWriter<GenericRecord> fileSinkWriter =
new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath + "/input", schemaProvider.getSourceSchema()
.toString(), 1024 * 1024L);
GenericRecordFullPayloadGenerator payloadGenerator =
new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
// 2. Generate 100 avro payloads and write them to an avro file
IntStream.range(0, 100).forEach(a -> {
try {
fileSinkWriter.writeData(payloadGenerator.getNewPayload());
} catch (IOException io) {
throw new UncheckedIOException(io);
}
});
fileSinkWriter.close();
DeltaWriteStats deltaWriteStats = fileSinkWriter.getDeltaWriteStats();
FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
FileStatus[] fileStatuses = fs.listStatus(new Path(deltaWriteStats.getFilePath()));
// Atleast 1 file was written
assertEquals(1, fileStatuses.length);
// File length should be greater than 0
assertTrue(fileStatuses[0].getLen() > 0);
// File length should be the same as the number of bytes written
assertTrue(deltaWriteStats.getBytesWritten() > 0);
List<String> paths = Arrays.asList(fs.globStatus(new Path(dfsBasePath + "/*/*.avro")))
.stream().map(f -> f.getPath().toString()).collect(Collectors.toList());
JavaRDD<GenericRecord> writtenRecords =
SparkBasedReader.readAvro(sparkSession, schemaProvider.getSourceSchema().toString(), paths, Option.empty(),
Option.empty());
// Number of records written should be 100
assertEquals(writtenRecords.count(), 100);
// Number of records in file should match with the stats
assertEquals(writtenRecords.count(), deltaWriteStats.getRecordsWritten());
}
@Test
public void testAvroFileSinkCreateNewWriter() throws IOException {
// 1. Create a Avro File Sink Writer
DeltaInputWriter<GenericRecord> fileSinkWriter =
new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath,
schemaProvider.getSourceSchema().toString(),
1024 * 1024L);
GenericRecordFullPayloadGenerator payloadGenerator =
new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
// 2. Generate 100 avro payloads and write them to an avro file
IntStream.range(0, 100).forEach(a -> {
try {
fileSinkWriter.writeData(payloadGenerator.getNewPayload());
} catch (IOException io) {
throw new UncheckedIOException(io);
}
});
fileSinkWriter.close();
String oldFilePath = fileSinkWriter.getDeltaWriteStats().getFilePath();
assertFalse(oldFilePath == null);
DeltaInputWriter<GenericRecord> newFileSinkWriter = fileSinkWriter.getNewWriter();
newFileSinkWriter.close();
DeltaWriteStats newStats = newFileSinkWriter.getDeltaWriteStats();
assertEquals(newStats.getBytesWritten(), 3674);
assertEquals(newStats.getRecordsWritten(), 0);
assertTrue(newStats.getFilePath() != null);
}
}

View File

@@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.configuration;
import static junit.framework.Assert.assertTrue;
import static junit.framework.TestCase.assertEquals;
import java.util.ArrayList;
import java.util.List;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
import org.junit.jupiter.api.Test;
public class TestWorkflowBuilder {
@Test
public void testWorkloadOperationSequenceBuilder() {
DagNode root = new InsertNode(DeltaConfig.Config.newBuilder()
.withNumRecordsToInsert(10000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
DagNode child1 = new UpsertNode(DeltaConfig.Config.newBuilder()
.withNumRecordsToUpdate(10000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
root.addChildNode(child1);
child1.addParentNode(root);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
WorkflowDag workflowDag = new WorkflowDag(rootNodes);
assertEquals(workflowDag.getNodeList().size(), 1);
assertEquals(((DagNode) workflowDag.getNodeList().get(0)).getChildNodes().size(), 1);
DagNode dagNode = (DagNode) workflowDag.getNodeList().get(0);
assertTrue(dagNode instanceof InsertNode);
DeltaConfig.Config config = dagNode.getConfig();
assertEquals(config.getNumInsertPartitions(), 1);
assertEquals(config.getRecordSize(), 1000);
assertEquals(config.getRepeatCount(), 2);
assertEquals(config.getNumRecordsInsert(), 10000);
assertEquals(config.getNumRecordsUpsert(), 0);
dagNode = (DagNode) ((DagNode) workflowDag.getNodeList().get(0)).getChildNodes().get(0);
assertTrue(dagNode instanceof UpsertNode);
config = dagNode.getConfig();
assertEquals(config.getNumInsertPartitions(), 1);
assertEquals(config.getRecordSize(), 1000);
assertEquals(config.getRepeatCount(), 2);
assertEquals(config.getNumRecordsInsert(), 0);
assertEquals(config.getNumRecordsUpsert(), 10000);
}
}

View File

@@ -0,0 +1,93 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.converter;
import static junit.framework.TestCase.assertTrue;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema.Field;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.integ.testsuite.utils.TestUtils;
import org.apache.hudi.integ.testsuite.utils.TestUtils;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import scala.Tuple2;
public class TestUpdateConverter {
private JavaSparkContext jsc;
@BeforeEach
public void setup() throws Exception {
jsc = UtilHelpers.buildSparkContext(this.getClass().getName() + "-hoodie", "local[1]");
}
@AfterEach
public void teardown() {
jsc.stop();
}
@Test
public void testGenerateUpdateRecordsFromInputRecords() throws Exception {
JavaRDD<GenericRecord> inputRDD = TestUtils.makeRDD(jsc, 10);
String schemaStr = inputRDD.take(1).get(0).getSchema().toString();
int minPayloadSize = 1000;
// 2. DFS converter reads existing records and generates random updates for the same row keys
UpdateConverter updateConverter = new UpdateConverter(schemaStr, minPayloadSize,
Arrays.asList("timestamp"), Arrays.asList("_row_key"));
List<String> insertRowKeys = inputRDD.map(r -> r.get("_row_key").toString()).collect();
assertTrue(inputRDD.count() == 10);
JavaRDD<GenericRecord> outputRDD = updateConverter.convert(inputRDD);
List<String> updateRowKeys = outputRDD.map(row -> row.get("_row_key").toString()).collect();
// The insert row keys should be the same as update row keys
assertTrue(insertRowKeys.containsAll(updateRowKeys));
Map<String, GenericRecord> inputRecords = inputRDD.mapToPair(r -> new Tuple2<>(r.get("_row_key").toString(), r))
.collectAsMap();
List<GenericRecord> updateRecords = outputRDD.collect();
updateRecords.stream().forEach(updateRecord -> {
GenericRecord inputRecord = inputRecords.get(updateRecord.get("_row_key").toString());
assertTrue(areRecordsDifferent(inputRecord, updateRecord));
});
}
/**
* Checks if even a single field in the 2 records is different (except the row key which is the same for an update).
*/
private boolean areRecordsDifferent(GenericRecord in, GenericRecord up) {
for (Field field : in.getSchema().getFields()) {
if (field.name() == "_row_key") {
continue;
} else {
// Just convert all types to string for now since all are primitive
if (in.get(field.name()).toString() != up.get(field.name()).toString()) {
return true;
}
}
}
return false;
}
}

View File

@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.ValidateNode;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.ValidateNode;
import org.apache.spark.api.java.JavaRDD;
public class ComplexDagGenerator implements WorkflowDagGenerator {
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(1000)
.withNumInsertPartitions(3)
.withRecordSize(1000).build());
DagNode child1 = new UpsertNode(Config.newBuilder()
.withNumRecordsToUpdate(999)
.withNumRecordsToInsert(1000)
.withNumUpsertFiles(1)
.withNumUpsertPartitions(1)
.withNumInsertPartitions(1)
.withRecordSize(10000).build());
Function<List<DagNode<JavaRDD<WriteStatus>>>, Boolean> function = (dagNodes) -> {
DagNode<JavaRDD<WriteStatus>> parent1 = dagNodes.get(0);
List<WriteStatus> statuses = parent1.getResult().collect();
long totalRecordsTouched = statuses.stream().map(st -> st.getStat().getNumUpdateWrites() + st.getStat()
.getNumInserts()).reduce((a, b) -> a + b).get();
boolean b1 = totalRecordsTouched == parent1.getConfig().getNumRecordsInsert()
+ parent1.getConfig().getNumRecordsUpsert();
boolean b2 = statuses.size() > parent1.getConfig().getNumUpsertFiles();
DagNode<JavaRDD<WriteStatus>> parent2 = parent1.getParentNodes().get(0);
statuses = parent2.getResult().collect();
totalRecordsTouched = statuses.stream().map(st -> st.getStat().getNumUpdateWrites() + st.getStat()
.getNumInserts()).reduce((a, b) -> a + b).get();
boolean b3 = totalRecordsTouched == parent2.getConfig().getNumRecordsInsert()
* parent2.getConfig().getNumInsertPartitions() + parent2.getConfig().getNumRecordsUpsert();
return b1 & b2 & b3;
};
DagNode child2 = new ValidateNode(Config.newBuilder().build(), function);
root.addChildNode(child1);
// child1.addParentNode(root);
child1.addChildNode(child2);
// child2.addParentNode(child1);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
}

View File

@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
public class HiveSyncDagGenerator implements WorkflowDagGenerator {
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(100)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(1)
.withRecordSize(1000).build());
DagNode child1 = new HiveSyncNode(Config.newBuilder().withHiveLocal(true).build());
root.addChildNode(child1);
DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays
.asList(Pair.of("select " + "count(*) from testdb1.table1 group " + "by rider having count(*) < 1", 0)))
.build());
child1.addChildNode(child2);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
}

View File

@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
public class HiveSyncDagGeneratorMOR implements WorkflowDagGenerator {
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(100)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(1)
.withRecordSize(1000).build());
DagNode child1 = new HiveSyncNode(Config.newBuilder().withHiveLocal(true).build());
root.addChildNode(child1);
DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays
.asList(Pair.of("select " + "count(*) from testdb1.table1_rt group " + "by rider having count(*) < 1", 0)))
.build());
child1.addChildNode(child2);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
}

View File

@@ -0,0 +1,95 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.ArrayList;
import java.util.List;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class TestDagUtils {
private static final String COW_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-cow.yaml";
@Test
public void testConvertDagToYaml() throws Exception {
ComplexDagGenerator dag = new ComplexDagGenerator();
String yaml = DagUtils.convertDagToYaml(dag.build());
System.out.println(yaml);
}
@Test
public void testConvertYamlToDag() throws Exception {
WorkflowDag dag = DagUtils.convertYamlToDag(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath((System.getProperty("user.dir") + "/.." + COW_DAG_DOCKER_DEMO_RELATIVE_PATH)));
assertEquals(dag.getNodeList().size(), 1);
Assertions.assertEquals(((DagNode) dag.getNodeList().get(0)).getParentNodes().size(), 0);
assertEquals(((DagNode) dag.getNodeList().get(0)).getChildNodes().size(), 1);
DagNode firstChild = (DagNode) ((DagNode) dag.getNodeList().get(0)).getChildNodes().get(0);
assertEquals(firstChild.getParentNodes().size(), 1);
assertEquals(firstChild.getChildNodes().size(), 1);
assertEquals(((DagNode) firstChild.getChildNodes().get(0)).getChildNodes().size(), 1);
}
public static class ComplexDagGenerator implements WorkflowDagGenerator {
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(1000000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
DagNode child1 = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(1000000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
DagNode child2 = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(1000000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
root.addChildNode(child1);
root.addChildNode(child2);
DagNode child3 = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(1000000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
child2.addChildNode(child3);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
}
}

View File

@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import static junit.framework.TestCase.assertEquals;
import org.apache.avro.Schema;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.junit.jupiter.api.Test;
public class TestGenericRecordPayloadEstimator {
private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH =
"/docker/demo/config/test-suite/complex-source.avsc";
@Test
public void testSimpleSchemaSize() throws Exception {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordFullPayloadSizeEstimator estimator =
new GenericRecordFullPayloadSizeEstimator(schema);
Pair<Integer, Integer> estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields();
assertEquals(estimateAndNumComplexFields.getRight().intValue(), 0);
assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 156);
}
@Test
public void testComplexSchemaSize() throws Exception {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers.readFileFromAbsolutePath(
System.getProperty("user.dir") + "/.." + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordFullPayloadSizeEstimator estimator =
new GenericRecordFullPayloadSizeEstimator(schema);
Pair<Integer, Integer> estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields();
assertEquals(estimateAndNumComplexFields.getRight().intValue(), 1);
assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 1278);
}
}

View File

@@ -0,0 +1,130 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.IntStream;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.junit.jupiter.api.Test;
public class TestGenericRecordPayloadGenerator {
private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH =
"/docker/demo/config/test-suite/complex-source.avsc";
@Test
public void testSimplePayload() throws Exception {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
GenericRecord record = payloadGenerator.getNewPayload();
// The generated payload should validate with the provided schema
payloadGenerator.validate(record);
}
@Test
public void testComplexPayload() throws IOException {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
GenericRecord record = payloadGenerator.getNewPayload();
// The generated payload should validate with the provided schema
assertTrue(payloadGenerator.validate(record));
}
@Test
public void testComplexPartialPayload() throws IOException {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordPartialPayloadGenerator payloadGenerator = new GenericRecordPartialPayloadGenerator(schema);
IntStream.range(0, 10).forEach(a -> {
GenericRecord record = payloadGenerator.getNewPayload();
// The generated payload should validate with the provided schema
assertTrue(payloadGenerator.validate(record));
});
}
@Test
public void testUpdatePayloadGenerator() throws IOException {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
List<String> insertRowKeys = new ArrayList<>();
List<String> updateRowKeys = new ArrayList<>();
List<Long> insertTimeStamps = new ArrayList<>();
List<Long> updateTimeStamps = new ArrayList<>();
List<GenericRecord> records = new ArrayList<>();
// Generate 10 new records
IntStream.range(0, 10).forEach(a -> {
GenericRecord record = payloadGenerator.getNewPayload();
records.add(record);
insertRowKeys.add(record.get("_row_key").toString());
insertTimeStamps.add((Long) record.get("timestamp"));
});
List<String> blacklistFields = Arrays.asList("_row_key");
records.stream().forEach(a -> {
// Generate 10 updated records
GenericRecord record = payloadGenerator.getUpdatePayload(a, blacklistFields);
updateRowKeys.add(record.get("_row_key").toString());
updateTimeStamps.add((Long) record.get("timestamp"));
});
// The row keys from insert payloads should match all the row keys from the update payloads
assertTrue(insertRowKeys.containsAll(updateRowKeys));
// The timestamp field for the insert payloads should not all match with the update payloads
assertFalse(insertTimeStamps.containsAll(updateTimeStamps));
}
@Test
public void testSimplePayloadWithLargeMinSize() throws Exception {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
int minPayloadSize = 1000;
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema,
minPayloadSize);
GenericRecord record = payloadGenerator.getNewPayload();
// The payload generated is less than minPayloadSize due to no collections present
assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize);
}
@Test
public void testComplexPayloadWithLargeMinSize() throws Exception {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
int minPayloadSize = 10000;
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(
schema, minPayloadSize);
GenericRecord record = payloadGenerator.getNewPayload();
// The payload generated should be within 10% extra of the minPayloadSize
assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize + 0.1 * minPayloadSize);
}
}

View File

@@ -0,0 +1,224 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.job;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.UUID;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig;
import org.apache.hudi.integ.testsuite.dag.ComplexDagGenerator;
import org.apache.hudi.integ.testsuite.dag.HiveSyncDagGenerator;
import org.apache.hudi.integ.testsuite.dag.HiveSyncDagGeneratorMOR;
import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
import org.apache.hudi.keygen.TimestampBasedKeyGenerator;
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
import org.apache.hudi.utilities.sources.AvroDFSSource;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
public class TestHoodieTestSuiteJob extends UtilitiesTestBase {
private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with useDeltaStreamer={0}, tableType={1}";
private static final String BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/base"
+ ".properties";
private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
private static final String TARGET_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/target.avsc";
private static final String COW_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-cow.yaml";
private static final String MOR_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-mor.yaml";
public static Stream<Arguments> configParams() {
Object[][] data =
new Object[][] {{false, "COPY_ON_WRITE"}};
return Stream.of(data).map(Arguments::of);
}
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass();
// prepare the configs.
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+ BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/base.properties");
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+ SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/source.avsc");
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+ TARGET_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/target.avsc");
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+ COW_DAG_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/complex-dag-cow.yaml");
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+ MOR_DAG_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/complex-dag-mor.yaml");
TypedProperties props = new TypedProperties();
props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
props.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp");
props.setProperty("hoodie.deltastreamer.keygen.timebased.timestamp.type", "UNIX_TIMESTAMP");
props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyy/MM/dd");
props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc");
props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc");
props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsBasePath + "/input");
props.setProperty("hoodie.datasource.hive_sync.assume_date_partitioning", "true");
props.setProperty("hoodie.datasource.write.keytranslator.class", "org.apache.hudi"
+ ".DayBasedPartitionPathKeyTranslator");
props.setProperty("hoodie.compact.inline.max.delta.commits", "3");
props.setProperty("hoodie.parquet.max.file.size", "1024000");
props.setProperty("hoodie.compact.inline.max.delta.commits", "0");
// Hive Configs
props.setProperty(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), "jdbc:hive2://127.0.0.1:9999/");
props.setProperty(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), "testdb1");
props.setProperty(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), "table1");
props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "datestr");
props.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), TimestampBasedKeyGenerator.class.getName());
UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-source"
+ ".properties");
// Properties used for the delta-streamer which incrementally pulls from upstream DFS Avro source and
// writes to downstream hudi table
TypedProperties downstreamProps = new TypedProperties();
downstreamProps.setProperty("include", "base.properties");
downstreamProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
downstreamProps.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp");
// Source schema is the target schema of upstream table
downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc");
downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc");
UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs,
dfsBasePath + "/test-downstream-source.properties");
// these tests cause a lot of log verbosity from spark, turning it down
Logger.getLogger("org.apache.spark").setLevel(Level.WARN);
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
}
@AfterEach
public void teardown() throws Exception {
super.teardown();
}
// Tests in this class add to the test build time significantly. Since this is a Integration Test (end to end), we
// would like to run this as a nightly build which is a TODO.
// TODO : Clean up input / result paths after each test
@MethodSource("configParams")
public void testDagWithInsertUpsertAndValidate(boolean useDeltaStreamer, String tableType) throws Exception {
dfs.delete(new Path(dfsBasePath + "/input"), true);
dfs.delete(new Path(dfsBasePath + "/result"), true);
String inputBasePath = dfsBasePath + "/input/" + UUID.randomUUID().toString();
String outputBasePath = dfsBasePath + "/result/" + UUID.randomUUID().toString();
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, tableType);
cfg.workloadDagGenerator = ComplexDagGenerator.class.getName();
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
hoodieTestSuiteJob.runTestSuite();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 2);
}
@MethodSource("configParams")
public void testHiveSync(boolean useDeltaStreamer, String tableType) throws Exception {
dfs.delete(new Path(dfsBasePath + "/input"), true);
dfs.delete(new Path(dfsBasePath + "/result"), true);
String inputBasePath = dfsBasePath + "/input";
String outputBasePath = dfsBasePath + "/result";
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, tableType);
if (tableType == HoodieTableType.COPY_ON_WRITE.name()) {
cfg.workloadDagGenerator = HiveSyncDagGenerator.class.getName();
} else {
cfg.workloadDagGenerator = HiveSyncDagGeneratorMOR.class.getName();
}
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
hoodieTestSuiteJob.runTestSuite();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 1);
}
@MethodSource("configParams")
public void testCOWFullDagFromYaml(boolean useDeltaStreamer, String tableType) throws Exception {
dfs.delete(new Path(dfsBasePath + "/input"), true);
dfs.delete(new Path(dfsBasePath + "/result"), true);
String inputBasePath = dfsBasePath + "/input";
String outputBasePath = dfsBasePath + "/result";
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType
.COPY_ON_WRITE.name());
cfg.workloadYamlPath = dfsBasePath + "/complex-dag-cow.yaml";
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
hoodieTestSuiteJob.runTestSuite();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 5);
}
@MethodSource("configParams")
public void testMORFullDagFromYaml(boolean useDeltaStreamer, String tableType) throws Exception {
dfs.delete(new Path(dfsBasePath + "/input"), true);
dfs.delete(new Path(dfsBasePath + "/result"), true);
String inputBasePath = dfsBasePath + "/input";
String outputBasePath = dfsBasePath + "/result";
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType
.MERGE_ON_READ.name());
cfg.workloadYamlPath = dfsBasePath + "/complex-dag-mor.yaml";
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
hoodieTestSuiteJob.runTestSuite();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 7);
}
protected HoodieTestSuiteConfig makeConfig(String inputBasePath, String outputBasePath, boolean useDeltaStream,
String tableType) {
HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig();
cfg.targetBasePath = outputBasePath;
cfg.inputBasePath = inputBasePath;
cfg.targetTableName = "table1";
cfg.tableType = tableType;
cfg.sourceClassName = AvroDFSSource.class.getName();
cfg.sourceOrderingField = "timestamp";
cfg.propsFilePath = dfsBasePath + "/test-source.properties";
cfg.outputTypeName = DeltaOutputMode.DFS.name();
cfg.inputFormatName = DeltaInputType.AVRO.name();
cfg.limitFileSize = 1024 * 1024L;
cfg.sourceLimit = 20000000;
cfg.workloadDagGenerator = WorkflowDagGenerator.class.getName();
cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName();
cfg.useDeltaStreamer = useDeltaStream;
return cfg;
}
}

View File

@@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.utils.TestUtils;
import org.apache.hudi.integ.testsuite.utils.TestUtils;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
public class TestDFSAvroDeltaInputReader extends UtilitiesTestBase {
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass();
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
}
@Test
@Disabled
public void testDFSSinkReader() throws IOException {
FileSystem fs = FSUtils.getFs(dfsBasePath, new Configuration());
// Create 10 avro files with 10 records each
TestUtils.createAvroFiles(jsc, sparkSession, dfsBasePath, 10, 10);
FileStatus[] statuses = fs.globStatus(new Path(dfsBasePath + "/*/*.avro"));
DFSAvroDeltaInputReader reader =
new DFSAvroDeltaInputReader(sparkSession, TestUtils.getSchema().toString(), dfsBasePath, Option.empty(),
Option.empty());
assertEquals(reader.analyzeSingleFile(statuses[0].getPath().toString()), 5);
assertEquals(reader.read(100).count(), 100);
assertEquals(reader.read(1000).count(), 100);
assertEquals(reader.read(10).count(), 10);
assertTrue(reader.read(11).count() > 11);
}
}

View File

@@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import static junit.framework.TestCase.assertEquals;
import static junit.framework.TestCase.assertTrue;
import java.util.HashSet;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.apache.spark.api.java.JavaRDD;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
public class TestDFSHoodieDatasetInputReader extends UtilitiesTestBase {
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass();
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
HoodieTestUtils.init(jsc.hadoopConfiguration(), dfsBasePath);
}
@AfterEach
public void teardown() throws Exception {
super.teardown();
}
@Test
public void testSimpleHoodieDatasetReader() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfig();
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
String commitTime = client.startCommit();
HoodieTestDataGenerator generator = new HoodieTestDataGenerator();
// Insert 100 records across 3 partitions
List<HoodieRecord> inserts = generator.generateInserts(commitTime, 100);
JavaRDD<WriteStatus> writeStatuses = client.upsert(jsc.parallelize(inserts), commitTime);
writeStatuses.count();
DFSHoodieDatasetInputReader reader = new DFSHoodieDatasetInputReader(jsc, config.getBasePath(),
HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())).toString());
// Try to read 100 records for the same partition path and same file ID
JavaRDD<GenericRecord> records = reader.read(1, 1, 100L);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
1);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
1);
// Try to read 100 records for 3 partition paths and 3 different file ids
records = reader.read(3, 3, 100L);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
3);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
3);
// Try to read 100 records for 3 partition paths and 50% records from each file
records = reader.read(3, 3, 0.5);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
3);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
3);
}
private HoodieWriteConfig makeHoodieClientConfig() throws Exception {
return makeHoodieClientConfigBuilder().build();
}
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
// Prepare the AvroParquetIO
return HoodieWriteConfig.newBuilder().withPath(dfsBasePath)
.withParallelism(2, 2)
.withSchema(HoodieTestDataGenerator
.TRIP_EXAMPLE_SCHEMA);
}
}

View File

@@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.utils;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.utilities.schema.RowBasedSchemaProvider;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
public class TestUtils {
/**
* Create a RDD of generic records for testing purposes.
*/
public static JavaRDD<GenericRecord> makeRDD(JavaSparkContext jsc, int numRecords) {
return jsc.parallelize(generateGenericRecords(numRecords));
}
/**
* Generate generic records.
*/
public static List<GenericRecord> generateGenericRecords(int numRecords) {
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
return dataGenerator.generateGenericRecords(numRecords);
}
public static void createAvroFiles(JavaSparkContext jsc, SparkSession sparkSession, String basePath, int numFiles,
int numRecordsPerFile) {
Schema schema = HoodieTestDataGenerator.AVRO_SCHEMA;
for (int i = 0; i < numFiles; i++) {
JavaRDD<GenericRecord> rdd = makeRDD(jsc, numRecordsPerFile);
AvroConversionUtils.createDataFrame(rdd.rdd(), schema.toString(), sparkSession).write()
.format("avro").option("recordName", RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME)
.option("recordNamespace", RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE).save(basePath + "/" + i);
}
}
public static Schema getSchema() {
return HoodieTestDataGenerator.AVRO_SCHEMA;
}
}

View File

@@ -15,8 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=WARN, CONSOLE
log4j.logger.org.apache.hudi=DEBUG
log4j.rootLogger=ERROR, CONSOLE
log4j.logger.org.apache.hudi=ERROR
log4j.category.org.apache.spark=ERROR
# CONSOLE is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
@@ -25,5 +26,5 @@ log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
log4j.appender.CONSOLE.filter.a.LevelMin=ERROR
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL

View File

@@ -95,7 +95,7 @@ public class QuickstartUtils {
}
public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName,
double timestamp) {
double timestamp) {
GenericRecord rec = new GenericData.Record(avroSchema);
rec.put("uuid", rowKey);
rec.put("ts", timestamp);
@@ -221,4 +221,4 @@ public class QuickstartUtils {
demoConfigs.put("hoodie.upsert.shuffle.parallelism", "2");
return demoConfigs;
}
}
}

View File

@@ -37,6 +37,7 @@ public class ComplexKeyGenerator extends KeyGenerator {
private static final String DEFAULT_PARTITION_PATH = "default";
private static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
public static final String DEFAULT_RECORD_KEY_SEPARATOR = ":";
protected static final String NULL_RECORDKEY_PLACEHOLDER = "__null__";
protected static final String EMPTY_RECORDKEY_PLACEHOLDER = "__empty__";
@@ -98,8 +99,16 @@ public class ComplexKeyGenerator extends KeyGenerator {
recordKey.deleteCharAt(recordKey.length() - 1);
if (keyIsNullEmpty) {
throw new HoodieKeyException("recordKey values: \"" + recordKey + "\" for fields: "
+ recordKeyFields.toString() + " cannot be entirely null or empty.");
+ recordKeyFields.toString() + " cannot be entirely null or empty.");
}
return recordKey.toString();
}
}
public List<String> getRecordKeyFields() {
return recordKeyFields;
}
public List<String> getPartitionPathFields() {
return partitionPathFields;
}
}

View File

@@ -40,4 +40,4 @@ public abstract class KeyGenerator implements Serializable {
* Generate a Hoodie Key out of provided generic record.
*/
public abstract HoodieKey getKey(GenericRecord record);
}
}

View File

@@ -48,4 +48,4 @@ public class NonpartitionedKeyGenerator extends KeyGenerator {
}
return new HoodieKey(recordKey, EMPTY_PARTITION);
}
}
}

View File

@@ -73,4 +73,12 @@ public class SimpleKeyGenerator extends KeyGenerator {
}
return recordKey;
}
}
public String getRecordKeyField() {
return recordKeyField;
}
public String getPartitionPathField() {
return partitionPathField;
}
}

View File

@@ -149,7 +149,6 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator {
"Unexpected type for partition field: " + partitionVal.getClass().getName());
}
DateTime timestamp = new DateTime(timeMs, outputDateTimeZone);
return hiveStylePartitioning ? partitionPathField + "=" + timestamp.toString(partitionFormatter)
: timestamp.toString(partitionFormatter);
} catch (Exception e) {

View File

@@ -0,0 +1,67 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static junit.framework.TestCase.assertEquals;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.keygen.ComplexKeyGenerator;
import org.junit.jupiter.api.Test;
public class TestComplexKeyGenerator {
@Test
public void testSingleValueKeyGenerator() {
TypedProperties properties = new TypedProperties();
properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key");
properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "timestamp");
ComplexKeyGenerator compositeKeyGenerator = new ComplexKeyGenerator(properties);
assertEquals(compositeKeyGenerator.getRecordKeyFields().size(), 1);
assertEquals(compositeKeyGenerator.getPartitionPathFields().size(), 1);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
GenericRecord record = dataGenerator.generateGenericRecords(1).get(0);
String rowKey = record.get("_row_key").toString();
String partitionPath = record.get("timestamp").toString();
HoodieKey hoodieKey = compositeKeyGenerator.getKey(record);
assertEquals("_row_key:" + rowKey, hoodieKey.getRecordKey());
assertEquals(partitionPath, hoodieKey.getPartitionPath());
}
@Test
public void testMultipleValueKeyGenerator() {
TypedProperties properties = new TypedProperties();
properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key,timestamp");
properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "rider,driver");
ComplexKeyGenerator compositeKeyGenerator = new ComplexKeyGenerator(properties);
assertEquals(compositeKeyGenerator.getRecordKeyFields().size(), 2);
assertEquals(compositeKeyGenerator.getPartitionPathFields().size(), 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
GenericRecord record = dataGenerator.generateGenericRecords(1).get(0);
String rowKey =
"_row_key" + ComplexKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("_row_key").toString() + ","
+ "timestamp" + ComplexKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("timestamp").toString();
String partitionPath = record.get("rider").toString() + "/" + record.get("driver").toString();
HoodieKey hoodieKey = compositeKeyGenerator.getKey(record);
assertEquals(rowKey, hoodieKey.getRecordKey());
assertEquals(partitionPath, hoodieKey.getPartitionPath());
}
}

View File

@@ -131,6 +131,12 @@
<artifactId>log4j</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<!-- Fasterxml -->
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
@@ -163,6 +169,10 @@
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.utilities;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.client.WriteStatus;
@@ -35,6 +36,7 @@ import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.utilities.checkpointing.InitialCheckPointProvider;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.hudi.utilities.sources.Source;
import org.apache.hudi.utilities.sources.helpers.DFSPathSelector;
import org.apache.hudi.utilities.transform.ChainedTransformer;
import org.apache.hudi.utilities.transform.Transformer;
@@ -348,4 +350,15 @@ public class UtilHelpers {
throw new HoodieException(String.format("%s table does not exists!", table));
}
}
public static DFSPathSelector createSourceSelector(String sourceSelectorClass, TypedProperties props,
Configuration conf) throws IOException {
try {
return (DFSPathSelector) ReflectionUtils.loadClass(sourceSelectorClass,
new Class<?>[]{TypedProperties.class, Configuration.class},
props, conf);
} catch (Throwable e) {
throw new IOException("Could not load source selector class " + sourceSelectorClass, e);
}
}
}

View File

@@ -40,6 +40,7 @@ import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.hudi.exception.HoodieDeltaStreamerException;
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Config;
import org.apache.hudi.utilities.schema.DelegatingSchemaProvider;
import org.apache.hudi.utilities.schema.RowBasedSchemaProvider;
import org.apache.hudi.utilities.schema.SchemaProvider;
@@ -212,8 +213,8 @@ public class DeltaSync implements Serializable {
/**
* Run one round of delta sync and return new compaction instant if one got scheduled.
*/
public Option<String> syncOnce() throws Exception {
Option<String> scheduledCompaction = Option.empty();
public Pair<Option<String>, JavaRDD<WriteStatus>> syncOnce() throws Exception {
Pair<Option<String>, JavaRDD<WriteStatus>> result = null;
HoodieDeltaStreamerMetrics metrics = new HoodieDeltaStreamerMetrics(getHoodieClientConfig(schemaProvider));
Timer.Context overallTimerContext = metrics.getOverallTimerContext();
@@ -231,13 +232,13 @@ public class DeltaSync implements Serializable {
setupWriteClient();
}
scheduledCompaction = writeToSink(srcRecordsWithCkpt.getRight().getRight(),
result = writeToSink(srcRecordsWithCkpt.getRight().getRight(),
srcRecordsWithCkpt.getRight().getLeft(), metrics, overallTimerContext);
}
// Clear persistent RDDs
jssc.getPersistentRDDs().values().forEach(JavaRDD::unpersist);
return scheduledCompaction;
return result;
}
/**
@@ -248,7 +249,7 @@ public class DeltaSync implements Serializable {
* of schemaProvider, checkpointStr and hoodieRecord
* @throws Exception in case of any Exception
*/
private Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> readFromSource(
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> readFromSource(
Option<HoodieTimeline> commitTimelineOpt) throws Exception {
// Retrieve the previous round checkpoints, if any
Option<String> resumeCheckpointStr = Option.empty();
@@ -355,8 +356,8 @@ public class DeltaSync implements Serializable {
* @param overallTimerContext Timer Context
* @return Option Compaction instant if one is scheduled
*/
private Option<String> writeToSink(JavaRDD<HoodieRecord> records, String checkpointStr,
HoodieDeltaStreamerMetrics metrics, Timer.Context overallTimerContext) throws Exception {
private Pair<Option<String>, JavaRDD<WriteStatus>> writeToSink(JavaRDD<HoodieRecord> records, String checkpointStr,
HoodieDeltaStreamerMetrics metrics, Timer.Context overallTimerContext) throws Exception {
Option<String> scheduledCompactionInstant = Option.empty();
// filter dupes if needed
@@ -413,7 +414,7 @@ public class DeltaSync implements Serializable {
if (!isEmpty) {
// Sync to hive if enabled
Timer.Context hiveSyncContext = metrics.getHiveSyncTimerContext();
syncHive();
syncHiveIfNeeded();
hiveSyncTimeMs = hiveSyncContext != null ? hiveSyncContext.stop() : 0;
}
} else {
@@ -438,7 +439,7 @@ public class DeltaSync implements Serializable {
// Send DeltaStreamer Metrics
metrics.updateDeltaStreamerMetrics(overallTimeMs, hiveSyncTimeMs);
return scheduledCompactionInstant;
return Pair.of(scheduledCompactionInstant, writeStatusRDD);
}
/**
@@ -472,15 +473,27 @@ public class DeltaSync implements Serializable {
/**
* Sync to Hive.
*/
private void syncHive() {
public void syncHiveIfNeeded() throws ClassNotFoundException {
if (cfg.enableHiveSync) {
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat);
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
+ hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
new HiveSyncTool(hiveSyncConfig, new HiveConf(conf, HiveConf.class), fs).syncHoodieTable();
syncHive();
}
}
public void syncHive() {
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat);
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
+ hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
HiveConf hiveConf = new HiveConf(conf, HiveConf.class);
LOG.info("Hive Conf => " + hiveConf.getAllProperties().toString());
LOG.info("Hive Sync Conf => " + hiveSyncConfig.toString());
new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable();
}
public void syncHive(HiveConf conf) {
this.conf = conf;
syncHive();
}
/**
* Note that depending on configs and source-type, schemaProvider could either be eagerly or lazily created.
* SchemaProvider creation is a precursor to HoodieWriteClient and AsyncCompactor creation. This method takes care of
@@ -558,4 +571,20 @@ public class DeltaSync implements Serializable {
writeClient = null;
}
}
public FileSystem getFs() {
return fs;
}
public TypedProperties getProps() {
return props;
}
public Config getCfg() {
return cfg;
}
public Option<HoodieTimeline> getCommitTimelineOpt() {
return commitTimelineOpt;
}
}

View File

@@ -18,8 +18,9 @@
package org.apache.hudi.utilities.deltastreamer;
import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.async.AbstractAsyncService;
import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieTableType;
@@ -49,6 +50,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
@@ -83,9 +85,9 @@ public class HoodieDeltaStreamer implements Serializable {
public static String CHECKPOINT_KEY = "deltastreamer.checkpoint.key";
private final transient Config cfg;
protected final transient Config cfg;
private transient DeltaSyncService deltaSyncService;
protected transient DeltaSyncService deltaSyncService;
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc) throws IOException {
this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()),
@@ -435,11 +437,11 @@ public class HoodieDeltaStreamer implements Serializable {
while (!isShutdownRequested()) {
try {
long start = System.currentTimeMillis();
Option<String> scheduledCompactionInstant = deltaSync.syncOnce();
if (scheduledCompactionInstant.isPresent()) {
LOG.info("Enqueuing new pending compaction instant (" + scheduledCompactionInstant + ")");
Pair<Option<String>, JavaRDD<WriteStatus>> scheduledCompactionInstantAndRDD = deltaSync.syncOnce();
if (scheduledCompactionInstantAndRDD.getLeft().isPresent()) {
LOG.info("Enqueuing new pending compaction instant (" + scheduledCompactionInstantAndRDD.getLeft() + ")");
asyncCompactService.enqueuePendingCompaction(new HoodieInstant(State.REQUESTED,
HoodieTimeline.COMPACTION_ACTION, scheduledCompactionInstant.get()));
HoodieTimeline.COMPACTION_ACTION, scheduledCompactionInstantAndRDD.getLeft().get()));
asyncCompactService.waitTillPendingCompactionsReducesTo(cfg.maxPendingCompactions);
}
long toSleepMs = cfg.minSyncIntervalSeconds * 1000 - (System.currentTimeMillis() - start);
@@ -623,4 +625,8 @@ public class HoodieDeltaStreamer implements Serializable {
}, executor)).toArray(CompletableFuture[]::new)), executor);
}
}
public DeltaSyncService getDeltaSyncService() {
return deltaSyncService;
}
}

View File

@@ -23,15 +23,17 @@ import org.apache.hudi.metrics.Metrics;
import com.codahale.metrics.Timer;
public class HoodieDeltaStreamerMetrics {
import java.io.Serializable;
public class HoodieDeltaStreamerMetrics implements Serializable {
private HoodieWriteConfig config;
private String tableName;
public String overallTimerName = null;
public String hiveSyncTimerName = null;
private Timer overallTimer = null;
public Timer hiveSyncTimer = null;
private transient Timer overallTimer = null;
public transient Timer hiveSyncTimer = null;
public HoodieDeltaStreamerMetrics(HoodieWriteConfig config) {
this.config = config;

View File

@@ -71,11 +71,11 @@ public final class SourceFormatAdapter {
// pass in the schema for the Row-to-Avro conversion
// to avoid nullability mismatch between Avro schema and Row schema
? AvroConversionUtils.createRdd(
rdd, r.getSchemaProvider().getSourceSchema(),
HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()
rdd, r.getSchemaProvider().getSourceSchema(),
HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()
: AvroConversionUtils.createRdd(
rdd, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()
))
))
.orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
}
default:
@@ -116,4 +116,4 @@ public final class SourceFormatAdapter {
throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")");
}
}
}
}

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.utilities.sources;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.hudi.utilities.sources.helpers.DFSPathSelector;
@@ -33,6 +34,8 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import java.io.IOException;
/**
* DFS Source that reads avro data.
*/
@@ -41,9 +44,11 @@ public class AvroDFSSource extends AvroSource {
private final DFSPathSelector pathSelector;
public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
SchemaProvider schemaProvider) {
SchemaProvider schemaProvider) throws IOException {
super(props, sparkContext, sparkSession, schemaProvider);
this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
this.pathSelector = UtilHelpers
.createSourceSelector(DFSPathSelector.class.getName(), props, sparkContext
.hadoopConfiguration());
}
@Override

View File

@@ -32,6 +32,8 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
@@ -42,18 +44,20 @@ import java.util.stream.Collectors;
public class DFSPathSelector {
protected static volatile Logger log = LogManager.getLogger(DFSPathSelector.class);
/**
* Configs supported.
*/
static class Config {
public static class Config {
private static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.dfs.root";
public static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.dfs.root";
}
private static final List<String> IGNORE_FILEPREFIX_LIST = Arrays.asList(".", "_");
protected static final List<String> IGNORE_FILEPREFIX_LIST = Arrays.asList(".", "_");
private final transient FileSystem fs;
private final TypedProperties props;
protected final transient FileSystem fs;
protected final TypedProperties props;
public DFSPathSelector(TypedProperties props, Configuration hadoopConf) {
DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.ROOT_INPUT_PATH_PROP));
@@ -66,6 +70,7 @@ public class DFSPathSelector {
try {
// obtain all eligible files under root folder.
log.info("Root path => " + props.getString(Config.ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit);
List<FileStatus> eligibleFiles = new ArrayList<>();
RemoteIterator<LocatedFileStatus> fitr =
fs.listFiles(new Path(props.getString(Config.ROOT_INPUT_PATH_PROP)), true);
@@ -79,7 +84,6 @@ public class DFSPathSelector {
}
// sort them by modification time.
eligibleFiles.sort(Comparator.comparingLong(FileStatus::getModificationTime));
// Filter based on checkpoint & input size, if needed
long currentBytes = 0;
long maxModificationTime = Long.MIN_VALUE;

View File

@@ -42,7 +42,7 @@ public class TestCsvDFSSource extends AbstractDFSSourceTestBase {
this.fileSuffix = ".json";
this.useFlattenedSchema = true;
this.schemaProvider = new FilebasedSchemaProvider(
Helpers.setupSchemaOnDFS("source-flattened.avsc"), jsc);
Helpers.setupSchemaOnDFS("delta-streamer-config", "source-flattened.avsc"), jsc);
}
@Override

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.utilities.testutils;
import java.io.FileInputStream;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieTableType;
@@ -189,6 +190,17 @@ public class UtilitiesTestBase {
return sb.toString();
}
public static String readFileFromAbsolutePath(String absolutePathForResource) throws IOException {
BufferedReader reader =
new BufferedReader(new InputStreamReader(new FileInputStream(absolutePathForResource)));
StringBuffer sb = new StringBuffer();
String line;
while ((line = reader.readLine()) != null) {
sb.append(line + "\n");
}
return sb.toString();
}
public static void copyToDFS(String testResourcePath, FileSystem fs, String targetPath) throws IOException {
PrintStream os = new PrintStream(fs.create(new Path(targetPath), true));
os.print(readFile(testResourcePath));
@@ -196,6 +208,14 @@ public class UtilitiesTestBase {
os.close();
}
public static void copyToDFSFromAbsolutePath(String absolutePathForResource, FileSystem fs, String targetPath)
throws IOException {
PrintStream os = new PrintStream(fs.create(new Path(targetPath), true));
os.print(readFileFromAbsolutePath(absolutePathForResource));
os.flush();
os.close();
}
public static void savePropsToDFS(TypedProperties props, FileSystem fs, String targetPath) throws IOException {
String[] lines = props.keySet().stream().map(k -> String.format("%s=%s", k, props.get(k))).toArray(String[]::new);
saveStringsToDFS(lines, fs, targetPath);
@@ -258,11 +278,18 @@ public class UtilitiesTestBase {
}
public static TypedProperties setupSchemaOnDFS() throws IOException {
return setupSchemaOnDFS("source.avsc");
return setupSchemaOnDFS("delta-streamer-config", "source.avsc");
}
public static TypedProperties setupSchemaOnDFS(String filename) throws IOException {
UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/" + filename, dfs, dfsBasePath + "/" + filename);
public static TypedProperties setupSchemaOnDFS(String scope, String filename) throws IOException {
UtilitiesTestBase.Helpers.copyToDFS(scope + "/" + filename, dfs, dfsBasePath + "/" + filename);
TypedProperties props = new TypedProperties();
props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/" + filename);
return props;
}
public static TypedProperties setupSchemaOnDFSWithAbsoluteScope(String scope, String filename) throws IOException {
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(scope + "/" + filename, dfs, dfsBasePath + "/" + filename);
TypedProperties props = new TypedProperties();
props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/" + filename);
return props;
@@ -278,7 +305,7 @@ public class UtilitiesTestBase {
}
public static List<GenericRecord> toGenericRecords(List<HoodieRecord> hoodieRecords) {
List<GenericRecord> records = new ArrayList<GenericRecord>();
List<GenericRecord> records = new ArrayList<>();
for (HoodieRecord hoodieRecord : hoodieRecords) {
records.add(toGenericRecord(hoodieRecord));
}

View File

@@ -0,0 +1,466 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"name": "COMPLEX",
"fields": [
{
"default": null,
"type": [
"null",
{
"items": "string",
"type": "array"
}
],
"name": "array_of_string_fields1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field2"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field3"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field4"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field5"
},
{
"default": null,
"type": [
"null",
"boolean"
],
"name": "boolean_field1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field6"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field7"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field8"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field9"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field10"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field11"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field12"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field13"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field14"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field2"
},
{
"default": null,
"type": [
"null",
{
"items": {
"fields": [
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field15"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field16"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field17"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field3"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field4"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field2"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field3"
}
],
"type": "record",
"name": "record_field1"
},
"type": "array"
}
],
"name": "record_name1"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field18"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field5"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field4"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field5"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field19"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field6"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field20"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field7"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field6"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field21"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field22"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field23"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field8"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field7"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field24"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field10"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field25"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field26"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field11"
},
{
"default": null,
"type": [
"null",
"boolean"
],
"name": "boolean_field3"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field12"
},
{
"default": null,
"type": [
"null",
"double"
],
"name": "double_field8"
},
{
"default": null,
"type": [
"null",
"long"
],
"name": "long_field13"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field27"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field28"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field29"
},
{
"default": null,
"type": [
"null",
"string"
],
"name": "string_field30"
}
],
"type": "record"
}

View File

@@ -0,0 +1,565 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hudi</artifactId>
<groupId>org.apache.hudi</groupId>
<version>0.6.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>hudi-integ-test-bundle</artifactId>
<packaging>jar</packaging>
<properties>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.basedir}</main.basedir>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>${maven-shade-plugin.version}</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<createSourcesJar>true</createSourcesJar>
<dependencyReducedPomLocation>${project.build.directory}/dependency-reduced-pom.xml
</dependencyReducedPomLocation>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer">
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
<addHeader>true</addHeader>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
</transformer>
</transformers>
<artifactSet>
<includes>
<include>commons-codec:commons-codec</include>
<include>commons-dbcp:commons-dbcp</include>
<include>commons-lang:commons-lang</include>
<include>commons-pool:commons-pool</include>
<include>org.apache.hudi:hudi-common</include>
<include>org.apache.hudi:hudi-client</include>
<include>org.apache.hudi:hudi-utilities_${scala.binary.version}</include>
<include>org.apache.hudi:hudi-spark_${scala.binary.version}</include>
<include>org.apache.hudi:hudi-hive-sync</include>
<include>org.apache.hudi:hudi-hadoop-mr</include>
<include>org.apache.hudi:hudi-timeline-service</include>
<include>org.apache.hudi:hudi-integ-test</include>
<include>com.beust:jcommander</include>
<include>com.twitter:bijection-avro_${scala.binary.version}</include>
<include>com.twitter:bijection-core_${scala.binary.version}</include>
<include>org.apache.parquet:parquet-avro</include>
<include>com.twitter:parquet-avro</include>
<include>com.twitter.common:objectsize</include>
<include>io.confluent:kafka-avro-serializer</include>
<include>io.confluent:common-config</include>
<include>io.confluent:common-utils</include>
<include>io.confluent:kafka-schema-registry-client</include>
<include>io.dropwizard.metrics:metrics-core</include>
<include>io.dropwizard.metrics:metrics-graphite</include>
<include>org.apache.spark:spark-streaming-kafka-0-10_${scala.binary.version}</include>
<include>org.apache.kafka:kafka_${scala.binary.version}</include>
<include>com.101tec:zkclient</include>
<include>org.apache.kafka:kafka-clients</include>
<include>org.apache.hive:hive-common</include>
<include>org.apache.hive:hive-service</include>
<include>org.apache.hive:hive-metastore</include>
<include>org.apache.hive:hive-jdbc</include>
<include>org.apache.hive:hive-exec</include>
<include>com.esotericsoftware:kryo-shaded</include>
<include>org.objenesis:objenesis</include>
<include>com.esotericsoftware:minlog</include>
<include>com.yammer.metrics:metrics-core</include>
<include>org.apache.thrift:libfb303</include>
<include>org.apache.thrift:libthrift</include>
<include>com.fasterxml.jackson.dataformat:jackson-dataformat-yaml</include>
</includes>
</artifactSet>
<relocations>
<relocation>
<pattern>com.beust.jcommander.</pattern>
<shadedPattern>org.apache.hudi.com.beust.jcommander.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.commons.dbcp.</pattern>
<shadedPattern>org.apache.hudi.org.apache.commons.dbcp.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.commons.lang.</pattern>
<shadedPattern>org.apache.hudi.org.apache.commons.lang.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.commons.pool.</pattern>
<shadedPattern>org.apache.hudi.org.apache.commons.pool.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hive.jdbc.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hive.jdbc.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hive.common.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hive.common.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hadoop.hive.common.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hadoop.hive.common.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hadoop.hive.shims.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hadoop.hive.shims.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hadoop.hive.thrift.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hadoop.hive.thrift.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hadoop.hive.serde2.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hadoop.hive.serde2.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hadoop.hive.io.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hadoop.hive.io.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hive.service.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hive.service.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hadoop.hive.service.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hadoop_hive.service.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hive.org.apache.thrift.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hive.org.apache.thrift.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.thrift.</pattern>
<shadedPattern>org.apache.hudi.org.apache.thrift.</shadedPattern>
</relocation>
<relocation>
<pattern>com.facebook.fb303.</pattern>
<shadedPattern>org.apache.hudi.com.facebook.fb303.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hive.jdbc.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hive.jdbc.</shadedPattern>
</relocation>
<relocation>
<pattern>com.esotericsoftware.kryo.</pattern>
<shadedPattern>org.apache.hudi.com.esotericsoftware.kryo.</shadedPattern>
</relocation>
<relocation>
<pattern>org.objenesis.</pattern>
<shadedPattern>org.apache.hudi.org.objenesis.</shadedPattern>
</relocation>
<relocation>
<pattern>com.esotericsoftware.minlog.</pattern>
<shadedPattern>org.apache.hudi.com.esotericsoftware.minlog.</shadedPattern>
</relocation>
<relocation>
<pattern>com.codahale.metrics.</pattern>
<shadedPattern>org.apache.hudi.com.codahale.metrics.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.commons.codec.</pattern>
<shadedPattern>org.apache.hudi.org.apache.commons.codec.</shadedPattern>
</relocation>
<relocation>
<pattern>com.fasterxml.jackson.dataformat.</pattern>
<shadedPattern>org.apache.hudi.com.fasterxml.jackson.dataformat.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.parquet.avro.</pattern>
<shadedPattern>org.apache.hudi.org.apache.parquet.avro.</shadedPattern>
</relocation>
</relocations>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<!-- Use this jar's NOTICE and license file -->
<exclude>META-INF/NOTICE*</exclude>
<exclude>META-INF/LICENSE*</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
<resource>
<directory>src/test/resources</directory>
</resource>
</resources>
</build>
<repositories>
<repository>
<id>confluent</id>
<url>https://packages.confluent.io/maven/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>io.javalin</groupId>
<artifactId>javalin</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>io.dropwizard.metrics</groupId>
<artifactId>metrics-core</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-yaml</artifactId>
<version>2.7.4</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hive-sync</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<classifier>tests</classifier>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<classifier>tests</classifier>
<exclusions>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
<classifier>${hive.exec.classifier}</classifier>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
<classifier>standalone</classifier>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-common</artifactId>
<version>${hive.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hive-sync</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-integ-test</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>commons-dbcp</groupId>
<artifactId>commons-dbcp</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>commons-pool</groupId>
<artifactId>commons-pool</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<exclusions>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>com.yammer.metrics</groupId>
<artifactId>metrics-core</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- Used for SQL templating -->
<dependency>
<groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId>
<version>4.0.2</version>
</dependency>
<dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>1.10.19</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro-mapred</artifactId>
<version>1.7.7</version>
</dependency>
<!-- Parquet -->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>${parquet.version}</version>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<artifactId>bijection-avro_${scala.binary.version}</artifactId>
<version>0.9.3</version>
</dependency>
<dependency>
<groupId>io.confluent</groupId>
<artifactId>kafka-avro-serializer</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>io.confluent</groupId>
<artifactId>common-config</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>io.confluent</groupId>
<artifactId>common-utils</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>io.confluent</groupId>
<artifactId>kafka-schema-registry-client</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.thrift</groupId>
<artifactId>libfb303</artifactId>
<version>0.9.3</version>
</dependency>
<dependency>
<groupId>org.apache.thrift</groupId>
<artifactId>libthrift</artifactId>
<version>0.9.3</version>
</dependency>
</dependencies>
</project>

Some files were not shown because too many files have changed in this diff Show More