From 6edf0b9defe8c82b0ef0076a6300c00c1641a6e6 Mon Sep 17 00:00:00 2001 From: vinoth chandar Date: Thu, 22 Aug 2019 20:18:50 -0700 Subject: [PATCH] [HUDI-68] Pom cleanup & demo automation (#846) - [HUDI-172] Cleanup Maven POM/Classpath - Fix ordering of dependencies in poms, to enable better resolution - Idea is to place more specific ones at the top - And place dependencies which use them below them - [HUDI-68] : Automate demo steps on docker setup - Move hive queries from hive cli to beeline - Standardize on taking query input from text command files - Deltastreamer ingest, also does hive sync in a single step - Spark Incremental Query materialized as a derived Hive table using datasource - Fix flakiness in HDFS spin up and output comparison - Code cleanup around streamlining and loc reduction - Also fixed pom to not shade some hive classs in spark, to enable hive sync --- .gitignore | 4 +- .travis.yml | 12 +- docker/demo/compaction.commands | 5 + docker/demo/config/dfs-source.properties | 27 + docker/demo/get_min_commit_time.sh | 21 + docker/demo/hive-batch1.commands | 11 + .../hive-batch2-after-compaction.commands | 9 + docker/demo/hive-incremental.commands | 10 + docker/demo/hive-table-check.commands | 10 + docker/demo/sparksql-batch1.commands | 30 + docker/demo/sparksql-batch2.commands | 29 + docker/demo/sparksql-incremental.commands | 59 ++ hudi-cli/hudi-cli.sh | 5 +- hudi-cli/pom.xml | 98 +-- .../hudi/cli/commands/CompactionCommand.java | 15 +- hudi-client/pom.xml | 177 ++--- hudi-common/pom.xml | 153 +++-- hudi-hadoop-mr/pom.xml | 99 +-- hudi-hive/pom.xml | 203 +++--- hudi-integ-test/pom.xml | 141 ++-- .../org/apache/hudi/integ/ITTestBase.java | 141 +++- .../apache/hudi/integ/ITTestHoodieDemo.java | 271 ++++++++ .../apache/hudi/integ/ITTestHoodieSanity.java | 141 ++-- hudi-spark/pom.xml | 194 +++--- hudi-timeline-service/pom.xml | 96 +-- hudi-utilities/pom.xml | 397 ++++++----- .../deltastreamer/HoodieDeltaStreamer.java | 2 +- packaging/hudi-hadoop-mr-bundle/pom.xml | 104 +-- packaging/hudi-hive-bundle/pom.xml | 185 ++--- packaging/hudi-presto-bundle/pom.xml | 120 ++-- packaging/hudi-spark-bundle/pom.xml | 243 ++++--- packaging/hudi-utilities-bundle/pom.xml | 396 ++++++----- pom.xml | 638 +++++++++--------- tools/run_travis_tests.sh | 17 + 34 files changed, 2382 insertions(+), 1681 deletions(-) create mode 100644 docker/demo/compaction.commands create mode 100644 docker/demo/config/dfs-source.properties create mode 100755 docker/demo/get_min_commit_time.sh create mode 100644 docker/demo/hive-batch1.commands create mode 100644 docker/demo/hive-batch2-after-compaction.commands create mode 100644 docker/demo/hive-incremental.commands create mode 100644 docker/demo/hive-table-check.commands create mode 100644 docker/demo/sparksql-batch1.commands create mode 100644 docker/demo/sparksql-batch2.commands create mode 100644 docker/demo/sparksql-incremental.commands create mode 100644 hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java create mode 100755 tools/run_travis_tests.sh diff --git a/.gitignore b/.gitignore index f4f680ade..9e726f4f6 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ target/ *.war *.ear *.db +*.patch ###################### # OSX @@ -74,5 +75,4 @@ dependency-reduced-pom.xml ####################################### # Docker ####################################### -hoodie-integ-test/compose_env - +hudi-integ-test/compose_env diff --git a/.travis.yml b/.travis.yml index 2cad58f17..e34d3565c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,16 +1,20 @@ language: java jdk: -- oraclejdk8 + - oraclejdk8 sudo: required env: -- HUDI_QUIETER_LOGGING=1 + - HUDI_QUIETER_LOGGING=1 TEST_SUITE=unit + - TEST_SUITE=integration +install: true services: -- docker + - docker cache: directories: - - "$HOME/.m2" + - "$HOME/.m2" notifications: slack: rooms: - secure: WNIZPBY//xf/xTJL1YUPzvPUDwjawaMM4IJ6IqxjRGcZCmuhNVu2XTJ3aL1g6X7ZcJKxJuwoU/TbSO8Dl6rgWSo/2OfyzBd4ks+hgeCsdycccTcvO8giQO1DOUGUSRdvUzOvKjWVK7iARYzQhoZawAYwI09UJLlwhYRCJ1IKc1ZksrEt964GeEmPyJbwMoZOJVUU84jJIAZPIpOFGTKM652FMermg9yaY2W5oSjDXaV98z0/mJV4Ry++J2v0fvoDs5HxkXYhZJP+dpWR82KDr6Q6LGL5/IlJ+b+IH3pF8LyKR4nCH6l1EZ8KpoFZapyYWYQpXMfQoF2K/JEQkpz1EqBCeEDSJ2+j1PPLhOWXd7ok4DsS26S8BP2ImvyXwua51THN1/r1fCGSIdxiQ5C8aeYmPCSr+oLChCVivEG2eeU34Z1nQJ5aDymNGeFE9qUUpjS0ETfFcjI/WQaA+FiYiPkDfeAoT1+6ySdY7l9gJhMygupILjq57IHbqx4nEr/8AB3Rqb8iIDTWDXgUBI9xKmty36zjIGcVOsCT/SGPccxvEJBXQk8uQqs/rDhaA/ErJPMLX/2b7ElSSObKFdjpMaxVvZIE6wvMLJpIYfChDoXwgfhN6zlAFZrEib7PFI4dGkS8u4wkkHkBS7C+uz2e92EhsAB+BIhUR1M3NQ33+Is= on_pull_requests: false +script: + tools/run_travis_tests.sh $TEST_SUITE \ No newline at end of file diff --git a/docker/demo/compaction.commands b/docker/demo/compaction.commands new file mode 100644 index 000000000..dd8ffae31 --- /dev/null +++ b/docker/demo/compaction.commands @@ -0,0 +1,5 @@ +connect --path /user/hive/warehouse/stock_ticks_mor +compactions show all +compaction schedule +compaction run --parallelism 2 --sparkMemory 1G --schemaFilePath /var/demo/config/schema.avsc --retry 1 + diff --git a/docker/demo/config/dfs-source.properties b/docker/demo/config/dfs-source.properties new file mode 100644 index 000000000..3e17661da --- /dev/null +++ b/docker/demo/config/dfs-source.properties @@ -0,0 +1,27 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +include=base.properties +# Key fields, for kafka example +hoodie.datasource.write.recordkey.field=key +hoodie.datasource.write.partitionpath.field=date +# Schema provider props (change to absolute path based on your installation) +hoodie.deltastreamer.schemaprovider.source.schema.file=/var/demo/config/schema.avsc +hoodie.deltastreamer.schemaprovider.target.schema.file=/var/demo/config/schema.avsc +# DFS Source +hoodie.deltastreamer.source.dfs.root=/usr/hive/data/input/ diff --git a/docker/demo/get_min_commit_time.sh b/docker/demo/get_min_commit_time.sh new file mode 100755 index 000000000..11bc773c5 --- /dev/null +++ b/docker/demo/get_min_commit_time.sh @@ -0,0 +1,21 @@ +#!/bin/bash +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +MIN_COMMIT_TIME=`hdfs dfs -ls -t /user/hive/warehouse/stock_ticks_cow/.hoodie/*.commit | head -1 | awk -F'/' ' { print $7 } ' | awk -F'.' ' { print $1 } '` +echo $MIN_COMMIT_TIME; diff --git a/docker/demo/hive-batch1.commands b/docker/demo/hive-batch1.commands new file mode 100644 index 000000000..3176dea91 --- /dev/null +++ b/docker/demo/hive-batch1.commands @@ -0,0 +1,11 @@ +add jar ${hudi.hadoop.bundle}; + +select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; +select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'; +select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'; + +select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; + +!quit diff --git a/docker/demo/hive-batch2-after-compaction.commands b/docker/demo/hive-batch2-after-compaction.commands new file mode 100644 index 000000000..4cf6eee00 --- /dev/null +++ b/docker/demo/hive-batch2-after-compaction.commands @@ -0,0 +1,9 @@ +add jar ${hudi.hadoop.bundle}; + +select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'; +select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'; + +select symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; + +!quit diff --git a/docker/demo/hive-incremental.commands b/docker/demo/hive-incremental.commands new file mode 100644 index 000000000..3c764ad9c --- /dev/null +++ b/docker/demo/hive-incremental.commands @@ -0,0 +1,10 @@ +add jar ${hudi.hadoop.bundle}; + +set hoodie.stock_ticks_cow.consume.mode=INCREMENTAL; +set hoodie.stock_ticks_cow.consume.max.commits=3; +set hoodie.stock_ticks_cow.consume.start.timestamp=${min.commit.time}; + +select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG' and `_hoodie_commit_time` > '${min.commit.time}'; + +!quit + diff --git a/docker/demo/hive-table-check.commands b/docker/demo/hive-table-check.commands new file mode 100644 index 000000000..4484ec43d --- /dev/null +++ b/docker/demo/hive-table-check.commands @@ -0,0 +1,10 @@ +add jar ${hudi.hadoop.bundle}; +show tables; + +show partitions stock_ticks_cow; +show partitions stock_ticks_mor; +show partitions stock_ticks_mor_rt; + +!quit + + diff --git a/docker/demo/sparksql-batch1.commands b/docker/demo/sparksql-batch1.commands new file mode 100644 index 000000000..e220cb0bd --- /dev/null +++ b/docker/demo/sparksql-batch1.commands @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +spark.sql("show tables").show(100, false) +// Copy-On-Write table +spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false) + +// Merge-On-Read table +spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false) + +System.exit(0) diff --git a/docker/demo/sparksql-batch2.commands b/docker/demo/sparksql-batch2.commands new file mode 100644 index 000000000..521a4ebe6 --- /dev/null +++ b/docker/demo/sparksql-batch2.commands @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + // Copy-On-Write table +spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false) + +// Merge-On-Read table +spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false) +spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false) + +System.exit(0) diff --git a/docker/demo/sparksql-incremental.commands b/docker/demo/sparksql-incremental.commands new file mode 100644 index 000000000..ceec945f9 --- /dev/null +++ b/docker/demo/sparksql-incremental.commands @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.hudi.DataSourceReadOptions; +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.spark.sql.SaveMode; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.HoodieDataSourceHelpers; +import org.apache.hadoop.fs.FileSystem; + +val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) +val beginInstantTime = HoodieDataSourceHelpers.listCommitsSince(fs, "/user/hive/warehouse/stock_ticks_cow", "00000").get(0) +val hoodieIncViewDF = spark.read.format("org.apache.hudi"). + option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY, DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL). + option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, beginInstantTime). + load("/user/hive/warehouse/stock_ticks_cow"); +hoodieIncViewDF.registerTempTable("stock_ticks_cow_incr") +spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_incr where symbol = 'GOOG'").show(100, false); + +spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, close from stock_ticks_cow_incr"). + write.format("org.apache.hudi"). + option("hoodie.insert.shuffle.parallelism", "2"). + option("hoodie.upsert.shuffle.parallelism","2"). + option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_STORAGE_TYPE_OPT_VAL). + option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL). + option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "key"). + option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "datestr"). + option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts"). + option(HoodieWriteConfig.TABLE_NAME, "stock_ticks_derived_mor"). + option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "stock_ticks_derived_mor"). + option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "default"). + option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://hiveserver:10000"). + option(DataSourceWriteOptions.HIVE_USER_OPT_KEY, "hive"). + option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY, "hive"). + option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true"). + option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "datestr"). + mode(SaveMode.Overwrite). + save("/user/hive/warehouse/stock_ticks_derived_mor"); + +spark.sql("show tables").show(20, false) +spark.sql("select count(*) from stock_ticks_derived_mor").show(20, false) +spark.sql("select count(*) from stock_ticks_derived_mor_rt").show(20, false) + +System.exit(0); \ No newline at end of file diff --git a/hudi-cli/hudi-cli.sh b/hudi-cli/hudi-cli.sh index 188c16331..8b12f6e61 100755 --- a/hudi-cli/hudi-cli.sh +++ b/hudi-cli/hudi-cli.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -HUDI_JAR=`ls $DIR/target/hudi-cli-*-SNAPSHOT.jar | grep -v source | grep -v javadoc` +HOODIE_JAR=`ls $DIR/target/hudi-cli-*-SNAPSHOT.jar | grep -v source | grep -v javadoc` if [ -z "$HADOOP_CONF_DIR" ]; then echo "setting hadoop conf dir" HADOOP_CONF_DIR="/etc/hadoop/conf" @@ -13,5 +13,4 @@ fi if [ -z "$CLIENT_JAR" ]; then echo "client jar location not set" fi -echo "java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:$DIR/target/lib/*:$HUDI_JAR:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap" -java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:$DIR/target/lib/*:$HUDI_JAR:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap +java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:$DIR/target/lib/*:$HOODIE_JAR:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap $@ diff --git a/hudi-cli/pom.xml b/hudi-cli/pom.xml index 0ba33dcba..3b66ff2f2 100644 --- a/hudi-cli/pom.xml +++ b/hudi-cli/pom.xml @@ -29,8 +29,6 @@ 1.2.0.RELEASE org.springframework.shell.Bootstrap - 1.2.17 - 4.10 ${project.basedir}/src/main/resources/META-INF @@ -61,7 +59,7 @@ net.alchim31.maven scala-maven-plugin - 3.2.1 + ${scala-maven-plugin.version} @@ -133,23 +131,42 @@ - + org.scala-lang scala-library ${scala.version} + - org.springframework.shell - spring-shell - ${spring.shell.version} + org.apache.hudi + hudi-client + ${project.version} - de.vandermeer - asciitable - 0.2.5 + org.apache.hudi + hudi-common + ${project.version} + + org.apache.hudi + hudi-hive + ${project.version} + + + org.apache.hudi + hudi-utilities + ${project.version} + + + + + log4j + log4j + + + org.apache.spark spark-core_2.11 @@ -159,6 +176,24 @@ spark-sql_2.11 + + + commons-dbcp + commons-dbcp + + + + org.springframework.shell + spring-shell + ${spring.shell.version} + + + + de.vandermeer + asciitable + 0.2.5 + + com.jakewharton.fliptables fliptables @@ -166,60 +201,25 @@ - log4j - log4j - ${log4j.version} - - - - org.apache.hudi - hudi-hive - ${project.version} - - - - org.apache.hudi - hudi-client - ${project.version} + joda-time + joda-time + org.apache.hadoop hadoop-common - org.apache.hadoop hadoop-hdfs - - org.apache.hudi - hudi-common - ${project.version} - junit junit-dep - ${junit.version} + ${junit-dep.version} test - - - commons-dbcp - commons-dbcp - - - joda-time - joda-time - 2.9.6 - - - org.apache.hudi - hudi-utilities - ${project.version} - - - diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java index 592d52864..6a32edc99 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java @@ -217,12 +217,23 @@ public class CompactionCommand implements CommandMarker { final String sparkMemory, @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry, - @CliOption(key = "compactionInstant", mandatory = true, help = "Base path for the target hoodie dataset") - final String compactionInstantTime) throws Exception { + @CliOption(key = "compactionInstant", mandatory = false, help = "Base path for the target hoodie dataset") + String compactionInstantTime) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { + if (null == compactionInstantTime) { + // pick outstanding one with lowest timestamp + Option firstPendingInstant = HoodieCLI.tableMetadata.reloadActiveTimeline() + .filterCompletedAndCompactionInstants().filter(instant -> instant.getAction() + .equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant().map(HoodieInstant::getTimestamp); + if (!firstPendingInstant.isPresent()) { + return "NO PENDING COMPACTION TO RUN"; + } + compactionInstantTime = firstPendingInstant.get(); + } + String sparkPropertiesPath = Utils.getDefaultPropertiesFile( scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); diff --git a/hudi-client/pom.xml b/hudi-client/pom.xml index 95959ef5c..5e1512d8d 100644 --- a/hudi-client/pom.xml +++ b/hudi-client/pom.xml @@ -68,6 +68,7 @@ + org.apache.hudi hudi-common @@ -78,6 +79,71 @@ hudi-timeline-service ${project.version} + + + + log4j + log4j + + + + + org.apache.parquet + parquet-avro + + + org.apache.parquet + parquet-hadoop + + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + + io.dropwizard.metrics + metrics-graphite + + + io.dropwizard.metrics + metrics-core + + + + com.google.guava + guava + + + + com.beust + jcommander + 1.48 + + + + org.htrace + htrace-core + 3.0.4 + + + + + org.apache.hadoop + hadoop-client + + + javax.servlet + * + + + org.apache.hadoop hadoop-hdfs @@ -117,6 +183,14 @@ + + + + org.apache.hbase + hbase-client + + + org.apache.hudi hudi-common @@ -126,61 +200,35 @@ test - io.dropwizard.metrics - metrics-graphite - - - io.dropwizard.metrics - metrics-core - - - com.beust - jcommander - 1.48 + org.apache.hudi + hudi-hadoop-mr + ${project.version} + test - + - log4j - log4j - - - - org.apache.hadoop - hadoop-client + org.apache.hbase + hbase-testing-util + ${hbase.version} + test - javax.servlet + org.codehaus.jackson + jackson-mapper-asl + + + org.codehaus.jackson + jackson-core-asl + + + javax.xml.bind * - - org.apache.parquet - parquet-avro - - - - org.apache.parquet - parquet-hadoop - - - - com.google.guava - guava - - - - org.apache.spark - spark-core_2.11 - - - - org.apache.spark - spark-sql_2.11 - - + ${hive.groupid} hive-exec @@ -191,46 +239,7 @@ org.mockito mockito-all - 1.10.19 test - - org.apache.hudi - hudi-hadoop-mr - ${project.version} - test - - - - org.apache.hbase - hbase-client - - - - org.htrace - htrace-core - 3.0.4 - - - - org.apache.hbase - hbase-testing-util - 1.2.3 - test - - - org.codehaus.jackson - jackson-mapper-asl - - - org.codehaus.jackson - jackson-core-asl - - - javax.xml.bind - * - - - diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml index fdde6022f..f418ab64d 100644 --- a/hudi-common/pom.xml +++ b/hudi-common/pom.xml @@ -43,7 +43,7 @@ org.apache.maven.plugins maven-jar-plugin - 2.5 + ${maven-jar-plugin.version} @@ -79,77 +79,20 @@ - - org.rocksdb - rocksdbjni - - - org.apache.avro - avro - - - org.apache.hadoop - hadoop-client - - - javax.servlet - * - - - - - junit - junit - ${junit.version} - test - + com.fasterxml.jackson.core jackson-annotations - ${fasterxml.version} com.fasterxml.jackson.core jackson-databind + + - org.apache.parquet - parquet-avro - ${parquet.version} - - - org.mockito - mockito-all - 1.10.19 - test - - - org.apache.hadoop - hadoop-hdfs - tests - - - org.apache.hadoop - hadoop-common - tests - - - org.apache.httpcomponents - httpclient - 4.5.4 - - - commons-codec - commons-codec - - - org.apache.httpcomponents - fluent-hc - 4.5.4 - - - com.esotericsoftware - kryo + org.apache.avro + avro org.apache.avro @@ -161,21 +104,91 @@ + + - com.github.stefanbirkner - system-rules - 1.16.0 - test + org.apache.parquet + parquet-avro + + + + + com.twitter.common + objectsize + 0.0.12 + + + + + commons-codec + commons-codec + + + + + org.apache.httpcomponents + fluent-hc - com.twitter.common - objectsize - 0.0.12 + org.apache.httpcomponents + httpclient + + + + org.rocksdb + rocksdbjni + + + + + org.apache.hadoop + hadoop-client + + + javax.servlet + * + + + + + org.apache.hadoop + hadoop-common + tests + + + org.apache.hadoop + hadoop-hdfs + tests + + + + junit + junit + test + + + + org.mockito + mockito-all + test + + + + com.esotericsoftware + kryo + test com.esotericsoftware kryo-shaded 4.0.2 + + + com.github.stefanbirkner + system-rules + 1.16.0 + test + diff --git a/hudi-hadoop-mr/pom.xml b/hudi-hadoop-mr/pom.xml index cc6cfa2c7..66f2fe65a 100644 --- a/hudi-hadoop-mr/pom.xml +++ b/hudi-hadoop-mr/pom.xml @@ -30,32 +30,52 @@ + org.apache.hudi hudi-common ${project.version} + + - org.apache.hudi - hudi-common - ${project.version} - tests - test-jar - test + org.apache.avro + avro + + + + + org.apache.parquet + parquet-avro + + + + + com.twitter + parquet-avro + + + com.twitter + parquet-hadoop-bundle + + + + + commons-logging + commons-logging + + com.twitter.common + objectsize + 0.0.12 + + + org.apache.hadoop hadoop-common - - org.apache.hadoop - hadoop-mapreduce-client-core - - - org.apache.hadoop - hadoop-mapreduce-client-common - org.apache.hadoop hadoop-auth @@ -64,52 +84,41 @@ org.apache.hadoop hadoop-hdfs + + org.apache.hadoop + hadoop-mapreduce-client-core + + + org.apache.hadoop + hadoop-mapreduce-client-common + + + ${hive.groupid} hive-jdbc - ${hive.version} - - - commons-logging - commons-logging - - ${hive.groupid} hive-exec - ${hive.version} + + - commons-logging - commons-logging - - - org.apache.parquet - parquet-avro - - - com.twitter - parquet-avro - - - com.twitter - parquet-hadoop-bundle - - - com.twitter.common - objectsize - 0.0.12 - - - org.apache.avro - avro + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + com.esotericsoftware kryo test + junit junit diff --git a/hudi-hive/pom.xml b/hudi-hive/pom.xml index b9f6e01f7..758860d34 100644 --- a/hudi-hive/pom.xml +++ b/hudi-hive/pom.xml @@ -31,6 +31,81 @@ + + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hudi + hudi-hadoop-mr + ${project.version} + + + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + + + com.twitter + parquet-avro + + + + com.google.guava + guava + + + + org.apache.thrift + libthrift + ${thrift.version} + + + + joda-time + joda-time + + + + + commons-dbcp + commons-dbcp + + + commons-pool + commons-pool + + + commons-io + commons-io + + + + com.beust + jcommander + + + + + org.apache.httpcomponents + httpcore + + + org.apache.httpcomponents + httpclient + + + org.apache.hadoop hadoop-common @@ -48,66 +123,17 @@ hadoop-auth - com.google.guava - guava + org.apache.hadoop + hadoop-common + tests - org.apache.thrift - libthrift - ${thrift.version} + org.apache.hadoop + hadoop-hdfs + tests - - joda-time - joda-time - - - - - commons-dbcp - commons-dbcp - - - - commons-pool - commons-pool - - - - commons-io - commons-io - - - - - org.slf4j - slf4j-api - - - org.slf4j - slf4j-log4j12 - - - - com.beust - jcommander - - - - org.apache.httpcomponents - httpcore - - - - org.apache.httpcomponents - httpclient - - - - - junit - junit - + ${hive.groupid} hive-service @@ -139,64 +165,53 @@ ${hive.version} + - org.apache.hadoop - hadoop-hdfs + org.apache.hudi + hudi-common + ${project.version} tests - - - org.apache.hadoop - hadoop-common - tests - - - org.apache.hadoop - hadoop-mapreduce-client-common - test - - - org.apache.hadoop - hadoop-mapreduce-client-core test + org.mockito mockito-all test - - com.twitter - parquet-avro - - - org.apache.hudi - hudi-hadoop-mr - ${project.version} - - - org.apache.hudi - hudi-common - ${project.version} - - - org.apache.hudi - hudi-common - ${project.version} - tests - test - com.esotericsoftware.kryo kryo 2.21 test + + + junit + junit + test + + org.eclipse.jetty.aggregate jetty-all test + + + + org.apache.hadoop + hadoop-mapreduce-client-common + test + + + + org.apache.hadoop + hadoop-mapreduce-client-core + test + + @@ -213,7 +228,7 @@ org.apache.maven.plugins maven-jar-plugin - 2.5 + ${maven-jar-plugin.version} diff --git a/hudi-integ-test/pom.xml b/hudi-integ-test/pom.xml index 6b7e29e78..30dd42c9b 100644 --- a/hudi-integ-test/pom.xml +++ b/hudi-integ-test/pom.xml @@ -9,32 +9,57 @@ hudi-integ-test 4.0.0 - - org.glassfish.jersey.connectors - jersey-apache-connector - 2.17 - + + org.glassfish.jersey.core jersey-server - 2.17 + + + org.glassfish.jersey.connectors + jersey-apache-connector org.glassfish.jersey.containers jersey-container-servlet-core - 2.17 + + + + + com.github.docker-java + docker-java + 3.1.2 + test + + + org.apache.hudi hudi-spark ${project.version} - - - org.glassfish.** - * - - + + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + + + org.apache.hudi + hudi-hadoop-sparkworker-docker + ${project.version} + pom + import + + + org.apache.hudi hudi-common @@ -43,12 +68,6 @@ test-jar test - - org.awaitility - awaitility - 3.1.2 - test - org.apache.hudi hudi-spark @@ -56,56 +75,39 @@ tests test-jar test - - - org.glassfish.** - * - - + + + + com.fasterxml.jackson.core + jackson-annotations + test + + + com.fasterxml.jackson.core + jackson-databind + test + + + com.fasterxml.jackson.datatype + jackson-datatype-guava + test + + + + org.awaitility + awaitility + 3.1.2 + test + + com.google.guava guava 20.0 test - - com.fasterxml.jackson.core - jackson-annotations - 2.6.4 - test - - - com.fasterxml.jackson.core - jackson-databind - 2.6.4 - test - - - com.fasterxml.jackson.datatype - jackson-datatype-guava - 2.9.4 - test - - - com.github.docker-java - docker-java - 3.1.0-rc-3 - test - - - org.glassfish.** - * - - - - - org.apache.hudi - hudi-hadoop-sparkworker-docker - ${project.version} - pom - import - + junit junit @@ -150,12 +152,25 @@ org.apache.maven.plugins maven-failsafe-plugin 2.22.0 + + + **/ITT*.java + + + integration-test integration-test + + verify + verify + + verify + + @@ -179,7 +194,7 @@ down - integration-test + post-integration-test down diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java index 9751c11ff..c12cf6d05 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java @@ -31,14 +31,14 @@ import com.github.dockerjava.core.DockerClientBuilder; import com.github.dockerjava.core.DockerClientConfig; import com.github.dockerjava.core.command.ExecStartResultCallback; import com.github.dockerjava.jaxrs.JerseyDockerCmdExecFactory; -import com.google.common.collect.ImmutableList; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import org.apache.commons.lang3.tuple.Pair; +import org.apache.hudi.common.util.collection.Pair; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.junit.Assert; @@ -52,38 +52,64 @@ public abstract class ITTestBase { protected static final String ADHOC_2_CONTAINER = "/adhoc-2"; protected static final String HIVESERVER = "/hiveserver"; protected static final String HOODIE_WS_ROOT = "/var/hoodie/ws"; - protected static final String HOODIE_JAVA_APP = HOODIE_WS_ROOT + "/hoodie-spark/run_hoodie_app.sh"; + protected static final String HOODIE_JAVA_APP = HOODIE_WS_ROOT + "/hudi-spark/run_hoodie_app.sh"; protected static final String HUDI_HADOOP_BUNDLE = HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-hadoop-mr-bundle.jar"; protected static final String HUDI_HIVE_BUNDLE = HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-hive-bundle.jar"; protected static final String HUDI_SPARK_BUNDLE = HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-spark-bundle.jar"; + protected static final String HUDI_UTILITIES_BUNDLE = + HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-utilities.jar"; protected static final String HIVE_SERVER_JDBC_URL = "jdbc:hive2://hiveserver:10000"; + protected static final String HADOOP_CONF_DIR = "/etc/hadoop"; + // Skip these lines when capturing output from hive - protected static final Integer SLF4J_WARNING_LINE_COUNT_IN_HIVE_CMD = 9; private static final String DEFAULT_DOCKER_HOST = "unix:///var/run/docker.sock"; private static final String OVERRIDDEN_DOCKER_HOST = System.getenv("DOCKER_HOST"); protected DockerClient dockerClient; protected Map runningContainers; - protected static String[] getHiveConsoleCommand(String rawCommand) { + static String[] getHiveConsoleCommand(String rawCommand) { String jarCommand = "add jar " + HUDI_HADOOP_BUNDLE + ";"; String fullCommand = jarCommand + rawCommand; - List cmd = new ImmutableList.Builder().add("hive") - .add("--hiveconf") - .add("hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat") - .add("--hiveconf") - .add("hive.stats.autogather=false") - .add("-e") - .add("\"" + fullCommand + "\"") - .build(); + List cmd = new ArrayList<>(); + cmd.add("hive"); + cmd.add("--hiveconf"); + cmd.add("hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); + cmd.add("--hiveconf"); + cmd.add("hive.stats.autogather=false"); + cmd.add("-e"); + cmd.add("\"" + fullCommand + "\""); return cmd.stream().toArray(String[]::new); } + private static String getHiveConsoleCommandFile(String commandFile, String additionalVar) { + StringBuilder builder = new StringBuilder() + .append("beeline -u " + HIVE_SERVER_JDBC_URL) + .append(" --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat ") + .append(" --hiveconf hive.stats.autogather=false ") + .append(" --hivevar hudi.hadoop.bundle=" + HUDI_HADOOP_BUNDLE); + + if (additionalVar != null) { + builder.append(" --hivevar " + additionalVar + " "); + } + return builder.append(" -f ").append(commandFile).toString(); + } + + static String getSparkShellCommand(String commandFile) { + return new StringBuilder() + .append("spark-shell --jars ").append(HUDI_SPARK_BUNDLE) + .append(" --master local[2] --driver-class-path ").append(HADOOP_CONF_DIR) + .append(" --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --executor-memory 1G --num-executors 1 ") + .append(" --packages com.databricks:spark-avro_2.11:4.0.0 ") + .append(" -i ").append(commandFile) + .toString(); + } + @Before - public void init() throws IOException { + public void init() { String dockerHost = (OVERRIDDEN_DOCKER_HOST != null) ? OVERRIDDEN_DOCKER_HOST : DEFAULT_DOCKER_HOST; //Assuming insecure docker engine DockerClientConfig config = DefaultDockerClientConfig.createDefaultConfigBuilder() @@ -104,7 +130,7 @@ public abstract class ITTestBase { List containerList = dockerClient.listContainersCmd().exec(); for (Container c : containerList) { if (!c.getState().equalsIgnoreCase("running")) { - System.out.println("Container : " + Arrays.toString(c.getNames()) + LOG.info("Container : " + Arrays.toString(c.getNames()) + "not in running state, Curr State :" + c.getState()); return false; } @@ -114,10 +140,12 @@ public abstract class ITTestBase { return true; } - protected TestExecStartResultCallback executeCommandInDocker(String containerName, String[] command, - boolean expectedToSucceed) - throws Exception { - LOG.info("Executing command (" + Arrays.toString(command) + ") in container " + containerName); + private String singleSpace(String str) { + return str.replaceAll("[\\s]+"," "); + } + + private TestExecStartResultCallback executeCommandInDocker(String containerName, String[] command, + boolean expectedToSucceed) throws Exception { Container sparkWorkerContainer = runningContainers.get(containerName); ExecCreateCmd cmd = dockerClient.execCreateCmd(sparkWorkerContainer.getId()) .withCmd(command).withAttachStdout(true).withAttachStderr(true); @@ -128,12 +156,10 @@ public abstract class ITTestBase { dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(false).withTty(false) .exec(callback).awaitCompletion(); int exitCode = dockerClient.inspectExecCmd(createCmdResponse.getId()).exec().getExitCode(); - LOG.info("Exit code for command (" + Arrays.toString(command) + ") is " + exitCode); - if (exitCode != 0) { - LOG.error("Command (" + Arrays.toString(command) + ") failed."); - LOG.error("Stdout is :" + callback.getStdout().toString()); - LOG.error("Stderr is :" + callback.getStderr().toString()); - } + LOG.info("Exit code for command : " + exitCode); + LOG.error("\n\n ###### Stdout #######\n" + callback.getStdout().toString()); + LOG.error("\n\n ###### Stderr #######\n" + callback.getStderr().toString()); + if (expectedToSucceed) { Assert.assertTrue("Command (" + Arrays.toString(command) + ") expected to succeed. Exit (" + exitCode + ")", exitCode == 0); @@ -145,6 +171,71 @@ public abstract class ITTestBase { return callback; } + void executeCommandStringsInDocker(String containerName, List commands) throws Exception { + for (String cmd : commands) { + executeCommandStringInDocker(containerName, cmd, true); + } + } + + TestExecStartResultCallback executeCommandStringInDocker(String containerName, String cmd, + boolean expectedToSucceed) throws Exception { + LOG.info("\n\n#################################################################################################"); + LOG.info("Container : " + containerName + ", Running command :" + cmd); + LOG.info("\n#################################################################################################"); + + String[] cmdSplits = singleSpace(cmd).split(" "); + return executeCommandInDocker(containerName, cmdSplits, expectedToSucceed); + } + + Pair executeHiveCommand(String hiveCommand) throws Exception { + + LOG.info("\n\n#################################################################################################"); + LOG.info("Running hive command :" + hiveCommand); + LOG.info("\n#################################################################################################"); + + String[] hiveCmd = getHiveConsoleCommand(hiveCommand); + TestExecStartResultCallback callback = executeCommandInDocker(HIVESERVER, hiveCmd, true); + return Pair.of(callback.getStdout().toString().trim(), callback.getStderr().toString().trim()); + } + + Pair executeHiveCommandFile(String commandFile) throws Exception { + return executeHiveCommandFile(commandFile, null); + } + + Pair executeHiveCommandFile(String commandFile, String additionalVar) throws Exception { + String hiveCmd = getHiveConsoleCommandFile(commandFile, additionalVar); + TestExecStartResultCallback callback = executeCommandStringInDocker(HIVESERVER, hiveCmd, true); + return Pair.of(callback.getStdout().toString().trim(), callback.getStderr().toString().trim()); + } + + Pair executeSparkSQLCommand(String commandFile, boolean expectedToSucceed) throws Exception { + String sparkShellCmd = getSparkShellCommand(commandFile); + TestExecStartResultCallback callback = executeCommandStringInDocker(ADHOC_1_CONTAINER, + sparkShellCmd, expectedToSucceed); + return Pair.of(callback.getStdout().toString(), callback.getStderr().toString()); + } + + void assertStdOutContains(Pair stdOutErr, String expectedOutput) { + assertStdOutContains(stdOutErr, expectedOutput, 1); + } + + void assertStdOutContains(Pair stdOutErr, String expectedOutput, int times) { + // this is so that changes in padding don't affect comparison + String stdOutSingleSpaced = singleSpace(stdOutErr.getLeft()).replaceAll(" ", ""); + expectedOutput = singleSpace(expectedOutput).replaceAll(" ", ""); + + int lastIndex = 0; + int count = 0; + while(lastIndex != -1){ + lastIndex = stdOutSingleSpaced.indexOf(expectedOutput, lastIndex); + if(lastIndex != -1){ + count ++; + lastIndex += expectedOutput.length(); + } + } + Assert.assertEquals("Did not find output the expected number of times", times, count); + } + public class TestExecStartResultCallback extends ExecStartResultCallback { // Storing the reference in subclass to expose to clients diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java new file mode 100644 index 000000000..ee3855a83 --- /dev/null +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ; + +import com.google.common.collect.ImmutableList; +import org.apache.hudi.common.util.collection.Pair; +import java.util.List; +import org.junit.Test; + +/** + * Goes through steps described in https://hudi.incubator.apache.org/docker_demo.html + * + * To run this as a standalone test in the IDE or command line. First bring up the demo setup using + * `docker/setup_demo.sh` and then run the test class as you would do normally. + */ +public class ITTestHoodieDemo extends ITTestBase { + + private static String HDFS_DATA_DIR = "/usr/hive/data/input"; + private static String HDFS_BATCH_PATH1 = HDFS_DATA_DIR + "/" + "batch_1.json"; + private static String HDFS_BATCH_PATH2 = HDFS_DATA_DIR + "/" + "batch_2.json"; + + private static String INPUT_BATCH_PATH1 = HOODIE_WS_ROOT + + "/docker/demo/data/batch_1.json"; + private static String INPUT_BATCH_PATH2 = HOODIE_WS_ROOT + + "/docker/demo/data/batch_2.json"; + + private static String COW_BASE_PATH = "/user/hive/warehouse/stock_ticks_cow"; + private static String MOR_BASE_PATH = "/user/hive/warehouse/stock_ticks_mor"; + private static String COW_TABLE_NAME = "stock_ticks_cow"; + private static String MOR_TABLE_NAME = "stock_ticks_mor"; + + private static String DEMO_CONTAINER_SCRIPT = HOODIE_WS_ROOT + "/docker/demo/setup_demo_container.sh"; + private static String MIN_COMMIT_TIME_SCRIPT = HOODIE_WS_ROOT + "/docker/demo/get_min_commit_time.sh"; + private static String HUDI_CLI_TOOL = HOODIE_WS_ROOT + "/hudi-cli/hudi-cli.sh"; + private static String COMPACTION_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/compaction.commands"; + private static String SPARKSQL_BATCH1_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-batch1.commands"; + private static String SPARKSQL_BATCH2_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-batch2.commands"; + private static String SPARKSQL_INCREMENTAL_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-incremental.commands"; + private static String HIVE_TBLCHECK_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-table-check.commands"; + private static String HIVE_BATCH1_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-batch1.commands"; + private static String HIVE_BATCH2_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-batch2-after-compaction.commands"; + private static String HIVE_INCREMENTAL_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-incremental.commands"; + + + private static String HIVE_SYNC_CMD_FMT = " --enable-hive-sync " + + " --hoodie-conf hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000 " + + " --hoodie-conf hoodie.datasource.hive_sync.username=hive " + + " --hoodie-conf hoodie.datasource.hive_sync.password=hive " + + " --hoodie-conf hoodie.datasource.hive_sync.partition_fields=%s " + + " --hoodie-conf hoodie.datasource.hive_sync.database=default " + + " --hoodie-conf hoodie.datasource.hive_sync.table=%s"; + + + @Test + public void testDemo() throws Exception { + setupDemo(); + + // batch 1 + ingestFirstBatchAndHiveSync(); + testHiveAfterFirstBatch(); + testSparkSQLAfterFirstBatch(); + + // batch 2 + ingestSecondBatchAndHiveSync(); + testHiveAfterSecondBatch(); + testSparkSQLAfterSecondBatch(); + testIncrementalHiveQuery(); + testIncrementalSparkSQLQuery(); + + // compaction + scheduleAndRunCompaction(); + testHiveAfterSecondBatchAfterCompaction(); + testIncrementalHiveQueryAfterCompaction(); + } + + private void setupDemo() throws Exception { + List cmds = new ImmutableList.Builder() + .add("hdfs dfsadmin -safemode wait") // handle NN going into safe mode at times + .add("hdfs dfs -mkdir -p " + HDFS_DATA_DIR) + .add("hdfs dfs -copyFromLocal -f " + INPUT_BATCH_PATH1 + " " + HDFS_BATCH_PATH1) + .add("/bin/bash " + DEMO_CONTAINER_SCRIPT) + .build(); + executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds); + } + + private void ingestFirstBatchAndHiveSync() throws Exception { + List cmds = new ImmutableList.Builder() + .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + + HUDI_UTILITIES_BUNDLE + " --storage-type COPY_ON_WRITE " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + COW_BASE_PATH + " --target-table " + COW_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties " + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + String.format(HIVE_SYNC_CMD_FMT, "dt", COW_TABLE_NAME)) + .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + + HUDI_UTILITIES_BUNDLE + " --storage-type MERGE_ON_READ " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties " + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + " --disable-compaction " + + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME)) + .build(); + + executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds); + } + + private void testHiveAfterFirstBatch() throws Exception { + Pair stdOutErrPair = executeHiveCommandFile(HIVE_TBLCHECK_COMMANDS); + assertStdOutContains(stdOutErrPair, "| stock_ticks_cow |"); + assertStdOutContains(stdOutErrPair, "| stock_ticks_mor |"); + assertStdOutContains(stdOutErrPair, "| stock_ticks_mor_rt |"); + + assertStdOutContains(stdOutErrPair, + "| partition |\n" + + "+----------------+\n" + + "| dt=2018-08-31 |\n" + + "+----------------+\n", 3); + + stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS); + assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n" + + "+---------+----------------------+\n" + + "| GOOG | 2018-08-31 10:29:00 |\n", 3); + assertStdOutContains(stdOutErrPair, + "| symbol | ts | volume | open | close |\n" + + "+---------+----------------------+---------+------------+-----------+\n" + + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" + + "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n", 3); + } + + private void testSparkSQLAfterFirstBatch() throws Exception { + Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_BATCH1_COMMANDS, true); + assertStdOutContains(stdOutErrPair, + "|default |stock_ticks_cow |false |\n" + + "|default |stock_ticks_mor |false |\n" + + "|default |stock_ticks_mor_rt |false |"); + assertStdOutContains(stdOutErrPair, + "+------+-------------------+\n" + + "|GOOG |2018-08-31 10:29:00|\n" + + "+------+-------------------+", 3); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 3); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|", 3); + } + + private void ingestSecondBatchAndHiveSync() throws Exception { + List cmds = new ImmutableList.Builder() + .add("hdfs dfs -copyFromLocal -f " + INPUT_BATCH_PATH2 + " " + HDFS_BATCH_PATH2) + .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + + HUDI_UTILITIES_BUNDLE + " --storage-type COPY_ON_WRITE " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + COW_BASE_PATH + " --target-table " + COW_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties " + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + String.format(HIVE_SYNC_CMD_FMT, "dt", COW_TABLE_NAME)) + .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + + HUDI_UTILITIES_BUNDLE + " --storage-type MERGE_ON_READ " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties " + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + " --disable-compaction " + + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME)) + .build(); + executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds); + } + + private void testHiveAfterSecondBatch() throws Exception { + Pair stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS); + assertStdOutContains(stdOutErrPair, + "| symbol | _c1 |\n" + + "+---------+----------------------+\n" + + "| GOOG | 2018-08-31 10:29:00 |\n"); + assertStdOutContains(stdOutErrPair, + "| symbol | _c1 |\n" + + "+---------+----------------------+\n" + + "| GOOG | 2018-08-31 10:59:00 |\n", 2); + assertStdOutContains(stdOutErrPair, + "| symbol | ts | volume | open | close |\n" + + "+---------+----------------------+---------+------------+-----------+\n" + + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" + + "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n"); + assertStdOutContains(stdOutErrPair, + "| symbol | ts | volume | open | close |\n" + + "+---------+----------------------+---------+------------+-----------+\n" + + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" + + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |\n", 2); + } + + private void testHiveAfterSecondBatchAfterCompaction() throws Exception { + Pair stdOutErrPair = executeHiveCommandFile(HIVE_BATCH2_COMMANDS); + assertStdOutContains(stdOutErrPair, + "| symbol | _c1 |\n" + + "+---------+----------------------+\n" + + "| GOOG | 2018-08-31 10:59:00 |", 2); + assertStdOutContains(stdOutErrPair, "| symbol | ts | volume | open | close |\n" + + "+---------+----------------------+---------+------------+-----------+\n" + + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" + + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |", 2); + } + + private void testSparkSQLAfterSecondBatch() throws Exception { + Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_BATCH2_COMMANDS, true); + assertStdOutContains(stdOutErrPair, + "+------+-------------------+\n" + + "|GOOG |2018-08-31 10:59:00|\n" + + "+------+-------------------+", 2); + + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 3); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|", 2); + assertStdOutContains(stdOutErrPair, + "+------+-------------------+\n" + + "|GOOG |2018-08-31 10:29:00|\n" + + "+------+-------------------+"); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|"); + } + + private void testIncrementalHiveQuery() throws Exception { + String minCommitTime = executeCommandStringInDocker(ADHOC_2_CONTAINER, MIN_COMMIT_TIME_SCRIPT, true) + .getStdout().toString(); + Pair stdOutErrPair = executeHiveCommandFile(HIVE_INCREMENTAL_COMMANDS, + "min.commit.time=" + minCommitTime +"`"); + assertStdOutContains(stdOutErrPair, "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |"); + } + + private void testIncrementalHiveQueryAfterCompaction() throws Exception { + String minCommitTime = executeCommandStringInDocker(ADHOC_2_CONTAINER, MIN_COMMIT_TIME_SCRIPT, true) + .getStdout().toString(); + Pair stdOutErrPair = executeHiveCommandFile(HIVE_INCREMENTAL_COMMANDS, + "min.commit.time=" + minCommitTime +"`"); + assertStdOutContains(stdOutErrPair, + "| symbol | ts | volume | open | close |\n" + + "+---------+----------------------+---------+------------+-----------+\n" + + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |"); + } + + private void testIncrementalSparkSQLQuery() throws Exception { + Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_INCREMENTAL_COMMANDS, true); + assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|"); + assertStdOutContains(stdOutErrPair, + "|default |stock_ticks_cow |false |\n" + + "|default |stock_ticks_derived_mor |false |\n" + + "|default |stock_ticks_derived_mor_rt|false |\n" + + "|default |stock_ticks_mor |false |\n" + + "|default |stock_ticks_mor_rt |false |\n" + + "| |stock_ticks_cow_incr |true |"); + assertStdOutContains(stdOutErrPair, + "|count(1)|\n" + + "+--------+\n" + + "|99 |", 2); + } + + private void scheduleAndRunCompaction() throws Exception { + executeCommandStringInDocker(ADHOC_1_CONTAINER, HUDI_CLI_TOOL + " --cmdfile " + COMPACTION_COMMANDS, true); + } +} \ No newline at end of file diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java index 89eb56960..c8e112624 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java @@ -18,7 +18,7 @@ package org.apache.hudi.integ; -import java.util.Arrays; +import org.apache.hudi.common.util.collection.Pair; import org.junit.Assert; import org.junit.Test; @@ -33,17 +33,6 @@ public class ITTestHoodieSanity extends ITTestBase { NON_PARTITIONED, } - @Test - public void testRunEcho() throws Exception { - String[] cmd = new String[]{"echo", "Happy Testing"}; - TestExecStartResultCallback callback = executeCommandInDocker(ADHOC_1_CONTAINER, - cmd, true); - String stdout = callback.getStdout().toString(); - String stderr = callback.getStderr().toString(); - LOG.info("Got output for (" + Arrays.toString(cmd) + ") :" + stdout); - LOG.info("Got error output for (" + Arrays.toString(cmd) + ") :" + stderr); - } - @Test /** * A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie with single partition key @@ -53,6 +42,7 @@ public class ITTestHoodieSanity extends ITTestBase { public void testRunHoodieJavaAppOnSinglePartitionKeyCOWTable() throws Exception { String hiveTableName = "docker_hoodie_single_partition_key_cow_test"; testRunHoodieJavaAppOnCOWTable(hiveTableName, PartitionType.SINGLE_KEY_PARTITIONED); + executeHiveCommand("drop table if exists " + hiveTableName); } @Test @@ -64,6 +54,7 @@ public class ITTestHoodieSanity extends ITTestBase { public void testRunHoodieJavaAppOnMultiPartitionKeysCOWTable() throws Exception { String hiveTableName = "docker_hoodie_multi_partition_key_cow_test"; testRunHoodieJavaAppOnCOWTable(hiveTableName, PartitionType.MULTI_KEYS_PARTITIONED); + executeHiveCommand("drop table if exists " + hiveTableName); } @Test @@ -75,6 +66,7 @@ public class ITTestHoodieSanity extends ITTestBase { public void testRunHoodieJavaAppOnNonPartitionedCOWTable() throws Exception { String hiveTableName = "docker_hoodie_non_partition_key_cow_test"; testRunHoodieJavaAppOnCOWTable(hiveTableName, PartitionType.NON_PARTITIONED); + executeHiveCommand("drop table if exists " + hiveTableName); } /** @@ -89,109 +81,54 @@ public class ITTestHoodieSanity extends ITTestBase { String hdfsUrl = "hdfs://namenode" + hdfsPath; // Drop Table if it exists - { - String[] hiveDropCmd = getHiveConsoleCommand("drop table if exists " + hiveTableName); - executeCommandInDocker(HIVESERVER, hiveDropCmd, true); + String hiveDropCmd = "drop table if exists " + hiveTableName; + try { + executeHiveCommand(hiveDropCmd); + } catch (AssertionError ex) { + // In travis, sometimes, the hivemetastore is not ready even though we wait for the port to be up + // Workaround to sleep for 5 secs and retry + Thread.sleep(5000); + executeHiveCommand(hiveDropCmd); } // Ensure table does not exist - { - String[] hiveTableCheck = getHiveConsoleCommand("show tables like '" + hiveTableName + "'"); - TestExecStartResultCallback callback = - executeCommandInDocker(HIVESERVER, hiveTableCheck, true); - String stderr = callback.getStderr().toString(); - String stdout = callback.getStdout().toString(); - LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") :" + stdout); - LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") :" + stderr); - Assert.assertTrue("Result :" + callback.getStdout().toString(), stdout.trim().isEmpty()); - } + Pair stdOutErr = executeHiveCommand("show tables like '" + hiveTableName + "'"); + Assert.assertTrue("Dropped table " + hiveTableName + " exists!", stdOutErr.getLeft().isEmpty()); // Run Hoodie Java App - { - String[] cmd = null; - if (partitionType == PartitionType.SINGLE_KEY_PARTITIONED) { - cmd = new String[]{ - HOODIE_JAVA_APP, - "--hive-sync", - "--table-path", hdfsUrl, - "--hive-url", HIVE_SERVER_JDBC_URL, - "--hive-table", hiveTableName - }; - } else if (partitionType == PartitionType.MULTI_KEYS_PARTITIONED) { - cmd = new String[]{ - HOODIE_JAVA_APP, - "--hive-sync", - "--table-path", hdfsUrl, - "--hive-url", HIVE_SERVER_JDBC_URL, - "--use-multi-partition-keys", - "--hive-table", hiveTableName - }; - } else { - cmd = new String[]{ - HOODIE_JAVA_APP, - "--hive-sync", - "--table-path", hdfsUrl, - "--hive-url", HIVE_SERVER_JDBC_URL, - "--non-partitioned", - "--hive-table", hiveTableName - }; - } - TestExecStartResultCallback callback = executeCommandInDocker(ADHOC_1_CONTAINER, - cmd, true); - String stdout = callback.getStdout().toString().trim(); - String stderr = callback.getStderr().toString().trim(); - LOG.info("Got output for (" + Arrays.toString(cmd) + ") :" + stdout); - LOG.info("Got error output for (" + Arrays.toString(cmd) + ") :" + stderr); + String cmd; + if (partitionType == PartitionType.SINGLE_KEY_PARTITIONED) { + cmd = HOODIE_JAVA_APP + " --hive-sync --table-path " + hdfsUrl + + " --hive-url " + HIVE_SERVER_JDBC_URL + " --hive-table " + hiveTableName; + } else if (partitionType == PartitionType.MULTI_KEYS_PARTITIONED) { + cmd = HOODIE_JAVA_APP + " --hive-sync --table-path " + hdfsUrl + + " --hive-url " + HIVE_SERVER_JDBC_URL + " --hive-table " + hiveTableName + + " --use-multi-partition-keys"; + } else { + cmd = HOODIE_JAVA_APP + " --hive-sync --table-path " + hdfsUrl + + " --hive-url " + HIVE_SERVER_JDBC_URL + " --hive-table " + hiveTableName + + " --non-partitioned"; } + executeCommandStringInDocker(ADHOC_1_CONTAINER, cmd, true); // Ensure table does exist - { - String[] hiveTableCheck = getHiveConsoleCommand("show tables like '" + hiveTableName + "'"); - TestExecStartResultCallback callback = - executeCommandInDocker(HIVESERVER, hiveTableCheck, true); - String stderr = callback.getStderr().toString().trim(); - String stdout = callback.getStdout().toString().trim(); - LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stdout + ")"); - LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stderr + ")"); - Assert.assertEquals("Table exists", hiveTableName, stdout); - } + stdOutErr = executeHiveCommand("show tables like '" + hiveTableName + "'"); + Assert.assertEquals("Table exists", hiveTableName, stdOutErr.getLeft()); // Ensure row count is 100 (without duplicates) - { - String[] hiveTableCheck = getHiveConsoleCommand("select count(1) from " + hiveTableName); - TestExecStartResultCallback callback = - executeCommandInDocker(ADHOC_1_CONTAINER, hiveTableCheck, true); - String stderr = callback.getStderr().toString().trim(); - String stdout = callback.getStdout().toString().trim(); - LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stdout + ")"); - LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stderr + ")"); - Assert.assertEquals("Expecting 100 rows to be present in the new table", 100, - Integer.parseInt(stdout.trim())); - } + stdOutErr = executeHiveCommand("select count(1) from " + hiveTableName); + Assert.assertEquals("Expecting 100 rows to be present in the new table", 100, + Integer.parseInt(stdOutErr.getLeft().trim())); // Make the HDFS dataset non-hoodie and run the same query // Checks for interoperability with non-hoodie tables - { - // Delete Hoodie directory to make it non-hoodie dataset - String[] cmd = new String[]{ - "hadoop", "fs", "-rm", "-r", hdfsPath + "/.hoodie" - }; - TestExecStartResultCallback callback = - executeCommandInDocker(ADHOC_1_CONTAINER, cmd, true); - String stderr = callback.getStderr().toString().trim(); - String stdout = callback.getStdout().toString().trim(); - LOG.info("Got output for (" + Arrays.toString(cmd) + ") : (" + stdout + ")"); - LOG.info("Got error output for (" + Arrays.toString(cmd) + ") : (" + stderr + ")"); - // Run the count query again. Without Hoodie, all versions are included. So we get a wrong count - String[] hiveTableCheck = getHiveConsoleCommand("select count(1) from " + hiveTableName); - callback = executeCommandInDocker(ADHOC_1_CONTAINER, hiveTableCheck, true); - stderr = callback.getStderr().toString().trim(); - stdout = callback.getStdout().toString().trim(); - LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stdout + ")"); - LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stderr + ")"); - Assert.assertEquals("Expecting 200 rows to be present in the new table", 200, - Integer.parseInt(stdout.trim())); - } + // Delete Hoodie directory to make it non-hoodie dataset + executeCommandStringInDocker(ADHOC_1_CONTAINER, "hdfs dfs -rm -r " + hdfsPath + "/.hoodie", true); + + // Run the count query again. Without Hoodie, all versions are included. So we get a wrong count + stdOutErr = executeHiveCommand("select count(1) from " + hiveTableName); + Assert.assertEquals("Expecting 200 rows to be present in the new table", 200, + Integer.parseInt(stdOutErr.getLeft().trim())); } } diff --git a/hudi-spark/pom.xml b/hudi-spark/pom.xml index 2ca54201d..60686f65a 100644 --- a/hudi-spark/pom.xml +++ b/hudi-spark/pom.xml @@ -23,13 +23,10 @@ 4.0.0 - org.apache.hudi hudi-spark jar - 1.2.17 - 4.10 ${project.basedir}/src/main/resources/META-INF @@ -52,12 +49,11 @@ net.alchim31.maven scala-maven-plugin - 3.3.1 + ${scala-maven-plugin.version} org.apache.maven.plugins maven-compiler-plugin - 2.0.2 @@ -156,95 +152,14 @@ + org.scala-lang scala-library ${scala.version} - - org.scalatest - scalatest_2.11 - 3.0.1 - test - - - org.apache.spark - spark-core_2.11 - - - org.apache.spark - spark-sql_2.11 - - - com.databricks - spark-avro_2.11 - 4.0.0 - - - com.fasterxml.jackson.core - jackson-annotations - - - com.fasterxml.jackson.module - jackson-module-scala_2.11 - - - org.apache.hadoop - hadoop-client - - - javax.servlet - * - - - provided - - - - org.apache.hadoop - hadoop-common - provided - - - - log4j - log4j - ${log4j.version} - - - org.apache.avro - avro - - - - org.apache.commons - commons-configuration2 - - - - ${hive.groupid} - hive-service - ${hive.version} - - - - ${hive.groupid} - hive-jdbc - ${hive.version} - - - - ${hive.groupid} - hive-metastore - ${hive.version} - - - - ${hive.groupid} - hive-common - ${hive.version} - + org.apache.hudi hudi-client @@ -265,12 +180,93 @@ hudi-hive ${project.version} + + - junit - junit-dep - ${junit.version} - test + log4j + log4j + + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.module + jackson-module-scala_2.11 + + + + + org.apache.avro + avro + + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + + com.databricks + spark-avro_2.11 + 4.0.0 + + + + + org.apache.commons + commons-configuration2 + + + + + org.apache.hadoop + hadoop-client + + + javax.servlet + * + + + provided + + + org.apache.hadoop + hadoop-common + provided + + + + + ${hive.groupid} + hive-service + ${hive.version} + + + ${hive.groupid} + hive-jdbc + ${hive.version} + + + ${hive.groupid} + hive-metastore + ${hive.version} + + + ${hive.groupid} + hive-common + ${hive.version} + + + org.apache.hudi hudi-client @@ -287,5 +283,19 @@ test-jar test + + + org.scalatest + scalatest_2.11 + ${scalatest.version} + test + + + + junit + junit-dep + ${junit-dep.version} + test + diff --git a/hudi-timeline-service/pom.xml b/hudi-timeline-service/pom.xml index f15833880..353a40f6f 100644 --- a/hudi-timeline-service/pom.xml +++ b/hudi-timeline-service/pom.xml @@ -121,40 +121,57 @@ - - io.javalin - javalin - 2.8.0 - - - org.apache.httpcomponents - fluent-hc - 4.3.2 - + org.apache.hudi hudi-common ${project.version} + + + + log4j + log4j + + + com.fasterxml.jackson.core jackson-annotations - 2.9.7 com.fasterxml.jackson.core jackson-core - 2.9.7 com.fasterxml.jackson.core jackson-databind - 2.9.7 + + + + org.apache.httpcomponents + fluent-hc + + + + io.javalin + javalin + 2.8.0 + + + + com.beust + jcommander + 1.48 + + org.rocksdb rocksdbjni + + org.apache.hadoop hadoop-hdfs @@ -194,25 +211,6 @@ - - org.apache.hudi - hudi-common - ${project.version} - tests - test-jar - test - - - com.beust - jcommander - 1.48 - - - - - log4j - log4j - org.apache.hadoop hadoop-client @@ -223,20 +221,34 @@ + + + + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + + + + junit + junit + test + + + + org.mockito + mockito-all + test + + com.esotericsoftware kryo test - - org.mockito - mockito-all - 1.10.19 - test - - - junit - junit - + diff --git a/hudi-utilities/pom.xml b/hudi-utilities/pom.xml index a571345b2..612afdb63 100644 --- a/hudi-utilities/pom.xml +++ b/hudi-utilities/pom.xml @@ -76,6 +76,124 @@ + + + + org.eclipse.jetty.aggregate + jetty-all + 7.6.0.v20120127 + + + + org.eclipse.jetty + jetty-server + 7.6.0.v20120127 + + + + + org.apache.hudi + hudi-client + ${project.version} + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hudi + hudi-hive + ${project.version} + + + javax.servlet + servlet-api + + + + + org.apache.hudi + hudi-spark + ${project.version} + + + javax.servlet + servlet-api + + + + + + + log4j + log4j + + + org.slf4j + slf4j-api + + + + + com.fasterxml.jackson.module + jackson-module-scala_2.11 + + + + + org.apache.avro + avro-mapred + + + + + org.apache.parquet + parquet-avro + + + org.apache.parquet + parquet-hadoop + + + + + org.apache.spark + spark-core_2.11 + + + javax.servlet + * + + + + + org.apache.spark + spark-sql_2.11 + + + javax.servlet + * + + + + + org.apache.spark + spark-streaming_2.11 + ${spark.version} + + + org.apache.spark + spark-streaming-kafka-0-8_2.11 + ${spark.version} + + + + + io.dropwizard.metrics + metrics-core + + io.javalin javalin @@ -89,43 +207,83 @@ - io.dropwizard.metrics + com.yammer.metrics metrics-core + 2.2.0 + + + + + org.antlr + stringtemplate + 4.0.2 - com.fasterxml.jackson.module - jackson-module-scala_2.11 + com.beust + jcommander - org.apache.hudi - hudi-common - ${project.version} + com.twitter + bijection-avro_2.11 + 0.9.2 + - org.apache.hudi - hudi-common - ${project.version} - tests - test-jar - test + io.confluent + kafka-avro-serializer + 3.0.0 + + + io.confluent + common-config + 3.0.0 + + + io.confluent + common-utils + 3.0.0 + + + io.confluent + kafka-schema-registry-client + 3.0.0 + - org.apache.hudi - hudi-hive - ${project.version} - tests - test-jar - test + commons-codec + commons-codec + + + commons-dbcp + commons-dbcp + + + commons-lang + commons-lang + + + commons-pool + commons-pool + - org.apache.hudi - hudi-spark - ${project.version} + org.apache.httpcomponents + httpcore + + + + + org.apache.hadoop + hadoop-client + + + org.apache.hadoop + hadoop-mapreduce-client-common javax.servlet @@ -133,13 +291,6 @@ - - - org.eclipse.jetty - jetty-server - 7.6.0.v20120127 - - org.apache.hadoop hadoop-hdfs @@ -165,6 +316,7 @@ + ${hive.groupid} hive-jdbc @@ -180,38 +332,13 @@ - - - ${hive.groupid} - hive-exec - ${hive.version} - test - - ${hive.groupid} hive-service ${hive.version} - - org.apache.hudi - hudi-hive - ${project.version} - - - javax.servlet - servlet-api - - - - - - org.apache.hudi - hudi-client - ${project.version} - - + org.apache.hudi hudi-client @@ -220,163 +347,35 @@ test-jar test - - commons-codec - commons-codec + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test - commons-dbcp - commons-dbcp - - - commons-lang - commons-lang - - - commons-pool - commons-pool - - - org.apache.httpcomponents - httpcore + org.apache.hudi + hudi-hive + ${project.version} + tests + test-jar + test + - log4j - log4j - - - org.slf4j - slf4j-api - - - - org.apache.hadoop - hadoop-mapreduce-client-common - - - javax.servlet - servlet-api - - - - - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-core_2.11 - - - javax.servlet - * - - - - - - org.apache.spark - spark-sql_2.11 - - - javax.servlet - * - - - - - - com.yammer.metrics - metrics-core - 2.2.0 - - - - org.apache.spark - spark-streaming_2.11 - ${spark.version} - provided - - - - org.apache.spark - spark-streaming-kafka-0-8_2.11 - ${spark.version} - - - - - org.antlr - stringtemplate - 4.0.2 - - - - com.beust - jcommander + ${hive.groupid} + hive-exec + ${hive.version} + test org.mockito mockito-all - 1.10.19 test - - org.apache.avro - avro-mapred - 1.7.7 - - - - org.apache.parquet - parquet-avro - - - - org.apache.parquet - parquet-hadoop - - - - com.twitter - bijection-avro_2.11 - 0.9.2 - - - - io.confluent - kafka-avro-serializer - 3.0.0 - - - - io.confluent - common-config - 3.0.0 - - - - io.confluent - common-utils - 3.0.0 - - - - io.confluent - kafka-schema-registry-client - 3.0.0 - - - - org.eclipse.jetty.aggregate - jetty-all - test - - diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java index cff6a8bfd..434638875 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java @@ -166,7 +166,7 @@ public class HoodieDeltaStreamer implements Serializable { public String propsFilePath = "file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties"; - @Parameter(names = {"--hudi-conf"}, description = "Any configuration that can be set in the properties file " + @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter") public List configs = new ArrayList<>(); diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml index 64a3bbff2..1b68296b7 100644 --- a/packaging/hudi-hadoop-mr-bundle/pom.xml +++ b/packaging/hudi-hadoop-mr-bundle/pom.xml @@ -27,45 +27,90 @@ hudi-hadoop-mr-bundle + org.apache.hudi hudi-common ${project.version} - org.apache.hudi hudi-hadoop-mr ${project.version} - + org.apache.hudi * + + + org.apache.avro + avro + + + + + org.apache.parquet + parquet-avro + + + + + com.twitter + parquet-avro + + + com.twitter + parquet-hadoop-bundle + + + + + commons-logging + commons-logging + + + commons-io + commons-io + + + commons-codec + commons-codec + + + + com.twitter.common + objectsize + 0.0.12 + + + org.apache.hadoop hadoop-common - org.apache.hadoop hadoop-mapreduce-client-core - org.apache.hadoop hadoop-mapreduce-client-common - org.apache.hadoop hadoop-auth + + org.apache.hadoop + hadoop-hdfs + + ${hive.groupid} hive-jdbc @@ -92,7 +137,7 @@ hive-shims ${hive.version} - + ${hive.groupid} hive-serde ${hive.version} @@ -109,49 +154,9 @@ - org.apache.hadoop - hadoop-hdfs - - - - commons-logging - commons-logging - - - - commons-io - commons-io - - - - commons-codec - commons-codec - - - - org.apache.parquet - parquet-avro - - - - com.twitter - parquet-avro - - - - com.twitter - parquet-hadoop-bundle - - - - com.twitter.common - objectsize - 0.0.12 - - - - org.apache.avro - avro + com.esotericsoftware + kryo + test @@ -159,6 +164,7 @@ junit test + diff --git a/packaging/hudi-hive-bundle/pom.xml b/packaging/hudi-hive-bundle/pom.xml index 4565b39bc..29594f4fe 100644 --- a/packaging/hudi-hive-bundle/pom.xml +++ b/packaging/hudi-hive-bundle/pom.xml @@ -28,71 +28,28 @@ jar + - org.apache.hadoop - hadoop-common + org.apache.hudi + hudi-common + ${project.version} - org.apache.hadoop - hadoop-client + org.apache.hudi + hudi-hadoop-mr-bundle + ${project.version} - org.apache.hadoop - hadoop-hdfs - - - org.apache.hadoop - hadoop-auth - - - ${hive.groupid} - hive-service - ${hive.version} - - - ${hive.groupid} - hive-jdbc - ${hive.version} - - - ${hive.groupid} - hive-metastore - ${hive.version} - - - ${hive.groupid} - hive-common - ${hive.version} - - - com.google.guava - guava - - - org.apache.thrift - libthrift - ${thrift.version} - - - org.apache.thrift - libfb303 - 0.9.3 - - - - joda-time - joda-time - - - - - commons-dbcp - commons-dbcp - - - - commons-io - commons-io + org.apache.hudi + hudi-hive + ${project.version} + + + + org.apache.hudi + * + + @@ -105,43 +62,93 @@ slf4j-log4j12 - - com.beust - jcommander - - - - org.apache.httpcomponents - httpcore - - - - org.apache.httpcomponents - httpclient - - + com.twitter parquet-avro + - org.apache.hudi - hudi-hadoop-mr-bundle - ${project.version} + org.apache.thrift + libthrift + ${thrift.version} - - org.apache.hudi - hudi-hive - ${project.version} - - - - org.apache.hudi - * - - + org.apache.thrift + libfb303 + 0.9.3 + + + + com.google.guava + guava + + + + joda-time + joda-time + + + + com.beust + jcommander + + + + + commons-dbcp + commons-dbcp + + + commons-io + commons-io + + + + + org.apache.httpcomponents + httpcore + + + org.apache.httpcomponents + httpclient + + + + + org.apache.hadoop + hadoop-client + + + org.apache.hadoop + hadoop-common + + + org.apache.hadoop + hadoop-hdfs + + + org.apache.hadoop + hadoop-auth + + + + + ${hive.groupid} + hive-service + + + ${hive.groupid} + hive-jdbc + + + ${hive.groupid} + hive-metastore + + + ${hive.groupid} + hive-common diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml index b9b3a38fa..54b966d32 100644 --- a/packaging/hudi-presto-bundle/pom.xml +++ b/packaging/hudi-presto-bundle/pom.xml @@ -28,46 +28,16 @@ jar + - org.apache.hadoop - hadoop-common + org.apache.hudi + hudi-common + ${project.version} - org.apache.hadoop - hadoop-client - - - org.apache.hadoop - hadoop-hdfs - - - org.apache.hadoop - hadoop-auth - - - com.google.guava - guava - - - org.apache.thrift - libthrift - ${thrift.version} - - - - joda-time - joda-time - - - - - commons-dbcp - commons-dbcp - - - - commons-io - commons-io + org.apache.hudi + hudi-hadoop-mr-bundle + ${project.version} @@ -80,30 +50,70 @@ slf4j-log4j12 - - com.beust - jcommander - - - - org.apache.httpcomponents - httpcore - - - - org.apache.httpcomponents - httpclient - - + com.twitter parquet-avro + - org.apache.hudi - hudi-hadoop-mr-bundle - ${project.version} + org.apache.thrift + libthrift + ${thrift.version} + + + + joda-time + joda-time + + + + com.google.guava + guava + + + + + commons-dbcp + commons-dbcp + + + commons-io + commons-io + + + + com.beust + jcommander + + + + + org.apache.httpcomponents + httpcore + + + org.apache.httpcomponents + httpclient + + + + + org.apache.hadoop + hadoop-client + + + org.apache.hadoop + hadoop-common + + + org.apache.hadoop + hadoop-hdfs + + + org.apache.hadoop + hadoop-auth diff --git a/packaging/hudi-spark-bundle/pom.xml b/packaging/hudi-spark-bundle/pom.xml index 304cea40d..f46b94b17 100644 --- a/packaging/hudi-spark-bundle/pom.xml +++ b/packaging/hudi-spark-bundle/pom.xml @@ -127,6 +127,8 @@ parquet.schema org.apache.hudi.parquet.schema + + com.esotericsoftware.kryo. org.apache.hudi.com.esotericsoftware.kryo. @@ -177,9 +184,8 @@ org.apache.derby:derby org.apache.hadoop:* org.apache.hbase:* - + org.apache.hive:hive-exec - org.apache.hive:hive-serde org.apache.hive:hive-shims org.apache.spark:* @@ -206,102 +212,18 @@ - - com.beust - jcommander - - - commons-dbcp - commons-dbcp - - - org.apache.avro - avro - - - commons-codec - commons-codec - + org.scala-lang scala-library ${scala.version} + + - org.scalatest - scalatest_2.11 - 3.0.1 - test - - - org.apache.spark - spark-core_2.11 - - - org.apache.spark - spark-sql_2.11 - - - com.databricks - spark-avro_2.11 - 4.0.0 - - - com.fasterxml.jackson.core - jackson-annotations - - - org.apache.hadoop - hadoop-client - - - javax.servlet - * - - - provided - - - org.apache.hadoop - hadoop-common - provided - - - log4j - log4j - ${log4j.version} - - - org.apache.avro - avro - - - ${hive.groupid} - hive-service - ${hive.version} - compile - - - ${hive.groupid} - hive-jdbc - ${hive.version} - compile - - - ${hive.groupid} - hive-metastore - ${hive.version} - compile - - - ${hive.groupid} - hive-common - ${hive.version} - compile - - - org.apache.commons - commons-configuration2 + org.apache.hudi + hudi-client + ${project.version} org.apache.hudi @@ -318,16 +240,141 @@ hudi-hive ${project.version} - - org.apache.hudi - hudi-client - ${project.version} - org.apache.hudi hudi-spark ${project.version} - + + + + + log4j + log4j + ${log4j.version} + + + + + com.fasterxml.jackson.core + jackson-annotations + + + + + org.apache.avro + avro + + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + + com.databricks + spark-avro_2.11 + 4.0.0 + + + + com.beust + jcommander + + + + + commons-codec + commons-codec + + + commons-dbcp + commons-dbcp + + + org.apache.commons + commons-configuration2 + + + + + org.apache.hadoop + hadoop-client + + + javax.servlet + * + + + provided + + + org.apache.hadoop + hadoop-common + provided + + + + + ${hive.groupid} + hive-jdbc + ${hive.version} + standalone + + + org.slf4j + slf4j-api + + + javax.servlet + servlet-api + + + + + + ${hive.groupid} + hive-service + ${hive.version} + compile + + + ${hive.groupid} + hive-jdbc + ${hive.version} + compile + + + ${hive.groupid} + hive-serde + ${hive.version} + compile + + + ${hive.groupid} + hive-metastore + ${hive.version} + compile + + + ${hive.groupid} + hive-common + ${hive.version} + compile + + + + + org.scalatest + scalatest_2.11 + ${scalatest.version} + test + diff --git a/packaging/hudi-utilities-bundle/pom.xml b/packaging/hudi-utilities-bundle/pom.xml index c8dc51890..530b12fba 100644 --- a/packaging/hudi-utilities-bundle/pom.xml +++ b/packaging/hudi-utilities-bundle/pom.xml @@ -28,8 +28,6 @@ jar - 1.2.17 - 4.10 true ${project.basedir}/src/main/resources/META-INF HUDI_NOTICE.txt @@ -205,6 +203,98 @@ + + + org.apache.hudi + hudi-client + ${project.version} + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hudi + hudi-hive + ${project.version} + + + javax.servlet + servlet-api + + + + + org.apache.hudi + hudi-spark + ${project.version} + + + org.apache.hudi + hudi-utilities + ${project.version} + + + + + log4j + log4j + + + org.slf4j + slf4j-api + + + + + com.fasterxml.jackson.module + jackson-module-scala_2.11 + + + + + org.apache.avro + avro-mapred + + + + + org.apache.parquet + parquet-avro + + + org.apache.parquet + parquet-hadoop + + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + org.apache.spark + spark-streaming_2.11 + ${spark.version} + provided + + + org.apache.spark + spark-streaming-kafka-0-8_2.11 + ${spark.version} + + + + + io.dropwizard.metrics + metrics-core + + io.javalin javalin @@ -212,21 +302,114 @@ - io.dropwizard.metrics + com.yammer.metrics metrics-core + 2.2.0 + + + + + org.antlr + stringtemplate + 4.0.2 - com.fasterxml.jackson.module - jackson-module-scala_2.11 + com.beust + jcommander + + com.twitter + bijection-avro_2.11 + 0.9.2 + + + + + io.confluent + kafka-avro-serializer + 3.0.0 + + + io.confluent + common-config + 3.0.0 + + + io.confluent + common-utils + 3.0.0 + + + io.confluent + kafka-schema-registry-client + 3.0.0 + + + + + commons-codec + commons-codec + + + commons-dbcp + commons-dbcp + + + commons-pool + commons-pool + + + + + org.apache.httpcomponents + httpcore + + + + + org.apache.hadoop + hadoop-client + + + org.apache.hadoop + hadoop-mapreduce-client-common + + + javax.servlet + servlet-api + + + + + + + ${hive.groupid} + hive-jdbc + ${hive.version} + standalone + + + org.slf4j + slf4j-api + + + javax.servlet + servlet-api + + + + + org.apache.hudi - hudi-common + hudi-client ${project.version} + tests + test-jar + test - org.apache.hudi hudi-common @@ -235,7 +418,6 @@ test-jar test - org.apache.hudi hudi-hive @@ -245,17 +427,7 @@ test - - org.apache.hudi - hudi-spark - ${project.version} - - - - org.apache.hadoop - hadoop-hdfs - tests - + org.apache.hadoop hadoop-common @@ -275,7 +447,13 @@ + + org.apache.hadoop + hadoop-hdfs + tests + + ${hive.groupid} hive-exec @@ -283,191 +461,11 @@ test - - ${hive.groupid} - hive-jdbc - ${hive.version} - standalone - - - org.slf4j - slf4j-api - - - javax.servlet - servlet-api - - - - - - org.apache.hudi - hudi-hive - ${project.version} - - - javax.servlet - servlet-api - - - - - - org.apache.hudi - hudi-client - ${project.version} - - - - org.apache.hudi - hudi-utilities - ${project.version} - - - - org.apache.hudi - hudi-client - ${project.version} - tests - test-jar - test - - - - commons-codec - commons-codec - - - commons-dbcp - commons-dbcp - - - commons-pool - commons-pool - - - - org.apache.httpcomponents - httpcore - - - - log4j - log4j - - - org.slf4j - slf4j-api - - - - org.apache.hadoop - hadoop-mapreduce-client-common - - - javax.servlet - servlet-api - - - - - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-core_2.11 - - - - org.apache.spark - spark-sql_2.11 - - - - com.yammer.metrics - metrics-core - 2.2.0 - - - - org.apache.spark - spark-streaming_2.11 - ${spark.version} - provided - - - - org.apache.spark - spark-streaming-kafka-0-8_2.11 - ${spark.version} - - - - - org.antlr - stringtemplate - 4.0.2 - - - - com.beust - jcommander - - org.mockito mockito-all - 1.10.19 test - - org.apache.avro - avro-mapred - 1.7.7 - - - - org.apache.parquet - parquet-avro - - - - org.apache.parquet - parquet-hadoop - - - - com.twitter - bijection-avro_2.11 - 0.9.2 - - - - io.confluent - kafka-avro-serializer - 3.0.0 - - - - io.confluent - common-config - 3.0.0 - - - - io.confluent - common-utils - 3.0.0 - - - - io.confluent - kafka-schema-registry-client - 3.0.0 - diff --git a/pom.xml b/pom.xml index 5671d11b8..45b10b600 100644 --- a/pom.xml +++ b/pom.xml @@ -24,11 +24,9 @@ hudi pom 0.5.0-SNAPSHOT - Hoodie is a Apache Spark library that provides the ability to efficiently do - incremental processing on datasets in HDFS - - https://github.com/uber/hudi - Hoodie + Apache Hudi brings stream style processing on big data + https://github.com/apache/incubator-hudi + Hudi hudi-common @@ -58,15 +56,15 @@ - Uber Technologies Inc. - http://www.uber.com/ + The Apache Software Foundation + https://www.apache.org vinothchandar Vinoth Chandar - Uber + Confluent Inc prasannarajaperumal @@ -129,10 +127,13 @@ 2.19.1 3.8.1 2.6.7 + 2.17 1.8.1 4.11 - 1.9.5 + 4.10 + 1.10.19 1.2.17 + 1.7.5 2.9.9 2.7.3 org.apache.hive @@ -142,12 +143,17 @@ 1.7.7 2.11.8 2.11 + 3.3.1 + 3.0.1 file://${project.basedir}/src/test/resources/log4j-surefire.properties 0.12.0 1.2.3 1.9.13 ${project.basedir} NOTICE.txt + false + ${skipTests} + ${skipTests} @@ -232,32 +238,32 @@ - org.scalastyle - scalastyle-maven-plugin - 1.0.0 - - false - true - true - false - ${project.basedir}/src/main/scala - ${project.basedir}/src/test/scala - style/scalastyle-config.xml - UTF-8 - - - - compile - - check - - - + org.scalastyle + scalastyle-maven-plugin + 1.0.0 + + false + true + true + false + ${project.basedir}/src/main/scala + ${project.basedir}/src/test/scala + style/scalastyle-config.xml + UTF-8 + + + + compile + + check + + + org.apache.maven.plugins maven-compiler-plugin - 3.8.1 + ${maven-compiler-plugin.version} 1.8 1.8 @@ -292,6 +298,7 @@ maven-surefire-plugin ${maven-surefire-plugin.version} + ${skipUTs} ${surefireArgLine} @@ -452,18 +459,152 @@ + - com.google.code.gson - gson - 2.3.1 - test + log4j + log4j + ${log4j.version} + + + org.slf4j + slf4j-api + ${slf4j.version} + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + - junit - junit - ${junit.version} + com.fasterxml.jackson.core + jackson-annotations + ${fasterxml.version} + + + com.fasterxml.jackson.core + jackson-core + ${fasterxml.version} + + + com.fasterxml.jackson.core + jackson-databind + ${fasterxml.version}.1 + + + com.fasterxml.jackson.datatype + jackson-datatype-guava + ${fasterxml.version} + + + com.fasterxml.jackson.module + jackson-module-scala_2.11 + ${fasterxml.version} + + + + + org.glassfish.jersey.core + jersey-server + ${glassfish.version} + + + org.glassfish.jersey.connectors + jersey-apache-connector + ${glassfish.version} + + + org.glassfish.jersey.containers + jersey-container-servlet-core + ${glassfish.version} + + + + org.eclipse.jetty.aggregate + jetty-all test + 7.6.0.v20120127 + + + + + org.apache.avro + avro + ${avro.version} + + + org.apache.avro + avro-mapred + ${avro.version} + + + + + org.apache.parquet + parquet-avro + ${parquet.version} + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + + + org.apache.parquet + parquet-hive-bundle + ${parquet.version} + + + + + + com.twitter + parquet-hadoop-bundle + 1.6.0 + + + com.twitter + parquet-hive-bundle + 1.6.0 + + + com.twitter + parquet-avro + 1.6.0 + + + + + org.apache.spark + spark-core_2.11 + ${spark.version} + provided + + + org.apache.spark + spark-sql_2.11 + ${spark.version} + provided + + + + + com.databricks + spark-avro_2.11 + 4.0.0 + + + + + io.dropwizard.metrics + metrics-graphite + ${metrics.version} + + + io.dropwizard.metrics + metrics-core + ${metrics.version} @@ -473,28 +614,119 @@ - log4j - log4j - ${log4j.version} - - - - joda-time joda-time ${joda.version} + + + com.google.guava + guava + 15.0 + + + + xerces + xercesImpl + 2.9.1 + + + + xalan + xalan + 2.7.1 + + + + org.rocksdb + rocksdbjni + 5.17.2 + + + + + commons-codec + commons-codec + 1.4 + + + commons-io + commons-io + 2.6 + + + commons-lang + commons-lang + 2.6 + + + commons-logging + commons-logging + 1.2 + + + commons-dbcp + commons-dbcp + 1.4 + + + commons-pool + commons-pool + 1.4 + + + org.apache.commons + commons-configuration2 + 2.1.1 + + + + + org.apache.httpcomponents + fluent-hc + 4.3.2 + + + org.apache.httpcomponents + httpcore + 4.3.2 + + + org.apache.httpcomponents + httpclient + 4.3.2 + + + + + org.codehaus.jackson + jackson-core-asl + ${codehaus-jackson.version} + + + org.codehaus.jackson + jackson-mapper-asl + ${codehaus-jackson.version} + + + org.codehaus.jackson + jackson-jaxrs + ${codehaus-jackson.version} + + + org.codehaus.jackson + jackson-xc + ${codehaus-jackson.version} + + + org.apache.hadoop hadoop-client ${hadoop.version} provided - - com.fasterxml.jackson.* - * - javax.servlet servlet-api @@ -505,48 +737,11 @@ - - - org.apache.parquet - parquet-avro - ${parquet.version} - - - - org.apache.parquet - parquet-hadoop - ${parquet.version} - - - - org.apache.avro - avro-mapred - ${avro.version} - - - - - com.google.guava - guava - 15.0 - - - org.apache.hadoop hadoop-common ${hadoop.version} provided - - - jdk.tools - jdk.tools - - - javax.xml.bind - jaxb-api - - org.apache.hadoop @@ -559,12 +754,6 @@ hadoop-auth ${hadoop.version} provided - - - com.fasterxml.jackson.* - * - - org.apache.hadoop @@ -591,214 +780,36 @@ - org.rocksdb - rocksdbjni - 5.17.2 + org.apache.hadoop + hadoop-hdfs + tests + ${hadoop.version} - commons-codec - commons-codec - 1.4 - - - commons-lang - commons-lang - 2.6 - - - commons-logging - commons-logging - 1.2 - - - commons-io - commons-io - 2.6 - - - - - com.twitter - parquet-hadoop-bundle - 1.6.0 - - - com.twitter - parquet-hive-bundle - 1.6.0 - - - com.twitter - parquet-avro - 1.6.0 - - - - org.apache.parquet - parquet-hive-bundle - ${parquet.version} - - - - org.apache.spark - spark-core_2.11 - ${spark.version} - provided + org.apache.hadoop + hadoop-common + tests + ${hadoop.version} - com.fasterxml.jackson.** - * + jdk.tools + jdk.tools - javax.servlet - servlet-api - - - - - org.apache.spark - spark-sql_2.11 - ${spark.version} - provided - - - com.fasterxml.jackson.** - * + javax.xml.bind + jaxb-api + org.apache.hbase hbase-client ${hbase.version} - - org.apache.avro - avro - ${avro.version} - - - org.slf4j - slf4j-api - - - - - - - io.dropwizard.metrics - metrics-graphite - ${metrics.version} - - - io.dropwizard.metrics - metrics-core - ${metrics.version} - - - - xerces - xercesImpl - 2.9.1 - - - xalan - xalan - 2.7.1 - - - - commons-dbcp - commons-dbcp - 1.4 - - - commons-pool - commons-pool - 1.4 - - - org.apache.httpcomponents - fluent-hc - 4.3.2 - - - org.apache.httpcomponents - httpcore - 4.3.2 - - - org.apache.httpcomponents - httpclient - 4.3.6 - - - org.slf4j - slf4j-api - 1.7.5 - - - org.slf4j - slf4j-log4j12 - 1.7.5 - - - - org.apache.commons - commons-configuration2 - 2.1.1 - - - - com.fasterxml.jackson.core - jackson-annotations - ${fasterxml.version} - - - - com.fasterxml.jackson.core - jackson-core - ${fasterxml.version} - - - - com.fasterxml.jackson.core - jackson-databind - ${fasterxml.version}.1 - - - - com.fasterxml.jackson.module - jackson-module-scala_2.11 - ${fasterxml.version} - - - - org.codehaus.jackson - jackson-core-asl - ${codehaus-jackson.version} - - - - org.codehaus.jackson - jackson-mapper-asl - ${codehaus-jackson.version} - - - - org.codehaus.jackson - jackson-jaxrs - ${codehaus-jackson.version} - - - - org.codehaus.jackson - jackson-xc - ${codehaus-jackson.version} - - + ${hive.groupid} hive-service @@ -919,34 +930,28 @@ + - org.apache.hadoop - hadoop-hdfs - tests - ${hadoop.version} + com.google.code.gson + gson + 2.3.1 + test + - org.apache.hadoop - hadoop-common - tests - ${hadoop.version} - - - jdk.tools - jdk.tools - - - javax.xml.bind - jaxb-api - - + junit + junit + ${junit.version} + test - + + org.mockito mockito-all test - 1.10.19 + ${mockito.version} + com.esotericsoftware @@ -954,24 +959,29 @@ 4.0.0 test - - - org.eclipse.jetty.aggregate - jetty-all - test - 7.6.0.v20120127 - - - Maven repository - https://central.maven.org/maven2/ + Maven Central + Maven Repository + https://repo.maven.apache.org/maven2 + + true + + + false + cloudera-repo-releases https://repository.cloudera.com/artifactory/public/ + + true + + + false + diff --git a/tools/run_travis_tests.sh b/tools/run_travis_tests.sh new file mode 100755 index 000000000..51a34e4b9 --- /dev/null +++ b/tools/run_travis_tests.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +mode=$1 + +if [ "$mode" = "unit" ]; +then + echo "Running Unit Tests" + mvn test -DskipITs=true -B +elif [ "$mode" = "integration" ]; +then + echo "Running Integration Tests" + mvn verify -DskipUTs=true -B +else + echo "Unknown mode $mode" + exit 1; +fi +