[HUDI-238] Make Hudi support Scala 2.12 (#1226)
* [HUDI-238] Rename scala related artifactId & add maven profile to support Scala 2.12
This commit is contained in:
committed by
Balaji Varadarajan
parent
923e2b4a1e
commit
292c1e2ff4
2
LICENSE
2
LICENSE
@@ -245,6 +245,8 @@ This product includes code from Apache Spark
|
|||||||
|
|
||||||
* org.apache.hudi.AvroConversionHelper copied from classes in org/apache/spark/sql/avro package
|
* org.apache.hudi.AvroConversionHelper copied from classes in org/apache/spark/sql/avro package
|
||||||
|
|
||||||
|
* dev/change-scala-version.sh copied from https://github.com/apache/spark/blob/branch-2.4/dev/change-scala-version.sh
|
||||||
|
|
||||||
Copyright: 2014 and onwards The Apache Software Foundation
|
Copyright: 2014 and onwards The Apache Software Foundation
|
||||||
Home page: http://spark.apache.org/
|
Home page: http://spark.apache.org/
|
||||||
License: http://www.apache.org/licenses/LICENSE-2.0
|
License: http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|||||||
66
dev/change-scala-version.sh
Executable file
66
dev/change-scala-version.sh
Executable file
@@ -0,0 +1,66 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
#
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
VALID_VERSIONS=( 2.11 2.12 )
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "Usage: $(basename $0) [-h|--help] <version>
|
||||||
|
where :
|
||||||
|
-h| --help Display this help text
|
||||||
|
valid version values : ${VALID_VERSIONS[*]}
|
||||||
|
" 1>&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ ($# -ne 1) || ( $1 == "--help") || $1 == "-h" ]]; then
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
|
||||||
|
TO_VERSION=$1
|
||||||
|
|
||||||
|
check_scala_version() {
|
||||||
|
for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done
|
||||||
|
echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
check_scala_version "$TO_VERSION"
|
||||||
|
|
||||||
|
if [ $TO_VERSION = "2.11" ]; then
|
||||||
|
FROM_VERSION="2.12"
|
||||||
|
else
|
||||||
|
FROM_VERSION="2.11"
|
||||||
|
fi
|
||||||
|
|
||||||
|
sed_i() {
|
||||||
|
sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2"
|
||||||
|
}
|
||||||
|
|
||||||
|
export -f sed_i
|
||||||
|
|
||||||
|
BASEDIR=$(dirname $0)/..
|
||||||
|
find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
|
||||||
|
-exec bash -c "sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' {}" \;
|
||||||
|
|
||||||
|
# Also update <scala.binary.version> in parent POM
|
||||||
|
# Match any scala binary version to ensure idempotency
|
||||||
|
sed_i '1,/<scala\.binary\.version>[0-9]*\.[0-9]*</s/<scala\.binary\.version>[0-9]*\.[0-9]*</<scala.binary.version>'$TO_VERSION'</' \
|
||||||
|
"$BASEDIR/pom.xml"
|
||||||
@@ -26,5 +26,5 @@ hoodie.deltastreamer.schemaprovider.target.schema.file=/var/demo/config/schema.a
|
|||||||
# Kafka Source
|
# Kafka Source
|
||||||
hoodie.deltastreamer.source.kafka.topic=stock_ticks
|
hoodie.deltastreamer.source.kafka.topic=stock_ticks
|
||||||
#Kafka props
|
#Kafka props
|
||||||
metadata.broker.list=kafkabroker:9092
|
bootstrap.servers=kafkabroker:9092
|
||||||
auto.offset.reset=smallest
|
auto.offset.reset=earliest
|
||||||
|
|||||||
@@ -57,9 +57,9 @@
|
|||||||
<tasks>
|
<tasks>
|
||||||
<copy file="${project.basedir}/../../../../packaging/hudi-hadoop-mr-bundle/target/hudi-hadoop-mr-bundle-${project.version}.jar" tofile="target/hoodie-hadoop-mr-bundle.jar" />
|
<copy file="${project.basedir}/../../../../packaging/hudi-hadoop-mr-bundle/target/hudi-hadoop-mr-bundle-${project.version}.jar" tofile="target/hoodie-hadoop-mr-bundle.jar" />
|
||||||
<copy file="${project.basedir}/../../../../packaging/hudi-hive-bundle/target/hudi-hive-bundle-${project.version}.jar" tofile="target/hoodie-hive-bundle.jar" />
|
<copy file="${project.basedir}/../../../../packaging/hudi-hive-bundle/target/hudi-hive-bundle-${project.version}.jar" tofile="target/hoodie-hive-bundle.jar" />
|
||||||
<copy file="${project.basedir}/../../../../packaging/hudi-spark-bundle/target/hudi-spark-bundle-${project.version}.jar" tofile="target/hoodie-spark-bundle.jar" />
|
<copy file="${project.basedir}/../../../../packaging/hudi-spark-bundle/target/hudi-spark-bundle_${scala.binary.version}-${project.version}.jar" tofile="target/hoodie-spark-bundle.jar" />
|
||||||
<copy
|
<copy
|
||||||
file="${project.basedir}/../../../../packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-${project.version}.jar"
|
file="${project.basedir}/../../../../packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_${scala.binary.version}-${project.version}.jar"
|
||||||
tofile="target/hoodie-utilities.jar"/>
|
tofile="target/hoodie-utilities.jar"/>
|
||||||
</tasks>
|
</tasks>
|
||||||
</configuration>
|
</configuration>
|
||||||
|
|||||||
@@ -42,7 +42,7 @@
|
|||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hudi</groupId>
|
<groupId>org.apache.hudi</groupId>
|
||||||
<artifactId>hudi-spark-bundle</artifactId>
|
<artifactId>hudi-spark-bundle_${scala.binary.version}</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|||||||
@@ -147,7 +147,7 @@
|
|||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hudi</groupId>
|
<groupId>org.apache.hudi</groupId>
|
||||||
<artifactId>hudi-utilities</artifactId>
|
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
@@ -160,16 +160,22 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.parquet</groupId>
|
<groupId>org.apache.parquet</groupId>
|
||||||
<artifactId>parquet-avro</artifactId>
|
<artifactId>parquet-avro</artifactId>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>com.thoughtworks.paranamer</groupId>
|
||||||
|
<artifactId>paranamer</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Spark -->
|
<!-- Spark -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|||||||
@@ -95,11 +95,11 @@
|
|||||||
<!-- Spark -->
|
<!-- Spark -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Dropwizard Metrics -->
|
<!-- Dropwizard Metrics -->
|
||||||
|
|||||||
@@ -54,7 +54,7 @@
|
|||||||
<!-- Hoodie -->
|
<!-- Hoodie -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hudi</groupId>
|
<groupId>org.apache.hudi</groupId>
|
||||||
<artifactId>hudi-spark</artifactId>
|
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
@@ -83,7 +83,7 @@
|
|||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hudi</groupId>
|
<groupId>org.apache.hudi</groupId>
|
||||||
<artifactId>hudi-spark</artifactId>
|
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
<classifier>tests</classifier>
|
<classifier>tests</classifier>
|
||||||
<type>test-jar</type>
|
<type>test-jar</type>
|
||||||
|
|||||||
@@ -23,7 +23,7 @@
|
|||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
<artifactId>hudi-spark</artifactId>
|
<artifactId>hudi-spark_2.11</artifactId>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
@@ -190,13 +190,20 @@
|
|||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.module</groupId>
|
<groupId>com.fasterxml.jackson.module</groupId>
|
||||||
<artifactId>jackson-module-scala_2.11</artifactId>
|
<artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Avro -->
|
<!-- Avro -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.avro</groupId>
|
<groupId>org.apache.avro</groupId>
|
||||||
<artifactId>avro</artifactId>
|
<artifactId>avro</artifactId>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<!-- this version to conflict to spark-core_2.12 -->
|
||||||
|
<groupId>com.thoughtworks.paranamer</groupId>
|
||||||
|
<artifactId>paranamer</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Parquet -->
|
<!-- Parquet -->
|
||||||
@@ -208,17 +215,17 @@
|
|||||||
<!-- Spark -->
|
<!-- Spark -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Spark (Packages) -->
|
<!-- Spark (Packages) -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-avro_2.11</artifactId>
|
<artifactId>spark-avro_${scala.binary.version}</artifactId>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
@@ -293,7 +300,7 @@
|
|||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.scalatest</groupId>
|
<groupId>org.scalatest</groupId>
|
||||||
<artifactId>scalatest_2.11</artifactId>
|
<artifactId>scalatest_${scala.binary.version}</artifactId>
|
||||||
<version>${scalatest.version}</version>
|
<version>${scalatest.version}</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ function error_exit {
|
|||||||
|
|
||||||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||||
#Ensure we pick the right jar even for hive11 builds
|
#Ensure we pick the right jar even for hive11 builds
|
||||||
HUDI_JAR=`ls -c $DIR/../packaging/hudi-spark-bundle/target/hudi-spark-bundle-*.jar | grep -v source | head -1`
|
HUDI_JAR=`ls -c $DIR/../packaging/hudi-spark-bundle/target/hudi-spark-bundle*.jar | grep -v source | head -1`
|
||||||
|
|
||||||
if [ -z "$HADOOP_CONF_DIR" ]; then
|
if [ -z "$HADOOP_CONF_DIR" ]; then
|
||||||
echo "setting hadoop conf dir"
|
echo "setting hadoop conf dir"
|
||||||
|
|||||||
@@ -23,7 +23,7 @@
|
|||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
<artifactId>hudi-utilities</artifactId>
|
<artifactId>hudi-utilities_2.11</artifactId>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
@@ -109,7 +109,7 @@
|
|||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hudi</groupId>
|
<groupId>org.apache.hudi</groupId>
|
||||||
<artifactId>hudi-spark</artifactId>
|
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
<exclusions>
|
<exclusions>
|
||||||
<exclusion>
|
<exclusion>
|
||||||
@@ -128,19 +128,25 @@
|
|||||||
<!-- Fasterxml -->
|
<!-- Fasterxml -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.module</groupId>
|
<groupId>com.fasterxml.jackson.module</groupId>
|
||||||
<artifactId>jackson-module-scala_2.11</artifactId>
|
<artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Parquet -->
|
<!-- Parquet -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.parquet</groupId>
|
<groupId>org.apache.parquet</groupId>
|
||||||
<artifactId>parquet-avro</artifactId>
|
<artifactId>parquet-avro</artifactId>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>com.thoughtworks.paranamer</groupId>
|
||||||
|
<artifactId>paranamer</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Spark -->
|
<!-- Spark -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
<exclusions>
|
<exclusions>
|
||||||
<exclusion>
|
<exclusion>
|
||||||
<groupId>javax.servlet</groupId>
|
<groupId>javax.servlet</groupId>
|
||||||
@@ -151,7 +157,7 @@
|
|||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
<exclusions>
|
<exclusions>
|
||||||
<exclusion>
|
<exclusion>
|
||||||
<groupId>javax.servlet</groupId>
|
<groupId>javax.servlet</groupId>
|
||||||
@@ -162,20 +168,26 @@
|
|||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-avro_2.11</artifactId>
|
<artifactId>spark-avro_${scala.binary.version}</artifactId>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-streaming_2.11</artifactId>
|
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
|
||||||
<version>${spark.version}</version>
|
<version>${spark.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
|
<artifactId>spark-streaming-kafka-0-10_${scala.binary.version}</artifactId>
|
||||||
<version>${spark.version}</version>
|
<version>${spark.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-streaming-kafka-0-10_${scala.binary.version}</artifactId>
|
||||||
|
<version>${spark.version}</version>
|
||||||
|
<classifier>tests</classifier>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- Dropwizard Metrics -->
|
<!-- Dropwizard Metrics -->
|
||||||
<dependency>
|
<dependency>
|
||||||
@@ -197,8 +209,8 @@
|
|||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.twitter</groupId>
|
<groupId>com.twitter</groupId>
|
||||||
<artifactId>bijection-avro_2.11</artifactId>
|
<artifactId>bijection-avro_${scala.binary.version}</artifactId>
|
||||||
<version>0.9.2</version>
|
<version>0.9.3</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Kafka -->
|
<!-- Kafka -->
|
||||||
@@ -223,6 +235,13 @@
|
|||||||
<version>3.0.0</version>
|
<version>3.0.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.kafka</groupId>
|
||||||
|
<artifactId>kafka_${scala.binary.version}</artifactId>
|
||||||
|
<version>${kafka.version}</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- Httpcomponents -->
|
<!-- Httpcomponents -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
|
|||||||
@@ -24,16 +24,17 @@ import org.apache.hudi.utilities.schema.SchemaProvider;
|
|||||||
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen;
|
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen;
|
||||||
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
|
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
|
||||||
|
|
||||||
import io.confluent.kafka.serializers.KafkaAvroDecoder;
|
import io.confluent.kafka.serializers.KafkaAvroDeserializer;
|
||||||
import kafka.serializer.StringDecoder;
|
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.kafka.common.serialization.StringDeserializer;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.apache.spark.streaming.kafka.KafkaUtils;
|
import org.apache.spark.streaming.kafka010.KafkaUtils;
|
||||||
import org.apache.spark.streaming.kafka.OffsetRange;
|
import org.apache.spark.streaming.kafka010.LocationStrategies;
|
||||||
|
import org.apache.spark.streaming.kafka010.OffsetRange;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads avro serialized Kafka data, based on the confluent schema-registry.
|
* Reads avro serialized Kafka data, based on the confluent schema-registry.
|
||||||
@@ -47,6 +48,8 @@ public class AvroKafkaSource extends AvroSource {
|
|||||||
public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
|
public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||||
SchemaProvider schemaProvider) {
|
SchemaProvider schemaProvider) {
|
||||||
super(props, sparkContext, sparkSession, schemaProvider);
|
super(props, sparkContext, sparkSession, schemaProvider);
|
||||||
|
props.put("key.deserializer", StringDeserializer.class);
|
||||||
|
props.put("value.deserializer", KafkaAvroDeserializer.class);
|
||||||
offsetGen = new KafkaOffsetGen(props);
|
offsetGen = new KafkaOffsetGen(props);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -64,9 +67,7 @@ public class AvroKafkaSource extends AvroSource {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<GenericRecord> toRDD(OffsetRange[] offsetRanges) {
|
private JavaRDD<GenericRecord> toRDD(OffsetRange[] offsetRanges) {
|
||||||
JavaRDD<GenericRecord> recordRDD =
|
return KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges,
|
||||||
KafkaUtils.createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class,
|
LocationStrategies.PreferConsistent()).map(obj -> (GenericRecord) obj.value());
|
||||||
offsetGen.getKafkaParams(), offsetRanges).values().map(obj -> (GenericRecord) obj);
|
|
||||||
return recordRDD;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,14 +24,15 @@ import org.apache.hudi.utilities.schema.SchemaProvider;
|
|||||||
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen;
|
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen;
|
||||||
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
|
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
|
||||||
|
|
||||||
import kafka.serializer.StringDecoder;
|
import org.apache.kafka.common.serialization.StringDeserializer;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.apache.spark.streaming.kafka.KafkaUtils;
|
import org.apache.spark.streaming.kafka010.KafkaUtils;
|
||||||
import org.apache.spark.streaming.kafka.OffsetRange;
|
import org.apache.spark.streaming.kafka010.LocationStrategies;
|
||||||
|
import org.apache.spark.streaming.kafka010.OffsetRange;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read json kafka data.
|
* Read json kafka data.
|
||||||
@@ -45,6 +46,8 @@ public class JsonKafkaSource extends JsonSource {
|
|||||||
public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SparkSession sparkSession,
|
public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||||
SchemaProvider schemaProvider) {
|
SchemaProvider schemaProvider) {
|
||||||
super(properties, sparkContext, sparkSession, schemaProvider);
|
super(properties, sparkContext, sparkSession, schemaProvider);
|
||||||
|
properties.put("key.deserializer", StringDeserializer.class);
|
||||||
|
properties.put("value.deserializer", StringDeserializer.class);
|
||||||
offsetGen = new KafkaOffsetGen(properties);
|
offsetGen = new KafkaOffsetGen(properties);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -61,7 +64,7 @@ public class JsonKafkaSource extends JsonSource {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<String> toRDD(OffsetRange[] offsetRanges) {
|
private JavaRDD<String> toRDD(OffsetRange[] offsetRanges) {
|
||||||
return KafkaUtils.createRDD(sparkContext, String.class, String.class, StringDecoder.class, StringDecoder.class,
|
return KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges,
|
||||||
offsetGen.getKafkaParams(), offsetRanges).values();
|
LocationStrategies.PreferConsistent()).map(x -> (String) x.value());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,30 +22,24 @@ import org.apache.hudi.DataSourceUtils;
|
|||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.TypedProperties;
|
import org.apache.hudi.common.util.TypedProperties;
|
||||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||||
import org.apache.hudi.utilities.exception.HoodieDeltaStreamerException;
|
|
||||||
|
|
||||||
import kafka.common.TopicAndPartition;
|
import org.apache.kafka.clients.consumer.KafkaConsumer;
|
||||||
|
import org.apache.kafka.common.PartitionInfo;
|
||||||
|
import org.apache.kafka.common.TopicPartition;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.streaming.kafka.KafkaCluster;
|
import org.apache.spark.streaming.kafka010.OffsetRange;
|
||||||
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset;
|
|
||||||
import org.apache.spark.streaming.kafka.OffsetRange;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import scala.Predef;
|
|
||||||
import scala.collection.JavaConverters;
|
|
||||||
import scala.collection.immutable.Map;
|
|
||||||
import scala.collection.immutable.Set;
|
|
||||||
import scala.collection.mutable.ArrayBuffer;
|
|
||||||
import scala.collection.mutable.StringBuilder;
|
|
||||||
import scala.util.Either;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Source to read data from Kafka, incrementally.
|
* Source to read data from Kafka, incrementally.
|
||||||
*/
|
*/
|
||||||
@@ -58,8 +52,8 @@ public class KafkaOffsetGen {
|
|||||||
/**
|
/**
|
||||||
* Reconstruct checkpoint from string.
|
* Reconstruct checkpoint from string.
|
||||||
*/
|
*/
|
||||||
public static HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> strToOffsets(String checkpointStr) {
|
public static HashMap<TopicPartition, Long> strToOffsets(String checkpointStr) {
|
||||||
HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> offsetMap = new HashMap<>();
|
HashMap<TopicPartition, Long> offsetMap = new HashMap<>();
|
||||||
if (checkpointStr.length() == 0) {
|
if (checkpointStr.length() == 0) {
|
||||||
return offsetMap;
|
return offsetMap;
|
||||||
}
|
}
|
||||||
@@ -67,8 +61,7 @@ public class KafkaOffsetGen {
|
|||||||
String topic = splits[0];
|
String topic = splits[0];
|
||||||
for (int i = 1; i < splits.length; i++) {
|
for (int i = 1; i < splits.length; i++) {
|
||||||
String[] subSplits = splits[i].split(":");
|
String[] subSplits = splits[i].split(":");
|
||||||
offsetMap.put(new TopicAndPartition(topic, Integer.parseInt(subSplits[0])),
|
offsetMap.put(new TopicPartition(topic, Integer.parseInt(subSplits[0])), Long.parseLong(subSplits[1]));
|
||||||
new KafkaCluster.LeaderOffset("", -1, Long.parseLong(subSplits[1])));
|
|
||||||
}
|
}
|
||||||
return offsetMap;
|
return offsetMap;
|
||||||
}
|
}
|
||||||
@@ -83,7 +76,7 @@ public class KafkaOffsetGen {
|
|||||||
// at least 1 partition will be present.
|
// at least 1 partition will be present.
|
||||||
sb.append(ranges[0].topic() + ",");
|
sb.append(ranges[0].topic() + ",");
|
||||||
sb.append(Arrays.stream(ranges).map(r -> String.format("%s:%d", r.partition(), r.untilOffset()))
|
sb.append(Arrays.stream(ranges).map(r -> String.format("%s:%d", r.partition(), r.untilOffset()))
|
||||||
.collect(Collectors.joining(",")));
|
.collect(Collectors.joining(",")));
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -94,32 +87,32 @@ public class KafkaOffsetGen {
|
|||||||
* @param toOffsetMap offsets of where each partitions is currently at
|
* @param toOffsetMap offsets of where each partitions is currently at
|
||||||
* @param numEvents maximum number of events to read.
|
* @param numEvents maximum number of events to read.
|
||||||
*/
|
*/
|
||||||
public static OffsetRange[] computeOffsetRanges(HashMap<TopicAndPartition, LeaderOffset> fromOffsetMap,
|
public static OffsetRange[] computeOffsetRanges(Map<TopicPartition, Long> fromOffsetMap,
|
||||||
HashMap<TopicAndPartition, LeaderOffset> toOffsetMap, long numEvents) {
|
Map<TopicPartition, Long> toOffsetMap, long numEvents) {
|
||||||
|
|
||||||
Comparator<OffsetRange> byPartition = Comparator.comparing(OffsetRange::partition);
|
Comparator<OffsetRange> byPartition = Comparator.comparing(OffsetRange::partition);
|
||||||
|
|
||||||
// Create initial offset ranges for each 'to' partition, with from = to offsets.
|
// Create initial offset ranges for each 'to' partition, with from = to offsets.
|
||||||
OffsetRange[] ranges = new OffsetRange[toOffsetMap.size()];
|
OffsetRange[] ranges = new OffsetRange[toOffsetMap.size()];
|
||||||
toOffsetMap.entrySet().stream().map(e -> {
|
toOffsetMap.entrySet().stream().map(e -> {
|
||||||
TopicAndPartition tp = e.getKey();
|
TopicPartition tp = e.getKey();
|
||||||
long fromOffset = fromOffsetMap.getOrDefault(tp, new LeaderOffset("", -1, 0)).offset();
|
long fromOffset = fromOffsetMap.getOrDefault(tp, 0L);
|
||||||
return OffsetRange.create(tp, fromOffset, fromOffset);
|
return OffsetRange.create(tp, fromOffset, fromOffset);
|
||||||
}).sorted(byPartition).collect(Collectors.toList()).toArray(ranges);
|
}).sorted(byPartition).collect(Collectors.toList()).toArray(ranges);
|
||||||
|
|
||||||
long allocedEvents = 0;
|
long allocedEvents = 0;
|
||||||
java.util.Set<Integer> exhaustedPartitions = new HashSet<>();
|
Set<Integer> exhaustedPartitions = new HashSet<>();
|
||||||
// keep going until we have events to allocate and partitions still not exhausted.
|
// keep going until we have events to allocate and partitions still not exhausted.
|
||||||
while (allocedEvents < numEvents && exhaustedPartitions.size() < toOffsetMap.size()) {
|
while (allocedEvents < numEvents && exhaustedPartitions.size() < toOffsetMap.size()) {
|
||||||
long remainingEvents = numEvents - allocedEvents;
|
long remainingEvents = numEvents - allocedEvents;
|
||||||
long eventsPerPartition =
|
long eventsPerPartition =
|
||||||
(long) Math.ceil((1.0 * remainingEvents) / (toOffsetMap.size() - exhaustedPartitions.size()));
|
(long) Math.ceil((1.0 * remainingEvents) / (toOffsetMap.size() - exhaustedPartitions.size()));
|
||||||
|
|
||||||
// Allocate the remaining events to non-exhausted partitions, in round robin fashion
|
// Allocate the remaining events to non-exhausted partitions, in round robin fashion
|
||||||
for (int i = 0; i < ranges.length; i++) {
|
for (int i = 0; i < ranges.length; i++) {
|
||||||
OffsetRange range = ranges[i];
|
OffsetRange range = ranges[i];
|
||||||
if (!exhaustedPartitions.contains(range.partition())) {
|
if (!exhaustedPartitions.contains(range.partition())) {
|
||||||
long toOffsetMax = toOffsetMap.get(range.topicAndPartition()).offset();
|
long toOffsetMax = toOffsetMap.get(range.topicPartition());
|
||||||
long toOffset = Math.min(toOffsetMax, range.untilOffset() + eventsPerPartition);
|
long toOffset = Math.min(toOffsetMax, range.untilOffset() + eventsPerPartition);
|
||||||
if (toOffset == toOffsetMax) {
|
if (toOffset == toOffsetMax) {
|
||||||
exhaustedPartitions.add(range.partition());
|
exhaustedPartitions.add(range.partition());
|
||||||
@@ -130,7 +123,7 @@ public class KafkaOffsetGen {
|
|||||||
long offsetsToAdd = Math.min(eventsPerPartition, (numEvents - allocedEvents));
|
long offsetsToAdd = Math.min(eventsPerPartition, (numEvents - allocedEvents));
|
||||||
toOffset = Math.min(toOffsetMax, toOffset + offsetsToAdd);
|
toOffset = Math.min(toOffsetMax, toOffset + offsetsToAdd);
|
||||||
}
|
}
|
||||||
ranges[i] = OffsetRange.create(range.topicAndPartition(), range.fromOffset(), toOffset);
|
ranges[i] = OffsetRange.create(range.topicPartition(), range.fromOffset(), toOffset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -143,29 +136,11 @@ public class KafkaOffsetGen {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Helpers to deal with tricky scala <=> java conversions. (oh my!)
|
|
||||||
*/
|
|
||||||
static class ScalaHelpers {
|
|
||||||
|
|
||||||
public static <K, V> Map<K, V> toScalaMap(HashMap<K, V> m) {
|
|
||||||
return JavaConverters.mapAsScalaMapConverter(m).asScala().toMap(Predef.conforms());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Set<String> toScalaSet(HashSet<String> s) {
|
|
||||||
return JavaConverters.asScalaSetConverter(s).asScala().toSet();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <K, V> java.util.Map<K, V> toJavaMap(Map<K, V> m) {
|
|
||||||
return JavaConverters.mapAsJavaMapConverter(m).asJava();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Kafka reset offset strategies.
|
* Kafka reset offset strategies.
|
||||||
*/
|
*/
|
||||||
enum KafkaResetOffsetStrategies {
|
enum KafkaResetOffsetStrategies {
|
||||||
LARGEST, SMALLEST
|
LATEST, EARLIEST
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -175,20 +150,20 @@ public class KafkaOffsetGen {
|
|||||||
|
|
||||||
private static final String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic";
|
private static final String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic";
|
||||||
private static final String MAX_EVENTS_FROM_KAFKA_SOURCE_PROP = "hoodie.deltastreamer.kafka.source.maxEvents";
|
private static final String MAX_EVENTS_FROM_KAFKA_SOURCE_PROP = "hoodie.deltastreamer.kafka.source.maxEvents";
|
||||||
private static final KafkaResetOffsetStrategies DEFAULT_AUTO_RESET_OFFSET = KafkaResetOffsetStrategies.LARGEST;
|
private static final KafkaResetOffsetStrategies DEFAULT_AUTO_RESET_OFFSET = KafkaResetOffsetStrategies.LATEST;
|
||||||
public static final long DEFAULT_MAX_EVENTS_FROM_KAFKA_SOURCE = 5000000;
|
public static final long DEFAULT_MAX_EVENTS_FROM_KAFKA_SOURCE = 5000000;
|
||||||
public static long maxEventsFromKafkaSource = DEFAULT_MAX_EVENTS_FROM_KAFKA_SOURCE;
|
public static long maxEventsFromKafkaSource = DEFAULT_MAX_EVENTS_FROM_KAFKA_SOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
private final HashMap<String, String> kafkaParams;
|
private final HashMap<String, Object> kafkaParams;
|
||||||
private final TypedProperties props;
|
private final TypedProperties props;
|
||||||
protected final String topicName;
|
protected final String topicName;
|
||||||
|
|
||||||
public KafkaOffsetGen(TypedProperties props) {
|
public KafkaOffsetGen(TypedProperties props) {
|
||||||
this.props = props;
|
this.props = props;
|
||||||
kafkaParams = new HashMap<String, String>();
|
kafkaParams = new HashMap<>();
|
||||||
for (Object prop : props.keySet()) {
|
for (Object prop : props.keySet()) {
|
||||||
kafkaParams.put(prop.toString(), props.getString(prop.toString()));
|
kafkaParams.put(prop.toString(), props.get(prop.toString()));
|
||||||
}
|
}
|
||||||
DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.KAFKA_TOPIC_NAME));
|
DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.KAFKA_TOPIC_NAME));
|
||||||
topicName = props.getString(Config.KAFKA_TOPIC_NAME);
|
topicName = props.getString(Config.KAFKA_TOPIC_NAME);
|
||||||
@@ -197,31 +172,25 @@ public class KafkaOffsetGen {
|
|||||||
public OffsetRange[] getNextOffsetRanges(Option<String> lastCheckpointStr, long sourceLimit) {
|
public OffsetRange[] getNextOffsetRanges(Option<String> lastCheckpointStr, long sourceLimit) {
|
||||||
|
|
||||||
// Obtain current metadata for the topic
|
// Obtain current metadata for the topic
|
||||||
KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams));
|
KafkaConsumer consumer = new KafkaConsumer(kafkaParams);
|
||||||
Either<ArrayBuffer<Throwable>, Set<TopicAndPartition>> either =
|
List<PartitionInfo> partitionInfoList;
|
||||||
cluster.getPartitions(ScalaHelpers.toScalaSet(new HashSet<>(Collections.singletonList(topicName))));
|
partitionInfoList = consumer.partitionsFor(topicName);
|
||||||
if (either.isLeft()) {
|
Set<TopicPartition> topicPartitions = partitionInfoList.stream()
|
||||||
// log errors. and bail out.
|
.map(x -> new TopicPartition(x.topic(), x.partition())).collect(Collectors.toSet());
|
||||||
throw new HoodieDeltaStreamerException("Error obtaining partition metadata", either.left().get().head());
|
|
||||||
}
|
|
||||||
Set<TopicAndPartition> topicPartitions = either.right().get();
|
|
||||||
|
|
||||||
// Determine the offset ranges to read from
|
// Determine the offset ranges to read from
|
||||||
HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> fromOffsets;
|
Map<TopicPartition, Long> fromOffsets;
|
||||||
HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> checkpointOffsets;
|
|
||||||
if (lastCheckpointStr.isPresent()) {
|
if (lastCheckpointStr.isPresent()) {
|
||||||
fromOffsets = checkupValidOffsets(cluster, lastCheckpointStr, topicPartitions);
|
fromOffsets = checkupValidOffsets(consumer, lastCheckpointStr, topicPartitions);
|
||||||
} else {
|
} else {
|
||||||
KafkaResetOffsetStrategies autoResetValue = KafkaResetOffsetStrategies
|
KafkaResetOffsetStrategies autoResetValue = KafkaResetOffsetStrategies
|
||||||
.valueOf(props.getString("auto.offset.reset", Config.DEFAULT_AUTO_RESET_OFFSET.toString()).toUpperCase());
|
.valueOf(props.getString("auto.offset.reset", Config.DEFAULT_AUTO_RESET_OFFSET.toString()).toUpperCase());
|
||||||
switch (autoResetValue) {
|
switch (autoResetValue) {
|
||||||
case SMALLEST:
|
case EARLIEST:
|
||||||
fromOffsets =
|
fromOffsets = consumer.beginningOffsets(topicPartitions);
|
||||||
new HashMap(ScalaHelpers.toJavaMap(cluster.getEarliestLeaderOffsets(topicPartitions).right().get()));
|
|
||||||
break;
|
break;
|
||||||
case LARGEST:
|
case LATEST:
|
||||||
fromOffsets =
|
fromOffsets = consumer.endOffsets(topicPartitions);
|
||||||
new HashMap(ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get()));
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw new HoodieNotSupportedException("Auto reset value must be one of 'smallest' or 'largest' ");
|
throw new HoodieNotSupportedException("Auto reset value must be one of 'smallest' or 'largest' ");
|
||||||
@@ -229,8 +198,7 @@ public class KafkaOffsetGen {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Obtain the latest offsets.
|
// Obtain the latest offsets.
|
||||||
HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> toOffsets =
|
Map<TopicPartition, Long> toOffsets = consumer.endOffsets(topicPartitions);
|
||||||
new HashMap(ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get()));
|
|
||||||
|
|
||||||
// Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
|
// Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
|
||||||
long maxEventsToReadFromKafka = props.getLong(Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP,
|
long maxEventsToReadFromKafka = props.getLong(Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP,
|
||||||
@@ -245,15 +213,13 @@ public class KafkaOffsetGen {
|
|||||||
|
|
||||||
// check up checkpoint offsets is valid or not, if true, return checkpoint offsets,
|
// check up checkpoint offsets is valid or not, if true, return checkpoint offsets,
|
||||||
// else return earliest offsets
|
// else return earliest offsets
|
||||||
private HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> checkupValidOffsets(KafkaCluster cluster,
|
private Map<TopicPartition, Long> checkupValidOffsets(KafkaConsumer consumer,
|
||||||
Option<String> lastCheckpointStr, Set<TopicAndPartition> topicPartitions) {
|
Option<String> lastCheckpointStr, Set<TopicPartition> topicPartitions) {
|
||||||
HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> checkpointOffsets =
|
Map<TopicPartition, Long> checkpointOffsets = CheckpointUtils.strToOffsets(lastCheckpointStr.get());
|
||||||
CheckpointUtils.strToOffsets(lastCheckpointStr.get());
|
Map<TopicPartition, Long> earliestOffsets = consumer.beginningOffsets(topicPartitions);
|
||||||
HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> earliestOffsets =
|
|
||||||
new HashMap(ScalaHelpers.toJavaMap(cluster.getEarliestLeaderOffsets(topicPartitions).right().get()));
|
|
||||||
|
|
||||||
boolean checkpointOffsetReseter = checkpointOffsets.entrySet().stream()
|
boolean checkpointOffsetReseter = checkpointOffsets.entrySet().stream()
|
||||||
.anyMatch(offset -> offset.getValue().offset() < earliestOffsets.get(offset.getKey()).offset());
|
.anyMatch(offset -> offset.getValue() < earliestOffsets.get(offset.getKey()));
|
||||||
return checkpointOffsetReseter ? earliestOffsets : checkpointOffsets;
|
return checkpointOffsetReseter ? earliestOffsets : checkpointOffsets;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -261,7 +227,7 @@ public class KafkaOffsetGen {
|
|||||||
return topicName;
|
return topicName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HashMap<String, String> getKafkaParams() {
|
public HashMap<String, Object> getKafkaParams() {
|
||||||
return kafkaParams;
|
return kafkaParams;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,14 +28,14 @@ import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
|
|||||||
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
|
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
|
||||||
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.Config;
|
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.Config;
|
||||||
|
|
||||||
import kafka.common.TopicAndPartition;
|
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.kafka.clients.consumer.ConsumerConfig;
|
||||||
|
import org.apache.kafka.common.TopicPartition;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Row;
|
import org.apache.spark.sql.Row;
|
||||||
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset;
|
import org.apache.spark.streaming.kafka010.KafkaTestUtils;
|
||||||
import org.apache.spark.streaming.kafka.KafkaTestUtils;
|
import org.apache.spark.streaming.kafka010.OffsetRange;
|
||||||
import org.apache.spark.streaming.kafka.OffsetRange;
|
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.AfterClass;
|
import org.junit.AfterClass;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
@@ -44,6 +44,7 @@ import org.junit.Test;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
@@ -84,13 +85,12 @@ public class TestKafkaSource extends UtilitiesTestBase {
|
|||||||
private TypedProperties createPropsForJsonSource(Long maxEventsToReadFromKafkaSource) {
|
private TypedProperties createPropsForJsonSource(Long maxEventsToReadFromKafkaSource) {
|
||||||
TypedProperties props = new TypedProperties();
|
TypedProperties props = new TypedProperties();
|
||||||
props.setProperty("hoodie.deltastreamer.source.kafka.topic", TEST_TOPIC_NAME);
|
props.setProperty("hoodie.deltastreamer.source.kafka.topic", TEST_TOPIC_NAME);
|
||||||
props.setProperty("metadata.broker.list", testUtils.brokerAddress());
|
props.setProperty("bootstrap.servers", testUtils.brokerAddress());
|
||||||
props.setProperty("auto.offset.reset", "smallest");
|
props.setProperty("auto.offset.reset", "earliest");
|
||||||
props.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
|
|
||||||
props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
|
|
||||||
props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents",
|
props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents",
|
||||||
maxEventsToReadFromKafkaSource != null ? String.valueOf(maxEventsToReadFromKafkaSource) :
|
maxEventsToReadFromKafkaSource != null ? String.valueOf(maxEventsToReadFromKafkaSource) :
|
||||||
String.valueOf(Config.maxEventsFromKafkaSource));
|
String.valueOf(Config.maxEventsFromKafkaSource));
|
||||||
|
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString());
|
||||||
return props;
|
return props;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -214,10 +214,10 @@ public class TestKafkaSource extends UtilitiesTestBase {
|
|||||||
assertEquals(Option.empty(), fetch6.getBatch());
|
assertEquals(Option.empty(), fetch6.getBatch());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static HashMap<TopicAndPartition, LeaderOffset> makeOffsetMap(int[] partitions, long[] offsets) {
|
private static HashMap<TopicPartition, Long> makeOffsetMap(int[] partitions, long[] offsets) {
|
||||||
HashMap<TopicAndPartition, LeaderOffset> map = new HashMap<>();
|
HashMap<TopicPartition, Long> map = new HashMap<>();
|
||||||
for (int i = 0; i < partitions.length; i++) {
|
for (int i = 0; i < partitions.length; i++) {
|
||||||
map.put(new TopicAndPartition(TEST_TOPIC_NAME, partitions[i]), new LeaderOffset("", -1, offsets[i]));
|
map.put(new TopicPartition(TEST_TOPIC_NAME, partitions[i]), offsets[i]);
|
||||||
}
|
}
|
||||||
return map;
|
return map;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,6 +25,6 @@ hoodie.deltastreamer.schemaprovider.registry.url=http://localhost:8081/subjects/
|
|||||||
#hoodie.deltastreamer.source.kafka.topic=uber_trips
|
#hoodie.deltastreamer.source.kafka.topic=uber_trips
|
||||||
hoodie.deltastreamer.source.kafka.topic=impressions
|
hoodie.deltastreamer.source.kafka.topic=impressions
|
||||||
#Kafka props
|
#Kafka props
|
||||||
metadata.broker.list=localhost:9092
|
bootstrap.servers=localhost:9092
|
||||||
auto.offset.reset=smallest
|
auto.offset.reset=earliest
|
||||||
schema.registry.url=http://localhost:8081
|
schema.registry.url=http://localhost:8081
|
||||||
|
|||||||
@@ -23,7 +23,7 @@
|
|||||||
<relativePath>../../pom.xml</relativePath>
|
<relativePath>../../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<artifactId>hudi-spark-bundle</artifactId>
|
<artifactId>hudi-spark-bundle_2.11</artifactId>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
@@ -32,7 +32,7 @@
|
|||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.rat</groupId>
|
<groupId>org.apache.rat</groupId>
|
||||||
<artifactId>apache-rat-plugin</artifactId>
|
<artifactId>apache-rat-plugin</artifactId>
|
||||||
@@ -66,7 +66,7 @@
|
|||||||
<includes>
|
<includes>
|
||||||
<include>org.apache.hudi:hudi-common</include>
|
<include>org.apache.hudi:hudi-common</include>
|
||||||
<include>org.apache.hudi:hudi-client</include>
|
<include>org.apache.hudi:hudi-client</include>
|
||||||
<include>org.apache.hudi:hudi-spark</include>
|
<include>org.apache.hudi:hudi-spark_${scala.binary.version}</include>
|
||||||
<include>org.apache.hudi:hudi-hive</include>
|
<include>org.apache.hudi:hudi-hive</include>
|
||||||
<include>org.apache.hudi:hudi-hadoop-mr</include>
|
<include>org.apache.hudi:hudi-hadoop-mr</include>
|
||||||
<include>org.apache.hudi:hudi-timeline-service</include>
|
<include>org.apache.hudi:hudi-timeline-service</include>
|
||||||
@@ -83,8 +83,8 @@
|
|||||||
<include>org.antlr:stringtemplate</include>
|
<include>org.antlr:stringtemplate</include>
|
||||||
<include>org.apache.parquet:parquet-avro</include>
|
<include>org.apache.parquet:parquet-avro</include>
|
||||||
|
|
||||||
<include>com.twitter:bijection-avro_2.11</include>
|
<include>com.twitter:bijection-avro_${scala.binary.version}</include>
|
||||||
<include>com.twitter:bijection-core_2.11</include>
|
<include>com.twitter:bijection-core_${scala.binary.version}</include>
|
||||||
<include>io.dropwizard.metrics:metrics-core</include>
|
<include>io.dropwizard.metrics:metrics-core</include>
|
||||||
<include>io.dropwizard.metrics:metrics-graphite</include>
|
<include>io.dropwizard.metrics:metrics-graphite</include>
|
||||||
<include>com.yammer.metrics:metrics-core</include>
|
<include>com.yammer.metrics:metrics-core</include>
|
||||||
@@ -190,7 +190,7 @@
|
|||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hudi</groupId>
|
<groupId>org.apache.hudi</groupId>
|
||||||
<artifactId>hudi-spark</artifactId>
|
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@
|
|||||||
<relativePath>../../pom.xml</relativePath>
|
<relativePath>../../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<artifactId>hudi-utilities-bundle</artifactId>
|
<artifactId>hudi-utilities-bundle_2.11</artifactId>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
@@ -67,8 +67,8 @@
|
|||||||
<includes>
|
<includes>
|
||||||
<include>org.apache.hudi:hudi-common</include>
|
<include>org.apache.hudi:hudi-common</include>
|
||||||
<include>org.apache.hudi:hudi-client</include>
|
<include>org.apache.hudi:hudi-client</include>
|
||||||
<include>org.apache.hudi:hudi-utilities</include>
|
<include>org.apache.hudi:hudi-utilities_${scala.binary.version}</include>
|
||||||
<include>org.apache.hudi:hudi-spark</include>
|
<include>org.apache.hudi:hudi-spark_${scala.binary.version}</include>
|
||||||
<include>org.apache.hudi:hudi-hive</include>
|
<include>org.apache.hudi:hudi-hive</include>
|
||||||
<include>org.apache.hudi:hudi-hadoop-mr</include>
|
<include>org.apache.hudi:hudi-hadoop-mr</include>
|
||||||
<include>org.apache.hudi:hudi-timeline-service</include>
|
<include>org.apache.hudi:hudi-timeline-service</include>
|
||||||
@@ -85,8 +85,8 @@
|
|||||||
<include>org.antlr:stringtemplate</include>
|
<include>org.antlr:stringtemplate</include>
|
||||||
<include>org.apache.parquet:parquet-avro</include>
|
<include>org.apache.parquet:parquet-avro</include>
|
||||||
|
|
||||||
<include>com.twitter:bijection-avro_2.11</include>
|
<include>com.twitter:bijection-avro_${scala.binary.version}</include>
|
||||||
<include>com.twitter:bijection-core_2.11</include>
|
<include>com.twitter:bijection-core_${scala.binary.version}</include>
|
||||||
<include>io.confluent:kafka-avro-serializer</include>
|
<include>io.confluent:kafka-avro-serializer</include>
|
||||||
<include>io.confluent:common-config</include>
|
<include>io.confluent:common-config</include>
|
||||||
<include>io.confluent:common-utils</include>
|
<include>io.confluent:common-utils</include>
|
||||||
@@ -94,8 +94,8 @@
|
|||||||
<include>io.dropwizard.metrics:metrics-core</include>
|
<include>io.dropwizard.metrics:metrics-core</include>
|
||||||
<include>io.dropwizard.metrics:metrics-graphite</include>
|
<include>io.dropwizard.metrics:metrics-graphite</include>
|
||||||
<include>com.yammer.metrics:metrics-core</include>
|
<include>com.yammer.metrics:metrics-core</include>
|
||||||
<include>org.apache.spark:spark-streaming-kafka-0-8_2.11</include>
|
<include>org.apache.spark:spark-streaming-kafka-0-10_${scala.binary.version}</include>
|
||||||
<include>org.apache.kafka:kafka_2.11</include>
|
<include>org.apache.kafka:kafka_${scala.binary.version}</include>
|
||||||
<include>com.101tec:zkclient</include>
|
<include>com.101tec:zkclient</include>
|
||||||
<include>org.apache.kafka:kafka-clients</include>
|
<include>org.apache.kafka:kafka-clients</include>
|
||||||
|
|
||||||
@@ -200,12 +200,12 @@
|
|||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hudi</groupId>
|
<groupId>org.apache.hudi</groupId>
|
||||||
<artifactId>hudi-spark</artifactId>
|
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hudi</groupId>
|
<groupId>org.apache.hudi</groupId>
|
||||||
<artifactId>hudi-utilities</artifactId>
|
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
|||||||
50
pom.xml
50
pom.xml
@@ -75,6 +75,7 @@
|
|||||||
|
|
||||||
<java.version>1.8</java.version>
|
<java.version>1.8</java.version>
|
||||||
<fasterxml.version>2.6.7</fasterxml.version>
|
<fasterxml.version>2.6.7</fasterxml.version>
|
||||||
|
<kafka.version>2.0.0</kafka.version>
|
||||||
<glassfish.version>2.17</glassfish.version>
|
<glassfish.version>2.17</glassfish.version>
|
||||||
<parquet.version>1.10.1</parquet.version>
|
<parquet.version>1.10.1</parquet.version>
|
||||||
<junit.version>4.11</junit.version>
|
<junit.version>4.11</junit.version>
|
||||||
@@ -91,7 +92,7 @@
|
|||||||
<spark.version>2.4.4</spark.version>
|
<spark.version>2.4.4</spark.version>
|
||||||
<avro.version>1.8.2</avro.version>
|
<avro.version>1.8.2</avro.version>
|
||||||
<scala.version>2.11.8</scala.version>
|
<scala.version>2.11.8</scala.version>
|
||||||
<scala.libversion>2.11</scala.libversion>
|
<scala.binary.version>2.11</scala.binary.version>
|
||||||
<apache-rat-plugin.version>0.12</apache-rat-plugin.version>
|
<apache-rat-plugin.version>0.12</apache-rat-plugin.version>
|
||||||
<scala-maven-plugin.version>3.3.1</scala-maven-plugin.version>
|
<scala-maven-plugin.version>3.3.1</scala-maven-plugin.version>
|
||||||
<scalatest.version>3.0.1</scalatest.version>
|
<scalatest.version>3.0.1</scalatest.version>
|
||||||
@@ -423,8 +424,8 @@
|
|||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.module</groupId>
|
<groupId>com.fasterxml.jackson.module</groupId>
|
||||||
<artifactId>jackson-module-scala_2.11</artifactId>
|
<artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
|
||||||
<version>${fasterxml.version}</version>
|
<version>${fasterxml.version}.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Glassfish -->
|
<!-- Glassfish -->
|
||||||
@@ -470,13 +471,13 @@
|
|||||||
<!-- Spark -->
|
<!-- Spark -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
<version>${spark.version}</version>
|
<version>${spark.version}</version>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
<version>${spark.version}</version>
|
<version>${spark.version}</version>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
@@ -484,7 +485,7 @@
|
|||||||
<!-- Spark (Packages) -->
|
<!-- Spark (Packages) -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-avro_2.11</artifactId>
|
<artifactId>spark-avro_${scala.binary.version}</artifactId>
|
||||||
<version>${spark.version}</version>
|
<version>${spark.version}</version>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
@@ -1040,6 +1041,43 @@
|
|||||||
</plugins>
|
</plugins>
|
||||||
</build>
|
</build>
|
||||||
</profile>
|
</profile>
|
||||||
|
<!-- Exists for backwards compatibility; profile doesn't do anything -->
|
||||||
|
<profile>
|
||||||
|
<id>scala-2.11</id>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>scala-2.12</id>
|
||||||
|
<properties>
|
||||||
|
<scala.version>2.12.10</scala.version>
|
||||||
|
<scala.binary.version>2.12</scala.binary.version>
|
||||||
|
</properties>
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-enforcer-plugin</artifactId>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>enforce-versions</id>
|
||||||
|
<goals>
|
||||||
|
<goal>enforce</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<rules>
|
||||||
|
<bannedDependencies>
|
||||||
|
<excludes combine.children="append">
|
||||||
|
<exclude>*:*_2.11</exclude>
|
||||||
|
</excludes>
|
||||||
|
</bannedDependencies>
|
||||||
|
</rules>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
</profiles>
|
</profiles>
|
||||||
|
|
||||||
<issueManagement>
|
<issueManagement>
|
||||||
|
|||||||
Reference in New Issue
Block a user