[HUDI-4023] Decouple hudi-spark from hudi-utilities-slim-bundle (#5641)
This commit is contained in:
@@ -605,8 +605,6 @@ public class DeltaSync implements Serializable {
|
||||
long totalErrorRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalErrorRecords).sum().longValue();
|
||||
long totalRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalRecords).sum().longValue();
|
||||
boolean hasErrors = totalErrorRecords > 0;
|
||||
long hiveSyncTimeMs = 0;
|
||||
long metaSyncTimeMs = 0;
|
||||
if (!hasErrors || cfg.commitOnErrors) {
|
||||
HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
|
||||
if (checkpointStr != null) {
|
||||
|
||||
@@ -17,6 +17,89 @@
|
||||
|
||||
# Usage of hudi-utilities-slim-bundle
|
||||
|
||||
Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules.
|
||||
This new bundle is intended to be used with Hudi Spark bundle together, if using hudi-utilities-bundle solely
|
||||
introduces problems for a specific Spark version.
|
||||
Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. This new bundle is intended to be used with Hudi Spark bundle together, if using
|
||||
hudi-utilities-bundle solely introduces problems for a specific Spark version.
|
||||
|
||||
## Example with Spark 2.4.7
|
||||
|
||||
* Build Hudi: `mvn clean install -DskipTests`
|
||||
* Run deltastreamer
|
||||
|
||||
```
|
||||
bin/spark-submit \
|
||||
--driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \
|
||||
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
|
||||
--conf spark.sql.catalogImplementation=hive \
|
||||
--conf spark.driver.maxResultSize=1g \
|
||||
--conf spark.ui.port=6679 \
|
||||
--packages org.apache.spark:spark-avro_2.11:2.4.7 \
|
||||
--jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.12.0-SNAPSHOT.jar \
|
||||
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.11-0.12.0-SNAPSHOT.jar` \
|
||||
--props `ls /path/to/hudi/dfs-source.properties` \
|
||||
--source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
|
||||
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
|
||||
--source-ordering-field tpep_dropoff_datetime \
|
||||
--table-type COPY_ON_WRITE \
|
||||
--target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark24/ \
|
||||
--target-table ny_hudi_tbl \
|
||||
--op UPSERT \
|
||||
--continuous \
|
||||
--source-limit 5000000 \
|
||||
--min-sync-interval-seconds 60
|
||||
```
|
||||
|
||||
## Example with Spark 3.1.2
|
||||
|
||||
* Build Hudi: `mvn clean install -DskipTests -Dspark3.1 -Dscala-2.12`
|
||||
* Run deltastreamer
|
||||
|
||||
```
|
||||
bin/spark-submit \
|
||||
--driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \
|
||||
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
|
||||
--conf spark.sql.catalogImplementation=hive \
|
||||
--conf spark.driver.maxResultSize=1g \
|
||||
--conf spark.ui.port=6679 \
|
||||
--packages org.apache.spark:spark-avro_2.12:3.1.2 \
|
||||
--jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.1-bundle_2.12-0.12.0-SNAPSHOT.jar \
|
||||
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \
|
||||
--props `ls /path/to/hudi/dfs-source.properties` \
|
||||
--source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
|
||||
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
|
||||
--source-ordering-field tpep_dropoff_datetime \
|
||||
--table-type COPY_ON_WRITE \
|
||||
--target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark31/ \
|
||||
--target-table ny_hudi_tbl \
|
||||
--op UPSERT \
|
||||
--continuous \
|
||||
--source-limit 5000000 \
|
||||
--min-sync-interval-seconds 60
|
||||
```
|
||||
|
||||
## Example with Spark 3.2.0
|
||||
|
||||
* Build Hudi: `mvn clean install -DskipTests -Dspark3.2 -Dscala-2.12`
|
||||
* Run deltastreamer
|
||||
|
||||
```
|
||||
bin/spark-submit \
|
||||
--driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \
|
||||
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
|
||||
--conf spark.sql.catalogImplementation=hive \
|
||||
--conf spark.driver.maxResultSize=1g \
|
||||
--conf spark.ui.port=6679 \
|
||||
--packages org.apache.spark:spark-avro_2.12:3.2.0 \
|
||||
--jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \
|
||||
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \
|
||||
--props `ls /path/to/hudi/dfs-source.properties` \
|
||||
--source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
|
||||
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
|
||||
--source-ordering-field tpep_dropoff_datetime \
|
||||
--table-type COPY_ON_WRITE \
|
||||
--target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark32/ \
|
||||
--target-table ny_hudi_tbl \
|
||||
--op UPSERT \
|
||||
--continuous \
|
||||
--source-limit 5000000 \
|
||||
--min-sync-interval-seconds 60
|
||||
```
|
||||
|
||||
@@ -77,7 +77,7 @@
|
||||
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer">
|
||||
</transformer>
|
||||
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
|
||||
<addHeader>true</addHeader>
|
||||
<addHeader>true</addHeader>
|
||||
</transformer>
|
||||
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
|
||||
<resource>META-INF/LICENSE</resource>
|
||||
@@ -92,10 +92,7 @@
|
||||
<includes>
|
||||
<include>org.apache.hudi:hudi-common</include>
|
||||
<include>org.apache.hudi:hudi-client-common</include>
|
||||
<include>org.apache.hudi:hudi-spark-client</include>
|
||||
<include>org.apache.hudi:hudi-utilities_${scala.binary.version}</include>
|
||||
<include>org.apache.hudi:hudi-hive-sync</include>
|
||||
<include>org.apache.hudi:hudi-sync-common</include>
|
||||
<include>org.apache.hudi:hudi-hadoop-mr</include>
|
||||
<include>org.apache.hudi:hudi-timeline-service</include>
|
||||
<include>org.apache.hudi:hudi-aws</include>
|
||||
@@ -136,13 +133,6 @@
|
||||
<include>org.apache.kafka:kafka_${scala.binary.version}</include>
|
||||
<include>com.101tec:zkclient</include>
|
||||
<include>org.apache.kafka:kafka-clients</include>
|
||||
|
||||
<include>org.apache.hive:hive-common</include>
|
||||
<include>org.apache.hive:hive-service</include>
|
||||
<include>org.apache.hive:hive-service-rpc</include>
|
||||
<include>org.apache.hive:hive-metastore</include>
|
||||
<include>org.apache.hive:hive-jdbc</include>
|
||||
|
||||
<include>org.apache.hbase:hbase-client</include>
|
||||
<include>org.apache.hbase:hbase-common</include>
|
||||
<include>org.apache.hbase:hbase-hadoop-compat</include>
|
||||
@@ -178,10 +168,6 @@
|
||||
<pattern>com.beust.jcommander.</pattern>
|
||||
<shadedPattern>org.apache.hudi.com.beust.jcommander.</shadedPattern>
|
||||
</relocation>
|
||||
<relocation>
|
||||
<pattern>org.apache.hive.jdbc.</pattern>
|
||||
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.jdbc.</shadedPattern>
|
||||
</relocation>
|
||||
<relocation>
|
||||
<pattern>org.apache.commons.io.</pattern>
|
||||
<shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
|
||||
@@ -205,10 +191,6 @@
|
||||
<pattern>org.apache.hadoop.hive.metastore.</pattern>
|
||||
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.metastore.</shadedPattern>
|
||||
</relocation>
|
||||
<relocation>
|
||||
<pattern>org.apache.hive.common.</pattern>
|
||||
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.common.</shadedPattern>
|
||||
</relocation>
|
||||
<relocation>
|
||||
<pattern>org.apache.hadoop.hive.common.</pattern>
|
||||
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.common.</shadedPattern>
|
||||
@@ -217,10 +199,6 @@
|
||||
<pattern>org.apache.hadoop.hive.conf.</pattern>
|
||||
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.conf.</shadedPattern>
|
||||
</relocation>
|
||||
<relocation>
|
||||
<pattern>org.apache.hive.service.</pattern>
|
||||
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.service.</shadedPattern>
|
||||
</relocation>
|
||||
<relocation>
|
||||
<pattern>org.apache.hadoop.hive.service.</pattern>
|
||||
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.service.</shadedPattern>
|
||||
@@ -344,116 +322,27 @@
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-client-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark-client</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-hive-sync</artifactId>
|
||||
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>javax.servlet</groupId>
|
||||
<artifactId>servlet-api</artifactId>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>${hudi.spark.module}_${scala.binary.version}</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>${hudi.spark.common.module}</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>${hudi.spark.module}_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>${hudi.spark.common.module}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Hive -->
|
||||
<dependency>
|
||||
<groupId>${hive.groupid}</groupId>
|
||||
<artifactId>hive-service</artifactId>
|
||||
<version>${hive.version}</version>
|
||||
<scope>${utilities.bundle.hive.scope}</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>${hive.groupid}</groupId>
|
||||
<artifactId>hive-service-rpc</artifactId>
|
||||
<version>${hive.version}</version>
|
||||
<scope>${utilities.bundle.hive.scope}</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>${hive.groupid}</groupId>
|
||||
<artifactId>hive-jdbc</artifactId>
|
||||
<version>${hive.version}</version>
|
||||
<scope>${utilities.bundle.hive.scope}</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>${hive.groupid}</groupId>
|
||||
<artifactId>hive-metastore</artifactId>
|
||||
<version>${hive.version}</version>
|
||||
<scope>${utilities.bundle.hive.scope}</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>${hive.groupid}</groupId>
|
||||
<artifactId>hive-common</artifactId>
|
||||
<version>${hive.version}</version>
|
||||
<scope>${utilities.bundle.hive.scope}</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.htrace</groupId>
|
||||
<artifactId>htrace-core</artifactId>
|
||||
<version>${htrace.version}</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- zookeeper -->
|
||||
<dependency>
|
||||
<groupId>org.apache.curator</groupId>
|
||||
<artifactId>curator-framework</artifactId>
|
||||
<version>${zk-curator.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.curator</groupId>
|
||||
<artifactId>curator-client</artifactId>
|
||||
<version>${zk-curator.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.curator</groupId>
|
||||
<artifactId>curator-recipes</artifactId>
|
||||
<version>${zk-curator.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<profiles>
|
||||
|
||||
7
pom.xml
7
pom.xml
@@ -99,6 +99,7 @@
|
||||
<pulsar.version>2.8.1</pulsar.version>
|
||||
<confluent.version>5.3.4</confluent.version>
|
||||
<glassfish.version>2.17</glassfish.version>
|
||||
<glassfish.el.version>3.0.1-b12</glassfish.el.version>
|
||||
<parquet.version>1.10.1</parquet.version>
|
||||
<junit.jupiter.version>5.7.0-M1</junit.jupiter.version>
|
||||
<junit.vintage.version>5.7.0-M1</junit.vintage.version>
|
||||
@@ -556,6 +557,12 @@
|
||||
<artifactId>jersey-container-servlet-core</artifactId>
|
||||
<version>${glassfish.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.glassfish</groupId>
|
||||
<artifactId>javax.el</artifactId>
|
||||
<version>${glassfish.el.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Avro -->
|
||||
<dependency>
|
||||
|
||||
Reference in New Issue
Block a user