1
0

[HUDI-4023] Decouple hudi-spark from hudi-utilities-slim-bundle (#5641)

This commit is contained in:
Sagar Sumit
2022-05-26 11:28:49 +05:30
committed by GitHub
parent 98c5c6c654
commit 31e13db1f0
4 changed files with 109 additions and 132 deletions

View File

@@ -605,8 +605,6 @@ public class DeltaSync implements Serializable {
long totalErrorRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalErrorRecords).sum().longValue(); long totalErrorRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalErrorRecords).sum().longValue();
long totalRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalRecords).sum().longValue(); long totalRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalRecords).sum().longValue();
boolean hasErrors = totalErrorRecords > 0; boolean hasErrors = totalErrorRecords > 0;
long hiveSyncTimeMs = 0;
long metaSyncTimeMs = 0;
if (!hasErrors || cfg.commitOnErrors) { if (!hasErrors || cfg.commitOnErrors) {
HashMap<String, String> checkpointCommitMetadata = new HashMap<>(); HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
if (checkpointStr != null) { if (checkpointStr != null) {

View File

@@ -17,6 +17,89 @@
# Usage of hudi-utilities-slim-bundle # Usage of hudi-utilities-slim-bundle
Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. This new bundle is intended to be used with Hudi Spark bundle together, if using
This new bundle is intended to be used with Hudi Spark bundle together, if using hudi-utilities-bundle solely hudi-utilities-bundle solely introduces problems for a specific Spark version.
introduces problems for a specific Spark version.
## Example with Spark 2.4.7
* Build Hudi: `mvn clean install -DskipTests`
* Run deltastreamer
```
bin/spark-submit \
--driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--conf spark.sql.catalogImplementation=hive \
--conf spark.driver.maxResultSize=1g \
--conf spark.ui.port=6679 \
--packages org.apache.spark:spark-avro_2.11:2.4.7 \
--jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.12.0-SNAPSHOT.jar \
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.11-0.12.0-SNAPSHOT.jar` \
--props `ls /path/to/hudi/dfs-source.properties` \
--source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--source-ordering-field tpep_dropoff_datetime \
--table-type COPY_ON_WRITE \
--target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark24/ \
--target-table ny_hudi_tbl \
--op UPSERT \
--continuous \
--source-limit 5000000 \
--min-sync-interval-seconds 60
```
## Example with Spark 3.1.2
* Build Hudi: `mvn clean install -DskipTests -Dspark3.1 -Dscala-2.12`
* Run deltastreamer
```
bin/spark-submit \
--driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--conf spark.sql.catalogImplementation=hive \
--conf spark.driver.maxResultSize=1g \
--conf spark.ui.port=6679 \
--packages org.apache.spark:spark-avro_2.12:3.1.2 \
--jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.1-bundle_2.12-0.12.0-SNAPSHOT.jar \
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \
--props `ls /path/to/hudi/dfs-source.properties` \
--source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--source-ordering-field tpep_dropoff_datetime \
--table-type COPY_ON_WRITE \
--target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark31/ \
--target-table ny_hudi_tbl \
--op UPSERT \
--continuous \
--source-limit 5000000 \
--min-sync-interval-seconds 60
```
## Example with Spark 3.2.0
* Build Hudi: `mvn clean install -DskipTests -Dspark3.2 -Dscala-2.12`
* Run deltastreamer
```
bin/spark-submit \
--driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--conf spark.sql.catalogImplementation=hive \
--conf spark.driver.maxResultSize=1g \
--conf spark.ui.port=6679 \
--packages org.apache.spark:spark-avro_2.12:3.2.0 \
--jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \
--props `ls /path/to/hudi/dfs-source.properties` \
--source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--source-ordering-field tpep_dropoff_datetime \
--table-type COPY_ON_WRITE \
--target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark32/ \
--target-table ny_hudi_tbl \
--op UPSERT \
--continuous \
--source-limit 5000000 \
--min-sync-interval-seconds 60
```

View File

@@ -92,10 +92,7 @@
<includes> <includes>
<include>org.apache.hudi:hudi-common</include> <include>org.apache.hudi:hudi-common</include>
<include>org.apache.hudi:hudi-client-common</include> <include>org.apache.hudi:hudi-client-common</include>
<include>org.apache.hudi:hudi-spark-client</include>
<include>org.apache.hudi:hudi-utilities_${scala.binary.version}</include> <include>org.apache.hudi:hudi-utilities_${scala.binary.version}</include>
<include>org.apache.hudi:hudi-hive-sync</include>
<include>org.apache.hudi:hudi-sync-common</include>
<include>org.apache.hudi:hudi-hadoop-mr</include> <include>org.apache.hudi:hudi-hadoop-mr</include>
<include>org.apache.hudi:hudi-timeline-service</include> <include>org.apache.hudi:hudi-timeline-service</include>
<include>org.apache.hudi:hudi-aws</include> <include>org.apache.hudi:hudi-aws</include>
@@ -136,13 +133,6 @@
<include>org.apache.kafka:kafka_${scala.binary.version}</include> <include>org.apache.kafka:kafka_${scala.binary.version}</include>
<include>com.101tec:zkclient</include> <include>com.101tec:zkclient</include>
<include>org.apache.kafka:kafka-clients</include> <include>org.apache.kafka:kafka-clients</include>
<include>org.apache.hive:hive-common</include>
<include>org.apache.hive:hive-service</include>
<include>org.apache.hive:hive-service-rpc</include>
<include>org.apache.hive:hive-metastore</include>
<include>org.apache.hive:hive-jdbc</include>
<include>org.apache.hbase:hbase-client</include> <include>org.apache.hbase:hbase-client</include>
<include>org.apache.hbase:hbase-common</include> <include>org.apache.hbase:hbase-common</include>
<include>org.apache.hbase:hbase-hadoop-compat</include> <include>org.apache.hbase:hbase-hadoop-compat</include>
@@ -178,10 +168,6 @@
<pattern>com.beust.jcommander.</pattern> <pattern>com.beust.jcommander.</pattern>
<shadedPattern>org.apache.hudi.com.beust.jcommander.</shadedPattern> <shadedPattern>org.apache.hudi.com.beust.jcommander.</shadedPattern>
</relocation> </relocation>
<relocation>
<pattern>org.apache.hive.jdbc.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.jdbc.</shadedPattern>
</relocation>
<relocation> <relocation>
<pattern>org.apache.commons.io.</pattern> <pattern>org.apache.commons.io.</pattern>
<shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern> <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
@@ -205,10 +191,6 @@
<pattern>org.apache.hadoop.hive.metastore.</pattern> <pattern>org.apache.hadoop.hive.metastore.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.metastore.</shadedPattern> <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.metastore.</shadedPattern>
</relocation> </relocation>
<relocation>
<pattern>org.apache.hive.common.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.common.</shadedPattern>
</relocation>
<relocation> <relocation>
<pattern>org.apache.hadoop.hive.common.</pattern> <pattern>org.apache.hadoop.hive.common.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.common.</shadedPattern> <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.common.</shadedPattern>
@@ -217,10 +199,6 @@
<pattern>org.apache.hadoop.hive.conf.</pattern> <pattern>org.apache.hadoop.hive.conf.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.conf.</shadedPattern> <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.conf.</shadedPattern>
</relocation> </relocation>
<relocation>
<pattern>org.apache.hive.service.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.service.</shadedPattern>
</relocation>
<relocation> <relocation>
<pattern>org.apache.hadoop.hive.service.</pattern> <pattern>org.apache.hadoop.hive.service.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.service.</shadedPattern> <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.service.</shadedPattern>
@@ -344,115 +322,26 @@
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.hudi</groupId> <groupId>org.apache.hudi</groupId>
<artifactId>hudi-client-common</artifactId> <artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-client</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hive-sync</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
<exclusions> <exclusions>
<exclusion> <exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId> <groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-common_${scala.binary.version}</artifactId> <artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
<version>${project.version}</version> </exclusion>
<scope>provided</scope> <exclusion>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId> <groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark_${scala.binary.version}</artifactId> <artifactId>hudi-spark_${scala.binary.version}</artifactId>
<version>${project.version}</version> </exclusion>
<scope>provided</scope> <exclusion>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId> <groupId>org.apache.hudi</groupId>
<artifactId>${hudi.spark.module}_${scala.binary.version}</artifactId> <artifactId>${hudi.spark.module}_${scala.binary.version}</artifactId>
<version>${project.version}</version> </exclusion>
<scope>provided</scope> <exclusion>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId> <groupId>org.apache.hudi</groupId>
<artifactId>${hudi.spark.common.module}</artifactId> <artifactId>${hudi.spark.common.module}</artifactId>
<version>${project.version}</version> </exclusion>
<scope>provided</scope> </exclusions>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<!-- Hive -->
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-service</artifactId>
<version>${hive.version}</version>
<scope>${utilities.bundle.hive.scope}</scope>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-service-rpc</artifactId>
<version>${hive.version}</version>
<scope>${utilities.bundle.hive.scope}</scope>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
<scope>${utilities.bundle.hive.scope}</scope>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-metastore</artifactId>
<version>${hive.version}</version>
<scope>${utilities.bundle.hive.scope}</scope>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-common</artifactId>
<version>${hive.version}</version>
<scope>${utilities.bundle.hive.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.htrace</groupId>
<artifactId>htrace-core</artifactId>
<version>${htrace.version}</version>
<scope>compile</scope>
</dependency>
<!-- zookeeper -->
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-framework</artifactId>
<version>${zk-curator.version}</version>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-client</artifactId>
<version>${zk-curator.version}</version>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
<version>${zk-curator.version}</version>
</dependency> </dependency>
</dependencies> </dependencies>

View File

@@ -99,6 +99,7 @@
<pulsar.version>2.8.1</pulsar.version> <pulsar.version>2.8.1</pulsar.version>
<confluent.version>5.3.4</confluent.version> <confluent.version>5.3.4</confluent.version>
<glassfish.version>2.17</glassfish.version> <glassfish.version>2.17</glassfish.version>
<glassfish.el.version>3.0.1-b12</glassfish.el.version>
<parquet.version>1.10.1</parquet.version> <parquet.version>1.10.1</parquet.version>
<junit.jupiter.version>5.7.0-M1</junit.jupiter.version> <junit.jupiter.version>5.7.0-M1</junit.jupiter.version>
<junit.vintage.version>5.7.0-M1</junit.vintage.version> <junit.vintage.version>5.7.0-M1</junit.vintage.version>
@@ -556,6 +557,12 @@
<artifactId>jersey-container-servlet-core</artifactId> <artifactId>jersey-container-servlet-core</artifactId>
<version>${glassfish.version}</version> <version>${glassfish.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.glassfish</groupId>
<artifactId>javax.el</artifactId>
<version>${glassfish.el.version}</version>
<scope>provided</scope>
</dependency>
<!-- Avro --> <!-- Avro -->
<dependency> <dependency>