[HUDI-91][HUDI-12]Migrate to spark 2.4.4, migrate to spark-avro library instead of databricks-avro, add support for Decimal/Date types

- Upgrade Spark to 2.4.4, Parquet to 1.10.1, Avro to 1.8.2 - Remove spark-avro from hudi-spark-bundle. Users need to provide --packages org.apache.spark:spark-avro:2.4.4 when running spark-shell or spark-submit - Replace com.databricks:spark-avro with org.apache.spark:spark-avro - Shade avro in hudi-hadoop-mr-bundle to make sure it does not conflict with hive's avro version.
2020-01-12 15:03:11 -08:00
parent d9675c4ec0
commit ad50008a59
11 changed files with 128 additions and 79 deletions
--- a/20
+++ b/20
@@ -241,23 +241,13 @@ This product includes code from https://github.com/twitter/commons/blob/master/s
 limitations under the License.
 =================================================================================================
-This product includes code from Databricks spark-avro with the below license
+This product includes code from Apache Spark
-* org.apache.hudi.AvroConversionHelper copied from classes in com/databricks/spark/avro package
+* org.apache.hudi.AvroConversionHelper copied from classes in org/apache/spark/sql/avro package
- Copyright 2014 Databricks
+Copyright: 2014 and onwards The Apache Software Foundation
-
+Home page: http://spark.apache.org/
- Licensed under the Apache License, Version 2.0 (the "License");
+License: http://www.apache.org/licenses/LICENSE-2.0
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 --------------------------------------------------------------------------------
--- a/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java
+++ b/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java
@@ -174,6 +174,8 @@ public class SchemaUtil {
        final DecimalMetadata decimalMetadata = parquetType.asPrimitiveType().getDecimalMetadata();
        return field.append("DECIMAL(").append(decimalMetadata.getPrecision()).append(" , ")
            .append(decimalMetadata.getScale()).append(")").toString();
      } else if (originalType == OriginalType.DATE) {
        return field.append("DATE").toString();
      }
      // TODO - fix the method naming here
      return parquetPrimitiveTypeName.convert(new PrimitiveType.PrimitiveTypeNameConverter<String, RuntimeException>() {
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java
@@ -107,7 +107,7 @@ public abstract class ITTestBase {
        .append(" --master local[2] --driver-class-path ").append(HADOOP_CONF_DIR)
        .append(
            " --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client  --driver-memory 1G --executor-memory 1G --num-executors 1 ")
-        .append(" --packages com.databricks:spark-avro_2.11:4.0.0 ").append(" -i ").append(commandFile).toString();
+        .append(" --packages org.apache.spark:spark-avro_2.11:2.4.4 ").append(" -i ").append(commandFile).toString();
  }
  static String getPrestoConsoleCommand(String commandFile) {
--- a/hudi-spark/pom.xml
+++ b/hudi-spark/pom.xml
@@ -213,9 +213,9 @@
    <!-- Spark (Packages) -->
    <dependency>
-      <groupId>com.databricks</groupId>
+      <groupId>org.apache.spark</groupId>
      <artifactId>spark-avro_2.11</artifactId>
-      <version>4.0.0</version>
+      <scope>provided</scope>
    </dependency>
    <!-- Hadoop -->
@@ -239,8 +239,19 @@
    <!-- Hive -->
    <dependency>
      <groupId>${hive.groupid}</groupId>
-      <artifactId>hive-service</artifactId>
+      <artifactId>hive-exec</artifactId>
      <version>${hive.version}</version>
      <classifier>${hive.exec.classifier}</classifier>
      <exclusions>
        <exclusion>
          <groupId>javax.mail</groupId>
          <artifactId>mail</artifactId>
        </exclusion>
        <exclusion>
          <groupId>org.eclipse.jetty.aggregate</groupId>
          <artifactId>*</artifactId>
        </exclusion>
      </exclusions>
    </dependency>
    <dependency>
      <groupId>${hive.groupid}</groupId>
--- a/hudi-spark/src/main/scala/org/apache/hudi/AvroConversionHelper.scala
+++ b/hudi-spark/src/main/scala/org/apache/hudi/AvroConversionHelper.scala
@@ -1,11 +1,10 @@
 /*
- * This code is copied from com.databricks:spark-avro with following license
+ * Licensed to the Apache Software Foundation (ASF) under one or more
- *
+ * contributor license agreements.  See the NOTICE file distributed with
- * Copyright 2014 Databricks
+ * this work for additional information regarding copyright ownership.
- *
+ * The ASF licenses this file to You under the Apache License, Version 2.0
- * Licensed under the Apache License, Version 2.0 (the "License");
+ * (the "License"); you may not use this file except in compliance with
- * you may not use this file except in compliance with the License.
+ * the License.  You may obtain a copy of the License at
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
@@ -22,14 +21,15 @@ import java.nio.ByteBuffer
 import java.sql.{Date, Timestamp}
 import java.util
-import com.databricks.spark.avro.SchemaConverters
+import org.apache.avro.Conversions.DecimalConversion
-import com.databricks.spark.avro.SchemaConverters.IncompatibleSchemaException
+import org.apache.avro.LogicalTypes.{TimestampMicros, TimestampMillis}
-import org.apache.avro.{Schema, SchemaBuilder}
+import org.apache.avro.{LogicalTypes, Schema}
 import org.apache.avro.Schema.Type._
 import org.apache.avro.generic.GenericData.{Fixed, Record}
-import org.apache.avro.generic.{GenericData, GenericRecord}
+import org.apache.avro.generic.{GenericData, GenericFixed, GenericRecord}
 import org.apache.hudi.AvroConversionUtils.getNewRecordNamespace
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.avro.{IncompatibleSchemaException, SchemaConverters}
 import org.apache.spark.sql.catalyst.expressions.GenericRow
 import org.apache.spark.sql.types._
@@ -37,6 +37,16 @@ import scala.collection.JavaConverters._
 object AvroConversionHelper {
  private def createDecimal(decimal: java.math.BigDecimal, precision: Int, scale: Int): Decimal = {
    if (precision <= Decimal.MAX_LONG_DIGITS) {
      // Constructs a `Decimal` with an unscaled `Long` value if possible.
      Decimal(decimal.unscaledValue().longValue(), precision, scale)
    } else {
      // Otherwise, resorts to an unscaled `BigInteger` instead.
      Decimal(decimal, precision, scale)
    }
  }
  /**
    *
    * Returns a converter function to convert row in avro format to GenericRow of catalyst.
@@ -76,7 +86,50 @@ object AvroConversionHelper {
              byteBuffer.get(bytes)
              bytes
            }
-
+        case (d: DecimalType, FIXED) =>
          (item: AnyRef) =>
            if (item == null) {
              null
            } else {
              val decimalConversion = new DecimalConversion
              val bigDecimal = decimalConversion.fromFixed(item.asInstanceOf[GenericFixed], avroSchema,
                LogicalTypes.decimal(d.precision, d.scale))
              createDecimal(bigDecimal, d.precision, d.scale)
            }
        case (d: DecimalType, BYTES) =>
          (item: AnyRef) =>
            if (item == null) {
              null
            } else {
              val decimalConversion = new DecimalConversion
              val bigDecimal = decimalConversion.fromBytes(item.asInstanceOf[ByteBuffer], avroSchema,
                LogicalTypes.decimal(d.precision, d.scale))
              createDecimal(bigDecimal, d.precision, d.scale)
            }
        case (DateType, INT) =>
          (item: AnyRef) =>
            if (item == null) {
              null
            } else {
              new Date(item.asInstanceOf[Long])
            }
        case (TimestampType, LONG) =>
          (item: AnyRef) =>
            if (item == null) {
              null
            } else {
              avroSchema.getLogicalType match {
                case _: TimestampMillis =>
                  new Timestamp(item.asInstanceOf[Long])
                case _: TimestampMicros =>
                  new Timestamp(item.asInstanceOf[Long] / 1000)
                case null =>
                  new Timestamp(item.asInstanceOf[Long])
                case other =>
                  throw new IncompatibleSchemaException(
                    s"Cannot convert Avro logical type ${other} to Catalyst Timestamp type.")
              }
            }
        case (struct: StructType, RECORD) =>
          val length = struct.fields.length
          val converters = new Array[AnyRef => AnyRef](length)
@@ -216,7 +269,8 @@ object AvroConversionHelper {
    createConverter(sourceAvroSchema, targetSqlType, List.empty[String])
  }
-  def createConverterToAvro(dataType: DataType,
+  def createConverterToAvro(avroSchema: Schema,
                            dataType: DataType,
                            structName: String,
                            recordNamespace: String): Any => Any = {
    dataType match {
@@ -231,13 +285,22 @@ object AvroConversionHelper {
        if (item == null) null else item.asInstanceOf[Byte].intValue
      case ShortType => (item: Any) =>
        if (item == null) null else item.asInstanceOf[Short].intValue
-      case _: DecimalType => (item: Any) => if (item == null) null else item.toString
+      case dec: DecimalType => (item: Any) =>
        Option(item).map { i =>
          val bigDecimalValue = item.asInstanceOf[java.math.BigDecimal]
          val decimalConversions = new DecimalConversion()
          decimalConversions.toFixed(bigDecimalValue, avroSchema.getField(structName).schema().getTypes.get(0),
            LogicalTypes.decimal(dec.precision, dec.scale))
        }.orNull
      case TimestampType => (item: Any) =>
-        if (item == null) null else item.asInstanceOf[Timestamp].getTime
+        // Convert time to microseconds since spark-avro by default converts TimestampType to
        // Avro Logical TimestampMicros
        Option(item).map(_.asInstanceOf[Timestamp].getTime * 1000).orNull
      case DateType => (item: Any) =>
-        if (item == null) null else item.asInstanceOf[Date].getTime
+        Option(item).map(_.asInstanceOf[Date].toLocalDate.toEpochDay.toInt).orNull
      case ArrayType(elementType, _) =>
        val elementConverter = createConverterToAvro(
          avroSchema,
          elementType,
          structName,
          getNewRecordNamespace(elementType, recordNamespace, structName))
@@ -258,6 +321,7 @@ object AvroConversionHelper {
        }
      case MapType(StringType, valueType, _) =>
        val valueConverter = createConverterToAvro(
          avroSchema,
          valueType,
          structName,
          getNewRecordNamespace(valueType, recordNamespace, structName))
@@ -273,11 +337,10 @@ object AvroConversionHelper {
          }
        }
      case structType: StructType =>
-        val builder = SchemaBuilder.record(structName).namespace(recordNamespace)
+        val schema: Schema = SchemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace)
        val schema: Schema = SchemaConverters.convertStructToAvro(
          structType, builder, recordNamespace)
        val fieldConverters = structType.fields.map(field =>
          createConverterToAvro(
            avroSchema,
            field.dataType,
            field.name,
            getNewRecordNamespace(field.dataType, recordNamespace, field.name)))
--- a/hudi-spark/src/main/scala/org/apache/hudi/AvroConversionUtils.scala
+++ b/hudi-spark/src/main/scala/org/apache/hudi/AvroConversionUtils.scala
@@ -17,11 +17,11 @@
 package org.apache.hudi
 import com.databricks.spark.avro.SchemaConverters
 import org.apache.avro.generic.GenericRecord
 import org.apache.avro.{Schema, SchemaBuilder}
 import org.apache.hudi.common.model.HoodieKey
 import org.apache.avro.Schema
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.avro.SchemaConverters
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
@@ -30,13 +30,20 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 object AvroConversionUtils {
  def createRdd(df: DataFrame, structName: String, recordNamespace: String): RDD[GenericRecord] = {
    val avroSchema = convertStructTypeToAvroSchema(df.schema, structName, recordNamespace)
    createRdd(df, avroSchema.toString, structName, recordNamespace)
  }
  def createRdd(df: DataFrame, avroSchemaAsJsonString: String, structName: String, recordNamespace: String)
  : RDD[GenericRecord] = {
    val dataType = df.schema
    val encoder = RowEncoder.apply(dataType).resolveAndBind()
    df.queryExecution.toRdd.map(encoder.fromRow)
      .mapPartitions { records =>
        if (records.isEmpty) Iterator.empty
        else {
-          val convertor = AvroConversionHelper.createConverterToAvro(dataType, structName, recordNamespace)
+          val avroSchema = new Schema.Parser().parse(avroSchemaAsJsonString)
          val convertor = AvroConversionHelper.createConverterToAvro(avroSchema, dataType, structName, recordNamespace)
          records.map { x => convertor(x).asInstanceOf[GenericRecord] }
        }
      }
@@ -75,11 +82,10 @@ object AvroConversionUtils {
  def convertStructTypeToAvroSchema(structType: StructType,
                                    structName: String,
                                    recordNamespace: String): Schema = {
-    val builder = SchemaBuilder.record(structName).namespace(recordNamespace)
+    SchemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace)
    SchemaConverters.convertStructToAvro(structType, builder, recordNamespace)
  }
  def convertAvroSchemaToStructType(avroSchema: Schema): StructType = {
-    SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType];
+    SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType]
  }
 }
--- a/hudi-spark/src/test/java/HoodieJavaApp.java
+++ b/hudi-spark/src/test/java/HoodieJavaApp.java
@@ -104,6 +104,7 @@ public class HoodieJavaApp {
    SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP")
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
    JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
    spark.sparkContext().setLogLevel("WARN");
    FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
    // Generator of some records to be loaded in.
--- a/hudi-utilities/pom.xml
+++ b/hudi-utilities/pom.xml
@@ -161,7 +161,7 @@
    </dependency>
    <dependency>
-      <groupId>com.databricks</groupId>
+      <groupId>org.apache.spark</groupId>
      <artifactId>spark-avro_2.11</artifactId>
      <scope>provided</scope>
    </dependency>
--- a/packaging/hudi-hadoop-mr-bundle/pom.xml
+++ b/packaging/hudi-hadoop-mr-bundle/pom.xml
@@ -89,7 +89,7 @@
                </relocation>
                <relocation>
                  <pattern>org.apache.avro.</pattern>
-                  <shadedPattern>${mr.bundle.avro.shade.prefix}org.apache.avro.</shadedPattern>
+                  <shadedPattern>org.apache.hudi.org.apache.avro.</shadedPattern>
                </relocation>
              </relocations>
              <createDependencyReducedPom>false</createDependencyReducedPom>
@@ -143,17 +143,7 @@
    <dependency>
      <groupId>org.apache.avro</groupId>
      <artifactId>avro</artifactId>
-      <scope>${mr.bundle.avro.scope}</scope>
+      <scope>compile</scope>
    </dependency>
  </dependencies>
  <profiles>
    <profile>
      <id>mr-bundle-shade-avro</id>
      <properties>
        <mr.bundle.avro.scope>compile</mr.bundle.avro.scope>
        <mr.bundle.avro.shade.prefix>org.apache.hudi.</mr.bundle.avro.shade.prefix>
      </properties>
    </profile>
  </profiles>
 </project>
--- a/packaging/hudi-spark-bundle/pom.xml
+++ b/packaging/hudi-spark-bundle/pom.xml
@@ -94,8 +94,6 @@
                  <include>org.apache.hive:hive-service-rpc</include>
                  <include>org.apache.hive:hive-metastore</include>
                  <include>org.apache.hive:hive-jdbc</include>
                  <include>com.databricks:spark-avro_2.11</include>
                </includes>
              </artifactSet>
              <relocations>
@@ -139,10 +137,6 @@
                  <pattern>org.apache.commons.codec.</pattern>
                  <shadedPattern>org.apache.hudi.org.apache.commons.codec.</shadedPattern>
                </relocation>
                <relocation>
                  <pattern>com.databricks.</pattern>
                  <shadedPattern>org.apache.hudi.com.databricks.</shadedPattern>
                </relocation>
                <!-- TODO: Revisit GH ISSUE #533 & PR#633-->
              </relocations>
             <filters>
--- a/pom.xml
+++ b/pom.xml
@@ -76,7 +76,7 @@
    <java.version>1.8</java.version>
    <fasterxml.version>2.6.7</fasterxml.version>
    <glassfish.version>2.17</glassfish.version>
-    <parquet.version>1.8.1</parquet.version>
+    <parquet.version>1.10.1</parquet.version>
    <junit.version>4.11</junit.version>
    <junit-dep.version>4.10</junit-dep.version>
    <mockito.version>1.10.19</mockito.version>
@@ -88,8 +88,8 @@
    <hive.version>2.3.1</hive.version>
    <hive.exec.classifier>core</hive.exec.classifier>
    <metrics.version>4.1.1</metrics.version>
-    <spark.version>2.1.0</spark.version>
+    <spark.version>2.4.4</spark.version>
-    <avro.version>1.7.7</avro.version>
+    <avro.version>1.8.2</avro.version>
    <scala.version>2.11.8</scala.version>
    <scala.libversion>2.11</scala.libversion>
    <apache-rat-plugin.version>0.12</apache-rat-plugin.version>
@@ -105,8 +105,6 @@
    <skipUTs>${skipTests}</skipUTs>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <main.basedir>${project.basedir}</main.basedir>
    <mr.bundle.avro.scope>provided</mr.bundle.avro.scope>
    <mr.bundle.avro.shade.prefix></mr.bundle.avro.shade.prefix>
    <spark.bundle.hive.scope>provided</spark.bundle.hive.scope>
    <spark.bundle.hive.shade.prefix></spark.bundle.hive.shade.prefix>
    <utilities.bundle.hive.scope>provided</utilities.bundle.hive.scope>
@@ -485,9 +483,10 @@
      <!-- Spark (Packages) -->
      <dependency>
-        <groupId>com.databricks</groupId>
+        <groupId>org.apache.spark</groupId>
        <artifactId>spark-avro_2.11</artifactId>
-        <version>4.0.0</version>
+        <version>${spark.version}</version>
        <scope>provided</scope>
      </dependency>
      <!-- Dropwizard Metrics -->
@@ -934,13 +933,6 @@
        <surefire-log4j.file>file://${project.basedir}/src/test/resources/log4j-surefire-quiet.properties</surefire-log4j.file>
      </properties>
    </profile>
    <profile>
      <id>aws-emr-profile</id>
      <properties>
        <mr.bundle.avro.scope>compile</mr.bundle.avro.scope>
        <mr.bundle.avro.shade.prefix>org.apache.hudi.</mr.bundle.avro.shade.prefix>
      </properties>
    </profile>
    <profile>
      <id>javadocs</id>
      <build>