[HUDI-1040] Make Hudi support Spark 3 (#2208)

* Fix flaky MOR unit test * Update Spark APIs to make it be compatible with both spark2 & spark3 * Refactor bulk insert v2 part to make Hudi be able to compile with Spark3 * Add spark3 profile to handle fasterxml & spark version * Create hudi-spark-common module & refactor hudi-spark related modules Co-authored-by: Wenning Ding <wenningd@amazon.com>
2020-12-09 15:52:23 -08:00
parent 3a91d26d62
commit fce1453fa6
79 changed files with 1040 additions and 172 deletions
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkRowDeserializer.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkRowDeserializer.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.client.utils;
+
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
+
+import java.io.Serializable;
+
+public interface SparkRowDeserializer extends Serializable {
+  Row deserializeRow(InternalRow internalRow);
+}
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala
@@ -21,41 +21,15 @@ package org.apache.hudi
 import org.apache.avro.Schema
 import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder, IndexedRecord}
 import org.apache.hudi.avro.HoodieAvroUtils
-import org.apache.hudi.common.model.HoodieKey
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.avro.SchemaConverters
-import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+import org.apache.spark.sql.{Dataset, Row, SparkSession}

 import scala.collection.JavaConverters._

 object AvroConversionUtils {

-  def createRdd(df: DataFrame, structName: String, recordNamespace: String): RDD[GenericRecord] = {
-    val avroSchema = convertStructTypeToAvroSchema(df.schema, structName, recordNamespace)
-    createRdd(df, avroSchema, structName, recordNamespace)
-  }
-
-  def createRdd(df: DataFrame, avroSchema: Schema, structName: String, recordNamespace: String)
-  : RDD[GenericRecord] = {
-    // Use the Avro schema to derive the StructType which has the correct nullability information
-    val dataType = SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType]
-    val encoder = RowEncoder.apply(dataType).resolveAndBind()
-    df.queryExecution.toRdd.map(encoder.fromRow)
-      .mapPartitions { records =>
-        if (records.isEmpty) Iterator.empty
-        else {
-          val convertor = AvroConversionHelper.createConverterToAvro(dataType, structName, recordNamespace)
-          records.map { x => convertor(x).asInstanceOf[GenericRecord] }
-        }
-      }
-  }
-
-  def createRddForDeletes(df: DataFrame, rowField: String, partitionField: String): RDD[HoodieKey] = {
-    df.rdd.map(row => new HoodieKey(row.getAs[String](rowField), row.getAs[String](partitionField)))
-  }
-
  def createDataFrame(rdd: RDD[GenericRecord], schemaStr: String, ss: SparkSession): Dataset[Row] = {
    if (rdd.isEmpty()) {
      ss.emptyDataFrame
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieRowCreateHandle.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieRowCreateHandle.java
@@ -72,7 +72,7 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
  }

  @Test
-  public void testRowCreateHandle() throws IOException {
+  public void testRowCreateHandle() throws Exception {
    // init config and table
    HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).build();
    HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
@@ -113,7 +113,7 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
   * should be thrown.
   */
  @Test
-  public void testGlobalFailure() throws IOException {
+  public void testGlobalFailure() throws Exception {
    // init config and table
    HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).build();
    HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
@@ -179,7 +179,8 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
    }
  }

-  private HoodieInternalWriteStatus writeAndGetWriteStatus(Dataset<Row> inputRows, HoodieRowCreateHandle handle) throws IOException {
+  private HoodieInternalWriteStatus writeAndGetWriteStatus(Dataset<Row> inputRows, HoodieRowCreateHandle handle)
+      throws Exception {
    List<InternalRow> internalRows = SparkDatasetTestUtils.toInternalRows(inputRows, SparkDatasetTestUtils.ENCODER);
    // issue writes
    for (InternalRow internalRow : internalRows) {
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieInternalRowParquetWriter.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieInternalRowParquetWriter.java
@@ -35,7 +35,6 @@ import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;

-import java.io.IOException;
 import java.util.List;
 import java.util.Random;
 import java.util.UUID;
@@ -64,7 +63,7 @@ public class TestHoodieInternalRowParquetWriter extends HoodieClientTestHarness
  }

  @Test
-  public void endToEndTest() throws IOException {
+  public void endToEndTest() throws Exception {
    HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).build();
    for (int i = 0; i < 5; i++) {
      // init write support and parquet config
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieMergeOnReadTestUtils.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieMergeOnReadTestUtils.java
@@ -43,7 +43,6 @@ import org.apache.hadoop.mapred.RecordReader;

 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
 import java.util.stream.Collectors;

@@ -84,36 +83,32 @@ public class HoodieMergeOnReadTestUtils {
        .map(f -> new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal()))
        .collect(Collectors.toList()));

-    return inputPaths.stream().map(path -> {
-      setInputPath(jobConf, path);
-      List<GenericRecord> records = new ArrayList<>();
-      try {
-        List<InputSplit> splits = Arrays.asList(inputFormat.getSplits(jobConf, 1));
-        for (InputSplit split : splits) {
-          RecordReader recordReader = inputFormat.getRecordReader(split, jobConf, null);
-          Object key = recordReader.createKey();
-          ArrayWritable writable = (ArrayWritable) recordReader.createValue();
-          while (recordReader.next(key, writable)) {
-            GenericRecordBuilder newRecord = new GenericRecordBuilder(projectedSchema);
-            // writable returns an array with [field1, field2, _hoodie_commit_time,
-            // _hoodie_commit_seqno]
-            Writable[] values = writable.get();
-            schema.getFields().stream()
-                .filter(f -> !projectCols || projectedColumns.contains(f.name()))
-                .map(f -> Pair.of(projectedSchema.getFields().stream()
-                    .filter(p -> f.name().equals(p.name())).findFirst().get(), f))
-                .forEach(fieldsPair -> newRecord.set(fieldsPair.getKey(), values[fieldsPair.getValue().pos()]));
-            records.add(newRecord.build());
-          }
+    List<GenericRecord> records = new ArrayList<>();
+    try {
+      FileInputFormat.setInputPaths(jobConf, String.join(",", inputPaths));
+      InputSplit[] splits = inputFormat.getSplits(jobConf, inputPaths.size());
+
+      for (InputSplit split : splits) {
+        RecordReader recordReader = inputFormat.getRecordReader(split, jobConf, null);
+        Object key = recordReader.createKey();
+        ArrayWritable writable = (ArrayWritable) recordReader.createValue();
+        while (recordReader.next(key, writable)) {
+          GenericRecordBuilder newRecord = new GenericRecordBuilder(projectedSchema);
+          // writable returns an array with [field1, field2, _hoodie_commit_time,
+          // _hoodie_commit_seqno]
+          Writable[] values = writable.get();
+          schema.getFields().stream()
+              .filter(f -> !projectCols || projectedColumns.contains(f.name()))
+              .map(f -> Pair.of(projectedSchema.getFields().stream()
+                  .filter(p -> f.name().equals(p.name())).findFirst().get(), f))
+              .forEach(fieldsPair -> newRecord.set(fieldsPair.getKey(), values[fieldsPair.getValue().pos()]));
+          records.add(newRecord.build());
        }
-      } catch (IOException ie) {
-        ie.printStackTrace();
      }
-      return records;
-    }).reduce((a, b) -> {
-      a.addAll(b);
-      return a;
-    }).orElse(new ArrayList<>());
+    } catch (IOException ie) {
+      ie.printStackTrace();
+    }
+    return records;
  }

  private static void setPropsForInputFormat(FileInputFormat inputFormat, JobConf jobConf, Schema schema, String hiveColumnTypes, boolean projectCols, List<String> projectedCols) {
@@ -156,10 +151,4 @@ public class HoodieMergeOnReadTestUtils {
    configurable.setConf(conf);
    jobConf.addResource(conf);
  }
-
-  private static void setInputPath(JobConf jobConf, String inputPath) {
-    jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath);
-    jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath);
-    jobConf.set("map.input.dir", inputPath);
-  }
 }
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkDatasetTestUtils.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkDatasetTestUtils.java
@@ -26,6 +26,7 @@ import org.apache.hudi.config.HoodieStorageConfig;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.index.HoodieIndex;

+import org.apache.spark.package$;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
@@ -41,6 +42,8 @@ import org.apache.spark.sql.types.Metadata;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;

+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.UUID;
@@ -139,11 +142,11 @@ public class SparkDatasetTestUtils {
   * @param rows Dataset<Row>s to be converted
   * @return the List of {@link InternalRow}s thus converted.
   */
-  public static List<InternalRow> toInternalRows(Dataset<Row> rows, ExpressionEncoder encoder) {
+  public static List<InternalRow> toInternalRows(Dataset<Row> rows, ExpressionEncoder encoder) throws Exception {
    List<InternalRow> toReturn = new ArrayList<>();
    List<Row> rowList = rows.collectAsList();
    for (Row row : rowList) {
-      toReturn.add(encoder.toRow(row).copy());
+      toReturn.add(serializeRow(encoder, row).copy());
    }
    return toReturn;
  }
@@ -173,4 +176,17 @@ public class SparkDatasetTestUtils {
        .withBulkInsertParallelism(2);
  }

+  private static InternalRow serializeRow(ExpressionEncoder encoder, Row row)
+      throws InvocationTargetException, IllegalAccessException, NoSuchMethodException, ClassNotFoundException {
+    // TODO remove reflection if Spark 2.x support is dropped
+    if (package$.MODULE$.SPARK_VERSION().startsWith("2.")) {
+      Method spark2method = encoder.getClass().getMethod("toRow", Object.class);
+      return (InternalRow) spark2method.invoke(encoder, row);
+    } else {
+      Class<?> serializerClass = Class.forName("org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer");
+      Object serializer = encoder.getClass().getMethod("createSerializer").invoke(encoder);
+      Method aboveSpark2method = serializerClass.getMethod("apply", Object.class);
+      return (InternalRow) aboveSpark2method.invoke(serializer, row);
+    }
+  }
 }