[HUDI-2161] Adding support to disable meta columns with bulk insert operation (#3247)

2021-07-19 20:43:48 -04:00
parent 2099bf41db
commit d5026e9a24
53 changed files with 1063 additions and 269 deletions
--- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDatasetBulkInsertHelper.java
+++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDatasetBulkInsertHelper.java
@@ -18,12 +18,6 @@

 package org.apache.hudi;

-import static org.apache.spark.sql.functions.callUDF;
-
-import java.util.Arrays;
-import java.util.List;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
 import org.apache.hudi.common.config.TypedProperties;
 import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.common.util.ReflectionUtils;
@@ -41,8 +35,17 @@ import org.apache.spark.sql.api.java.UDF1;
 import org.apache.spark.sql.functions;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructType;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
 import scala.collection.JavaConverters;

+import static org.apache.spark.sql.functions.callUDF;
+
 /**
 * Helper class to assist in preparing {@link Dataset<Row>}s for bulk insert with datasource implementation.
 */
@@ -112,4 +115,40 @@ public class HoodieDatasetBulkInsertHelper {

    return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism());
  }
+
+  /**
+   * Add empty meta fields and reorder such that meta fields are at the beginning.
+   *
+   * @param rows
+   * @return
+   */
+  public static Dataset<Row> prepareHoodieDatasetForBulkInsertWithoutMetaFields(Dataset<Row> rows) {
+    // add empty meta cols.
+    Dataset<Row> rowsWithMetaCols = rows
+        .withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD,
+            functions.lit("").cast(DataTypes.StringType))
+        .withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD,
+            functions.lit("").cast(DataTypes.StringType))
+        .withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD,
+            functions.lit("").cast(DataTypes.StringType))
+        .withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD,
+            functions.lit("").cast(DataTypes.StringType))
+        .withColumn(HoodieRecord.FILENAME_METADATA_FIELD,
+            functions.lit("").cast(DataTypes.StringType));
+
+    List<Column> originalFields =
+        Arrays.stream(rowsWithMetaCols.schema().fields()).filter(field -> !field.name().contains("_hoodie_")).map(f -> new Column(f.name())).collect(Collectors.toList());
+
+    List<Column> metaFields =
+        Arrays.stream(rowsWithMetaCols.schema().fields()).filter(field -> field.name().contains("_hoodie_")).map(f -> new Column(f.name())).collect(Collectors.toList());
+
+    // reorder such that all meta columns are at the beginning followed by original columns
+    List<Column> allCols = new ArrayList<>();
+    allCols.addAll(metaFields);
+    allCols.addAll(originalFields);
+
+    return rowsWithMetaCols.select(
+        JavaConverters.collectionAsScalaIterableConverter(allCols).asScala().toSeq());
+  }
+
 }
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
@@ -32,7 +32,7 @@ import org.apache.hudi.common.util.{CommitUtils, ReflectionUtils}
 import org.apache.hudi.config.HoodieBootstrapConfig.{BOOTSTRAP_BASE_PATH_PROP, BOOTSTRAP_INDEX_CLASS_PROP}
 import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig}
 import org.apache.hudi.exception.HoodieException
-import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerWithRowsFactory
+import org.apache.hudi.execution.bulkinsert.{BulkInsertInternalPartitionerWithRowsFactory, NonSortPartitionerWithRows}
 import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool}
 import org.apache.hudi.index.SparkHoodieIndex
 import org.apache.hudi.internal.DataSourceInternalWriterHelper
@@ -128,6 +128,7 @@ object HoodieSparkSqlWriter {
          .setPayloadClassName(hoodieConfig.getString(PAYLOAD_CLASS_OPT_KEY))
          .setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
          .setPartitionColumns(partitionColumns)
+          .setPopulateMetaFields(parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean)
          .initTable(sparkContext.hadoopConfiguration, path.get)
        tableConfig = tableMetaClient.getTableConfig
      }
@@ -139,7 +140,8 @@ object HoodieSparkSqlWriter {
      if (hoodieConfig.getBoolean(ENABLE_ROW_WRITER_OPT_KEY) &&
        operation == WriteOperationType.BULK_INSERT) {
        val (success, commitTime: common.util.Option[String]) = bulkInsertAsRow(sqlContext, parameters, df, tblName,
-                                                                                basePath, path, instantTime)
+          basePath, path, instantTime, parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(),
+            HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean)
        return (success, commitTime, common.util.Option.empty(), common.util.Option.empty(), hoodieWriteClient.orNull, tableConfig)
      }
      // scalastyle:on
@@ -330,7 +332,8 @@ object HoodieSparkSqlWriter {
                      tblName: String,
                      basePath: Path,
                      path: Option[String],
-                      instantTime: String): (Boolean, common.util.Option[String]) = {
+                      instantTime: String,
+                      populateMetaFields: Boolean): (Boolean, common.util.Option[String]) = {
    val sparkContext = sqlContext.sparkContext
    // register classes & schemas
    val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tblName)
@@ -345,22 +348,36 @@ object HoodieSparkSqlWriter {
    }
    val params = parameters.updated(HoodieWriteConfig.AVRO_SCHEMA.key, schema.toString)
    val writeConfig = DataSourceUtils.createHoodieConfig(schema.toString, path.get, tblName, mapAsJavaMap(params))
-    val userDefinedBulkInsertPartitionerOpt = DataSourceUtils.createUserDefinedBulkInsertPartitionerWithRows(writeConfig)
-    val bulkInsertPartitionerRows : BulkInsertPartitioner[Dataset[Row]] = if (userDefinedBulkInsertPartitionerOpt.isPresent)  {
-      userDefinedBulkInsertPartitionerOpt.get
-    }
-    else {
-      BulkInsertInternalPartitionerWithRowsFactory.get(writeConfig.getBulkInsertSortMode)
+    val bulkInsertPartitionerRows : BulkInsertPartitioner[Dataset[Row]] = if (populateMetaFields) {
+      val userDefinedBulkInsertPartitionerOpt = DataSourceUtils.createUserDefinedBulkInsertPartitionerWithRows(writeConfig)
+      if (userDefinedBulkInsertPartitionerOpt.isPresent)  {
+        userDefinedBulkInsertPartitionerOpt.get
+      }
+      else {
+        BulkInsertInternalPartitionerWithRowsFactory.get(writeConfig.getBulkInsertSortMode)
+      }
+    } else {
+      // Sort modes are not yet supported when meta fields are disabled
+      new NonSortPartitionerWithRows()
    }
    val arePartitionRecordsSorted = bulkInsertPartitionerRows.arePartitionRecordsSorted();
    parameters.updated(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED, arePartitionRecordsSorted.toString)
-    val isGlobalIndex = SparkHoodieIndex.isGlobalIndex(writeConfig)
-    val hoodieDF = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, writeConfig, df, structName, nameSpace,
-      bulkInsertPartitionerRows, isGlobalIndex)
+    val isGlobalIndex = if (populateMetaFields) {
+      SparkHoodieIndex.isGlobalIndex(writeConfig)
+    } else {
+      false
+    }
+    val hoodieDF = if (populateMetaFields) {
+      HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, writeConfig, df, structName, nameSpace,
+        bulkInsertPartitionerRows, isGlobalIndex)
+    } else {
+      HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsertWithoutMetaFields(df)
+    }
    if (SPARK_VERSION.startsWith("2.")) {
      hoodieDF.write.format("org.apache.hudi.internal")
        .option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime)
        .options(params)
+        .mode(SaveMode.Append)
        .save()
    } else if (SPARK_VERSION.startsWith("3.")) {
      hoodieDF.write.format("org.apache.hudi.spark3.internal")
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala
@@ -1,206 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi
-
-import org.apache.avro.Schema
-import org.apache.avro.generic.GenericRecord
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hudi.common.model.HoodieRecord
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.apache.spark.sql.avro.SchemaConverters
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal}
-import org.apache.spark.sql.sources.{And, EqualNullSafe, EqualTo, Filter, GreaterThan, GreaterThanOrEqual, In, IsNotNull, IsNull, LessThan, LessThanOrEqual, Not, Or, StringContains, StringEndsWith, StringStartsWith}
-import org.apache.spark.sql.catalyst.encoders.RowEncoder
-import org.apache.spark.sql.execution.datasources.{FileStatusCache, InMemoryFileIndex}
-import org.apache.spark.sql.types.{StringType, StructField, StructType}
-
-import scala.collection.JavaConverters._
-
-
-object HoodieSparkUtils extends SparkAdapterSupport {
-
-  def getMetaSchema: StructType = {
-    StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => {
-      StructField(col, StringType, nullable = true)
-    }))
-  }
-
-  /**
-   * This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]].
-   * [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally.
-   */
-  def isGlobPath(pattern: Path): Boolean = {
-    pattern.toString.exists("{}[]*?\\".toSet.contains)
-  }
-
-  /**
-   * This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]].
-   * [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally.
-   */
-  def globPath(fs: FileSystem, pattern: Path): Seq[Path] = {
-    Option(fs.globStatus(pattern)).map { statuses =>
-      statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq
-    }.getOrElse(Seq.empty[Path])
-  }
-
-  /**
-   * This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]].
-   * [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally.
-   */
-  def globPathIfNecessary(fs: FileSystem, pattern: Path): Seq[Path] = {
-    if (isGlobPath(pattern)) globPath(fs, pattern) else Seq(pattern)
-  }
-
-  /**
-   * Checks to see whether input path contains a glob pattern and if yes, maps it to a list of absolute paths
-   * which match the glob pattern. Otherwise, returns original path
-   *
-   * @param paths List of absolute or globbed paths
-   * @param fs    File system
-   * @return list of absolute file paths
-   */
-  def checkAndGlobPathIfNecessary(paths: Seq[String], fs: FileSystem): Seq[Path] = {
-    paths.flatMap(path => {
-      val qualified = new Path(path).makeQualified(fs.getUri, fs.getWorkingDirectory)
-      val globPaths = globPathIfNecessary(fs, qualified)
-      globPaths
-    })
-  }
-
-  def createInMemoryFileIndex(sparkSession: SparkSession, globbedPaths: Seq[Path]): InMemoryFileIndex = {
-    val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
-    new InMemoryFileIndex(sparkSession, globbedPaths, Map(), Option.empty, fileStatusCache)
-  }
-
-  def createRdd(df: DataFrame, structName: String, recordNamespace: String): RDD[GenericRecord] = {
-    val avroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, recordNamespace)
-    createRdd(df, avroSchema, structName, recordNamespace)
-  }
-
-  def createRdd(df: DataFrame, avroSchema: Schema, structName: String, recordNamespace: String)
-  : RDD[GenericRecord] = {
-    // Use the Avro schema to derive the StructType which has the correct nullability information
-    val dataType = SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType]
-    val encoder = RowEncoder.apply(dataType).resolveAndBind()
-    val deserializer = sparkAdapter.createSparkRowSerDe(encoder)
-    df.queryExecution.toRdd.map(row => deserializer.deserializeRow(row))
-      .mapPartitions { records =>
-        if (records.isEmpty) Iterator.empty
-        else {
-          val convertor = AvroConversionHelper.createConverterToAvro(dataType, structName, recordNamespace)
-          records.map { x => convertor(x).asInstanceOf[GenericRecord] }
-        }
-      }
-  }
-
-  /**
-   * Convert Filters to Catalyst Expressions and joined by And. If convert success return an
-   * Non-Empty Option[Expression],or else return None.
-   */
-  def convertToCatalystExpressions(filters: Array[Filter],
-                                   tableSchema: StructType): Option[Expression] = {
-    val expressions = filters.map(convertToCatalystExpression(_, tableSchema))
-    if (expressions.forall(p => p.isDefined)) {
-      if (expressions.isEmpty) {
-        None
-      } else if (expressions.length == 1) {
-        expressions(0)
-      } else {
-        Some(expressions.map(_.get).reduce(org.apache.spark.sql.catalyst.expressions.And))
-      }
-    } else {
-      None
-    }
-  }
-
-  /**
-   * Convert Filter to Catalyst Expression. If convert success return an Non-Empty
-   * Option[Expression],or else return None.
-   */
-  def convertToCatalystExpression(filter: Filter, tableSchema: StructType): Option[Expression] = {
-    Option(
-      filter match {
-        case EqualTo(attribute, value) =>
-          org.apache.spark.sql.catalyst.expressions.EqualTo(toAttribute(attribute, tableSchema), Literal.create(value))
-        case EqualNullSafe(attribute, value) =>
-          org.apache.spark.sql.catalyst.expressions.EqualNullSafe(toAttribute(attribute, tableSchema), Literal.create(value))
-        case GreaterThan(attribute, value) =>
-          org.apache.spark.sql.catalyst.expressions.GreaterThan(toAttribute(attribute, tableSchema), Literal.create(value))
-        case GreaterThanOrEqual(attribute, value) =>
-          org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual(toAttribute(attribute, tableSchema), Literal.create(value))
-        case LessThan(attribute, value) =>
-          org.apache.spark.sql.catalyst.expressions.LessThan(toAttribute(attribute, tableSchema), Literal.create(value))
-        case LessThanOrEqual(attribute, value) =>
-          org.apache.spark.sql.catalyst.expressions.LessThanOrEqual(toAttribute(attribute, tableSchema), Literal.create(value))
-        case In(attribute, values) =>
-          val attrExp = toAttribute(attribute, tableSchema)
-          val valuesExp = values.map(v => Literal.create(v))
-          org.apache.spark.sql.catalyst.expressions.In(attrExp, valuesExp)
-        case IsNull(attribute) =>
-          org.apache.spark.sql.catalyst.expressions.IsNull(toAttribute(attribute, tableSchema))
-        case IsNotNull(attribute) =>
-          org.apache.spark.sql.catalyst.expressions.IsNotNull(toAttribute(attribute, tableSchema))
-        case And(left, right) =>
-          val leftExp = convertToCatalystExpression(left, tableSchema)
-          val rightExp = convertToCatalystExpression(right, tableSchema)
-          if (leftExp.isEmpty || rightExp.isEmpty) {
-            null
-          } else {
-            org.apache.spark.sql.catalyst.expressions.And(leftExp.get, rightExp.get)
-          }
-        case Or(left, right) =>
-          val leftExp = convertToCatalystExpression(left, tableSchema)
-          val rightExp = convertToCatalystExpression(right, tableSchema)
-          if (leftExp.isEmpty || rightExp.isEmpty) {
-            null
-          } else {
-            org.apache.spark.sql.catalyst.expressions.Or(leftExp.get, rightExp.get)
-          }
-        case Not(child) =>
-          val childExp = convertToCatalystExpression(child, tableSchema)
-          if (childExp.isEmpty) {
-            null
-          } else {
-            org.apache.spark.sql.catalyst.expressions.Not(childExp.get)
-          }
-        case StringStartsWith(attribute, value) =>
-          val leftExp = toAttribute(attribute, tableSchema)
-          val rightExp = Literal.create(s"$value%")
-          sparkAdapter.createLike(leftExp, rightExp)
-        case StringEndsWith(attribute, value) =>
-          val leftExp = toAttribute(attribute, tableSchema)
-          val rightExp = Literal.create(s"%$value")
-          sparkAdapter.createLike(leftExp, rightExp)
-        case StringContains(attribute, value) =>
-          val leftExp = toAttribute(attribute, tableSchema)
-          val rightExp = Literal.create(s"%$value%")
-          sparkAdapter.createLike(leftExp, rightExp)
-        case _=> null
-      }
-    )
-  }
-
-  private def toAttribute(columnName: String, tableSchema: StructType): AttributeReference = {
-    val field = tableSchema.find(p => p.name == columnName)
-    assert(field.isDefined, s"Cannot find column: $columnName, Table Columns are: " +
-      s"${tableSchema.fieldNames.mkString(",")}")
-    AttributeReference(columnName, field.get.dataType, field.get.nullable)()
-  }
-}
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/SparkAdapterSupport.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/SparkAdapterSupport.scala
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi
-
-import org.apache.spark.sql.hudi.{HoodieSqlUtils, SparkAdapter}
-
-/**
- * Use the SparkAdapterSupport trait to get the SparkAdapter when we
- * need to adapt the difference between spark2 and spark3.
- */
-trait SparkAdapterSupport {
-
-  lazy val sparkAdapter: SparkAdapter = {
-    val adapterClass = if (HoodieSqlUtils.isSpark3) {
-      "org.apache.spark.sql.adapter.Spark3Adapter"
-    } else {
-      "org.apache.spark.sql.adapter.Spark2Adapter"
-    }
-    getClass.getClassLoader.loadClass(adapterClass)
-      .newInstance().asInstanceOf[SparkAdapter]
-  }
-}