[HUDI-3549] Removing dependency on "spark-avro" (#4955)

Hudi will be taking on promise for it bundles to stay compatible with Spark minor versions (for ex 2.4, 3.1, 3.2): meaning that single build of Hudi (for ex "hudi-spark3.2-bundle") will be compatible with ALL patch versions in that minor branch (in that case 3.2.1, 3.2.0, etc) To achieve that we'll have to remove (and ban) "spark-avro" as a dependency, which on a few occasions was the root-cause of incompatibility b/w consecutive Spark patch versions (most recently 3.2.1 and 3.2.0, due to this PR). Instead of bundling "spark-avro" as dependency, we will be copying over some of the classes Hudi depends on and maintain them along the Hudi code-base to make sure we're able to provide for the aforementioned guarantee. To workaround arising compatibility issues we will be applying local patches to guarantee compatibility of Hudi bundles w/in the Spark minor version branches. Following Hudi modules to Spark minor branches is currently maintained: "hudi-spark3" -> 3.2.x "hudi-spark3.1.x" -> 3.1.x "hudi-spark2" -> 2.4.x Following classes hierarchies (borrowed from "spark-avro") are maintained w/in these Spark-specific modules to guarantee compatibility with respective minor version branches: AvroSerializer AvroDeserializer AvroUtils Each of these classes has been correspondingly copied from Spark 3.2.1 (for 3.2.x branch), 3.1.2 (for 3.1.x branch), 2.4.4 (for 2.4.x branch) into their respective modules. SchemaConverters class in turn is shared across all those modules given its relative stability (there're only cosmetical changes from 2.4.4 to 3.2.1). All of the aforementioned classes have their corresponding scope of visibility limited to corresponding packages (org.apache.spark.sql.avro, org.apache.spark.sql) to make sure broader code-base does not become dependent on them and instead relies on facades abstracting them. Additionally, given that Hudi plans on supporting all the patch versions of Spark w/in aforementioned minor versions branches of Spark, additional build steps were added to validate that Hudi could be properly compiled against those versions. Testing, however, is performed against the most recent patch versions of Spark with the help of Azure CI. Brief change log: - Removing spark-avro bundling from Hudi by default - Scaffolded Spark 3.2.x hierarchy - Bootstrapped Spark 3.1.x Avro serializer/deserializer hierarchy - Bootstrapped Spark 2.4.x Avro serializer/deserializer hierarchy - Moved ExpressionCodeGen,ExpressionPayload into hudi-spark module - Fixed AvroDeserializer to stay compatible w/ both Spark 3.2.1 and 3.2.0 - Modified bot.yml to build full matrix of support Spark versions - Removed "spark-avro" dependency from all modules - Fixed relocation of spark-avro classes in bundles to assist in running integ-tests.
2022-03-29 11:44:47 -07:00
parent 0802510ca9
commit e5a2baeed0
54 changed files with 2665 additions and 278 deletions
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionCodeGen.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionCodeGen.scala
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hudi.command.payload
+
+import org.apache.avro.generic.{GenericRecord, IndexedRecord}
+import org.apache.hudi.sql.IExpressionEvaluator
+import org.apache.spark.executor.InputMetrics
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.avro.AvroSerializer
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, GenericInternalRow, LeafExpression, UnsafeArrayData, UnsafeMapData, UnsafeRow}
+import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
+import org.apache.spark.sql.hudi.command.payload.ExpressionCodeGen.RECORD_NAME
+import org.apache.spark.sql.types.{DataType, Decimal}
+import org.apache.spark.unsafe.Platform
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
+import org.apache.spark.util.ParentClassLoader
+import org.apache.spark.{TaskContext, TaskKilledException}
+import org.codehaus.commons.compiler.CompileException
+import org.codehaus.janino.{ClassBodyEvaluator, InternalCompilerException}
+
+import java.util.UUID
+
+/**
+ * Do CodeGen for expression based on IndexedRecord.
+ * The mainly difference with the spark's CodeGen for expression is that
+ * the expression's input is a IndexedRecord but not a Row.
+ *
+ */
+object ExpressionCodeGen extends Logging {
+
+  val RECORD_NAME = "record"
+
+  /**
+   * CodeGen for expressions.
+   * @param exprs The expression list to CodeGen.
+   * @return An IExpressionEvaluator generate by CodeGen which take a IndexedRecord as input
+   *         param and return a Array of results for each expression.
+   */
+  def doCodeGen(exprs: Seq[Expression], serializer: AvroSerializer): IExpressionEvaluator = {
+    val ctx = new CodegenContext()
+    // Set the input_row to null as we do not use row as the input object but Record.
+    ctx.INPUT_ROW = null
+
+    val replacedExprs = exprs.map(replaceBoundReference)
+    val resultVars = replacedExprs.map(_.genCode(ctx))
+    val className = s"ExpressionPayloadEvaluator_${UUID.randomUUID().toString.replace("-", "_")}"
+    val codeBody =
+      s"""
+         |private Object[] references;
+         |private String code;
+         |private AvroSerializer serializer;
+         |
+         |public $className(Object references, String code, AvroSerializer serializer) {
+         |  this.references = (Object[])references;
+         |  this.code = code;
+         |  this.serializer = serializer;
+         |}
+         |
+         |public GenericRecord eval(IndexedRecord $RECORD_NAME) {
+         |    ${resultVars.map(_.code).mkString("\n")}
+         |    Object[] results = new Object[${resultVars.length}];
+         |    ${
+                (for (i <- resultVars.indices) yield {
+                          s"""
+                             |if (${resultVars(i).isNull}) {
+                             |  results[$i] = null;
+                             |} else {
+                             |  results[$i] = ${resultVars(i).value.code};
+                             |}
+                       """.stripMargin
+                 }).mkString("\n")
+              }
+              InternalRow row = new GenericInternalRow(results);
+              return (GenericRecord) serializer.serialize(row);
+         |  }
+         |
+         |public String getCode() {
+         |  return code;
+         |}
+     """.stripMargin
+
+    val evaluator = new ClassBodyEvaluator()
+    val parentClassLoader = new ParentClassLoader(
+      Option(Thread.currentThread().getContextClassLoader).getOrElse(getClass.getClassLoader))
+
+    evaluator.setParentClassLoader(parentClassLoader)
+    // Cannot be under package codegen, or fail with java.lang.InstantiationException
+    evaluator.setClassName(s"org.apache.hudi.sql.payload.$className")
+    evaluator.setDefaultImports(
+      classOf[Platform].getName,
+      classOf[InternalRow].getName,
+      classOf[UnsafeRow].getName,
+      classOf[UTF8String].getName,
+      classOf[Decimal].getName,
+      classOf[CalendarInterval].getName,
+      classOf[ArrayData].getName,
+      classOf[UnsafeArrayData].getName,
+      classOf[MapData].getName,
+      classOf[UnsafeMapData].getName,
+      classOf[Expression].getName,
+      classOf[TaskContext].getName,
+      classOf[TaskKilledException].getName,
+      classOf[InputMetrics].getName,
+      classOf[IndexedRecord].getName,
+      classOf[AvroSerializer].getName,
+      classOf[GenericRecord].getName,
+      classOf[GenericInternalRow].getName
+    )
+    evaluator.setImplementedInterfaces(Array(classOf[IExpressionEvaluator]))
+    try {
+      evaluator.cook(codeBody)
+    } catch {
+      case e: InternalCompilerException =>
+        val msg = s"failed to compile: $e"
+        logError(msg, e)
+        throw new InternalCompilerException(msg, e)
+      case e: CompileException =>
+        val msg = s"failed to compile: $e"
+        logError(msg, e)
+        throw new CompileException(msg, e.getLocation)
+    }
+    val referenceArray = ctx.references.toArray.map(_.asInstanceOf[Object])
+    val expressionSql = exprs.map(_.sql).mkString("  ")
+
+    evaluator.getClazz.getConstructor(classOf[Object], classOf[String], classOf[AvroSerializer])
+      .newInstance(referenceArray, s"Expressions is: [$expressionSql]\nCodeBody is: {\n$codeBody\n}", serializer)
+      .asInstanceOf[IExpressionEvaluator]
+  }
+
+  /**
+   * Replace the BoundReference to the Record implement which will override the
+   * doGenCode method.
+   */
+  private def replaceBoundReference(expression: Expression): Expression = {
+    expression transformDown  {
+      case BoundReference(ordinal, dataType, nullable) =>
+         RecordBoundReference(ordinal, dataType, nullable)
+      case other =>
+        other
+    }
+  }
+}
+
+case class RecordBoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
+  extends LeafExpression {
+
+  /**
+   * Do the CodeGen for RecordBoundReference.
+   * Use "IndexedRecord" as the input object but not a "Row"
+   */
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val javaType = JavaCode.javaType(dataType)
+    val boxType = JavaCode.boxedType(dataType)
+
+    val value = s"($boxType)$RECORD_NAME.get($ordinal)"
+    if (nullable) {
+      ev.copy(code =
+        code"""
+              | boolean ${ev.isNull} = $RECORD_NAME.get($ordinal) == null;
+              | $javaType ${ev.value} = ${ev.isNull} ?
+              | ${CodeGenerator.defaultValue(dataType)} : ($value);
+          """
+      )
+    } else {
+      ev.copy(code = code"$javaType ${ev.value} = $value;", isNull = FalseLiteral)
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    throw new IllegalArgumentException(s"Should not call eval method for " +
+      s"${getClass.getCanonicalName}")
+  }
+}
+
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hudi.command.payload
+
+import com.google.common.cache.CacheBuilder
+import org.apache.avro.Schema
+import org.apache.avro.generic.{GenericData, GenericRecord, IndexedRecord}
+import org.apache.hudi.AvroConversionUtils
+import org.apache.hudi.DataSourceWriteOptions._
+import org.apache.hudi.avro.HoodieAvroUtils
+import org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro
+import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodiePayloadProps, HoodieRecord}
+import org.apache.hudi.common.util.{ValidationUtils, Option => HOption}
+import org.apache.hudi.config.HoodieWriteConfig
+import org.apache.hudi.io.HoodieWriteHandle
+import org.apache.hudi.sql.IExpressionEvaluator
+import org.apache.spark.sql.avro.{AvroSerializer, SchemaConverters}
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.hudi.SerDeUtils
+import org.apache.spark.sql.hudi.command.payload.ExpressionPayload.getEvaluator
+import org.apache.spark.sql.types.{StructField, StructType}
+
+import java.util.concurrent.Callable
+import java.util.{Base64, Properties}
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
+
+/**
+ * A HoodieRecordPayload for MergeIntoHoodieTableCommand.
+ * It will execute the condition and assignments expression in the
+ * match and not-match actions and compute the final record to write.
+ *
+ * If there is no condition match the record, ExpressionPayload will return
+ * a HoodieWriteHandle.IGNORE_RECORD, and the write handles will ignore this record.
+ */
+class ExpressionPayload(record: GenericRecord,
+                        orderingVal: Comparable[_])
+  extends DefaultHoodieRecordPayload(record, orderingVal) {
+
+  def this(recordOpt: HOption[GenericRecord]) {
+    this(recordOpt.orElse(null), 0)
+  }
+
+  /**
+   * The schema of this table.
+   */
+  private var writeSchema: Schema = _
+
+  override def combineAndGetUpdateValue(currentValue: IndexedRecord,
+                                        schema: Schema): HOption[IndexedRecord] = {
+    throw new IllegalStateException(s"Should not call this method for ${getClass.getCanonicalName}")
+  }
+
+  override def getInsertValue(schema: Schema): HOption[IndexedRecord] = {
+    throw new IllegalStateException(s"Should not call this method for ${getClass.getCanonicalName}")
+  }
+
+  override def combineAndGetUpdateValue(targetRecord: IndexedRecord,
+                                        schema: Schema, properties: Properties): HOption[IndexedRecord] = {
+    val sourceRecord = bytesToAvro(recordBytes, schema)
+    val joinSqlRecord = new SqlTypedRecord(joinRecord(sourceRecord, targetRecord))
+    processMatchedRecord(joinSqlRecord, Some(targetRecord), properties)
+  }
+
+  /**
+   * Process the matched record. Firstly test if the record matched any of the update-conditions,
+   * if matched, return the update assignments result. Secondly, test if the record matched
+   * delete-condition, if matched then return a delete record. Finally if no condition matched,
+   * return a {@link HoodieWriteHandle.IGNORE_RECORD} which will be ignored by HoodieWriteHandle.
+   * @param inputRecord  The input record to process.
+   * @param targetRecord The origin exist record.
+   * @param properties   The properties.
+   * @return The result of the record to update or delete.
+   */
+  private def processMatchedRecord(inputRecord: SqlTypedRecord,
+    targetRecord: Option[IndexedRecord], properties: Properties): HOption[IndexedRecord] = {
+    // Process update
+    val updateConditionAndAssignmentsText =
+      properties.get(ExpressionPayload.PAYLOAD_UPDATE_CONDITION_AND_ASSIGNMENTS)
+    assert(updateConditionAndAssignmentsText != null,
+      s"${ExpressionPayload.PAYLOAD_UPDATE_CONDITION_AND_ASSIGNMENTS} have not set")
+
+    var resultRecordOpt: HOption[IndexedRecord] = null
+
+    // Get the Evaluator for each condition and update assignments.
+    initWriteSchemaIfNeed(properties)
+    val updateConditionAndAssignments = getEvaluator(updateConditionAndAssignmentsText.toString, writeSchema)
+    for ((conditionEvaluator, assignmentEvaluator) <- updateConditionAndAssignments
+         if resultRecordOpt == null) {
+      val conditionVal = evaluate(conditionEvaluator, inputRecord).get(0).asInstanceOf[Boolean]
+      // If the update condition matched  then execute assignment expression
+      // to compute final record to update. We will return the first matched record.
+      if (conditionVal) {
+        val resultRecord = evaluate(assignmentEvaluator, inputRecord)
+
+        if (targetRecord.isEmpty || needUpdatingPersistedRecord(targetRecord.get, resultRecord, properties)) {
+          resultRecordOpt = HOption.of(resultRecord)
+        } else {
+          // if the PreCombine field value of targetRecord is greater
+          // than the new incoming record, just keep the old record value.
+          resultRecordOpt = HOption.of(targetRecord.get)
+        }
+      }
+    }
+    if (resultRecordOpt == null) {
+      // Process delete
+      val deleteConditionText = properties.get(ExpressionPayload.PAYLOAD_DELETE_CONDITION)
+      if (deleteConditionText != null) {
+        val deleteCondition = getEvaluator(deleteConditionText.toString, writeSchema).head._1
+        val deleteConditionVal = evaluate(deleteCondition, inputRecord).get(0).asInstanceOf[Boolean]
+        if (deleteConditionVal) {
+          resultRecordOpt = HOption.empty()
+        }
+      }
+    }
+    if (resultRecordOpt == null) {
+      // If there is no condition matched, just filter this record.
+      // here we return a IGNORE_RECORD, HoodieMergeHandle will not handle it.
+      HOption.of(HoodieWriteHandle.IGNORE_RECORD)
+    } else {
+      resultRecordOpt
+    }
+  }
+
+  /**
+   * Process the not-matched record. Test if the record matched any of insert-conditions,
+   * if matched then return the result of insert-assignment. Or else return a
+   * {@link HoodieWriteHandle.IGNORE_RECORD} which will be ignored by HoodieWriteHandle.
+   *
+   * @param inputRecord The input record to process.
+   * @param properties  The properties.
+   * @return The result of the record to insert.
+   */
+  private def processNotMatchedRecord(inputRecord: SqlTypedRecord, properties: Properties): HOption[IndexedRecord] = {
+    val insertConditionAndAssignmentsText =
+      properties.get(ExpressionPayload.PAYLOAD_INSERT_CONDITION_AND_ASSIGNMENTS)
+    // Get the evaluator for each condition and insert assignment.
+    initWriteSchemaIfNeed(properties)
+    val insertConditionAndAssignments =
+      ExpressionPayload.getEvaluator(insertConditionAndAssignmentsText.toString, writeSchema)
+    var resultRecordOpt: HOption[IndexedRecord] = null
+    for ((conditionEvaluator, assignmentEvaluator) <- insertConditionAndAssignments
+         if resultRecordOpt == null) {
+      val conditionVal = evaluate(conditionEvaluator, inputRecord).get(0).asInstanceOf[Boolean]
+      // If matched the insert condition then execute the assignment expressions to compute the
+      // result record. We will return the first matched record.
+      if (conditionVal) {
+        val resultRecord = evaluate(assignmentEvaluator, inputRecord)
+        resultRecordOpt = HOption.of(resultRecord)
+      }
+    }
+    if (resultRecordOpt != null) {
+      resultRecordOpt
+    } else {
+      // If there is no condition matched, just filter this record.
+      // Here we return a IGNORE_RECORD, HoodieCreateHandle will not handle it.
+      HOption.of(HoodieWriteHandle.IGNORE_RECORD)
+    }
+  }
+
+  override def getInsertValue(schema: Schema, properties: Properties): HOption[IndexedRecord] = {
+    val incomingRecord = bytesToAvro(recordBytes, schema)
+    if (isDeleteRecord(incomingRecord)) {
+      HOption.empty[IndexedRecord]()
+    } else {
+      val sqlTypedRecord = new SqlTypedRecord(incomingRecord)
+      if (isMORTable(properties)) {
+        // For the MOR table, both the matched and not-matched record will step into the getInsertValue() method.
+        // We call the processMatchedRecord() method if current is a Update-Record to process
+        // the matched record. Or else we call processNotMatchedRecord() method to process the not matched record.
+        val isUpdateRecord = properties.getProperty(HoodiePayloadProps.PAYLOAD_IS_UPDATE_RECORD_FOR_MOR, "false").toBoolean
+        if (isUpdateRecord) {
+          processMatchedRecord(sqlTypedRecord, Option.empty, properties)
+        } else {
+          processNotMatchedRecord(sqlTypedRecord, properties)
+        }
+      } else {
+        // For COW table, only the not-matched record will step into the getInsertValue method, So just call
+        // the processNotMatchedRecord() here.
+        processNotMatchedRecord(sqlTypedRecord, properties)
+      }
+    }
+  }
+
+  private def isMORTable(properties: Properties): Boolean = {
+    properties.getProperty(TABLE_TYPE.key, null) == MOR_TABLE_TYPE_OPT_VAL
+  }
+
+  private def convertToRecord(values: Array[AnyRef], schema: Schema): IndexedRecord = {
+    assert(values.length == schema.getFields.size())
+    val writeRecord = new GenericData.Record(schema)
+    for (i <- values.indices) {
+      writeRecord.put(i, values(i))
+    }
+    writeRecord
+  }
+
+  /**
+   * Init the table schema.
+   */
+  private def initWriteSchemaIfNeed(properties: Properties): Unit = {
+    if (writeSchema == null) {
+      ValidationUtils.checkArgument(properties.containsKey(HoodieWriteConfig.WRITE_SCHEMA.key),
+        s"Missing ${HoodieWriteConfig.WRITE_SCHEMA.key}")
+      writeSchema = new Schema.Parser().parse(properties.getProperty(HoodieWriteConfig.WRITE_SCHEMA.key))
+    }
+  }
+
+  /**
+   * Join the source record with the target record.
+   *
+   * @return
+   */
+  private def joinRecord(sourceRecord: IndexedRecord, targetRecord: IndexedRecord): IndexedRecord = {
+    val leftSchema = sourceRecord.getSchema
+    // the targetRecord is load from the disk, it contains the meta fields, so we remove it here
+    val rightSchema = HoodieAvroUtils.removeMetadataFields(targetRecord.getSchema)
+    val joinSchema = mergeSchema(leftSchema, rightSchema)
+
+    val values = new ArrayBuffer[AnyRef]()
+    for (i <- 0 until joinSchema.getFields.size()) {
+      val value = if (i < leftSchema.getFields.size()) {
+        sourceRecord.get(i)
+      } else { // skip meta field
+        targetRecord.get(i - leftSchema.getFields.size() + HoodieRecord.HOODIE_META_COLUMNS.size())
+      }
+      values += value
+    }
+    convertToRecord(values.toArray, joinSchema)
+  }
+
+  private def mergeSchema(a: Schema, b: Schema): Schema = {
+    val mergedFields =
+      a.getFields.asScala.map(field =>
+        new Schema.Field("a_" + field.name,
+          field.schema, field.doc, field.defaultVal, field.order)) ++
+        b.getFields.asScala.map(field =>
+          new Schema.Field("b_" + field.name,
+            field.schema, field.doc, field.defaultVal, field.order))
+    Schema.createRecord(a.getName, a.getDoc, a.getNamespace, a.isError, mergedFields.asJava)
+  }
+
+  private def evaluate(evaluator: IExpressionEvaluator, sqlTypedRecord: SqlTypedRecord): GenericRecord = {
+    try evaluator.eval(sqlTypedRecord) catch {
+      case e: Throwable =>
+        throw new RuntimeException(s"Error in execute expression: ${e.getMessage}.\n${evaluator.getCode}", e)
+    }
+  }
+}
+
+object ExpressionPayload {
+
+  /**
+   * Property for pass the merge-into delete clause condition expression.
+   */
+  val PAYLOAD_DELETE_CONDITION = "hoodie.payload.delete.condition"
+
+  /**
+   * Property for pass the merge-into update clauses's condition and assignments.
+   */
+  val PAYLOAD_UPDATE_CONDITION_AND_ASSIGNMENTS = "hoodie.payload.update.condition.assignments"
+
+  /**
+   * Property for pass the merge-into insert clauses's condition and assignments.
+   */
+  val PAYLOAD_INSERT_CONDITION_AND_ASSIGNMENTS = "hoodie.payload.insert.condition.assignments"
+
+  /**
+   * A cache for the serializedConditionAssignments to the compiled class after CodeGen.
+   * The Map[IExpressionEvaluator, IExpressionEvaluator] is the map of the condition expression
+   * to the assignments expression.
+   */
+  private val cache = CacheBuilder.newBuilder()
+    .maximumSize(1024)
+    .build[String, Map[IExpressionEvaluator, IExpressionEvaluator]]()
+
+  /**
+   * Do the CodeGen for each condition and assignment expressions.We will cache it to reduce
+   * the compile time for each method call.
+   */
+  def getEvaluator(
+    serializedConditionAssignments: String, writeSchema: Schema): Map[IExpressionEvaluator, IExpressionEvaluator] = {
+    cache.get(serializedConditionAssignments,
+      new Callable[Map[IExpressionEvaluator, IExpressionEvaluator]] {
+
+        override def call(): Map[IExpressionEvaluator, IExpressionEvaluator] = {
+          val serializedBytes = Base64.getDecoder.decode(serializedConditionAssignments)
+          val conditionAssignments = SerDeUtils.toObject(serializedBytes)
+            .asInstanceOf[Map[Expression, Seq[Expression]]]
+          // Do the CodeGen for condition expression and assignment expression
+          conditionAssignments.map {
+            case (condition, assignments) =>
+              val conditionType = StructType(Seq(StructField("_col0", condition.dataType, nullable = true)))
+              val conditionSerializer = new AvroSerializer(conditionType,
+                SchemaConverters.toAvroType(conditionType), false)
+              val conditionEvaluator = ExpressionCodeGen.doCodeGen(Seq(condition), conditionSerializer)
+
+              val assignSqlType = AvroConversionUtils.convertAvroSchemaToStructType(writeSchema)
+              val assignSerializer = new AvroSerializer(assignSqlType, writeSchema, false)
+              val assignmentEvaluator = ExpressionCodeGen.doCodeGen(assignments, assignSerializer)
+              conditionEvaluator -> assignmentEvaluator
+          }
+        }
+      })
+  }
+}
+
--- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java
+++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.index.bucket;
+
+import org.apache.hudi.common.model.HoodieAvroRecord;
+import org.apache.hudi.common.model.HoodieKey;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.keygen.KeyGenUtils;
+import org.apache.hudi.testutils.KeyGeneratorTestUtilities;
+
+import org.apache.avro.generic.GenericRecord;
+import org.junit.jupiter.api.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+public class TestBucketIdentifier {
+
+  @Test
+  public void testBucketFileId() {
+    for (int i = 0; i < 1000; i++) {
+      String bucketId = BucketIdentifier.bucketIdStr(i);
+      String fileId = BucketIdentifier.newBucketFileIdPrefix(bucketId);
+      assert BucketIdentifier.bucketIdFromFileId(fileId) == i;
+    }
+  }
+
+  @Test
+  public void testBucketIdWithSimpleRecordKey() {
+    String recordKeyField = "_row_key";
+    String indexKeyField = "_row_key";
+    GenericRecord record = KeyGeneratorTestUtilities.getRecord();
+    HoodieRecord hoodieRecord = new HoodieAvroRecord(
+        new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null);
+    int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8);
+    assert bucketId == BucketIdentifier.getBucketId(
+        Arrays.asList(record.get(indexKeyField).toString()), 8);
+  }
+
+  @Test
+  public void testBucketIdWithComplexRecordKey() {
+    List<String> recordKeyField = Arrays.asList("_row_key","ts_ms");
+    String indexKeyField = "_row_key";
+    GenericRecord record = KeyGeneratorTestUtilities.getRecord();
+    HoodieRecord hoodieRecord = new HoodieAvroRecord(
+        new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null);
+    int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8);
+    assert bucketId == BucketIdentifier.getBucketId(
+        Arrays.asList(record.get(indexKeyField).toString()), 8);
+  }
+}
--- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/testutils/KeyGeneratorTestUtilities.java
+++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/testutils/KeyGeneratorTestUtilities.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.testutils;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.AvroConversionUtils;
+import org.apache.spark.package$;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$;
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
+import org.apache.spark.sql.catalyst.encoders.RowEncoder;
+import org.apache.spark.sql.catalyst.expressions.Attribute;
+import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema;
+import org.apache.spark.sql.types.StructType;
+import scala.Function1;
+import scala.collection.JavaConversions;
+import scala.collection.JavaConverters;
+
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class KeyGeneratorTestUtilities {
+
+  public static final String NESTED_COL_SCHEMA = "{\"type\":\"record\", \"name\":\"nested_col\",\"fields\": ["
+      + "{\"name\": \"prop1\",\"type\": \"string\"},{\"name\": \"prop2\", \"type\": \"long\"}]}";
+  public static final String EXAMPLE_SCHEMA = "{\"type\": \"record\",\"name\": \"testrec\",\"fields\": [ "
+      + "{\"name\": \"timestamp\",\"type\": \"long\"},{\"name\": \"_row_key\", \"type\": \"string\"},"
+      + "{\"name\": \"ts_ms\", \"type\": \"string\"},"
+      + "{\"name\": \"pii_col\", \"type\": \"string\"},"
+      + "{\"name\": \"nested_col\",\"type\": [\"null\", " + NESTED_COL_SCHEMA + "]}"
+      + "]}";
+
+  public static final String TEST_STRUCTNAME = "test_struct_name";
+  public static final String TEST_RECORD_NAMESPACE = "test_record_namespace";
+  public static Schema schema = new Schema.Parser().parse(EXAMPLE_SCHEMA);
+  public static StructType structType = AvroConversionUtils.convertAvroSchemaToStructType(schema);
+
+  public static GenericRecord getRecord() {
+    return getRecord(getNestedColRecord("val1", 10L));
+  }
+
+  public static GenericRecord getNestedColRecord(String prop1Value, Long prop2Value) {
+    GenericRecord nestedColRecord = new GenericData.Record(new Schema.Parser().parse(NESTED_COL_SCHEMA));
+    nestedColRecord.put("prop1", prop1Value);
+    nestedColRecord.put("prop2", prop2Value);
+    return nestedColRecord;
+  }
+
+  public static GenericRecord getRecord(GenericRecord nestedColRecord) {
+    GenericRecord record = new GenericData.Record(new Schema.Parser().parse(EXAMPLE_SCHEMA));
+    record.put("timestamp", 4357686L);
+    record.put("_row_key", "key1");
+    record.put("ts_ms", "2020-03-21");
+    record.put("pii_col", "pi");
+    record.put("nested_col", nestedColRecord);
+    return record;
+  }
+
+  public static Row getRow(GenericRecord record) {
+    return getRow(record, schema, structType);
+  }
+
+  public static Row getRow(GenericRecord record, Schema schema, StructType structType) {
+    Function1<GenericRecord, Row> converterFn = AvroConversionUtils.createConverterToRow(schema, structType);
+    Row row = converterFn.apply(record);
+    int fieldCount = structType.fieldNames().length;
+    Object[] values = new Object[fieldCount];
+    for (int i = 0; i < fieldCount; i++) {
+      values[i] = row.get(i);
+    }
+    return new GenericRowWithSchema(values, structType);
+  }
+
+  public static InternalRow getInternalRow(Row row) {
+    try {
+      return getInternalRow(row, getEncoder(row.schema()));
+    } catch (Exception e) {
+      throw new IllegalStateException("Exception thrown while converting Row to InternalRow", e);
+    }
+  }
+
+  private static ExpressionEncoder getEncoder(StructType schema) {
+    List<Attribute> attributes = JavaConversions.asJavaCollection(schema.toAttributes()).stream()
+        .map(Attribute::toAttribute).collect(Collectors.toList());
+    return RowEncoder.apply(schema)
+        .resolveAndBind(JavaConverters.asScalaBufferConverter(attributes).asScala().toSeq(),
+            SimpleAnalyzer$.MODULE$);
+  }
+
+  public static InternalRow getInternalRow(Row row, ExpressionEncoder<Row> encoder) throws ClassNotFoundException, InvocationTargetException, IllegalAccessException, NoSuchMethodException {
+    return serializeRow(encoder, row);
+  }
+
+  private static InternalRow serializeRow(ExpressionEncoder encoder, Row row)
+      throws InvocationTargetException, IllegalAccessException, NoSuchMethodException, ClassNotFoundException {
+    // TODO remove reflection if Spark 2.x support is dropped
+    if (package$.MODULE$.SPARK_VERSION().startsWith("2.")) {
+      Method spark2method = encoder.getClass().getMethod("toRow", Object.class);
+      return (InternalRow) spark2method.invoke(encoder, row);
+    } else {
+      Class<?> serializerClass = Class.forName("org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer");
+      Object serializer = encoder.getClass().getMethod("createSerializer").invoke(encoder);
+      Method aboveSpark2method = serializerClass.getMethod("apply", Object.class);
+      return (InternalRow) aboveSpark2method.invoke(serializer, row);
+    }
+  }
+
+}