[HUDI-3396] Make sure BaseFileOnlyViewRelation only reads projected columns (#4818)

NOTE: This change is first part of the series to clean up Hudi's Spark DataSource related implementations, making sure there's minimal code duplication among them, implementations are consistent and performant This PR is making sure that BaseFileOnlyViewRelation only reads projected columns as well as avoiding unnecessary serde from Row to InternalRow Brief change log - Introduced HoodieBaseRDD as a base for all custom RDD impls - Extracted common fields/methods to HoodieBaseRelation - Cleaned up and streamlined HoodieBaseFileViewOnlyRelation - Fixed all of the Relations to avoid superfluous Row <> InternalRow conversions
2022-03-09 18:45:25 -08:00
parent ca0b8fccee
commit 034addaef5
25 changed files with 751 additions and 264 deletions
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala
@@ -177,14 +177,24 @@ object HoodieSparkUtils extends SparkAdapterSupport {
   * Convert Filters to Catalyst Expressions and joined by And. If convert success return an
   * Non-Empty Option[Expression],or else return None.
   */
-  def convertToCatalystExpressions(filters: Array[Filter],
-                                   tableSchema: StructType): Option[Expression] = {
-    val expressions = filters.map(convertToCatalystExpression(_, tableSchema))
+  def convertToCatalystExpressions(filters: Seq[Filter],
+                                   tableSchema: StructType): Seq[Option[Expression]] = {
+    filters.map(convertToCatalystExpression(_, tableSchema))
+  }
+
+
+  /**
+   * Convert Filters to Catalyst Expressions and joined by And. If convert success return an
+   * Non-Empty Option[Expression],or else return None.
+   */
+  def convertToCatalystExpression(filters: Array[Filter],
+                                  tableSchema: StructType): Option[Expression] = {
+    val expressions = convertToCatalystExpressions(filters, tableSchema)
    if (expressions.forall(p => p.isDefined)) {
      if (expressions.isEmpty) {
        None
      } else if (expressions.length == 1) {
-        expressions(0)
+        expressions.head
      } else {
        Some(expressions.map(_.get).reduce(org.apache.spark.sql.catalyst.expressions.And))
      }
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializerTrait.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializerTrait.scala
@@ -24,12 +24,6 @@ package org.apache.spark.sql.avro
 *       If you're looking to convert Avro into "deserialized" [[Row]] (comprised of Java native types),
 *       please check [[AvroConversionUtils]]
 */
-trait HoodieAvroDeserializerTrait {
-  final def deserialize(data: Any): Option[Any] =
-    doDeserialize(data) match {
-      case opt: Option[_] => opt    // As of Spark 3.1, this will return data wrapped with Option, so we fetch the data
-      case row => Some(row)         // For other Spark versions, return the data as is
-    }
-
-  protected def doDeserialize(data: Any): Any
+trait HoodieAvroDeserializer {
+  def deserialize(data: Any): Option[Any]
 }
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializerTrait.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializerTrait.scala
@@ -23,6 +23,6 @@ package org.apache.spark.sql.avro
 * NOTE: This is low-level component operating on Spark internal data-types (comprising [[InternalRow]]).
 *       If you're looking to convert "deserialized" [[Row]] into Avro, please check [[AvroConversionUtils]]
 */
-trait HoodieAvroSerializerTrait {
+trait HoodieAvroSerializer {
  def serialize(catalystData: Any): Any
 }
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.hudi

 import org.apache.avro.Schema
 import org.apache.hudi.client.utils.SparkRowSerDe
-import org.apache.spark.sql.avro.{HoodieAvroDeserializerTrait, HoodieAvroSerializerTrait}
+import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSerializer}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
@@ -43,16 +43,16 @@ import java.util.Locale
 trait SparkAdapter extends Serializable {

  /**
-   * Creates instance of [[HoodieAvroSerializerTrait]] providing for ability to serialize
+   * Creates instance of [[HoodieAvroSerializer]] providing for ability to serialize
   * Spark's [[InternalRow]] into Avro payloads
   */
-  def createAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean): HoodieAvroSerializerTrait
+  def createAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean): HoodieAvroSerializer

  /**
-   * Creates instance of [[HoodieAvroDeserializerTrait]] providing for ability to deserialize
+   * Creates instance of [[HoodieAvroDeserializer]] providing for ability to deserialize
   * Avro payloads into Spark's [[InternalRow]]
   */
-  def createAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType): HoodieAvroDeserializerTrait
+  def createAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType): HoodieAvroDeserializer

  /**
   * Create the SparkRowSerDe.