[HUDI-3396] Make sure BaseFileOnlyViewRelation only reads projected columns (#4818)
NOTE: This change is first part of the series to clean up Hudi's Spark DataSource related implementations, making sure there's minimal code duplication among them, implementations are consistent and performant This PR is making sure that BaseFileOnlyViewRelation only reads projected columns as well as avoiding unnecessary serde from Row to InternalRow Brief change log - Introduced HoodieBaseRDD as a base for all custom RDD impls - Extracted common fields/methods to HoodieBaseRelation - Cleaned up and streamlined HoodieBaseFileViewOnlyRelation - Fixed all of the Relations to avoid superfluous Row <> InternalRow conversions
This commit is contained in:
@@ -21,7 +21,7 @@ import org.apache.avro.Schema
|
||||
import org.apache.hudi.Spark3RowSerDe
|
||||
import org.apache.hudi.client.utils.SparkRowSerDe
|
||||
import org.apache.hudi.spark3.internal.ReflectUtil
|
||||
import org.apache.spark.sql.avro.{HoodieAvroDeserializerTrait, HoodieAvroSerializerTrait, Spark3HoodieAvroDeserializer, HoodieAvroSerializer}
|
||||
import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSerializer, HoodieSpark3AvroDeserializer, HoodieSparkAvroSerializer}
|
||||
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
|
||||
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||
import org.apache.spark.sql.catalyst.expressions.{Expression, Like}
|
||||
@@ -43,11 +43,11 @@ import org.apache.spark.sql.{Row, SparkSession}
|
||||
*/
|
||||
class Spark3Adapter extends SparkAdapter {
|
||||
|
||||
def createAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean): HoodieAvroSerializerTrait =
|
||||
new HoodieAvroSerializer(rootCatalystType, rootAvroType, nullable)
|
||||
def createAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean): HoodieAvroSerializer =
|
||||
new HoodieSparkAvroSerializer(rootCatalystType, rootAvroType, nullable)
|
||||
|
||||
def createAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType): HoodieAvroDeserializerTrait =
|
||||
new Spark3HoodieAvroDeserializer(rootAvroType, rootCatalystType)
|
||||
def createAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType): HoodieAvroDeserializer =
|
||||
new HoodieSpark3AvroDeserializer(rootAvroType, rootCatalystType)
|
||||
|
||||
override def createSparkRowSerDe(encoder: ExpressionEncoder[Row]): SparkRowSerDe = {
|
||||
new Spark3RowSerDe(encoder)
|
||||
|
||||
@@ -21,8 +21,8 @@ import org.apache.avro.Schema
|
||||
import org.apache.hudi.HoodieSparkUtils
|
||||
import org.apache.spark.sql.types.DataType
|
||||
|
||||
class Spark3HoodieAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType)
|
||||
extends HoodieAvroDeserializerTrait {
|
||||
class HoodieSpark3AvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType)
|
||||
extends HoodieAvroDeserializer {
|
||||
|
||||
// SPARK-34404: As of Spark3.2, there is no AvroDeserializer's constructor with Schema and DataType arguments.
|
||||
// So use the reflection to get AvroDeserializer instance.
|
||||
@@ -34,5 +34,5 @@ class Spark3HoodieAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataT
|
||||
constructor.newInstance(rootAvroType, rootCatalystType)
|
||||
}
|
||||
|
||||
def doDeserialize(data: Any): Any = avroDeserializer.deserialize(data)
|
||||
def deserialize(data: Any): Option[Any] = avroDeserializer.deserialize(data)
|
||||
}
|
||||
Reference in New Issue
Block a user