1
0

[HUDI-4250][HUDI-4202] Optimize performance of Column Stats Index reading in Data Skipping (#5746)

We provide an alternative way of fetching Column Stats Index within the reading process to avoid the penalty of a more heavy-weight execution scheduled through a Spark engine.
This commit is contained in:
Alexey Kudinkin
2022-07-25 15:36:12 -07:00
committed by GitHub
parent 6e7ac45735
commit e7c8df7e8b
20 changed files with 730 additions and 387 deletions

View File

@@ -22,9 +22,13 @@ import org.apache.avro.Schema
import org.apache.hudi.Spark2RowSerDe
import org.apache.hudi.client.utils.SparkRowSerDe
import org.apache.spark.sql.avro._
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedPredicate}
import org.apache.spark.sql.catalyst.parser.ParserInterface
import org.apache.spark.sql.catalyst.plans.JoinType
import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, Join, LogicalPlan}
import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier}
import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, Spark24HoodieParquetFileFormat}
import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile, Spark2ParsePartitionUtil, SparkParsePartitionUtil}
import org.apache.spark.sql.hudi.SparkAdapter
@@ -32,6 +36,8 @@ import org.apache.spark.sql.hudi.parser.HoodieSpark2ExtendedSqlParser
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.{HoodieCatalystExpressionUtils, HoodieCatalystPlansUtils, HoodieSpark2CatalystExpressionUtils, HoodieSpark2CatalystPlanUtils, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.storage.StorageLevel._
import scala.collection.mutable.ArrayBuffer
@@ -115,4 +121,20 @@ class Spark2Adapter extends SparkAdapter {
override def createInterpretedPredicate(e: Expression): InterpretedPredicate = {
InterpretedPredicate.create(e)
}
override def convertStorageLevelToString(level: StorageLevel): String = level match {
case NONE => "NONE"
case DISK_ONLY => "DISK_ONLY"
case DISK_ONLY_2 => "DISK_ONLY_2"
case MEMORY_ONLY => "MEMORY_ONLY"
case MEMORY_ONLY_2 => "MEMORY_ONLY_2"
case MEMORY_ONLY_SER => "MEMORY_ONLY_SER"
case MEMORY_ONLY_SER_2 => "MEMORY_ONLY_SER_2"
case MEMORY_AND_DISK => "MEMORY_AND_DISK"
case MEMORY_AND_DISK_2 => "MEMORY_AND_DISK_2"
case MEMORY_AND_DISK_SER => "MEMORY_AND_DISK_SER"
case MEMORY_AND_DISK_SER_2 => "MEMORY_AND_DISK_SER_2"
case OFF_HEAP => "OFF_HEAP"
case _ => throw new IllegalArgumentException(s"Invalid StorageLevel: $level")
}
}