1
0

[HUDI-4250][HUDI-4202] Optimize performance of Column Stats Index reading in Data Skipping (#5746)

We provide an alternative way of fetching Column Stats Index within the reading process to avoid the penalty of a more heavy-weight execution scheduled through a Spark engine.
This commit is contained in:
Alexey Kudinkin
2022-07-25 15:36:12 -07:00
committed by GitHub
parent 6e7ac45735
commit e7c8df7e8b
20 changed files with 730 additions and 387 deletions

View File

@@ -33,6 +33,8 @@ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.hudi.SparkAdapter
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.{HoodieCatalystPlansUtils, HoodieSpark3CatalystPlanUtils, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.storage.StorageLevel.{DISK_ONLY, DISK_ONLY_2, DISK_ONLY_3, MEMORY_AND_DISK, MEMORY_AND_DISK_2, MEMORY_AND_DISK_SER, MEMORY_AND_DISK_SER_2, MEMORY_ONLY, MEMORY_ONLY_2, MEMORY_ONLY_SER, MEMORY_ONLY_SER_2, NONE, OFF_HEAP}
import scala.util.control.NonFatal
@@ -100,4 +102,24 @@ abstract class BaseSpark3Adapter extends SparkAdapter with Logging {
override def createInterpretedPredicate(e: Expression): InterpretedPredicate = {
Predicate.createInterpreted(e)
}
/**
* Converts instance of [[StorageLevel]] to a corresponding string
*/
override def convertStorageLevelToString(level: StorageLevel): String = level match {
case NONE => "NONE"
case DISK_ONLY => "DISK_ONLY"
case DISK_ONLY_2 => "DISK_ONLY_2"
case DISK_ONLY_3 => "DISK_ONLY_3"
case MEMORY_ONLY => "MEMORY_ONLY"
case MEMORY_ONLY_2 => "MEMORY_ONLY_2"
case MEMORY_ONLY_SER => "MEMORY_ONLY_SER"
case MEMORY_ONLY_SER_2 => "MEMORY_ONLY_SER_2"
case MEMORY_AND_DISK => "MEMORY_AND_DISK"
case MEMORY_AND_DISK_2 => "MEMORY_AND_DISK_2"
case MEMORY_AND_DISK_SER => "MEMORY_AND_DISK_SER"
case MEMORY_AND_DISK_SER_2 => "MEMORY_AND_DISK_SER_2"
case OFF_HEAP => "OFF_HEAP"
case _ => throw new IllegalArgumentException(s"Invalid StorageLevel: $level")
}
}