[HUDI-3896] Porting Nested Schema Pruning optimization for Hudi's custom Relations (#5428)
Currently, all Hudi Relations bear performance gap relative to Spark's HadoopFsRelation and the reason to that is SchemaPruning optimization rule (pruning nested schemas) that is unfortunately predicated on usage of HadoopFsRelation, meaning that it's not applied in cases when any other relation is used. This change is porting this rule to Hudi relations (MOR, Incremental, etc) by the virtue of leveraging HoodieSparkSessionExtensions mechanism injecting modified version of the original SchemaPruning rule that is adopted to work w/ Hudi's custom relations. - Added customOptimizerRules to HoodieAnalysis - Added NestedSchemaPrunning Spark's Optimizer rule - Handle Spark's Optimizer pruned data schema (to effectively prune nested schemas) - Enable HoodieClientTestHarness to inject HoodieSparkSessionExtensions - Injecting Spark Session extensions for TestMORDataSource, TestCOWDataSource - Disabled fallback to HadoopFsRelation
This commit is contained in:
@@ -28,21 +28,20 @@ import org.apache.spark.sql.parser.HoodieCommonSqlParser
|
||||
class HoodieSparkSessionExtension extends (SparkSessionExtensions => Unit)
|
||||
with SparkAdapterSupport {
|
||||
override def apply(extensions: SparkSessionExtensions): Unit = {
|
||||
|
||||
extensions.injectParser { (session, parser) =>
|
||||
new HoodieCommonSqlParser(session, parser)
|
||||
}
|
||||
|
||||
HoodieAnalysis.customResolutionRules.foreach { ruleBuilder =>
|
||||
extensions.injectResolutionRule { session =>
|
||||
ruleBuilder(session)
|
||||
}
|
||||
HoodieAnalysis.customOptimizerRules.foreach { ruleBuilder =>
|
||||
extensions.injectOptimizerRule(ruleBuilder(_))
|
||||
}
|
||||
|
||||
HoodieAnalysis.customPostHocResolutionRules.foreach { rule =>
|
||||
extensions.injectPostHocResolutionRule { session =>
|
||||
rule(session)
|
||||
}
|
||||
HoodieAnalysis.customResolutionRules.foreach { ruleBuilder =>
|
||||
extensions.injectResolutionRule(ruleBuilder(_))
|
||||
}
|
||||
|
||||
HoodieAnalysis.customPostHocResolutionRules.foreach { ruleBuilder =>
|
||||
extensions.injectPostHocResolutionRule(ruleBuilder(_))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ object HoodieSqlUtils extends SparkAdapterSupport {
|
||||
case SubqueryAlias(tableId, _) => tableId
|
||||
case plan => throw new IllegalArgumentException(s"Illegal plan $plan in target")
|
||||
}
|
||||
sparkAdapter.toTableIdentifier(aliaId)
|
||||
sparkAdapter.getCatalystPlanUtils.toTableIdentifier(aliaId)
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -44,6 +44,16 @@ import scala.collection.mutable.ListBuffer
|
||||
object HoodieAnalysis {
|
||||
type RuleBuilder = SparkSession => Rule[LogicalPlan]
|
||||
|
||||
def customOptimizerRules: Seq[RuleBuilder] =
|
||||
if (HoodieSparkUtils.gteqSpark3_1) {
|
||||
val nestedSchemaPruningClass = "org.apache.spark.sql.execution.datasources.NestedSchemaPruning"
|
||||
val nestedSchemaPruningRule = ReflectionUtils.loadClass(nestedSchemaPruningClass).asInstanceOf[Rule[LogicalPlan]]
|
||||
|
||||
Seq(_ => nestedSchemaPruningRule)
|
||||
} else {
|
||||
Seq.empty
|
||||
}
|
||||
|
||||
def customResolutionRules: Seq[RuleBuilder] = {
|
||||
val rules: ListBuffer[RuleBuilder] = ListBuffer(
|
||||
// Default rules
|
||||
@@ -130,8 +140,8 @@ case class HoodieAnalysis(sparkSession: SparkSession) extends Rule[LogicalPlan]
|
||||
DeleteHoodieTableCommand(d)
|
||||
|
||||
// Convert to InsertIntoHoodieTableCommand
|
||||
case l if sparkAdapter.isInsertInto(l) =>
|
||||
val (table, partition, query, overwrite, _) = sparkAdapter.getInsertIntoChildren(l).get
|
||||
case l if sparkAdapter.getCatalystPlanUtils.isInsertInto(l) =>
|
||||
val (table, partition, query, overwrite, _) = sparkAdapter.getCatalystPlanUtils.getInsertIntoChildren(l).get
|
||||
table match {
|
||||
case relation: LogicalRelation if sparkAdapter.isHoodieTable(relation, sparkSession) =>
|
||||
new InsertIntoHoodieTableCommand(relation, query, partition, overwrite)
|
||||
@@ -420,9 +430,9 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
||||
|
||||
// Append the meta field to the insert query to walk through the validate for the
|
||||
// number of insert fields with the number of the target table fields.
|
||||
case l if sparkAdapter.isInsertInto(l) =>
|
||||
case l if sparkAdapter.getCatalystPlanUtils.isInsertInto(l) =>
|
||||
val (table, partition, query, overwrite, ifPartitionNotExists) =
|
||||
sparkAdapter.getInsertIntoChildren(l).get
|
||||
sparkAdapter.getCatalystPlanUtils.getInsertIntoChildren(l).get
|
||||
|
||||
if (sparkAdapter.isHoodieTable(table, sparkSession) && query.resolved &&
|
||||
!containUnResolvedStar(query) &&
|
||||
@@ -439,21 +449,21 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
||||
val withMetaFieldProjects = metaFields ++ query.output
|
||||
Project(withMetaFieldProjects, query)
|
||||
}
|
||||
sparkAdapter.createInsertInto(table, partition, newQuery, overwrite, ifPartitionNotExists)
|
||||
sparkAdapter.getCatalystPlanUtils.createInsertInto(table, partition, newQuery, overwrite, ifPartitionNotExists)
|
||||
} else {
|
||||
l
|
||||
}
|
||||
|
||||
case l if sparkAdapter.isRelationTimeTravel(l) =>
|
||||
case l if sparkAdapter.getCatalystPlanUtils.isRelationTimeTravel(l) =>
|
||||
val (plan: UnresolvedRelation, timestamp, version) =
|
||||
sparkAdapter.getRelationTimeTravel(l).get
|
||||
sparkAdapter.getCatalystPlanUtils.getRelationTimeTravel(l).get
|
||||
|
||||
if (timestamp.isEmpty && version.nonEmpty) {
|
||||
throw new AnalysisException(
|
||||
"version expression is not supported for time travel")
|
||||
}
|
||||
|
||||
val tableIdentifier = sparkAdapter.toTableIdentifier(plan)
|
||||
val tableIdentifier = sparkAdapter.getCatalystPlanUtils.toTableIdentifier(plan)
|
||||
if (sparkAdapter.isHoodieTable(tableIdentifier, sparkSession)) {
|
||||
val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier)
|
||||
val table = hoodieCatalogTable.table
|
||||
@@ -525,7 +535,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
||||
// Fake a project for the expression based on the source plan.
|
||||
val fakeProject = if (right.isDefined) {
|
||||
Project(Seq(Alias(expression, "_c0")()),
|
||||
sparkAdapter.createJoin(left, right.get, Inner))
|
||||
sparkAdapter.getCatalystPlanUtils.createJoin(left, right.get, Inner))
|
||||
} else {
|
||||
Project(Seq(Alias(expression, "_c0")()),
|
||||
left)
|
||||
|
||||
@@ -41,7 +41,7 @@ class RunClusteringProcedure extends BaseProcedure
|
||||
with Logging
|
||||
with SparkAdapterSupport {
|
||||
|
||||
private val exprUtils = sparkAdapter.createCatalystExpressionUtils()
|
||||
private val exprUtils = sparkAdapter.getCatalystExpressionUtils()
|
||||
|
||||
/**
|
||||
* OPTIMIZE table_name|table_path [WHERE predicate]
|
||||
|
||||
Reference in New Issue
Block a user