1
0

[HUDI-3896] Porting Nested Schema Pruning optimization for Hudi's custom Relations (#5428)

Currently, all Hudi Relations bear performance gap relative to Spark's HadoopFsRelation 
and the reason to that is SchemaPruning optimization rule (pruning nested schemas) 
that is unfortunately predicated on usage of HadoopFsRelation, meaning that it's 
not applied in cases when any other relation is used.

This change is porting this rule to Hudi relations (MOR, Incremental, etc) 
by the virtue of leveraging HoodieSparkSessionExtensions mechanism 
injecting modified version of the original SchemaPruning rule 
that is adopted to work w/ Hudi's custom relations.

- Added customOptimizerRules to HoodieAnalysis
- Added NestedSchemaPrunning Spark's Optimizer rule
- Handle Spark's Optimizer pruned data schema (to effectively prune nested schemas)
- Enable HoodieClientTestHarness to inject HoodieSparkSessionExtensions
- Injecting Spark Session extensions for TestMORDataSource, TestCOWDataSource
- Disabled fallback to HadoopFsRelation
This commit is contained in:
Alexey Kudinkin
2022-07-21 02:36:06 -07:00
committed by GitHub
parent 2394c62973
commit de37774e12
42 changed files with 1220 additions and 500 deletions

View File

@@ -28,21 +28,20 @@ import org.apache.spark.sql.parser.HoodieCommonSqlParser
class HoodieSparkSessionExtension extends (SparkSessionExtensions => Unit)
with SparkAdapterSupport {
override def apply(extensions: SparkSessionExtensions): Unit = {
extensions.injectParser { (session, parser) =>
new HoodieCommonSqlParser(session, parser)
}
HoodieAnalysis.customResolutionRules.foreach { ruleBuilder =>
extensions.injectResolutionRule { session =>
ruleBuilder(session)
}
HoodieAnalysis.customOptimizerRules.foreach { ruleBuilder =>
extensions.injectOptimizerRule(ruleBuilder(_))
}
HoodieAnalysis.customPostHocResolutionRules.foreach { rule =>
extensions.injectPostHocResolutionRule { session =>
rule(session)
}
HoodieAnalysis.customResolutionRules.foreach { ruleBuilder =>
extensions.injectResolutionRule(ruleBuilder(_))
}
HoodieAnalysis.customPostHocResolutionRules.foreach { ruleBuilder =>
extensions.injectPostHocResolutionRule(ruleBuilder(_))
}
}
}

View File

@@ -33,7 +33,7 @@ object HoodieSqlUtils extends SparkAdapterSupport {
case SubqueryAlias(tableId, _) => tableId
case plan => throw new IllegalArgumentException(s"Illegal plan $plan in target")
}
sparkAdapter.toTableIdentifier(aliaId)
sparkAdapter.getCatalystPlanUtils.toTableIdentifier(aliaId)
}
/**

View File

@@ -44,6 +44,16 @@ import scala.collection.mutable.ListBuffer
object HoodieAnalysis {
type RuleBuilder = SparkSession => Rule[LogicalPlan]
def customOptimizerRules: Seq[RuleBuilder] =
if (HoodieSparkUtils.gteqSpark3_1) {
val nestedSchemaPruningClass = "org.apache.spark.sql.execution.datasources.NestedSchemaPruning"
val nestedSchemaPruningRule = ReflectionUtils.loadClass(nestedSchemaPruningClass).asInstanceOf[Rule[LogicalPlan]]
Seq(_ => nestedSchemaPruningRule)
} else {
Seq.empty
}
def customResolutionRules: Seq[RuleBuilder] = {
val rules: ListBuffer[RuleBuilder] = ListBuffer(
// Default rules
@@ -130,8 +140,8 @@ case class HoodieAnalysis(sparkSession: SparkSession) extends Rule[LogicalPlan]
DeleteHoodieTableCommand(d)
// Convert to InsertIntoHoodieTableCommand
case l if sparkAdapter.isInsertInto(l) =>
val (table, partition, query, overwrite, _) = sparkAdapter.getInsertIntoChildren(l).get
case l if sparkAdapter.getCatalystPlanUtils.isInsertInto(l) =>
val (table, partition, query, overwrite, _) = sparkAdapter.getCatalystPlanUtils.getInsertIntoChildren(l).get
table match {
case relation: LogicalRelation if sparkAdapter.isHoodieTable(relation, sparkSession) =>
new InsertIntoHoodieTableCommand(relation, query, partition, overwrite)
@@ -420,9 +430,9 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
// Append the meta field to the insert query to walk through the validate for the
// number of insert fields with the number of the target table fields.
case l if sparkAdapter.isInsertInto(l) =>
case l if sparkAdapter.getCatalystPlanUtils.isInsertInto(l) =>
val (table, partition, query, overwrite, ifPartitionNotExists) =
sparkAdapter.getInsertIntoChildren(l).get
sparkAdapter.getCatalystPlanUtils.getInsertIntoChildren(l).get
if (sparkAdapter.isHoodieTable(table, sparkSession) && query.resolved &&
!containUnResolvedStar(query) &&
@@ -439,21 +449,21 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
val withMetaFieldProjects = metaFields ++ query.output
Project(withMetaFieldProjects, query)
}
sparkAdapter.createInsertInto(table, partition, newQuery, overwrite, ifPartitionNotExists)
sparkAdapter.getCatalystPlanUtils.createInsertInto(table, partition, newQuery, overwrite, ifPartitionNotExists)
} else {
l
}
case l if sparkAdapter.isRelationTimeTravel(l) =>
case l if sparkAdapter.getCatalystPlanUtils.isRelationTimeTravel(l) =>
val (plan: UnresolvedRelation, timestamp, version) =
sparkAdapter.getRelationTimeTravel(l).get
sparkAdapter.getCatalystPlanUtils.getRelationTimeTravel(l).get
if (timestamp.isEmpty && version.nonEmpty) {
throw new AnalysisException(
"version expression is not supported for time travel")
}
val tableIdentifier = sparkAdapter.toTableIdentifier(plan)
val tableIdentifier = sparkAdapter.getCatalystPlanUtils.toTableIdentifier(plan)
if (sparkAdapter.isHoodieTable(tableIdentifier, sparkSession)) {
val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier)
val table = hoodieCatalogTable.table
@@ -525,7 +535,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
// Fake a project for the expression based on the source plan.
val fakeProject = if (right.isDefined) {
Project(Seq(Alias(expression, "_c0")()),
sparkAdapter.createJoin(left, right.get, Inner))
sparkAdapter.getCatalystPlanUtils.createJoin(left, right.get, Inner))
} else {
Project(Seq(Alias(expression, "_c0")()),
left)

View File

@@ -41,7 +41,7 @@ class RunClusteringProcedure extends BaseProcedure
with Logging
with SparkAdapterSupport {
private val exprUtils = sparkAdapter.createCatalystExpressionUtils()
private val exprUtils = sparkAdapter.getCatalystExpressionUtils()
/**
* OPTIMIZE table_name|table_path [WHERE predicate]