[HUDI-4178] Addressing performance regressions in Spark DataSourceV2 Integration (#5737)
There are multiple issues with our current DataSource V2 integrations: b/c we advertise Hudi tables as V2, Spark expects it to implement certain APIs which are not implemented at the moment, instead we're using custom Resolution rule (in HoodieSpark3Analysis) to instead manually fallback to V1 APIs. This commit fixes the issue by reverting DSv2 APIs and making Spark use V1, except for schema evaluation logic.
This commit is contained in:
@@ -33,17 +33,13 @@ class HoodieSparkSessionExtension extends (SparkSessionExtensions => Unit)
|
||||
new HoodieCommonSqlParser(session, parser)
|
||||
}
|
||||
|
||||
HoodieAnalysis.customResolutionRules().foreach { rule =>
|
||||
HoodieAnalysis.customResolutionRules.foreach { ruleBuilder =>
|
||||
extensions.injectResolutionRule { session =>
|
||||
rule(session)
|
||||
ruleBuilder(session)
|
||||
}
|
||||
}
|
||||
|
||||
extensions.injectResolutionRule { session =>
|
||||
sparkAdapter.createResolveHudiAlterTableCommand(session)
|
||||
}
|
||||
|
||||
HoodieAnalysis.customPostHocResolutionRules().foreach { rule =>
|
||||
HoodieAnalysis.customPostHocResolutionRules.foreach { rule =>
|
||||
extensions.injectPostHocResolutionRule { session =>
|
||||
rule(session)
|
||||
}
|
||||
|
||||
@@ -39,45 +39,69 @@ import org.apache.spark.sql.{AnalysisException, SparkSession}
|
||||
|
||||
import java.util
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable.ListBuffer
|
||||
|
||||
object HoodieAnalysis {
|
||||
def customResolutionRules(): Seq[SparkSession => Rule[LogicalPlan]] =
|
||||
Seq(
|
||||
type RuleBuilder = SparkSession => Rule[LogicalPlan]
|
||||
|
||||
def customResolutionRules: Seq[RuleBuilder] = {
|
||||
val rules: ListBuffer[RuleBuilder] = ListBuffer(
|
||||
// Default rules
|
||||
session => HoodieResolveReferences(session),
|
||||
session => HoodieAnalysis(session)
|
||||
) ++ extraResolutionRules()
|
||||
)
|
||||
|
||||
def customPostHocResolutionRules(): Seq[SparkSession => Rule[LogicalPlan]] =
|
||||
Seq(
|
||||
session => HoodiePostAnalysisRule(session)
|
||||
) ++ extraPostHocResolutionRules()
|
||||
|
||||
def extraResolutionRules(): Seq[SparkSession => Rule[LogicalPlan]] = {
|
||||
if (HoodieSparkUtils.gteqSpark3_2) {
|
||||
val dataSourceV2ToV1FallbackClass = "org.apache.spark.sql.hudi.analysis.HoodieDataSourceV2ToV1Fallback"
|
||||
val dataSourceV2ToV1Fallback: RuleBuilder =
|
||||
session => ReflectionUtils.loadClass(dataSourceV2ToV1FallbackClass, session).asInstanceOf[Rule[LogicalPlan]]
|
||||
|
||||
val spark3AnalysisClass = "org.apache.spark.sql.hudi.analysis.HoodieSpark3Analysis"
|
||||
val spark3Analysis: SparkSession => Rule[LogicalPlan] =
|
||||
val spark3Analysis: RuleBuilder =
|
||||
session => ReflectionUtils.loadClass(spark3AnalysisClass, session).asInstanceOf[Rule[LogicalPlan]]
|
||||
|
||||
val spark3ResolveReferences = "org.apache.spark.sql.hudi.analysis.HoodieSpark3ResolveReferences"
|
||||
val spark3References: SparkSession => Rule[LogicalPlan] =
|
||||
session => ReflectionUtils.loadClass(spark3ResolveReferences, session).asInstanceOf[Rule[LogicalPlan]]
|
||||
val spark3ResolveReferencesClass = "org.apache.spark.sql.hudi.analysis.HoodieSpark3ResolveReferences"
|
||||
val spark3ResolveReferences: RuleBuilder =
|
||||
session => ReflectionUtils.loadClass(spark3ResolveReferencesClass, session).asInstanceOf[Rule[LogicalPlan]]
|
||||
|
||||
Seq(spark3Analysis, spark3References)
|
||||
} else {
|
||||
Seq.empty
|
||||
val spark32ResolveAlterTableCommandsClass = "org.apache.spark.sql.hudi.ResolveHudiAlterTableCommandSpark32"
|
||||
val spark32ResolveAlterTableCommands: RuleBuilder =
|
||||
session => ReflectionUtils.loadClass(spark32ResolveAlterTableCommandsClass, session).asInstanceOf[Rule[LogicalPlan]]
|
||||
|
||||
// NOTE: PLEASE READ CAREFULLY
|
||||
//
|
||||
// It's critical for this rules to follow in this order, so that DataSource V2 to V1 fallback
|
||||
// is performed prior to other rules being evaluated
|
||||
rules ++= Seq(dataSourceV2ToV1Fallback, spark3Analysis, spark3ResolveReferences, spark32ResolveAlterTableCommands)
|
||||
|
||||
} else if (HoodieSparkUtils.gteqSpark3_1) {
|
||||
val spark31ResolveAlterTableCommandsClass = "org.apache.spark.sql.hudi.ResolveHudiAlterTableCommand312"
|
||||
val spark31ResolveAlterTableCommands: RuleBuilder =
|
||||
session => ReflectionUtils.loadClass(spark31ResolveAlterTableCommandsClass, session).asInstanceOf[Rule[LogicalPlan]]
|
||||
|
||||
rules ++= Seq(spark31ResolveAlterTableCommands)
|
||||
}
|
||||
|
||||
rules
|
||||
}
|
||||
|
||||
def extraPostHocResolutionRules(): Seq[SparkSession => Rule[LogicalPlan]] =
|
||||
def customPostHocResolutionRules: Seq[RuleBuilder] = {
|
||||
val rules: ListBuffer[RuleBuilder] = ListBuffer(
|
||||
// Default rules
|
||||
session => HoodiePostAnalysisRule(session)
|
||||
)
|
||||
|
||||
if (HoodieSparkUtils.gteqSpark3_2) {
|
||||
val spark3PostHocResolutionClass = "org.apache.spark.sql.hudi.analysis.HoodieSpark3PostAnalysisRule"
|
||||
val spark3PostHocResolution: SparkSession => Rule[LogicalPlan] =
|
||||
val spark3PostHocResolution: RuleBuilder =
|
||||
session => ReflectionUtils.loadClass(spark3PostHocResolutionClass, session).asInstanceOf[Rule[LogicalPlan]]
|
||||
|
||||
Seq(spark3PostHocResolution)
|
||||
} else {
|
||||
Seq.empty
|
||||
rules += spark3PostHocResolution
|
||||
}
|
||||
|
||||
rules
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -453,7 +453,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Hoodie
|
||||
val hiveSyncConfig = buildHiveSyncConfig(hoodieProps, hoodieCatalogTable)
|
||||
|
||||
// Enable the hive sync by default if spark have enable the hive metastore.
|
||||
val enableHive = isEnableHive(sparkSession)
|
||||
val enableHive = isUsingHiveCatalog(sparkSession)
|
||||
withSparkConf(sparkSession, hoodieCatalogTable.catalogProperties) {
|
||||
Map(
|
||||
"path" -> path,
|
||||
|
||||
@@ -18,20 +18,24 @@
|
||||
package org.apache.hudi.functional
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem
|
||||
import org.apache.hudi.HoodieConversionUtils.toJavaOption
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig
|
||||
import org.apache.hudi.common.model.HoodieRecord
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant
|
||||
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver}
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
|
||||
import org.apache.hudi.common.testutils.RawTripTestPayload.{deleteRecordsToStrings, recordsToStrings}
|
||||
import org.apache.hudi.common.util
|
||||
import org.apache.hudi.config.HoodieWriteConfig
|
||||
import org.apache.hudi.exception.{HoodieException, HoodieUpsertException}
|
||||
import org.apache.hudi.keygen._
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config
|
||||
import org.apache.hudi.testutils.HoodieClientTestBase
|
||||
import org.apache.hudi.util.JFunction
|
||||
import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers}
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.functions.{col, concat, lit, udf}
|
||||
import org.apache.spark.sql.hudi.HoodieSparkSessionExtension
|
||||
import org.apache.spark.sql.types._
|
||||
import org.joda.time.DateTime
|
||||
import org.joda.time.format.DateTimeFormat
|
||||
@@ -42,6 +46,7 @@ import org.junit.jupiter.params.ParameterizedTest
|
||||
import org.junit.jupiter.params.provider.{CsvSource, ValueSource}
|
||||
|
||||
import java.sql.{Date, Timestamp}
|
||||
import java.util.function.Consumer
|
||||
import scala.collection.JavaConversions._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
@@ -67,6 +72,12 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val verificationCol: String = "driver"
|
||||
val updatedVerificationVal: String = "driver_update"
|
||||
|
||||
override def getSparkSessionExtensionsInjector: util.Option[Consumer[SparkSessionExtensions]] =
|
||||
toJavaOption(
|
||||
Some(
|
||||
JFunction.toJava((receiver: SparkSessionExtensions) => new HoodieSparkSessionExtension().apply(receiver)))
|
||||
)
|
||||
|
||||
@BeforeEach override def setUp() {
|
||||
initPath()
|
||||
initSparkContexts()
|
||||
|
||||
@@ -25,6 +25,7 @@ import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.catalyst.util.DateTimeUtils
|
||||
import org.apache.spark.sql.{Row, SparkSession}
|
||||
import org.apache.spark.util.Utils
|
||||
import org.joda.time.DateTimeZone
|
||||
import org.scalactic.source
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite, Tag}
|
||||
|
||||
@@ -40,7 +41,10 @@ class HoodieSparkSqlTestBase extends FunSuite with BeforeAndAfterAll {
|
||||
dir
|
||||
}
|
||||
|
||||
TimeZone.setDefault(DateTimeUtils.getTimeZone("CTT"))
|
||||
// NOTE: We have to fix the timezone to make sure all date-/timestamp-bound utilities output
|
||||
// is consistent with the fixtures
|
||||
DateTimeZone.setDefault(DateTimeZone.UTC)
|
||||
TimeZone.setDefault(DateTimeUtils.getTimeZone("UTC"))
|
||||
protected lazy val spark: SparkSession = SparkSession.builder()
|
||||
.master("local[1]")
|
||||
.appName("hoodie sql test")
|
||||
@@ -50,7 +54,7 @@ class HoodieSparkSqlTestBase extends FunSuite with BeforeAndAfterAll {
|
||||
.config("hoodie.upsert.shuffle.parallelism", "4")
|
||||
.config("hoodie.delete.shuffle.parallelism", "4")
|
||||
.config("spark.sql.warehouse.dir", sparkWareHouse.getCanonicalPath)
|
||||
.config("spark.sql.session.timeZone", "CTT")
|
||||
.config("spark.sql.session.timeZone", "UTC")
|
||||
.config(sparkConf())
|
||||
.getOrCreate()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user