[HUDI-3254] Introduce HoodieCatalog to manage tables for Spark Datasource V2 (#4611)
This commit is contained in:
@@ -19,10 +19,8 @@ package org.apache.spark.sql.hudi
|
||||
|
||||
import org.apache.hudi.SparkAdapterSupport
|
||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||
import org.apache.spark.sql.catalyst.expressions.{And, Cast, Expression, Literal}
|
||||
import org.apache.spark.sql.catalyst.expressions.{And, Expression}
|
||||
import org.apache.spark.sql.catalyst.plans.logical.{MergeIntoTable, SubqueryAlias}
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
import org.apache.spark.sql.types.{DataType, NullType}
|
||||
|
||||
object HoodieSqlUtils extends SparkAdapterSupport {
|
||||
|
||||
@@ -50,12 +48,4 @@ object HoodieSqlUtils extends SparkAdapterSupport {
|
||||
case exp => Seq(exp)
|
||||
}
|
||||
}
|
||||
|
||||
def castIfNeeded(child: Expression, dataType: DataType, conf: SQLConf): Expression = {
|
||||
child match {
|
||||
case Literal(nul, NullType) => Literal(nul, dataType)
|
||||
case _ => if (child.dataType != dataType)
|
||||
Cast(child, dataType, Option(conf.sessionLocalTimeZone)) else child
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,10 +17,10 @@
|
||||
|
||||
package org.apache.spark.sql.hudi.analysis
|
||||
|
||||
import org.apache.hudi.{HoodieSparkUtils, SparkAdapterSupport}
|
||||
import org.apache.hudi.DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL
|
||||
import org.apache.hudi.common.model.HoodieRecord
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.util.ReflectionUtils
|
||||
import org.apache.hudi.{HoodieSparkUtils, SparkAdapterSupport}
|
||||
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedStar}
|
||||
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, Literal, NamedExpression}
|
||||
import org.apache.spark.sql.catalyst.plans.Inner
|
||||
@@ -28,10 +28,10 @@ import org.apache.spark.sql.catalyst.plans.logical._
|
||||
import org.apache.spark.sql.catalyst.rules.Rule
|
||||
import org.apache.spark.sql.execution.command._
|
||||
import org.apache.spark.sql.execution.datasources.{CreateTable, LogicalRelation}
|
||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{getTableIdentifier, getTableLocation, isHoodieTable, removeMetaFields, tableExistsInPath}
|
||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{getTableIdentifier, removeMetaFields}
|
||||
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
||||
import org.apache.spark.sql.hudi.command._
|
||||
import org.apache.spark.sql.hudi.{HoodieOptionConfig, HoodieSqlCommonUtils, HoodieSqlUtils}
|
||||
import org.apache.spark.sql.hudi.{HoodieOptionConfig, HoodieSqlCommonUtils}
|
||||
import org.apache.spark.sql.types.StringType
|
||||
import org.apache.spark.sql.{AnalysisException, SparkSession}
|
||||
|
||||
@@ -42,12 +42,39 @@ object HoodieAnalysis {
|
||||
Seq(
|
||||
session => HoodieResolveReferences(session),
|
||||
session => HoodieAnalysis(session)
|
||||
)
|
||||
) ++ extraResolutionRules()
|
||||
|
||||
def customPostHocResolutionRules(): Seq[SparkSession => Rule[LogicalPlan]] =
|
||||
Seq(
|
||||
session => HoodiePostAnalysisRule(session)
|
||||
)
|
||||
) ++ extraPostHocResolutionRules()
|
||||
|
||||
def extraResolutionRules(): Seq[SparkSession => Rule[LogicalPlan]] = {
|
||||
if (!HoodieSparkUtils.beforeSpark3_2()) {
|
||||
val spark3AnalysisClass = "org.apache.spark.sql.hudi.analysis.HoodieSpark3Analysis"
|
||||
val spark3Analysis: SparkSession => Rule[LogicalPlan] =
|
||||
session => ReflectionUtils.loadClass(spark3AnalysisClass, session).asInstanceOf[Rule[LogicalPlan]]
|
||||
|
||||
val spark3ResolveReferences = "org.apache.spark.sql.hudi.analysis.HoodieSpark3ResolveReferences"
|
||||
val spark3References: SparkSession => Rule[LogicalPlan] =
|
||||
session => ReflectionUtils.loadClass(spark3ResolveReferences, session).asInstanceOf[Rule[LogicalPlan]]
|
||||
|
||||
Seq(spark3Analysis, spark3References)
|
||||
} else {
|
||||
Seq.empty
|
||||
}
|
||||
}
|
||||
|
||||
def extraPostHocResolutionRules(): Seq[SparkSession => Rule[LogicalPlan]] =
|
||||
if (!HoodieSparkUtils.beforeSpark3_2()) {
|
||||
val spark3PostHocResolutionClass = "org.apache.spark.sql.hudi.analysis.HoodieSpark3PostAnalysisRule"
|
||||
val spark3PostHocResolution: SparkSession => Rule[LogicalPlan] =
|
||||
session => ReflectionUtils.loadClass(spark3PostHocResolutionClass, session).asInstanceOf[Rule[LogicalPlan]]
|
||||
|
||||
Seq(spark3PostHocResolution)
|
||||
} else {
|
||||
Seq.empty
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -61,36 +88,36 @@ case class HoodieAnalysis(sparkSession: SparkSession) extends Rule[LogicalPlan]
|
||||
plan match {
|
||||
// Convert to MergeIntoHoodieTableCommand
|
||||
case m @ MergeIntoTable(target, _, _, _, _)
|
||||
if m.resolved && isHoodieTable(target, sparkSession) =>
|
||||
if m.resolved && sparkAdapter.isHoodieTable(target, sparkSession) =>
|
||||
MergeIntoHoodieTableCommand(m)
|
||||
|
||||
// Convert to UpdateHoodieTableCommand
|
||||
case u @ UpdateTable(table, _, _)
|
||||
if u.resolved && isHoodieTable(table, sparkSession) =>
|
||||
if u.resolved && sparkAdapter.isHoodieTable(table, sparkSession) =>
|
||||
UpdateHoodieTableCommand(u)
|
||||
|
||||
// Convert to DeleteHoodieTableCommand
|
||||
case d @ DeleteFromTable(table, _)
|
||||
if d.resolved && isHoodieTable(table, sparkSession) =>
|
||||
if d.resolved && sparkAdapter.isHoodieTable(table, sparkSession) =>
|
||||
DeleteHoodieTableCommand(d)
|
||||
|
||||
// Convert to InsertIntoHoodieTableCommand
|
||||
case l if sparkAdapter.isInsertInto(l) =>
|
||||
val (table, partition, query, overwrite, _) = sparkAdapter.getInsertIntoChildren(l).get
|
||||
table match {
|
||||
case relation: LogicalRelation if isHoodieTable(relation, sparkSession) =>
|
||||
case relation: LogicalRelation if sparkAdapter.isHoodieTable(relation, sparkSession) =>
|
||||
new InsertIntoHoodieTableCommand(relation, query, partition, overwrite)
|
||||
case _ =>
|
||||
l
|
||||
}
|
||||
// Convert to CreateHoodieTableAsSelectCommand
|
||||
case CreateTable(table, mode, Some(query))
|
||||
if query.resolved && isHoodieTable(table) =>
|
||||
if query.resolved && sparkAdapter.isHoodieTable(table) =>
|
||||
CreateHoodieTableAsSelectCommand(table, mode, query)
|
||||
|
||||
// Convert to CompactionHoodieTableCommand
|
||||
case CompactionTable(table, operation, options)
|
||||
if table.resolved && isHoodieTable(table, sparkSession) =>
|
||||
if table.resolved && sparkAdapter.isHoodieTable(table, sparkSession) =>
|
||||
val tableId = getTableIdentifier(table)
|
||||
val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(tableId)
|
||||
CompactionHoodieTableCommand(catalogTable, operation, options)
|
||||
@@ -99,7 +126,7 @@ case class HoodieAnalysis(sparkSession: SparkSession) extends Rule[LogicalPlan]
|
||||
CompactionHoodiePathCommand(path, operation, options)
|
||||
// Convert to CompactionShowOnTable
|
||||
case CompactionShowOnTable(table, limit)
|
||||
if isHoodieTable(table, sparkSession) =>
|
||||
if sparkAdapter.isHoodieTable(table, sparkSession) =>
|
||||
val tableId = getTableIdentifier(table)
|
||||
val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(tableId)
|
||||
CompactionShowHoodieTableCommand(catalogTable, limit)
|
||||
@@ -122,7 +149,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
||||
def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp {
|
||||
// Resolve merge into
|
||||
case mergeInto @ MergeIntoTable(target, source, mergeCondition, matchedActions, notMatchedActions)
|
||||
if isHoodieTable(target, sparkSession) && target.resolved =>
|
||||
if sparkAdapter.isHoodieTable(target, sparkSession) && target.resolved =>
|
||||
|
||||
val resolver = sparkSession.sessionState.conf.resolver
|
||||
val resolvedSource = analyzer.execute(source)
|
||||
@@ -277,7 +304,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
||||
|
||||
// Resolve update table
|
||||
case UpdateTable(table, assignments, condition)
|
||||
if isHoodieTable(table, sparkSession) && table.resolved =>
|
||||
if sparkAdapter.isHoodieTable(table, sparkSession) && table.resolved =>
|
||||
// Resolve condition
|
||||
val resolvedCondition = condition.map(resolveExpressionFrom(table)(_))
|
||||
// Resolve assignments
|
||||
@@ -291,7 +318,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
||||
|
||||
// Resolve Delete Table
|
||||
case DeleteFromTable(table, condition)
|
||||
if isHoodieTable(table, sparkSession) && table.resolved =>
|
||||
if sparkAdapter.isHoodieTable(table, sparkSession) && table.resolved =>
|
||||
// Resolve condition
|
||||
val resolvedCondition = condition.map(resolveExpressionFrom(table)(_))
|
||||
// Return the resolved DeleteTable
|
||||
@@ -303,7 +330,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
||||
val (table, partition, query, overwrite, ifPartitionNotExists) =
|
||||
sparkAdapter.getInsertIntoChildren(l).get
|
||||
|
||||
if (isHoodieTable(table, sparkSession) && query.resolved &&
|
||||
if (sparkAdapter.isHoodieTable(table, sparkSession) && query.resolved &&
|
||||
!containUnResolvedStar(query) &&
|
||||
!checkAlreadyAppendMetaField(query)) {
|
||||
val metaFields = HoodieRecord.HOODIE_META_COLUMNS.asScala.map(
|
||||
@@ -401,37 +428,37 @@ case class HoodiePostAnalysisRule(sparkSession: SparkSession) extends Rule[Logic
|
||||
plan match {
|
||||
// Rewrite the CreateDataSourceTableCommand to CreateHoodieTableCommand
|
||||
case CreateDataSourceTableCommand(table, ignoreIfExists)
|
||||
if isHoodieTable(table) =>
|
||||
if sparkAdapter.isHoodieTable(table) =>
|
||||
CreateHoodieTableCommand(table, ignoreIfExists)
|
||||
// Rewrite the DropTableCommand to DropHoodieTableCommand
|
||||
case DropTableCommand(tableName, ifExists, isView, purge)
|
||||
if isHoodieTable(tableName, sparkSession) =>
|
||||
if sparkAdapter.isHoodieTable(tableName, sparkSession) =>
|
||||
DropHoodieTableCommand(tableName, ifExists, isView, purge)
|
||||
// Rewrite the AlterTableDropPartitionCommand to AlterHoodieTableDropPartitionCommand
|
||||
case AlterTableDropPartitionCommand(tableName, specs, ifExists, purge, retainData)
|
||||
if isHoodieTable(tableName, sparkSession) =>
|
||||
if sparkAdapter.isHoodieTable(tableName, sparkSession) =>
|
||||
AlterHoodieTableDropPartitionCommand(tableName, specs, ifExists, purge, retainData)
|
||||
// Rewrite the AlterTableRenameCommand to AlterHoodieTableRenameCommand
|
||||
// Rewrite the AlterTableAddColumnsCommand to AlterHoodieTableAddColumnsCommand
|
||||
case AlterTableAddColumnsCommand(tableId, colsToAdd)
|
||||
if isHoodieTable(tableId, sparkSession) =>
|
||||
if sparkAdapter.isHoodieTable(tableId, sparkSession) =>
|
||||
AlterHoodieTableAddColumnsCommand(tableId, colsToAdd)
|
||||
// Rewrite the AlterTableRenameCommand to AlterHoodieTableRenameCommand
|
||||
case AlterTableRenameCommand(oldName, newName, isView)
|
||||
if !isView && isHoodieTable(oldName, sparkSession) =>
|
||||
if !isView && sparkAdapter.isHoodieTable(oldName, sparkSession) =>
|
||||
new AlterHoodieTableRenameCommand(oldName, newName, isView)
|
||||
// Rewrite the AlterTableChangeColumnCommand to AlterHoodieTableChangeColumnCommand
|
||||
case AlterTableChangeColumnCommand(tableName, columnName, newColumn)
|
||||
if isHoodieTable(tableName, sparkSession) =>
|
||||
if sparkAdapter.isHoodieTable(tableName, sparkSession) =>
|
||||
AlterHoodieTableChangeColumnCommand(tableName, columnName, newColumn)
|
||||
// SPARK-34238: the definition of ShowPartitionsCommand has been changed in Spark3.2.
|
||||
// Match the class type instead of call the `unapply` method.
|
||||
case s: ShowPartitionsCommand
|
||||
if isHoodieTable(s.tableName, sparkSession) =>
|
||||
if sparkAdapter.isHoodieTable(s.tableName, sparkSession) =>
|
||||
ShowHoodieTablePartitionsCommand(s.tableName, s.spec)
|
||||
// Rewrite TruncateTableCommand to TruncateHoodieTableCommand
|
||||
case TruncateTableCommand(tableName, partitionSpec)
|
||||
if isHoodieTable(tableName, sparkSession) =>
|
||||
if sparkAdapter.isHoodieTable(tableName, sparkSession) =>
|
||||
new TruncateHoodieTableCommand(tableName, partitionSpec)
|
||||
case _ => plan
|
||||
}
|
||||
|
||||
@@ -32,7 +32,6 @@ import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
|
||||
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
|
||||
import org.apache.spark.sql.execution.datasources.LogicalRelation
|
||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._
|
||||
import org.apache.spark.sql.hudi.HoodieSqlUtils.castIfNeeded
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
||||
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, BoundReference, Cast, EqualTo, Expression, Literal}
|
||||
import org.apache.spark.sql.catalyst.plans.logical._
|
||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._
|
||||
import org.apache.spark.sql.hudi.HoodieSqlUtils.{castIfNeeded, getMergeIntoTargetTableId}
|
||||
import org.apache.spark.sql.hudi.HoodieSqlUtils.getMergeIntoTargetTableId
|
||||
import org.apache.spark.sql.hudi.SerDeUtils
|
||||
import org.apache.spark.sql.hudi.command.payload.ExpressionPayload
|
||||
import org.apache.spark.sql.hudi.command.payload.ExpressionPayload._
|
||||
|
||||
@@ -29,7 +29,6 @@ import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
||||
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression}
|
||||
import org.apache.spark.sql.catalyst.plans.logical.{Assignment, UpdateTable}
|
||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._
|
||||
import org.apache.spark.sql.hudi.HoodieSqlUtils.castIfNeeded
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
import org.apache.spark.sql.types.StructField
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ import org.apache.hudi.functional.TestBootstrap
|
||||
import org.apache.hudi.hive.HiveSyncConfig
|
||||
import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator}
|
||||
import org.apache.hudi.testutils.DataSourceTestUtils
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.api.java.JavaSparkContext
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.functions.{expr, lit}
|
||||
@@ -94,11 +94,17 @@ class TestHoodieSparkSqlWriter {
|
||||
* Utility method for initializing the spark context.
|
||||
*/
|
||||
def initSparkContext(): Unit = {
|
||||
val sparkConf = new SparkConf()
|
||||
if (!HoodieSparkUtils.beforeSpark3_2()) {
|
||||
sparkConf.set("spark.sql.catalog.spark_catalog",
|
||||
"org.apache.spark.sql.hudi.catalog.HoodieCatalog")
|
||||
}
|
||||
spark = SparkSession.builder()
|
||||
.appName(hoodieFooTableName)
|
||||
.master("local[2]")
|
||||
.withExtensions(new HoodieSparkSessionExtension)
|
||||
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
.config(sparkConf)
|
||||
.getOrCreate()
|
||||
sc = spark.sparkContext
|
||||
sc.setLogLevel("ERROR")
|
||||
|
||||
@@ -18,8 +18,10 @@
|
||||
package org.apache.spark.sql.hudi
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.HoodieSparkUtils
|
||||
import org.apache.hudi.common.fs.FSUtils
|
||||
import org.apache.log4j.Level
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.catalyst.util.DateTimeUtils
|
||||
import org.apache.spark.sql.{Row, SparkSession}
|
||||
import org.apache.spark.util.Utils
|
||||
@@ -49,10 +51,20 @@ class TestHoodieSqlBase extends FunSuite with BeforeAndAfterAll {
|
||||
.config("hoodie.delete.shuffle.parallelism", "4")
|
||||
.config("spark.sql.warehouse.dir", sparkWareHouse.getCanonicalPath)
|
||||
.config("spark.sql.session.timeZone", "CTT")
|
||||
.config(sparkConf())
|
||||
.getOrCreate()
|
||||
|
||||
private var tableId = 0
|
||||
|
||||
def sparkConf(): SparkConf = {
|
||||
val sparkConf = new SparkConf()
|
||||
if (!HoodieSparkUtils.beforeSpark3_2()) {
|
||||
sparkConf.set("spark.sql.catalog.spark_catalog",
|
||||
"org.apache.spark.sql.hudi.catalog.HoodieCatalog")
|
||||
}
|
||||
sparkConf
|
||||
}
|
||||
|
||||
protected def withTempDir(f: File => Unit): Unit = {
|
||||
val tempDir = Utils.createTempDir()
|
||||
try f(tempDir) finally {
|
||||
|
||||
@@ -87,7 +87,7 @@ class TestMergeIntoTable extends TestHoodieSqlBase {
|
||||
| on s0.id = $tableName.id
|
||||
| when matched then update set
|
||||
| id = s0.id, name = s0.name, price = s0.price + $tableName.price, ts = s0.ts
|
||||
| when not matched and id % 2 = 0 then insert *
|
||||
| when not matched and s0.id % 2 = 0 then insert *
|
||||
""".stripMargin)
|
||||
checkAnswer(s"select id, name, price, ts from $tableName")(
|
||||
Seq(1, "a1", 30.0, 1002),
|
||||
@@ -102,9 +102,9 @@ class TestMergeIntoTable extends TestHoodieSqlBase {
|
||||
| select 1 as id, 'a1' as name, 12 as price, 1003 as ts
|
||||
| ) s0
|
||||
| on s0.id = $tableName.id
|
||||
| when matched and id != 1 then update set
|
||||
| when matched and s0.id != 1 then update set
|
||||
| id = s0.id, name = s0.name, price = s0.price, ts = s0.ts
|
||||
| when matched and id = 1 then delete
|
||||
| when matched and s0.id = 1 then delete
|
||||
| when not matched then insert *
|
||||
""".stripMargin)
|
||||
val cnt = spark.sql(s"select * from $tableName where id = 1").count()
|
||||
@@ -178,7 +178,7 @@ class TestMergeIntoTable extends TestHoodieSqlBase {
|
||||
| )
|
||||
| ) s0
|
||||
| on s0.s_id = t0.id
|
||||
| when matched and ts = 1001 then update set id = s0.s_id, name = t0.name, price =
|
||||
| when matched and s0.ts = 1001 then update set id = s0.s_id, name = t0.name, price =
|
||||
| s0.price, ts = s0.ts
|
||||
""".stripMargin
|
||||
)
|
||||
@@ -233,7 +233,7 @@ class TestMergeIntoTable extends TestHoodieSqlBase {
|
||||
| select 1 as id, 'a1' as name, 12 as price, 1001 as ts, '2021-03-21' as dt
|
||||
| ) as s0
|
||||
| on t0.id = s0.id
|
||||
| when matched and id % 2 = 0 then update set *
|
||||
| when matched and s0.id % 2 = 0 then update set *
|
||||
""".stripMargin
|
||||
)
|
||||
checkAnswer(s"select id,name,price,dt from $tableName")(
|
||||
@@ -488,7 +488,7 @@ class TestMergeIntoTable extends TestHoodieSqlBase {
|
||||
|merge into $targetTable t0
|
||||
|using $sourceTable s0
|
||||
|on t0.id = s0.id
|
||||
|when matched and cast(_ts as string) > '1000' then update set *
|
||||
|when matched and cast(s0._ts as string) > '1000' then update set *
|
||||
""".stripMargin)
|
||||
checkAnswer(s"select id, name, price, _ts from $targetTable")(
|
||||
Seq(1, "a1", 12, 1001)
|
||||
@@ -512,7 +512,7 @@ class TestMergeIntoTable extends TestHoodieSqlBase {
|
||||
|using $sourceTable s0
|
||||
|on t0.id = s0.id
|
||||
|when matched then update set *
|
||||
|when not matched and name = 'a2' then insert *
|
||||
|when not matched and s0.name = 'a2' then insert *
|
||||
""".stripMargin)
|
||||
checkAnswer(s"select id, name, price, _ts from $targetTable order by id")(
|
||||
Seq(1, "a1", 12, 1001),
|
||||
|
||||
Reference in New Issue
Block a user