[HUDI-4178] Addressing performance regressions in Spark DataSourceV2 Integration (#5737)
There are multiple issues with our current DataSource V2 integrations: b/c we advertise Hudi tables as V2, Spark expects it to implement certain APIs which are not implemented at the moment, instead we're using custom Resolution rule (in HoodieSpark3Analysis) to instead manually fallback to V1 APIs. This commit fixes the issue by reverting DSv2 APIs and making Spark use V1, except for schema evaluation logic.
This commit is contained in:
@@ -17,19 +17,19 @@
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql.connector.catalog.{Table, TableProvider}
|
||||
import org.apache.spark.sql.connector.expressions.Transform
|
||||
import org.apache.spark.sql.hudi.catalog.HoodieInternalV2Table
|
||||
import org.apache.spark.sql.sources.DataSourceRegister
|
||||
import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.util.CaseInsensitiveStringMap
|
||||
|
||||
class Spark3DefaultSource extends DefaultSource with DataSourceRegister with TableProvider {
|
||||
/**
|
||||
* NOTE: PLEASE READ CAREFULLY
|
||||
* All of Spark DataSourceV2 APIs are deliberately disabled to make sure
|
||||
* there are no regressions in performance
|
||||
* Please check out HUDI-4178 for more details
|
||||
*/
|
||||
class Spark3DefaultSource extends DefaultSource with DataSourceRegister /* with TableProvider */ {
|
||||
|
||||
override def shortName(): String = "hudi"
|
||||
|
||||
/*
|
||||
def inferSchema: StructType = new StructType()
|
||||
|
||||
override def inferSchema(options: CaseInsensitiveStringMap): StructType = inferSchema
|
||||
@@ -43,4 +43,5 @@ class Spark3DefaultSource extends DefaultSource with DataSourceRegister with Tab
|
||||
|
||||
HoodieInternalV2Table(SparkSession.active, path)
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
@@ -67,19 +67,6 @@ class Spark3_2Adapter extends BaseSpark3Adapter {
|
||||
)
|
||||
}
|
||||
|
||||
override def createResolveHudiAlterTableCommand(sparkSession: SparkSession): Rule[LogicalPlan] = {
|
||||
if (SPARK_VERSION.startsWith("3.2")) {
|
||||
val loadClassName = "org.apache.spark.sql.hudi.ResolveHudiAlterTableCommandSpark32"
|
||||
val clazz = Class.forName(loadClassName, true, Thread.currentThread().getContextClassLoader)
|
||||
val ctor = clazz.getConstructors.head
|
||||
ctor.newInstance(sparkSession).asInstanceOf[Rule[LogicalPlan]]
|
||||
} else {
|
||||
new Rule[LogicalPlan] {
|
||||
override def apply(plan: LogicalPlan): LogicalPlan = plan
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
override def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = {
|
||||
Some(new Spark32HoodieParquetFileFormat(appendPartitionValues))
|
||||
}
|
||||
|
||||
@@ -17,12 +17,12 @@
|
||||
|
||||
package org.apache.spark.sql.hudi
|
||||
|
||||
import org.apache.hudi.common.config.HoodieCommonConfig
|
||||
import org.apache.hudi.config.HoodieWriteConfig
|
||||
import org.apache.hudi.internal.schema.action.TableChange.ColumnChangeID
|
||||
import org.apache.spark.sql.catalyst.analysis.ResolvedTable
|
||||
import org.apache.spark.sql.catalyst.catalog.CatalogTable
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql.catalyst.plans.logical.{AddColumns, AlterColumn, DropColumns, LogicalPlan, RenameColumn, ReplaceColumns, SetTableProperties, UnsetTableProperties}
|
||||
import org.apache.spark.sql.catalyst.analysis.ResolvedTable
|
||||
import org.apache.spark.sql.catalyst.plans.logical._
|
||||
import org.apache.spark.sql.catalyst.rules.Rule
|
||||
import org.apache.spark.sql.hudi.catalog.HoodieInternalV2Table
|
||||
import org.apache.spark.sql.hudi.command.{AlterTableCommand => HudiAlterTableCommand}
|
||||
@@ -33,33 +33,38 @@ import org.apache.spark.sql.hudi.command.{AlterTableCommand => HudiAlterTableCom
|
||||
*/
|
||||
class ResolveHudiAlterTableCommandSpark32(sparkSession: SparkSession) extends Rule[LogicalPlan] {
|
||||
|
||||
def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
|
||||
case set @ SetTableProperties(asTable(table), _) if schemaEvolutionEnabled && set.resolved =>
|
||||
HudiAlterTableCommand(table, set.changes, ColumnChangeID.PROPERTY_CHANGE)
|
||||
case unSet @ UnsetTableProperties(asTable(table), _, _) if schemaEvolutionEnabled && unSet.resolved =>
|
||||
HudiAlterTableCommand(table, unSet.changes, ColumnChangeID.PROPERTY_CHANGE)
|
||||
case drop @ DropColumns(asTable(table), _) if schemaEvolutionEnabled && drop.resolved =>
|
||||
HudiAlterTableCommand(table, drop.changes, ColumnChangeID.DELETE)
|
||||
case add @ AddColumns(asTable(table), _) if schemaEvolutionEnabled && add.resolved =>
|
||||
HudiAlterTableCommand(table, add.changes, ColumnChangeID.ADD)
|
||||
case renameColumn @ RenameColumn(asTable(table), _, _) if schemaEvolutionEnabled && renameColumn.resolved=>
|
||||
HudiAlterTableCommand(table, renameColumn.changes, ColumnChangeID.UPDATE)
|
||||
case alter @ AlterColumn(asTable(table), _, _, _, _, _) if schemaEvolutionEnabled && alter.resolved =>
|
||||
HudiAlterTableCommand(table, alter.changes, ColumnChangeID.UPDATE)
|
||||
case replace @ ReplaceColumns(asTable(table), _) if schemaEvolutionEnabled && replace.resolved =>
|
||||
HudiAlterTableCommand(table, replace.changes, ColumnChangeID.REPLACE)
|
||||
def apply(plan: LogicalPlan): LogicalPlan = {
|
||||
if (schemaEvolutionEnabled) {
|
||||
plan.resolveOperatorsUp {
|
||||
case set@SetTableProperties(ResolvedHoodieV2TablePlan(t), _) if set.resolved =>
|
||||
HudiAlterTableCommand(t.v1Table, set.changes, ColumnChangeID.PROPERTY_CHANGE)
|
||||
case unSet@UnsetTableProperties(ResolvedHoodieV2TablePlan(t), _, _) if unSet.resolved =>
|
||||
HudiAlterTableCommand(t.v1Table, unSet.changes, ColumnChangeID.PROPERTY_CHANGE)
|
||||
case drop@DropColumns(ResolvedHoodieV2TablePlan(t), _) if drop.resolved =>
|
||||
HudiAlterTableCommand(t.v1Table, drop.changes, ColumnChangeID.DELETE)
|
||||
case add@AddColumns(ResolvedHoodieV2TablePlan(t), _) if add.resolved =>
|
||||
HudiAlterTableCommand(t.v1Table, add.changes, ColumnChangeID.ADD)
|
||||
case renameColumn@RenameColumn(ResolvedHoodieV2TablePlan(t), _, _) if renameColumn.resolved =>
|
||||
HudiAlterTableCommand(t.v1Table, renameColumn.changes, ColumnChangeID.UPDATE)
|
||||
case alter@AlterColumn(ResolvedHoodieV2TablePlan(t), _, _, _, _, _) if alter.resolved =>
|
||||
HudiAlterTableCommand(t.v1Table, alter.changes, ColumnChangeID.UPDATE)
|
||||
case replace@ReplaceColumns(ResolvedHoodieV2TablePlan(t), _) if replace.resolved =>
|
||||
HudiAlterTableCommand(t.v1Table, replace.changes, ColumnChangeID.REPLACE)
|
||||
}
|
||||
} else {
|
||||
plan
|
||||
}
|
||||
}
|
||||
|
||||
private def schemaEvolutionEnabled(): Boolean = sparkSession
|
||||
.sessionState.conf.getConfString(HoodieWriteConfig.SCHEMA_EVOLUTION_ENABLE.key(), "false").toBoolean
|
||||
private def schemaEvolutionEnabled: Boolean =
|
||||
sparkSession.sessionState.conf.getConfString(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key,
|
||||
HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.defaultValue.toString).toBoolean
|
||||
|
||||
object asTable {
|
||||
def unapply(a: LogicalPlan): Option[CatalogTable] = {
|
||||
a match {
|
||||
case ResolvedTable(_, _, table: HoodieInternalV2Table, _) =>
|
||||
table.catalogTable
|
||||
case _ =>
|
||||
None
|
||||
object ResolvedHoodieV2TablePlan {
|
||||
def unapply(plan: LogicalPlan): Option[HoodieInternalV2Table] = {
|
||||
plan match {
|
||||
case ResolvedTable(_, _, v2Table: HoodieInternalV2Table, _) => Some(v2Table)
|
||||
case _ => None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,72 +17,77 @@
|
||||
|
||||
package org.apache.spark.sql.hudi.analysis
|
||||
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.{DefaultSource, SparkAdapterSupport}
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||
import org.apache.spark.sql.catalyst.analysis.{ResolvedTable, UnresolvedPartitionSpec}
|
||||
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable}
|
||||
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute}
|
||||
import org.apache.spark.sql.catalyst.plans.logical._
|
||||
import org.apache.spark.sql.catalyst.rules.Rule
|
||||
import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper
|
||||
import org.apache.spark.sql.connector.catalog.{Table, V1Table}
|
||||
import org.apache.spark.sql.execution.datasources.LogicalRelation
|
||||
import org.apache.spark.sql.execution.datasources.PreWriteCheck.failAnalysis
|
||||
import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, V2SessionCatalog}
|
||||
import org.apache.spark.sql.hudi.{HoodieSqlCommonUtils, ProvidesHoodieConfig}
|
||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{castIfNeeded, getTableLocation, removeMetaFields, tableExistsInPath}
|
||||
import org.apache.spark.sql.hudi.catalog.{HoodieCatalog, HoodieInternalV2Table}
|
||||
import org.apache.spark.sql.hudi.command.{AlterHoodieTableDropPartitionCommand, ShowHoodieTablePartitionsCommand, TruncateHoodieTableCommand}
|
||||
import org.apache.spark.sql.hudi.{HoodieSqlCommonUtils, ProvidesHoodieConfig}
|
||||
import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.{AnalysisException, SQLContext, SparkSession}
|
||||
|
||||
import scala.collection.JavaConverters.mapAsJavaMapConverter
|
||||
|
||||
/**
|
||||
* Rule for convert the logical plan to command.
|
||||
* @param sparkSession
|
||||
* NOTE: PLEASE READ CAREFULLY
|
||||
*
|
||||
* Since Hudi relations don't currently implement DS V2 Read API, we have to fallback to V1 here.
|
||||
* Such fallback will have considerable performance impact, therefore it's only performed in cases
|
||||
* where V2 API have to be used. Currently only such use-case is using of Schema Evolution feature
|
||||
*
|
||||
* Check out HUDI-4178 for more details
|
||||
*/
|
||||
case class HoodieSpark3Analysis(sparkSession: SparkSession) extends Rule[LogicalPlan]
|
||||
with SparkAdapterSupport with ProvidesHoodieConfig {
|
||||
class HoodieDataSourceV2ToV1Fallback(sparkSession: SparkSession) extends Rule[LogicalPlan]
|
||||
with ProvidesHoodieConfig {
|
||||
|
||||
override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsDown {
|
||||
case dsv2 @ DataSourceV2Relation(d: HoodieInternalV2Table, _, _, _, _) =>
|
||||
val output = dsv2.output
|
||||
val catalogTable = if (d.catalogTable.isDefined) {
|
||||
Some(d.v1Table)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
case v2r @ DataSourceV2Relation(v2Table: HoodieInternalV2Table, _, _, _, _) =>
|
||||
val output = v2r.output
|
||||
val catalogTable = v2Table.catalogTable.map(_ => v2Table.v1Table)
|
||||
val relation = new DefaultSource().createRelation(new SQLContext(sparkSession),
|
||||
buildHoodieConfig(d.hoodieCatalogTable))
|
||||
buildHoodieConfig(v2Table.hoodieCatalogTable), v2Table.hoodieCatalogTable.tableSchema)
|
||||
|
||||
LogicalRelation(relation, output, catalogTable, isStreaming = false)
|
||||
case a @ InsertIntoStatement(r: DataSourceV2Relation, partitionSpec, _, _, _, _) if a.query.resolved &&
|
||||
r.table.isInstanceOf[HoodieInternalV2Table] &&
|
||||
needsSchemaAdjustment(a.query, r.table.asInstanceOf[HoodieInternalV2Table], partitionSpec, r.schema) =>
|
||||
val projection = resolveQueryColumnsByOrdinal(a.query, r.output)
|
||||
if (projection != a.query) {
|
||||
a.copy(query = projection)
|
||||
} else {
|
||||
a
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class HoodieSpark3Analysis(sparkSession: SparkSession) extends Rule[LogicalPlan] {
|
||||
override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsDown {
|
||||
case s @ InsertIntoStatement(r @ DataSourceV2Relation(v2Table: HoodieInternalV2Table, _, _, _, _), partitionSpec, _, _, _, _)
|
||||
if s.query.resolved && needsSchemaAdjustment(s.query, v2Table.hoodieCatalogTable.table, partitionSpec, r.schema) =>
|
||||
val projection = resolveQueryColumnsByOrdinal(s.query, r.output)
|
||||
if (projection != s.query) {
|
||||
s.copy(query = projection)
|
||||
} else {
|
||||
s
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Need to adjust schema based on the query and relation schema, for example,
|
||||
* if using insert into xx select 1, 2 here need to map to column names
|
||||
* @param query
|
||||
* @param hoodieTable
|
||||
* @param partitionSpec
|
||||
* @param schema
|
||||
* @return
|
||||
*/
|
||||
private def needsSchemaAdjustment(query: LogicalPlan,
|
||||
hoodieTable: HoodieInternalV2Table,
|
||||
table: CatalogTable,
|
||||
partitionSpec: Map[String, Option[String]],
|
||||
schema: StructType): Boolean = {
|
||||
val output = query.output
|
||||
val queryOutputWithoutMetaFields = removeMetaFields(output)
|
||||
val partitionFields = hoodieTable.hoodieCatalogTable.partitionFields
|
||||
val partitionSchema = hoodieTable.hoodieCatalogTable.partitionSchema
|
||||
val hoodieCatalogTable = HoodieCatalogTable(sparkSession, table)
|
||||
|
||||
val partitionFields = hoodieCatalogTable.partitionFields
|
||||
val partitionSchema = hoodieCatalogTable.partitionSchema
|
||||
val staticPartitionValues = partitionSpec.filter(p => p._2.isDefined).mapValues(_.get)
|
||||
|
||||
assert(staticPartitionValues.isEmpty ||
|
||||
@@ -91,8 +96,8 @@ case class HoodieSpark3Analysis(sparkSession: SparkSession) extends Rule[Logical
|
||||
s"is: ${staticPartitionValues.mkString("," + "")}")
|
||||
|
||||
assert(staticPartitionValues.size + queryOutputWithoutMetaFields.size
|
||||
== hoodieTable.hoodieCatalogTable.tableSchemaWithoutMetaFields.size,
|
||||
s"Required select columns count: ${hoodieTable.hoodieCatalogTable.tableSchemaWithoutMetaFields.size}, " +
|
||||
== hoodieCatalogTable.tableSchemaWithoutMetaFields.size,
|
||||
s"Required select columns count: ${hoodieCatalogTable.tableSchemaWithoutMetaFields.size}, " +
|
||||
s"Current select columns(including static partition column) count: " +
|
||||
s"${staticPartitionValues.size + queryOutputWithoutMetaFields.size},columns: " +
|
||||
s"(${(queryOutputWithoutMetaFields.map(_.name) ++ staticPartitionValues.keys).mkString(",")})")
|
||||
@@ -126,7 +131,6 @@ case class HoodieSpark3Analysis(sparkSession: SparkSession) extends Rule[Logical
|
||||
|
||||
/**
|
||||
* Rule for resolve hoodie's extended syntax or rewrite some logical plan.
|
||||
* @param sparkSession
|
||||
*/
|
||||
case class HoodieSpark3ResolveReferences(sparkSession: SparkSession) extends Rule[LogicalPlan]
|
||||
with SparkAdapterSupport with ProvidesHoodieConfig {
|
||||
@@ -173,28 +177,26 @@ case class HoodieSpark3ResolveReferences(sparkSession: SparkSession) extends Rul
|
||||
}
|
||||
|
||||
/**
|
||||
* Rule for rewrite some spark commands to hudi's implementation.
|
||||
* @param sparkSession
|
||||
* Rule replacing resolved Spark's commands (not working for Hudi tables out-of-the-box) with
|
||||
* corresponding Hudi implementations
|
||||
*/
|
||||
case class HoodieSpark3PostAnalysisRule(sparkSession: SparkSession) extends Rule[LogicalPlan] {
|
||||
override def apply(plan: LogicalPlan): LogicalPlan = {
|
||||
plan match {
|
||||
case ShowPartitions(ResolvedTable(_, idt, _: HoodieInternalV2Table, _), specOpt, _) =>
|
||||
case ShowPartitions(ResolvedTable(_, id, HoodieV1OrV2Table(_), _), specOpt, _) =>
|
||||
ShowHoodieTablePartitionsCommand(
|
||||
idt.asTableIdentifier, specOpt.map(s => s.asInstanceOf[UnresolvedPartitionSpec].spec))
|
||||
id.asTableIdentifier, specOpt.map(s => s.asInstanceOf[UnresolvedPartitionSpec].spec))
|
||||
|
||||
// Rewrite TruncateTableCommand to TruncateHoodieTableCommand
|
||||
case TruncateTable(ResolvedTable(_, idt, _: HoodieInternalV2Table, _)) =>
|
||||
TruncateHoodieTableCommand(idt.asTableIdentifier, None)
|
||||
case TruncateTable(ResolvedTable(_, id, HoodieV1OrV2Table(_), _)) =>
|
||||
TruncateHoodieTableCommand(id.asTableIdentifier, None)
|
||||
|
||||
case TruncatePartition(
|
||||
ResolvedTable(_, idt, _: HoodieInternalV2Table, _),
|
||||
partitionSpec: UnresolvedPartitionSpec) =>
|
||||
TruncateHoodieTableCommand(idt.asTableIdentifier, Some(partitionSpec.spec))
|
||||
case TruncatePartition(ResolvedTable(_, id, HoodieV1OrV2Table(_), _), partitionSpec: UnresolvedPartitionSpec) =>
|
||||
TruncateHoodieTableCommand(id.asTableIdentifier, Some(partitionSpec.spec))
|
||||
|
||||
case DropPartitions(ResolvedTable(_, idt, _: HoodieInternalV2Table, _), specs, ifExists, purge) =>
|
||||
case DropPartitions(ResolvedTable(_, id, HoodieV1OrV2Table(_), _), specs, ifExists, purge) =>
|
||||
AlterHoodieTableDropPartitionCommand(
|
||||
idt.asTableIdentifier,
|
||||
id.asTableIdentifier,
|
||||
specs.seq.map(f => f.asInstanceOf[UnresolvedPartitionSpec]).map(s => s.spec),
|
||||
ifExists,
|
||||
purge,
|
||||
@@ -205,3 +207,12 @@ case class HoodieSpark3PostAnalysisRule(sparkSession: SparkSession) extends Rule
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private[sql] object HoodieV1OrV2Table extends SparkAdapterSupport {
|
||||
def unapply(table: Table): Option[CatalogTable] = table match {
|
||||
case V1Table(catalogTable) if sparkAdapter.isHoodieTable(catalogTable) => Some(catalogTable)
|
||||
case v2: HoodieInternalV2Table => v2.catalogTable
|
||||
case _ => None
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.sql.InsertMode
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils
|
||||
import org.apache.hudi.{DataSourceWriteOptions, SparkAdapterSupport}
|
||||
import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, SparkAdapterSupport}
|
||||
import org.apache.spark.sql.HoodieSpark3SqlUtils.convertTransforms
|
||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||
import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, TableAlreadyExistsException, UnresolvedAttribute}
|
||||
@@ -33,6 +33,7 @@ import org.apache.spark.sql.connector.catalog.TableChange.{AddColumn, ColumnChan
|
||||
import org.apache.spark.sql.connector.catalog._
|
||||
import org.apache.spark.sql.connector.expressions.Transform
|
||||
import org.apache.spark.sql.execution.datasources.DataSource
|
||||
import org.apache.spark.sql.hudi.analysis.HoodieV1OrV2Table
|
||||
import org.apache.spark.sql.hudi.command._
|
||||
import org.apache.spark.sql.hudi.{HoodieSqlCommonUtils, ProvidesHoodieConfig}
|
||||
import org.apache.spark.sql.types.{StructField, StructType}
|
||||
@@ -105,12 +106,30 @@ class HoodieCatalog extends DelegatingCatalogExtension
|
||||
case _ =>
|
||||
catalogTable0
|
||||
}
|
||||
HoodieInternalV2Table(
|
||||
|
||||
val v2Table = HoodieInternalV2Table(
|
||||
spark = spark,
|
||||
path = catalogTable.location.toString,
|
||||
catalogTable = Some(catalogTable),
|
||||
tableIdentifier = Some(ident.toString))
|
||||
case o => o
|
||||
|
||||
val schemaEvolutionEnabled: Boolean = spark.sessionState.conf.getConfString(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key,
|
||||
DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.defaultValue.toString).toBoolean
|
||||
|
||||
// NOTE: PLEASE READ CAREFULLY
|
||||
//
|
||||
// Since Hudi relations don't currently implement DS V2 Read API, we by default fallback to V1 here.
|
||||
// Such fallback will have considerable performance impact, therefore it's only performed in cases
|
||||
// where V2 API have to be used. Currently only such use-case is using of Schema Evolution feature
|
||||
//
|
||||
// Check out HUDI-4178 for more details
|
||||
if (schemaEvolutionEnabled) {
|
||||
v2Table
|
||||
} else {
|
||||
v2Table.v1TableWrapper
|
||||
}
|
||||
|
||||
case t => t
|
||||
}
|
||||
}
|
||||
|
||||
@@ -132,7 +151,7 @@ class HoodieCatalog extends DelegatingCatalogExtension
|
||||
override def dropTable(ident: Identifier): Boolean = {
|
||||
val table = loadTable(ident)
|
||||
table match {
|
||||
case _: HoodieInternalV2Table =>
|
||||
case HoodieV1OrV2Table(_) =>
|
||||
DropHoodieTableCommand(ident.asTableIdentifier, ifExists = true, isView = false, purge = false).run(spark)
|
||||
true
|
||||
case _ => super.dropTable(ident)
|
||||
@@ -142,7 +161,7 @@ class HoodieCatalog extends DelegatingCatalogExtension
|
||||
override def purgeTable(ident: Identifier): Boolean = {
|
||||
val table = loadTable(ident)
|
||||
table match {
|
||||
case _: HoodieInternalV2Table =>
|
||||
case HoodieV1OrV2Table(_) =>
|
||||
DropHoodieTableCommand(ident.asTableIdentifier, ifExists = true, isView = false, purge = true).run(spark)
|
||||
true
|
||||
case _ => super.purgeTable(ident)
|
||||
@@ -153,56 +172,53 @@ class HoodieCatalog extends DelegatingCatalogExtension
|
||||
@throws[TableAlreadyExistsException]
|
||||
override def renameTable(oldIdent: Identifier, newIdent: Identifier): Unit = {
|
||||
loadTable(oldIdent) match {
|
||||
case _: HoodieInternalV2Table =>
|
||||
case HoodieV1OrV2Table(_) =>
|
||||
AlterHoodieTableRenameCommand(oldIdent.asTableIdentifier, newIdent.asTableIdentifier, false).run(spark)
|
||||
case _ => super.renameTable(oldIdent, newIdent)
|
||||
}
|
||||
}
|
||||
|
||||
override def alterTable(ident: Identifier, changes: TableChange*): Table = {
|
||||
val tableIdent = TableIdentifier(ident.name(), ident.namespace().lastOption)
|
||||
// scalastyle:off
|
||||
val table = loadTable(ident) match {
|
||||
case hoodieTable: HoodieInternalV2Table => hoodieTable
|
||||
case _ => return super.alterTable(ident, changes: _*)
|
||||
}
|
||||
// scalastyle:on
|
||||
loadTable(ident) match {
|
||||
case HoodieV1OrV2Table(table) => {
|
||||
val tableIdent = TableIdentifier(ident.name(), ident.namespace().lastOption)
|
||||
changes.groupBy(c => c.getClass).foreach {
|
||||
case (t, newColumns) if t == classOf[AddColumn] =>
|
||||
AlterHoodieTableAddColumnsCommand(
|
||||
tableIdent,
|
||||
newColumns.asInstanceOf[Seq[AddColumn]].map { col =>
|
||||
StructField(
|
||||
col.fieldNames()(0),
|
||||
col.dataType(),
|
||||
col.isNullable)
|
||||
}).run(spark)
|
||||
|
||||
val grouped = changes.groupBy(c => c.getClass)
|
||||
|
||||
grouped.foreach {
|
||||
case (t, newColumns) if t == classOf[AddColumn] =>
|
||||
AlterHoodieTableAddColumnsCommand(
|
||||
tableIdent,
|
||||
newColumns.asInstanceOf[Seq[AddColumn]].map { col =>
|
||||
StructField(
|
||||
col.fieldNames()(0),
|
||||
col.dataType(),
|
||||
col.isNullable)
|
||||
}).run(spark)
|
||||
case (t, columnChanges) if classOf[ColumnChange].isAssignableFrom(t) =>
|
||||
columnChanges.foreach {
|
||||
case dataType: UpdateColumnType =>
|
||||
val colName = UnresolvedAttribute(dataType.fieldNames()).name
|
||||
val newDataType = dataType.newDataType()
|
||||
val structField = StructField(colName, newDataType)
|
||||
AlterHoodieTableChangeColumnCommand(tableIdent, colName, structField).run(spark)
|
||||
case dataType: UpdateColumnComment =>
|
||||
val newComment = dataType.newComment()
|
||||
val colName = UnresolvedAttribute(dataType.fieldNames()).name
|
||||
val fieldOpt = table.schema().findNestedField(dataType.fieldNames(), includeCollections = true,
|
||||
spark.sessionState.conf.resolver).map(_._2)
|
||||
val field = fieldOpt.getOrElse {
|
||||
throw new AnalysisException(
|
||||
s"Couldn't find column $colName in:\n${table.schema().treeString}")
|
||||
case (t, columnChanges) if classOf[ColumnChange].isAssignableFrom(t) =>
|
||||
columnChanges.foreach {
|
||||
case dataType: UpdateColumnType =>
|
||||
val colName = UnresolvedAttribute(dataType.fieldNames()).name
|
||||
val newDataType = dataType.newDataType()
|
||||
val structField = StructField(colName, newDataType)
|
||||
AlterHoodieTableChangeColumnCommand(tableIdent, colName, structField).run(spark)
|
||||
case dataType: UpdateColumnComment =>
|
||||
val newComment = dataType.newComment()
|
||||
val colName = UnresolvedAttribute(dataType.fieldNames()).name
|
||||
val fieldOpt = table.schema.findNestedField(dataType.fieldNames(), includeCollections = true,
|
||||
spark.sessionState.conf.resolver).map(_._2)
|
||||
val field = fieldOpt.getOrElse {
|
||||
throw new AnalysisException(
|
||||
s"Couldn't find column $colName in:\n${table.schema.treeString}")
|
||||
}
|
||||
AlterHoodieTableChangeColumnCommand(tableIdent, colName, field.withComment(newComment)).run(spark)
|
||||
}
|
||||
AlterHoodieTableChangeColumnCommand(tableIdent, colName, field.withComment(newComment)).run(spark)
|
||||
case (t, _) =>
|
||||
throw new UnsupportedOperationException(s"not supported table change: ${t.getClass}")
|
||||
}
|
||||
case (t, _) =>
|
||||
throw new UnsupportedOperationException(s"not supported table change: ${t.getClass}")
|
||||
}
|
||||
|
||||
loadTable(ident)
|
||||
loadTable(ident)
|
||||
}
|
||||
case _ => super.alterTable(ident, changes: _*)
|
||||
}
|
||||
}
|
||||
|
||||
private def deduceTableLocationURIAndTableType(
|
||||
|
||||
@@ -21,7 +21,7 @@ import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
|
||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable}
|
||||
import org.apache.spark.sql.connector.catalog.TableCapability._
|
||||
import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability, V2TableWithV1Fallback}
|
||||
import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability, V1Table, V2TableWithV1Fallback}
|
||||
import org.apache.spark.sql.connector.expressions.{FieldReference, IdentityTransform, Transform}
|
||||
import org.apache.spark.sql.connector.write._
|
||||
import org.apache.spark.sql.hudi.ProvidesHoodieConfig
|
||||
@@ -74,6 +74,8 @@ case class HoodieInternalV2Table(spark: SparkSession,
|
||||
|
||||
override def v1Table: CatalogTable = hoodieCatalogTable.table
|
||||
|
||||
def v1TableWrapper: V1Table = V1Table(v1Table)
|
||||
|
||||
override def partitioning(): Array[Transform] = {
|
||||
hoodieCatalogTable.partitionFields.map { col =>
|
||||
new IdentityTransform(new FieldReference(Seq(col)))
|
||||
|
||||
Reference in New Issue
Block a user