[HUDI-2811] Support Spark 3.2 (#4270)
This commit is contained in:
4
.github/workflows/bot.yml
vendored
4
.github/workflows/bot.yml
vendored
@@ -24,6 +24,10 @@ jobs:
|
|||||||
spark: "spark3,spark3.0.x"
|
spark: "spark3,spark3.0.x"
|
||||||
- scala: "scala-2.12"
|
- scala: "scala-2.12"
|
||||||
spark: "spark3,spark3.0.x,spark-shade-unbundle-avro"
|
spark: "spark3,spark3.0.x,spark-shade-unbundle-avro"
|
||||||
|
- scala: "scala-2.12"
|
||||||
|
spark: "spark3,spark3.1.x"
|
||||||
|
- scala: "scala-2.12"
|
||||||
|
spark: "spark3,spark3.1.x,spark-shade-unbundle-avro"
|
||||||
- scala: "scala-2.12"
|
- scala: "scala-2.12"
|
||||||
spark: "spark3"
|
spark: "spark3"
|
||||||
- scala: "scala-2.12"
|
- scala: "scala-2.12"
|
||||||
|
|||||||
@@ -47,8 +47,14 @@ import scala.collection.JavaConverters.asScalaBufferConverter
|
|||||||
|
|
||||||
object HoodieSparkUtils extends SparkAdapterSupport {
|
object HoodieSparkUtils extends SparkAdapterSupport {
|
||||||
|
|
||||||
|
def isSpark2: Boolean = SPARK_VERSION.startsWith("2.")
|
||||||
|
|
||||||
def isSpark3: Boolean = SPARK_VERSION.startsWith("3.")
|
def isSpark3: Boolean = SPARK_VERSION.startsWith("3.")
|
||||||
|
|
||||||
|
def isSpark3_0: Boolean = SPARK_VERSION.startsWith("3.0")
|
||||||
|
|
||||||
|
def isSpark3_2: Boolean = SPARK_VERSION.startsWith("3.2")
|
||||||
|
|
||||||
def getMetaSchema: StructType = {
|
def getMetaSchema: StructType = {
|
||||||
StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => {
|
StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => {
|
||||||
StructField(col, StringType, nullable = true)
|
StructField(col, StringType, nullable = true)
|
||||||
|
|||||||
@@ -20,7 +20,8 @@ package org.apache.spark.sql.execution.datasources
|
|||||||
import java.util.TimeZone
|
import java.util.TimeZone
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
import org.apache.spark.sql.execution.datasources.PartitioningUtils.PartitionValues
|
|
||||||
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
import org.apache.spark.sql.types.DataType
|
import org.apache.spark.sql.types.DataType
|
||||||
|
|
||||||
trait SparkParsePartitionUtil extends Serializable {
|
trait SparkParsePartitionUtil extends Serializable {
|
||||||
@@ -30,5 +31,5 @@ trait SparkParsePartitionUtil extends Serializable {
|
|||||||
typeInference: Boolean,
|
typeInference: Boolean,
|
||||||
basePaths: Set[Path],
|
basePaths: Set[Path],
|
||||||
userSpecifiedDataTypes: Map[String, DataType],
|
userSpecifiedDataTypes: Map[String, DataType],
|
||||||
timeZone: TimeZone): Option[PartitionValues]
|
timeZone: TimeZone): InternalRow
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -578,14 +578,10 @@ case class HoodieFileIndex(
|
|||||||
}.mkString("/")
|
}.mkString("/")
|
||||||
val pathWithPartitionName = new Path(basePath, partitionWithName)
|
val pathWithPartitionName = new Path(basePath, partitionWithName)
|
||||||
val partitionDataTypes = partitionSchema.fields.map(f => f.name -> f.dataType).toMap
|
val partitionDataTypes = partitionSchema.fields.map(f => f.name -> f.dataType).toMap
|
||||||
val partitionValues = sparkParsePartitionUtil.parsePartition(pathWithPartitionName,
|
|
||||||
|
sparkParsePartitionUtil.parsePartition(pathWithPartitionName,
|
||||||
typeInference = false, Set(new Path(basePath)), partitionDataTypes,
|
typeInference = false, Set(new Path(basePath)), partitionDataTypes,
|
||||||
DateTimeUtils.getTimeZone(timeZoneId))
|
DateTimeUtils.getTimeZone(timeZoneId))
|
||||||
|
|
||||||
// Convert partitionValues to InternalRow
|
|
||||||
partitionValues.map(_.literals.map(_.value))
|
|
||||||
.map(InternalRow.fromSeq)
|
|
||||||
.getOrElse(InternalRow.empty)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
PartitionRowPath(partitionRow, partitionPath)
|
PartitionRowPath(partitionRow, partitionPath)
|
||||||
|
|||||||
@@ -48,7 +48,8 @@ import org.apache.spark.rdd.RDD
|
|||||||
import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
|
import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
|
||||||
import org.apache.spark.sql.types.StructType
|
import org.apache.spark.sql.types.StructType
|
||||||
import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext, SaveMode, SparkSession}
|
import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext, SaveMode, SparkSession}
|
||||||
import org.apache.spark.{SPARK_VERSION, SparkContext}
|
import org.apache.spark.SparkContext
|
||||||
|
|
||||||
import java.util.Properties
|
import java.util.Properties
|
||||||
|
|
||||||
import scala.collection.JavaConversions._
|
import scala.collection.JavaConversions._
|
||||||
@@ -463,13 +464,13 @@ object HoodieSparkSqlWriter {
|
|||||||
} else {
|
} else {
|
||||||
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsertWithoutMetaFields(df)
|
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsertWithoutMetaFields(df)
|
||||||
}
|
}
|
||||||
if (SPARK_VERSION.startsWith("2.")) {
|
if (HoodieSparkUtils.isSpark2) {
|
||||||
hoodieDF.write.format("org.apache.hudi.internal")
|
hoodieDF.write.format("org.apache.hudi.internal")
|
||||||
.option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime)
|
.option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime)
|
||||||
.options(params)
|
.options(params)
|
||||||
.mode(SaveMode.Append)
|
.mode(SaveMode.Append)
|
||||||
.save()
|
.save()
|
||||||
} else if (SPARK_VERSION.startsWith("3.")) {
|
} else if(HoodieSparkUtils.isSpark3) {
|
||||||
hoodieDF.write.format("org.apache.hudi.spark3.internal")
|
hoodieDF.write.format("org.apache.hudi.spark3.internal")
|
||||||
.option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime)
|
.option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime)
|
||||||
.option(HoodieInternalConfig.BULKINSERT_INPUT_DATA_SCHEMA_DDL.key, hoodieDF.schema.toDDL)
|
.option(HoodieInternalConfig.BULKINSERT_INPUT_DATA_SCHEMA_DDL.key, hoodieDF.schema.toDDL)
|
||||||
|
|||||||
@@ -18,18 +18,30 @@
|
|||||||
package org.apache.spark.sql.avro
|
package org.apache.spark.sql.avro
|
||||||
|
|
||||||
import org.apache.avro.Schema
|
import org.apache.avro.Schema
|
||||||
|
|
||||||
|
import org.apache.hudi.HoodieSparkUtils
|
||||||
|
|
||||||
import org.apache.spark.sql.types.DataType
|
import org.apache.spark.sql.types.DataType
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is to be compatible with the type returned by Spark 3.1
|
* This is to be compatible with the type returned by Spark 3.1
|
||||||
* and other spark versions for AvroDeserializer
|
* and other spark versions for AvroDeserializer
|
||||||
*/
|
*/
|
||||||
case class HoodieAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType)
|
case class HoodieAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType) {
|
||||||
extends AvroDeserializer(rootAvroType, rootCatalystType) {
|
|
||||||
|
private val avroDeserializer = if (HoodieSparkUtils.isSpark3_2) {
|
||||||
|
// SPARK-34404: As of Spark3.2, there is no AvroDeserializer's constructor with Schema and DataType arguments.
|
||||||
|
// So use the reflection to get AvroDeserializer instance.
|
||||||
|
val constructor = classOf[AvroDeserializer].getConstructor(classOf[Schema], classOf[DataType], classOf[String])
|
||||||
|
constructor.newInstance(rootAvroType, rootCatalystType, "EXCEPTION")
|
||||||
|
} else {
|
||||||
|
val constructor = classOf[AvroDeserializer].getConstructor(classOf[Schema], classOf[DataType])
|
||||||
|
constructor.newInstance(rootAvroType, rootCatalystType)
|
||||||
|
}
|
||||||
|
|
||||||
def deserializeData(data: Any): Any = {
|
def deserializeData(data: Any): Any = {
|
||||||
super.deserialize(data) match {
|
avroDeserializer.deserialize(data) match {
|
||||||
case Some(r) => r // spark 3.1 return type is Option, we fetch the data.
|
case Some(r) => r // As of spark 3.1, this will return data wrapped with Option, so we fetch the data.
|
||||||
case o => o // for other spark version, return the data directly.
|
case o => o // for other spark version, return the data directly.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,17 +22,37 @@ import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation.Compactio
|
|||||||
case class CompactionTable(table: LogicalPlan, operation: CompactionOperation, instantTimestamp: Option[Long])
|
case class CompactionTable(table: LogicalPlan, operation: CompactionOperation, instantTimestamp: Option[Long])
|
||||||
extends Command {
|
extends Command {
|
||||||
override def children: Seq[LogicalPlan] = Seq(table)
|
override def children: Seq[LogicalPlan] = Seq(table)
|
||||||
|
|
||||||
|
def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): CompactionTable = {
|
||||||
|
copy(table = newChildren.head)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
case class CompactionPath(path: String, operation: CompactionOperation, instantTimestamp: Option[Long])
|
case class CompactionPath(path: String, operation: CompactionOperation, instantTimestamp: Option[Long])
|
||||||
extends Command
|
extends Command {
|
||||||
|
override def children: Seq[LogicalPlan] = Seq.empty
|
||||||
|
|
||||||
|
def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): CompactionPath = {
|
||||||
|
this
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
case class CompactionShowOnTable(table: LogicalPlan, limit: Int = 20)
|
case class CompactionShowOnTable(table: LogicalPlan, limit: Int = 20)
|
||||||
extends Command {
|
extends Command {
|
||||||
override def children: Seq[LogicalPlan] = Seq(table)
|
override def children: Seq[LogicalPlan] = Seq(table)
|
||||||
|
|
||||||
|
def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): CompactionShowOnTable = {
|
||||||
|
copy(table = newChildren.head)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
case class CompactionShowOnPath(path: String, limit: Int = 20) extends Command
|
case class CompactionShowOnPath(path: String, limit: Int = 20) extends Command {
|
||||||
|
override def children: Seq[LogicalPlan] = Seq.empty
|
||||||
|
|
||||||
|
def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): CompactionShowOnPath = {
|
||||||
|
this
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
object CompactionOperation extends Enumeration {
|
object CompactionOperation extends Enumeration {
|
||||||
type CompactionOperation = Value
|
type CompactionOperation = Value
|
||||||
|
|||||||
@@ -0,0 +1,30 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark.sql.catalyst.trees
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Similar to `LeafLike` in Spark3.2.
|
||||||
|
*/
|
||||||
|
trait HoodieLeafLike[T <: TreeNode[T]] { self: TreeNode[T] =>
|
||||||
|
|
||||||
|
override final def children: Seq[T] = Nil
|
||||||
|
|
||||||
|
override final def mapChildren(f: T => T): T = this.asInstanceOf[T]
|
||||||
|
|
||||||
|
final def withNewChildrenInternal(newChildren: IndexedSeq[T]): T = this.asInstanceOf[T]
|
||||||
|
}
|
||||||
@@ -31,7 +31,7 @@ import org.apache.hudi.common.fs.FSUtils
|
|||||||
import org.apache.hudi.common.model.HoodieRecord
|
import org.apache.hudi.common.model.HoodieRecord
|
||||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||||
import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieInstantTimeGenerator}
|
import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieInstantTimeGenerator}
|
||||||
import org.apache.spark.SPARK_VERSION
|
|
||||||
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
|
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
|
||||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||||
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
|
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
|
||||||
@@ -282,8 +282,6 @@ object HoodieSqlUtils extends SparkAdapterSupport {
|
|||||||
.filterKeys(_.startsWith("hoodie."))
|
.filterKeys(_.startsWith("hoodie."))
|
||||||
}
|
}
|
||||||
|
|
||||||
def isSpark3: Boolean = SPARK_VERSION.startsWith("3.")
|
|
||||||
|
|
||||||
def isEnableHive(sparkSession: SparkSession): Boolean =
|
def isEnableHive(sparkSession: SparkSession): Boolean =
|
||||||
"hive" == sparkSession.sessionState.conf.getConf(StaticSQLConf.CATALOG_IMPLEMENTATION)
|
"hive" == sparkSession.sessionState.conf.getConf(StaticSQLConf.CATALOG_IMPLEMENTATION)
|
||||||
|
|
||||||
|
|||||||
@@ -17,12 +17,13 @@
|
|||||||
|
|
||||||
package org.apache.spark.sql.hudi.analysis
|
package org.apache.spark.sql.hudi.analysis
|
||||||
|
|
||||||
|
import org.apache.hudi.{HoodieSparkUtils, SparkAdapterSupport}
|
||||||
import org.apache.hudi.DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL
|
import org.apache.hudi.DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL
|
||||||
import org.apache.hudi.SparkAdapterSupport
|
|
||||||
import org.apache.hudi.common.model.HoodieRecord
|
import org.apache.hudi.common.model.HoodieRecord
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
|
|
||||||
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedStar}
|
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedStar}
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression, Literal, NamedExpression}
|
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, Literal, NamedExpression}
|
||||||
import org.apache.spark.sql.catalyst.plans.Inner
|
import org.apache.spark.sql.catalyst.plans.Inner
|
||||||
import org.apache.spark.sql.catalyst.plans.logical._
|
import org.apache.spark.sql.catalyst.plans.logical._
|
||||||
import org.apache.spark.sql.catalyst.rules.Rule
|
import org.apache.spark.sql.catalyst.rules.Rule
|
||||||
@@ -137,7 +138,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
|||||||
// We can do this because under the normal case, we should not allow to update or set
|
// We can do this because under the normal case, we should not allow to update or set
|
||||||
// the hoodie's meta field in sql statement, it is a system field, cannot set the value
|
// the hoodie's meta field in sql statement, it is a system field, cannot set the value
|
||||||
// by user.
|
// by user.
|
||||||
if (HoodieSqlUtils.isSpark3) {
|
if (HoodieSparkUtils.isSpark3) {
|
||||||
val assignmentFieldNames = assignments.map(_.key).map {
|
val assignmentFieldNames = assignments.map(_.key).map {
|
||||||
case attr: AttributeReference =>
|
case attr: AttributeReference =>
|
||||||
attr.name
|
attr.name
|
||||||
@@ -178,11 +179,19 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
|||||||
.map { case (targetAttr, sourceAttr) => Assignment(targetAttr, sourceAttr) }
|
.map { case (targetAttr, sourceAttr) => Assignment(targetAttr, sourceAttr) }
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
assignments.map(assignment => {
|
// For Spark3.2, InsertStarAction/UpdateStarAction's assignments will contain the meta fields.
|
||||||
|
val withoutMetaAttrs = assignments.filterNot{ assignment =>
|
||||||
|
if (assignment.key.isInstanceOf[Attribute]) {
|
||||||
|
HoodieSqlUtils.isMetaField(assignment.key.asInstanceOf[Attribute].name)
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
withoutMetaAttrs.map { assignment =>
|
||||||
val resolvedKey = resolveExpressionFrom(target)(assignment.key)
|
val resolvedKey = resolveExpressionFrom(target)(assignment.key)
|
||||||
val resolvedValue = resolveExpressionFrom(resolvedSource, Some(target))(assignment.value)
|
val resolvedValue = resolveExpressionFrom(resolvedSource, Some(target))(assignment.value)
|
||||||
Assignment(resolvedKey, resolvedValue)
|
Assignment(resolvedKey, resolvedValue)
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
(resolvedCondition, resolvedAssignments)
|
(resolvedCondition, resolvedAssignments)
|
||||||
}
|
}
|
||||||
@@ -242,6 +251,10 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
|||||||
case DeleteAction(condition) =>
|
case DeleteAction(condition) =>
|
||||||
val resolvedCondition = condition.map(resolveExpressionFrom(resolvedSource)(_))
|
val resolvedCondition = condition.map(resolveExpressionFrom(resolvedSource)(_))
|
||||||
DeleteAction(resolvedCondition)
|
DeleteAction(resolvedCondition)
|
||||||
|
case action: MergeAction =>
|
||||||
|
// SPARK-34962: use UpdateStarAction as the explicit representation of * in UpdateAction.
|
||||||
|
// So match and covert this in Spark3.2 env.
|
||||||
|
UpdateAction(action.condition, Seq.empty)
|
||||||
}
|
}
|
||||||
// Resolve the notMatchedActions
|
// Resolve the notMatchedActions
|
||||||
val resolvedNotMatchedActions = notMatchedActions.map {
|
val resolvedNotMatchedActions = notMatchedActions.map {
|
||||||
@@ -249,6 +262,10 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
|||||||
val (resolvedCondition, resolvedAssignments) =
|
val (resolvedCondition, resolvedAssignments) =
|
||||||
resolveConditionAssignments(condition, assignments)
|
resolveConditionAssignments(condition, assignments)
|
||||||
InsertAction(resolvedCondition, resolvedAssignments)
|
InsertAction(resolvedCondition, resolvedAssignments)
|
||||||
|
case action: MergeAction =>
|
||||||
|
// SPARK-34962: use InsertStarAction as the explicit representation of * in InsertAction.
|
||||||
|
// So match and covert this in Spark3.2 env.
|
||||||
|
InsertAction(action.condition, Seq.empty)
|
||||||
}
|
}
|
||||||
// Return the resolved MergeIntoTable
|
// Return the resolved MergeIntoTable
|
||||||
MergeIntoTable(target, resolvedSource, resolvedMergeCondition,
|
MergeIntoTable(target, resolvedSource, resolvedMergeCondition,
|
||||||
@@ -426,9 +443,11 @@ case class HoodiePostAnalysisRule(sparkSession: SparkSession) extends Rule[Logic
|
|||||||
case AlterTableChangeColumnCommand(tableName, columnName, newColumn)
|
case AlterTableChangeColumnCommand(tableName, columnName, newColumn)
|
||||||
if isHoodieTable(tableName, sparkSession) =>
|
if isHoodieTable(tableName, sparkSession) =>
|
||||||
AlterHoodieTableChangeColumnCommand(tableName, columnName, newColumn)
|
AlterHoodieTableChangeColumnCommand(tableName, columnName, newColumn)
|
||||||
case ShowPartitionsCommand(tableName, specOpt)
|
// SPARK-34238: the definition of ShowPartitionsCommand has been changed in Spark3.2.
|
||||||
if isHoodieTable(tableName, sparkSession) =>
|
// Match the class type instead of call the `unapply` method.
|
||||||
ShowHoodieTablePartitionsCommand(tableName, specOpt)
|
case s: ShowPartitionsCommand
|
||||||
|
if isHoodieTable(s.tableName, sparkSession) =>
|
||||||
|
ShowHoodieTablePartitionsCommand(s.tableName, s.spec)
|
||||||
// Rewrite TruncateTableCommand to TruncateHoodieTableCommand
|
// Rewrite TruncateTableCommand to TruncateHoodieTableCommand
|
||||||
case TruncateTableCommand(tableName, partitionSpec)
|
case TruncateTableCommand(tableName, partitionSpec)
|
||||||
if isHoodieTable(tableName, sparkSession) =>
|
if isHoodieTable(tableName, sparkSession) =>
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ import org.apache.spark.api.java.JavaSparkContext
|
|||||||
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
|
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
|
||||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||||
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable}
|
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable}
|
||||||
import org.apache.spark.sql.execution.command.{DDLUtils, RunnableCommand}
|
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||||
import org.apache.spark.sql.types.{StructField, StructType}
|
import org.apache.spark.sql.types.{StructField, StructType}
|
||||||
import org.apache.spark.sql.util.SchemaUtils
|
import org.apache.spark.sql.util.SchemaUtils
|
||||||
|
|
||||||
@@ -44,7 +44,7 @@ import scala.util.control.NonFatal
|
|||||||
case class AlterHoodieTableAddColumnsCommand(
|
case class AlterHoodieTableAddColumnsCommand(
|
||||||
tableId: TableIdentifier,
|
tableId: TableIdentifier,
|
||||||
colsToAdd: Seq[StructField])
|
colsToAdd: Seq[StructField])
|
||||||
extends RunnableCommand {
|
extends HoodieLeafRunnableCommand {
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
if (colsToAdd.nonEmpty) {
|
if (colsToAdd.nonEmpty) {
|
||||||
@@ -74,7 +74,7 @@ case class AlterHoodieTableAddColumnsCommand(
|
|||||||
}
|
}
|
||||||
|
|
||||||
private def refreshSchemaInMeta(sparkSession: SparkSession, table: CatalogTable,
|
private def refreshSchemaInMeta(sparkSession: SparkSession, table: CatalogTable,
|
||||||
newSqlSchema: StructType): Unit = {
|
newSqlDataSchema: StructType): Unit = {
|
||||||
try {
|
try {
|
||||||
sparkSession.catalog.uncacheTable(tableId.quotedString)
|
sparkSession.catalog.uncacheTable(tableId.quotedString)
|
||||||
} catch {
|
} catch {
|
||||||
@@ -84,12 +84,11 @@ case class AlterHoodieTableAddColumnsCommand(
|
|||||||
sparkSession.catalog.refreshTable(table.identifier.unquotedString)
|
sparkSession.catalog.refreshTable(table.identifier.unquotedString)
|
||||||
|
|
||||||
SchemaUtils.checkColumnNameDuplication(
|
SchemaUtils.checkColumnNameDuplication(
|
||||||
newSqlSchema.map(_.name),
|
newSqlDataSchema.map(_.name),
|
||||||
"in the table definition of " + table.identifier,
|
"in the table definition of " + table.identifier,
|
||||||
conf.caseSensitiveAnalysis)
|
conf.caseSensitiveAnalysis)
|
||||||
DDLUtils.checkDataColNames(table, colsToAdd.map(_.name))
|
|
||||||
|
|
||||||
sparkSession.sessionState.catalog.alterTableDataSchema(tableId, newSqlSchema)
|
sparkSession.sessionState.catalog.alterTableDataSchema(tableId, newSqlDataSchema)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ import org.apache.hudi.exception.HoodieException
|
|||||||
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
|
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
|
||||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||||
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||||
import org.apache.spark.sql.types.{StructField, StructType}
|
import org.apache.spark.sql.types.{StructField, StructType}
|
||||||
|
|
||||||
import scala.util.control.NonFatal
|
import scala.util.control.NonFatal
|
||||||
@@ -39,7 +39,7 @@ case class AlterHoodieTableChangeColumnCommand(
|
|||||||
tableIdentifier: TableIdentifier,
|
tableIdentifier: TableIdentifier,
|
||||||
columnName: String,
|
columnName: String,
|
||||||
newColumn: StructField)
|
newColumn: StructField)
|
||||||
extends RunnableCommand {
|
extends HoodieLeafRunnableCommand {
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier)
|
val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier)
|
||||||
|
|||||||
@@ -25,11 +25,12 @@ import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME
|
|||||||
import org.apache.hudi.hive.MultiPartKeysValueExtractor
|
import org.apache.hudi.hive.MultiPartKeysValueExtractor
|
||||||
import org.apache.hudi.hive.ddl.HiveSyncMode
|
import org.apache.hudi.hive.ddl.HiveSyncMode
|
||||||
import org.apache.hudi.{DataSourceWriteOptions, HoodieSparkSqlWriter}
|
import org.apache.hudi.{DataSourceWriteOptions, HoodieSparkSqlWriter}
|
||||||
|
|
||||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||||
import org.apache.spark.sql.catalyst.analysis.Resolver
|
import org.apache.spark.sql.catalyst.analysis.Resolver
|
||||||
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
|
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
|
||||||
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
||||||
import org.apache.spark.sql.execution.command.{DDLUtils, RunnableCommand}
|
import org.apache.spark.sql.execution.command.DDLUtils
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
||||||
import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
|
import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
|
||||||
|
|
||||||
@@ -39,7 +40,7 @@ case class AlterHoodieTableDropPartitionCommand(
|
|||||||
ifExists : Boolean,
|
ifExists : Boolean,
|
||||||
purge : Boolean,
|
purge : Boolean,
|
||||||
retainData : Boolean)
|
retainData : Boolean)
|
||||||
extends RunnableCommand {
|
extends HoodieLeafRunnableCommand {
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val fullTableName = s"${tableIdentifier.database}.${tableIdentifier.table}"
|
val fullTableName = s"${tableIdentifier.database}.${tableIdentifier.table}"
|
||||||
|
|||||||
@@ -24,12 +24,12 @@ import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeli
|
|||||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||||
import org.apache.hudi.common.util.{HoodieTimer, Option => HOption}
|
import org.apache.hudi.common.util.{HoodieTimer, Option => HOption}
|
||||||
import org.apache.hudi.exception.HoodieException
|
import org.apache.hudi.exception.HoodieException
|
||||||
|
|
||||||
import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
|
import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
|
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation
|
import org.apache.spark.sql.catalyst.plans.logical.{CompactionOperation, LogicalPlan}
|
||||||
import org.apache.spark.sql.{Row, SparkSession}
|
import org.apache.spark.sql.{Row, SparkSession}
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation.{CompactionOperation, RUN, SCHEDULE}
|
import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation.{CompactionOperation, RUN, SCHEDULE}
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlUtils
|
import org.apache.spark.sql.hudi.HoodieSqlUtils
|
||||||
import org.apache.spark.sql.types.StringType
|
import org.apache.spark.sql.types.StringType
|
||||||
|
|
||||||
@@ -38,7 +38,7 @@ import scala.collection.JavaConverters._
|
|||||||
|
|
||||||
case class CompactionHoodiePathCommand(path: String,
|
case class CompactionHoodiePathCommand(path: String,
|
||||||
operation: CompactionOperation, instantTimestamp: Option[Long] = None)
|
operation: CompactionOperation, instantTimestamp: Option[Long] = None)
|
||||||
extends RunnableCommand {
|
extends HoodieLeafRunnableCommand {
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val metaClient = HoodieTableMetaClient.builder().setBasePath(path)
|
val metaClient = HoodieTableMetaClient.builder().setBasePath(path)
|
||||||
|
|||||||
@@ -21,13 +21,13 @@ import org.apache.spark.sql.{Row, SparkSession}
|
|||||||
import org.apache.spark.sql.catalyst.catalog.CatalogTable
|
import org.apache.spark.sql.catalyst.catalog.CatalogTable
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
|
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation.{CompactionOperation, RUN, SCHEDULE}
|
import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation.{CompactionOperation, RUN, SCHEDULE}
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlUtils.getTableLocation
|
import org.apache.spark.sql.hudi.HoodieSqlUtils.getTableLocation
|
||||||
import org.apache.spark.sql.types.StringType
|
import org.apache.spark.sql.types.StringType
|
||||||
|
|
||||||
case class CompactionHoodieTableCommand(table: CatalogTable,
|
case class CompactionHoodieTableCommand(table: CatalogTable,
|
||||||
operation: CompactionOperation, instantTimestamp: Option[Long])
|
operation: CompactionOperation, instantTimestamp: Option[Long])
|
||||||
extends RunnableCommand {
|
extends HoodieLeafRunnableCommand {
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val basePath = getTableLocation(table, sparkSession)
|
val basePath = getTableLocation(table, sparkSession)
|
||||||
|
|||||||
@@ -22,14 +22,14 @@ import org.apache.hudi.common.table.HoodieTableMetaClient
|
|||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline
|
import org.apache.hudi.common.table.timeline.HoodieTimeline
|
||||||
import org.apache.hudi.common.util.CompactionUtils
|
import org.apache.hudi.common.util.CompactionUtils
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
|
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
|
||||||
|
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||||
import org.apache.spark.sql.{Row, SparkSession}
|
import org.apache.spark.sql.{Row, SparkSession}
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
|
||||||
import org.apache.spark.sql.types.{IntegerType, StringType}
|
import org.apache.spark.sql.types.{IntegerType, StringType}
|
||||||
|
|
||||||
import scala.collection.JavaConverters.asScalaIteratorConverter
|
import scala.collection.JavaConverters.asScalaIteratorConverter
|
||||||
|
|
||||||
case class CompactionShowHoodiePathCommand(path: String, limit: Int)
|
case class CompactionShowHoodiePathCommand(path: String, limit: Int)
|
||||||
extends RunnableCommand {
|
extends HoodieLeafRunnableCommand {
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val metaClient = HoodieTableMetaClient.builder().setBasePath(path.toString)
|
val metaClient = HoodieTableMetaClient.builder().setBasePath(path.toString)
|
||||||
|
|||||||
@@ -20,12 +20,12 @@ package org.apache.spark.sql.hudi.command
|
|||||||
import org.apache.spark.sql.{Row, SparkSession}
|
import org.apache.spark.sql.{Row, SparkSession}
|
||||||
import org.apache.spark.sql.catalyst.catalog.CatalogTable
|
import org.apache.spark.sql.catalyst.catalog.CatalogTable
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
|
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlUtils.getTableLocation
|
import org.apache.spark.sql.hudi.HoodieSqlUtils.getTableLocation
|
||||||
import org.apache.spark.sql.types.{IntegerType, StringType}
|
import org.apache.spark.sql.types.{IntegerType, StringType}
|
||||||
|
|
||||||
case class CompactionShowHoodieTableCommand(table: CatalogTable, limit: Int)
|
case class CompactionShowHoodieTableCommand(table: CatalogTable, limit: Int)
|
||||||
extends RunnableCommand {
|
extends HoodieLeafRunnableCommand {
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val basePath = getTableLocation(table, sparkSession)
|
val basePath = getTableLocation(table, sparkSession)
|
||||||
|
|||||||
@@ -41,6 +41,10 @@ case class CreateHoodieTableAsSelectCommand(
|
|||||||
mode: SaveMode,
|
mode: SaveMode,
|
||||||
query: LogicalPlan) extends DataWritingCommand {
|
query: LogicalPlan) extends DataWritingCommand {
|
||||||
|
|
||||||
|
def withNewChildInternal(newChild: LogicalPlan): CreateHoodieTableAsSelectCommand = {
|
||||||
|
this
|
||||||
|
}
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
|
override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
|
||||||
assert(table.tableType != CatalogTableType.VIEW)
|
assert(table.tableType != CatalogTableType.VIEW)
|
||||||
assert(table.provider.isDefined)
|
assert(table.provider.isDefined)
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils
|
|||||||
|
|
||||||
import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, TableAlreadyExistsException}
|
import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, TableAlreadyExistsException}
|
||||||
import org.apache.spark.sql.catalyst.catalog._
|
import org.apache.spark.sql.catalyst.catalog._
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||||
import org.apache.spark.sql.hive.HiveClientUtils
|
import org.apache.spark.sql.hive.HiveClientUtils
|
||||||
import org.apache.spark.sql.hive.HiveExternalCatalog._
|
import org.apache.spark.sql.hive.HiveExternalCatalog._
|
||||||
import org.apache.spark.sql.hudi.{HoodieOptionConfig, HoodieSqlUtils}
|
import org.apache.spark.sql.hudi.{HoodieOptionConfig, HoodieSqlUtils}
|
||||||
@@ -46,7 +46,7 @@ import scala.util.control.NonFatal
|
|||||||
* Command for create hoodie table.
|
* Command for create hoodie table.
|
||||||
*/
|
*/
|
||||||
case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean)
|
case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean)
|
||||||
extends RunnableCommand with SparkAdapterSupport {
|
extends HoodieLeafRunnableCommand with SparkAdapterSupport {
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val tableIsExists = sparkSession.sessionState.catalog.tableExists(table.identifier)
|
val tableIsExists = sparkSession.sessionState.catalog.tableExists(table.identifier)
|
||||||
@@ -198,7 +198,7 @@ object CreateHoodieTableCommand {
|
|||||||
val schemaJsonString = schema.json
|
val schemaJsonString = schema.json
|
||||||
// Split the JSON string.
|
// Split the JSON string.
|
||||||
val parts = schemaJsonString.grouped(threshold).toSeq
|
val parts = schemaJsonString.grouped(threshold).toSeq
|
||||||
properties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
|
properties.put(DATASOURCE_SCHEMA_PREFIX + "numParts", parts.size.toString)
|
||||||
parts.zipWithIndex.foreach { case (part, index) =>
|
parts.zipWithIndex.foreach { case (part, index) =>
|
||||||
properties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
|
properties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,12 +25,11 @@ import org.apache.hudi.{DataSourceWriteOptions, SparkAdapterSupport}
|
|||||||
|
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.DeleteFromTable
|
import org.apache.spark.sql.catalyst.plans.logical.{DeleteFromTable, LogicalPlan}
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
||||||
import org.apache.spark.sql.types.StructType
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
case class DeleteHoodieTableCommand(deleteTable: DeleteFromTable) extends RunnableCommand
|
case class DeleteHoodieTableCommand(deleteTable: DeleteFromTable) extends HoodieLeafRunnableCommand
|
||||||
with SparkAdapterSupport {
|
with SparkAdapterSupport {
|
||||||
|
|
||||||
private val table = deleteTable.table
|
private val table = deleteTable.table
|
||||||
|
|||||||
@@ -18,14 +18,14 @@
|
|||||||
package org.apache.spark.sql.hudi.command
|
package org.apache.spark.sql.hudi.command
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
import org.apache.hudi.SparkAdapterSupport
|
|
||||||
import org.apache.hudi.client.common.HoodieSparkEngineContext
|
import org.apache.hudi.client.common.HoodieSparkEngineContext
|
||||||
import org.apache.hudi.common.fs.FSUtils
|
import org.apache.hudi.common.fs.FSUtils
|
||||||
|
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||||
import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchTableException}
|
import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchTableException}
|
||||||
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, HoodieCatalogTable}
|
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, HoodieCatalogTable}
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
|
||||||
import org.apache.spark.sql.hive.HiveClientUtils
|
import org.apache.spark.sql.hive.HiveClientUtils
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlUtils.isEnableHive
|
import org.apache.spark.sql.hudi.HoodieSqlUtils.isEnableHive
|
||||||
|
|
||||||
@@ -35,8 +35,8 @@ case class DropHoodieTableCommand(
|
|||||||
tableIdentifier: TableIdentifier,
|
tableIdentifier: TableIdentifier,
|
||||||
ifExists: Boolean,
|
ifExists: Boolean,
|
||||||
isView: Boolean,
|
isView: Boolean,
|
||||||
purge: Boolean) extends RunnableCommand
|
purge: Boolean)
|
||||||
with SparkAdapterSupport {
|
extends HoodieLeafRunnableCommand {
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val fullTableName = s"${tableIdentifier.database}.${tableIdentifier.table}"
|
val fullTableName = s"${tableIdentifier.database}.${tableIdentifier.table}"
|
||||||
|
|||||||
@@ -0,0 +1,29 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark.sql.hudi.command
|
||||||
|
|
||||||
|
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||||
|
import org.apache.spark.sql.catalyst.trees.HoodieLeafLike
|
||||||
|
import org.apache.spark.sql.execution.command.RunnableCommand
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Similar to `LeafRunnableCommand` in Spark3.2, `HoodieLeafRunnableCommand` mixed in
|
||||||
|
* `HoodieLeafLike` can avoid subclasses of `RunnableCommand` to override
|
||||||
|
* the `withNewChildrenInternal` method repeatedly.
|
||||||
|
*/
|
||||||
|
trait HoodieLeafRunnableCommand extends RunnableCommand with HoodieLeafLike[LogicalPlan]
|
||||||
@@ -36,7 +36,6 @@ import org.apache.spark.internal.Logging
|
|||||||
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable}
|
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable}
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
|
import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
|
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
|
||||||
import org.apache.spark.sql.execution.datasources.LogicalRelation
|
import org.apache.spark.sql.execution.datasources.LogicalRelation
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
||||||
import org.apache.spark.sql.internal.SQLConf
|
import org.apache.spark.sql.internal.SQLConf
|
||||||
@@ -54,7 +53,7 @@ case class InsertIntoHoodieTableCommand(
|
|||||||
query: LogicalPlan,
|
query: LogicalPlan,
|
||||||
partition: Map[String, Option[String]],
|
partition: Map[String, Option[String]],
|
||||||
overwrite: Boolean)
|
overwrite: Boolean)
|
||||||
extends RunnableCommand {
|
extends HoodieLeafRunnableCommand {
|
||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
assert(logicalRelation.catalogTable.isDefined, "Missing catalog table")
|
assert(logicalRelation.catalogTable.isDefined, "Missing catalog table")
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ import org.apache.spark.sql.catalyst.analysis.Resolver
|
|||||||
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, BoundReference, Cast, EqualTo, Expression, Literal}
|
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, BoundReference, Cast, EqualTo, Expression, Literal}
|
||||||
import org.apache.spark.sql.catalyst.plans.logical._
|
import org.apache.spark.sql.catalyst.plans.logical._
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
||||||
import org.apache.spark.sql.hudi.command.payload.ExpressionPayload
|
import org.apache.spark.sql.hudi.command.payload.ExpressionPayload
|
||||||
import org.apache.spark.sql.hudi.command.payload.ExpressionPayload._
|
import org.apache.spark.sql.hudi.command.payload.ExpressionPayload._
|
||||||
@@ -60,7 +59,7 @@ import java.util.Base64
|
|||||||
* ExpressionPayload#getInsertValue.
|
* ExpressionPayload#getInsertValue.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends RunnableCommand
|
case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends HoodieLeafRunnableCommand
|
||||||
with SparkAdapterSupport {
|
with SparkAdapterSupport {
|
||||||
|
|
||||||
private var sparkSession: SparkSession = _
|
private var sparkSession: SparkSession = _
|
||||||
@@ -203,7 +202,13 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab
|
|||||||
|
|
||||||
sourceExpression match {
|
sourceExpression match {
|
||||||
case attr: AttributeReference if sourceColumnName.find(resolver(_, attr.name)).get.equals(targetColumnName) => true
|
case attr: AttributeReference if sourceColumnName.find(resolver(_, attr.name)).get.equals(targetColumnName) => true
|
||||||
case Cast(attr: AttributeReference, _, _) if sourceColumnName.find(resolver(_, attr.name)).get.equals(targetColumnName) => true
|
// SPARK-35857: the definition of Cast has been changed in Spark3.2.
|
||||||
|
// Match the class type instead of call the `unapply` method.
|
||||||
|
case cast: Cast =>
|
||||||
|
cast.child match {
|
||||||
|
case attr: AttributeReference if sourceColumnName.find(resolver(_, attr.name)).get.equals(targetColumnName) => true
|
||||||
|
case _ => false
|
||||||
|
}
|
||||||
case _=> false
|
case _=> false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
|
|||||||
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
|
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
|
||||||
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
|
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||||
import org.apache.spark.sql.execution.datasources.PartitioningUtils
|
import org.apache.spark.sql.execution.datasources.PartitioningUtils
|
||||||
import org.apache.spark.sql.types.StringType
|
import org.apache.spark.sql.types.StringType
|
||||||
|
|
||||||
@@ -34,7 +34,7 @@ import org.apache.spark.sql.types.StringType
|
|||||||
case class ShowHoodieTablePartitionsCommand(
|
case class ShowHoodieTablePartitionsCommand(
|
||||||
tableIdentifier: TableIdentifier,
|
tableIdentifier: TableIdentifier,
|
||||||
specOpt: Option[TablePartitionSpec])
|
specOpt: Option[TablePartitionSpec])
|
||||||
extends RunnableCommand {
|
extends HoodieLeafRunnableCommand {
|
||||||
|
|
||||||
override val output: Seq[Attribute] = {
|
override val output: Seq[Attribute] = {
|
||||||
AttributeReference("partition", StringType, nullable = false)() :: Nil
|
AttributeReference("partition", StringType, nullable = false)() :: Nil
|
||||||
|
|||||||
@@ -28,15 +28,14 @@ import org.apache.hudi.hive.ddl.HiveSyncMode
|
|||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression}
|
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression}
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.{Assignment, UpdateTable}
|
import org.apache.spark.sql.catalyst.plans.logical.{Assignment, LogicalPlan, UpdateTable}
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
||||||
import org.apache.spark.sql.internal.SQLConf
|
import org.apache.spark.sql.internal.SQLConf
|
||||||
import org.apache.spark.sql.types.{StructField, StructType}
|
import org.apache.spark.sql.types.{StructField, StructType}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends RunnableCommand
|
case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends HoodieLeafRunnableCommand
|
||||||
with SparkAdapterSupport {
|
with SparkAdapterSupport {
|
||||||
|
|
||||||
private val table = updateTable.table
|
private val table = updateTable.table
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ package org.apache.hudi
|
|||||||
|
|
||||||
import org.apache.hudi.HoodieSparkUtils.convertToCatalystExpressions
|
import org.apache.hudi.HoodieSparkUtils.convertToCatalystExpressions
|
||||||
import org.apache.hudi.HoodieSparkUtils.convertToCatalystExpression
|
import org.apache.hudi.HoodieSparkUtils.convertToCatalystExpression
|
||||||
|
|
||||||
import org.apache.spark.sql.sources.{And, EqualNullSafe, EqualTo, Filter, GreaterThan, GreaterThanOrEqual, In, IsNotNull, IsNull, LessThan, LessThanOrEqual, Not, Or, StringContains, StringEndsWith, StringStartsWith}
|
import org.apache.spark.sql.sources.{And, EqualNullSafe, EqualTo, Filter, GreaterThan, GreaterThanOrEqual, In, IsNotNull, IsNull, LessThan, LessThanOrEqual, Not, Or, StringContains, StringEndsWith, StringStartsWith}
|
||||||
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType, StructField, StructType}
|
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType, StructField, StructType}
|
||||||
import org.junit.jupiter.api.Assertions.assertEquals
|
import org.junit.jupiter.api.Assertions.assertEquals
|
||||||
@@ -68,22 +69,36 @@ class TestConvertFilterToCatalystExpression {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private def checkConvertFilter(filter: Filter, expectExpression: String): Unit = {
|
private def checkConvertFilter(filter: Filter, expectExpression: String): Unit = {
|
||||||
|
// [SPARK-25769][SPARK-34636][SPARK-34626][SQL] sql method in UnresolvedAttribute,
|
||||||
|
// AttributeReference and Alias don't quote qualified names properly
|
||||||
|
val removeQuotesIfNeed = if (expectExpression != null && HoodieSparkUtils.isSpark3_2) {
|
||||||
|
expectExpression.replace("`", "")
|
||||||
|
} else {
|
||||||
|
expectExpression
|
||||||
|
}
|
||||||
val exp = convertToCatalystExpression(filter, tableSchema)
|
val exp = convertToCatalystExpression(filter, tableSchema)
|
||||||
if (expectExpression == null) {
|
if (removeQuotesIfNeed == null) {
|
||||||
assertEquals(exp.isEmpty, true)
|
assertEquals(exp.isEmpty, true)
|
||||||
} else {
|
} else {
|
||||||
assertEquals(exp.isDefined, true)
|
assertEquals(exp.isDefined, true)
|
||||||
assertEquals(expectExpression, exp.get.sql)
|
assertEquals(removeQuotesIfNeed, exp.get.sql)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private def checkConvertFilters(filters: Array[Filter], expectExpression: String): Unit = {
|
private def checkConvertFilters(filters: Array[Filter], expectExpression: String): Unit = {
|
||||||
|
// [SPARK-25769][SPARK-34636][SPARK-34626][SQL] sql method in UnresolvedAttribute,
|
||||||
|
// AttributeReference and Alias don't quote qualified names properly
|
||||||
|
val removeQuotesIfNeed = if (expectExpression != null && HoodieSparkUtils.isSpark3_2) {
|
||||||
|
expectExpression.replace("`", "")
|
||||||
|
} else {
|
||||||
|
expectExpression
|
||||||
|
}
|
||||||
val exp = convertToCatalystExpressions(filters, tableSchema)
|
val exp = convertToCatalystExpressions(filters, tableSchema)
|
||||||
if (expectExpression == null) {
|
if (removeQuotesIfNeed == null) {
|
||||||
assertEquals(exp.isEmpty, true)
|
assertEquals(exp.isEmpty, true)
|
||||||
} else {
|
} else {
|
||||||
assertEquals(exp.isDefined, true)
|
assertEquals(exp.isDefined, true)
|
||||||
assertEquals(expectExpression, exp.get.sql)
|
assertEquals(removeQuotesIfNeed, exp.get.sql)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -293,7 +293,6 @@ class TestHoodieSparkSqlWriter {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
def testDisableAndEnableMetaFields(): Unit = {
|
def testDisableAndEnableMetaFields(): Unit = {
|
||||||
try {
|
|
||||||
testBulkInsertWithSortMode(BulkInsertSortMode.NONE, populateMetaFields = false)
|
testBulkInsertWithSortMode(BulkInsertSortMode.NONE, populateMetaFields = false)
|
||||||
//create a new table
|
//create a new table
|
||||||
val fooTableModifier = commonTableModifier.updated("hoodie.bulkinsert.shuffle.parallelism", "4")
|
val fooTableModifier = commonTableModifier.updated("hoodie.bulkinsert.shuffle.parallelism", "4")
|
||||||
@@ -316,7 +315,6 @@ class TestHoodieSparkSqlWriter {
|
|||||||
case e: Exception => fail(e);
|
case e: Exception => fail(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test case for drop duplicates row writing for bulk_insert.
|
* Test case for drop duplicates row writing for bulk_insert.
|
||||||
@@ -711,7 +709,6 @@ class TestHoodieSparkSqlWriter {
|
|||||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "",
|
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "",
|
||||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.NonpartitionedKeyGenerator",
|
DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.NonpartitionedKeyGenerator",
|
||||||
HoodieWriteConfig.TBL_NAME.key -> "hoodie_test")
|
HoodieWriteConfig.TBL_NAME.key -> "hoodie_test")
|
||||||
try {
|
|
||||||
val df = spark.range(0, 1000).toDF("keyid")
|
val df = spark.range(0, 1000).toDF("keyid")
|
||||||
.withColumn("col3", expr("keyid"))
|
.withColumn("col3", expr("keyid"))
|
||||||
.withColumn("age", lit(1))
|
.withColumn("age", lit(1))
|
||||||
@@ -757,7 +754,6 @@ class TestHoodieSparkSqlWriter {
|
|||||||
assert(incrementalKeyIdNumBootstrap == 1000)
|
assert(incrementalKeyIdNumBootstrap == 1000)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test case for deletion of partitions.
|
* Test case for deletion of partitions.
|
||||||
|
|||||||
@@ -61,14 +61,18 @@ class TestHoodieSqlBase extends FunSuite with BeforeAndAfterAll {
|
|||||||
}
|
}
|
||||||
|
|
||||||
override protected def test(testName: String, testTags: Tag*)(testFun: => Any /* Assertion */)(implicit pos: source.Position): Unit = {
|
override protected def test(testName: String, testTags: Tag*)(testFun: => Any /* Assertion */)(implicit pos: source.Position): Unit = {
|
||||||
try super.test(testName, testTags: _*)(try testFun finally {
|
super.test(testName, testTags: _*)(
|
||||||
|
try {
|
||||||
|
testFun
|
||||||
|
} finally {
|
||||||
val catalog = spark.sessionState.catalog
|
val catalog = spark.sessionState.catalog
|
||||||
catalog.listDatabases().foreach{db =>
|
catalog.listDatabases().foreach{db =>
|
||||||
catalog.listTables(db).foreach {table =>
|
catalog.listTables(db).foreach {table =>
|
||||||
catalog.dropTable(table, true, true)
|
catalog.dropTable(table, true, true)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
protected def generateTableName: String = {
|
protected def generateTableName: String = {
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
|
|
||||||
package org.apache.spark.sql.hudi
|
package org.apache.spark.sql.hudi
|
||||||
|
|
||||||
|
import org.apache.hudi.HoodieSparkUtils
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
import org.apache.spark.sql.Row
|
import org.apache.spark.sql.Row
|
||||||
|
|
||||||
@@ -352,7 +353,7 @@ class TestMergeIntoTable2 extends TestHoodieSqlBase {
|
|||||||
| when not matched and flag = '1' then insert *
|
| when not matched and flag = '1' then insert *
|
||||||
|""".stripMargin
|
|""".stripMargin
|
||||||
|
|
||||||
if (HoodieSqlUtils.isSpark3) {
|
if (HoodieSparkUtils.isSpark3) {
|
||||||
checkExceptionContain(mergeSql)("Columns aliases are not allowed in MERGE")
|
checkExceptionContain(mergeSql)("Columns aliases are not allowed in MERGE")
|
||||||
} else {
|
} else {
|
||||||
spark.sql(mergeSql)
|
spark.sql(mergeSql)
|
||||||
|
|||||||
@@ -16,18 +16,26 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.spark.sql.execution.datasources
|
package org.apache.spark.sql.execution.datasources
|
||||||
|
|
||||||
import java.util.TimeZone
|
import java.util.TimeZone
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
import org.apache.spark.sql.execution.datasources.PartitioningUtils.PartitionValues
|
|
||||||
import org.apache.spark.sql.types.DataType
|
import org.apache.spark.sql.types._
|
||||||
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
|
|
||||||
class Spark2ParsePartitionUtil extends SparkParsePartitionUtil {
|
class Spark2ParsePartitionUtil extends SparkParsePartitionUtil {
|
||||||
override def parsePartition(path: Path, typeInference: Boolean,
|
|
||||||
|
override def parsePartition(
|
||||||
|
path: Path,
|
||||||
|
typeInference: Boolean,
|
||||||
basePaths: Set[Path],
|
basePaths: Set[Path],
|
||||||
userSpecifiedDataTypes: Map[String, DataType],
|
userSpecifiedDataTypes: Map[String, DataType],
|
||||||
timeZone: TimeZone): Option[PartitionValues] = {
|
timeZone: TimeZone): InternalRow = {
|
||||||
PartitioningUtils.parsePartition(path, typeInference,
|
val (partitionValues, _) = PartitioningUtils.parsePartition(path, typeInference,
|
||||||
basePaths, userSpecifiedDataTypes, timeZone)._1
|
basePaths, userSpecifiedDataTypes, timeZone)
|
||||||
|
|
||||||
|
partitionValues.map(_.literals.map(_.value)).map(InternalRow.fromSeq)
|
||||||
|
.getOrElse(InternalRow.empty)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,20 +17,25 @@
|
|||||||
|
|
||||||
package org.apache.hudi.spark3.internal;
|
package org.apache.hudi.spark3.internal;
|
||||||
|
|
||||||
|
import org.apache.hudi.HoodieSparkUtils;
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.InsertIntoStatement;
|
import org.apache.spark.sql.catalyst.plans.logical.InsertIntoStatement;
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan;
|
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan;
|
||||||
|
import org.apache.spark.sql.catalyst.util.DateFormatter;
|
||||||
|
|
||||||
import scala.Option;
|
import scala.Option;
|
||||||
import scala.collection.Seq;
|
import scala.collection.Seq;
|
||||||
import scala.collection.immutable.Map;
|
import scala.collection.immutable.Map;
|
||||||
|
|
||||||
import java.lang.reflect.Constructor;
|
import java.lang.reflect.Constructor;
|
||||||
|
import java.lang.reflect.Method;
|
||||||
|
import java.time.ZoneId;
|
||||||
|
|
||||||
public class ReflectUtil {
|
public class ReflectUtil {
|
||||||
|
|
||||||
public static InsertIntoStatement createInsertInto(boolean isSpark30, LogicalPlan table, Map<String, Option<String>> partition, Seq<String> userSpecifiedCols,
|
public static InsertIntoStatement createInsertInto(LogicalPlan table, Map<String, Option<String>> partition, Seq<String> userSpecifiedCols,
|
||||||
LogicalPlan query, boolean overwrite, boolean ifPartitionNotExists) {
|
LogicalPlan query, boolean overwrite, boolean ifPartitionNotExists) {
|
||||||
try {
|
try {
|
||||||
if (isSpark30) {
|
if (HoodieSparkUtils.isSpark3_0()) {
|
||||||
Constructor<InsertIntoStatement> constructor = InsertIntoStatement.class.getConstructor(
|
Constructor<InsertIntoStatement> constructor = InsertIntoStatement.class.getConstructor(
|
||||||
LogicalPlan.class, Map.class, LogicalPlan.class, boolean.class, boolean.class);
|
LogicalPlan.class, Map.class, LogicalPlan.class, boolean.class, boolean.class);
|
||||||
return constructor.newInstance(table, partition, query, overwrite, ifPartitionNotExists);
|
return constructor.newInstance(table, partition, query, overwrite, ifPartitionNotExists);
|
||||||
@@ -43,4 +48,23 @@ public class ReflectUtil {
|
|||||||
throw new RuntimeException("Error in create InsertIntoStatement", e);
|
throw new RuntimeException("Error in create InsertIntoStatement", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static DateFormatter getDateFormatter(ZoneId zoneId) {
|
||||||
|
try {
|
||||||
|
ClassLoader loader = Thread.currentThread().getContextClassLoader();
|
||||||
|
if (HoodieSparkUtils.isSpark3_2()) {
|
||||||
|
Class clazz = loader.loadClass(DateFormatter.class.getName());
|
||||||
|
Method applyMethod = clazz.getDeclaredMethod("apply");
|
||||||
|
applyMethod.setAccessible(true);
|
||||||
|
return (DateFormatter)applyMethod.invoke(null);
|
||||||
|
} else {
|
||||||
|
Class clazz = loader.loadClass(DateFormatter.class.getName());
|
||||||
|
Method applyMethod = clazz.getDeclaredMethod("apply", ZoneId.class);
|
||||||
|
applyMethod.setAccessible(true);
|
||||||
|
return (DateFormatter)applyMethod.invoke(null, zoneId);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("Error in apply DateFormatter", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ package org.apache.spark.sql.adapter
|
|||||||
import org.apache.hudi.Spark3RowSerDe
|
import org.apache.hudi.Spark3RowSerDe
|
||||||
import org.apache.hudi.client.utils.SparkRowSerDe
|
import org.apache.hudi.client.utils.SparkRowSerDe
|
||||||
import org.apache.hudi.spark3.internal.ReflectUtil
|
import org.apache.hudi.spark3.internal.ReflectUtil
|
||||||
import org.apache.spark.SPARK_VERSION
|
|
||||||
import org.apache.spark.sql.Row
|
import org.apache.spark.sql.Row
|
||||||
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
|
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
|
||||||
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||||
@@ -79,7 +79,7 @@ class Spark3Adapter extends SparkAdapter {
|
|||||||
|
|
||||||
override def createInsertInto(table: LogicalPlan, partition: Map[String, Option[String]],
|
override def createInsertInto(table: LogicalPlan, partition: Map[String, Option[String]],
|
||||||
query: LogicalPlan, overwrite: Boolean, ifPartitionNotExists: Boolean): LogicalPlan = {
|
query: LogicalPlan, overwrite: Boolean, ifPartitionNotExists: Boolean): LogicalPlan = {
|
||||||
ReflectUtil.createInsertInto(SPARK_VERSION.startsWith("3.0"), table, partition, Seq.empty[String], query, overwrite, ifPartitionNotExists)
|
ReflectUtil.createInsertInto(table, partition, Seq.empty[String], query, overwrite, ifPartitionNotExists)
|
||||||
}
|
}
|
||||||
|
|
||||||
override def createSparkParsePartitionUtil(conf: SQLConf): SparkParsePartitionUtil = {
|
override def createSparkParsePartitionUtil(conf: SQLConf): SparkParsePartitionUtil = {
|
||||||
|
|||||||
@@ -16,24 +16,259 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.spark.sql.execution.datasources
|
package org.apache.spark.sql.execution.datasources
|
||||||
import java.util.TimeZone
|
|
||||||
|
import java.lang.{Double => JDouble, Long => JLong}
|
||||||
|
import java.math.{BigDecimal => JBigDecimal}
|
||||||
|
import java.time.ZoneId
|
||||||
|
import java.util.{Locale, TimeZone}
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
|
|
||||||
|
import org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH
|
||||||
|
import org.apache.hudi.spark3.internal.ReflectUtil
|
||||||
|
|
||||||
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
|
import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.unescapePathName
|
||||||
|
import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
|
||||||
import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter}
|
import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter}
|
||||||
import org.apache.spark.sql.execution.datasources.PartitioningUtils.{PartitionValues, timestampPartitionPattern}
|
import org.apache.spark.sql.execution.datasources.PartitioningUtils.timestampPartitionPattern
|
||||||
import org.apache.spark.sql.internal.SQLConf
|
import org.apache.spark.sql.internal.SQLConf
|
||||||
import org.apache.spark.sql.types.DataType
|
import org.apache.spark.sql.types._
|
||||||
|
import org.apache.spark.unsafe.types.UTF8String
|
||||||
|
|
||||||
|
import scala.collection.mutable.ArrayBuffer
|
||||||
|
import scala.util.Try
|
||||||
|
import scala.util.control.NonFatal
|
||||||
|
|
||||||
class Spark3ParsePartitionUtil(conf: SQLConf) extends SparkParsePartitionUtil {
|
class Spark3ParsePartitionUtil(conf: SQLConf) extends SparkParsePartitionUtil {
|
||||||
|
|
||||||
override def parsePartition(path: Path, typeInference: Boolean,
|
/**
|
||||||
basePaths: Set[Path], userSpecifiedDataTypes: Map[String, DataType],
|
* The definition of PartitionValues has been changed by SPARK-34314 in Spark3.2.
|
||||||
timeZone: TimeZone): Option[PartitionValues] = {
|
* To solve the compatibility between 3.1 and 3.2, copy some codes from PartitioningUtils in Spark3.2 here.
|
||||||
val dateFormatter = DateFormatter(timeZone.toZoneId)
|
* And this method will generate and return `InternalRow` directly instead of `PartitionValues`.
|
||||||
|
*/
|
||||||
|
override def parsePartition(
|
||||||
|
path: Path,
|
||||||
|
typeInference: Boolean,
|
||||||
|
basePaths: Set[Path],
|
||||||
|
userSpecifiedDataTypes: Map[String, DataType],
|
||||||
|
timeZone: TimeZone): InternalRow = {
|
||||||
|
val dateFormatter = ReflectUtil.getDateFormatter(timeZone.toZoneId)
|
||||||
val timestampFormatter = TimestampFormatter(timestampPartitionPattern,
|
val timestampFormatter = TimestampFormatter(timestampPartitionPattern,
|
||||||
timeZone.toZoneId, isParsing = true)
|
timeZone.toZoneId, isParsing = true)
|
||||||
|
|
||||||
PartitioningUtils.parsePartition(path, typeInference, basePaths, userSpecifiedDataTypes,
|
val (partitionValues, _) = parsePartition(path, typeInference, basePaths, userSpecifiedDataTypes,
|
||||||
conf.validatePartitionColumns, timeZone.toZoneId, dateFormatter, timestampFormatter)._1
|
conf.validatePartitionColumns, timeZone.toZoneId, dateFormatter, timestampFormatter)
|
||||||
|
|
||||||
|
partitionValues.map {
|
||||||
|
case PartitionValues(columnNames: Seq[String], typedValues: Seq[TypedPartValue]) =>
|
||||||
|
val rowValues = columnNames.zip(typedValues).map { case (columnName, typedValue) =>
|
||||||
|
try {
|
||||||
|
castPartValueToDesiredType(typedValue.dataType, typedValue.value, timeZone.toZoneId)
|
||||||
|
} catch {
|
||||||
|
case NonFatal(_) =>
|
||||||
|
if (conf.validatePartitionColumns) {
|
||||||
|
throw new RuntimeException(s"Failed to cast value `${typedValue.value}` to " +
|
||||||
|
s"`${typedValue.dataType}` for partition column `$columnName`")
|
||||||
|
} else null
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
InternalRow.fromSeq(rowValues)
|
||||||
|
}.getOrElse(InternalRow.empty)
|
||||||
|
}
|
||||||
|
|
||||||
|
case class TypedPartValue(value: String, dataType: DataType)
|
||||||
|
|
||||||
|
case class PartitionValues(columnNames: Seq[String], typedValues: Seq[TypedPartValue])
|
||||||
|
{
|
||||||
|
require(columnNames.size == typedValues.size)
|
||||||
|
}
|
||||||
|
|
||||||
|
private def parsePartition(
|
||||||
|
path: Path,
|
||||||
|
typeInference: Boolean,
|
||||||
|
basePaths: Set[Path],
|
||||||
|
userSpecifiedDataTypes: Map[String, DataType],
|
||||||
|
validatePartitionColumns: Boolean,
|
||||||
|
zoneId: ZoneId,
|
||||||
|
dateFormatter: DateFormatter,
|
||||||
|
timestampFormatter: TimestampFormatter): (Option[PartitionValues], Option[Path]) = {
|
||||||
|
|
||||||
|
val columns = ArrayBuffer.empty[(String, TypedPartValue)]
|
||||||
|
// Old Hadoop versions don't have `Path.isRoot`
|
||||||
|
var finished = path.getParent == null
|
||||||
|
// currentPath is the current path that we will use to parse partition column value.
|
||||||
|
var currentPath: Path = path
|
||||||
|
|
||||||
|
while (!finished) {
|
||||||
|
// Sometimes (e.g., when speculative task is enabled), temporary directories may be left
|
||||||
|
// uncleaned. Here we simply ignore them.
|
||||||
|
if (currentPath.getName.toLowerCase(Locale.ROOT) == "_temporary") {
|
||||||
|
// scalastyle:off return
|
||||||
|
return (None, None)
|
||||||
|
// scalastyle:on return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (basePaths.contains(currentPath)) {
|
||||||
|
// If the currentPath is one of base paths. We should stop.
|
||||||
|
finished = true
|
||||||
|
} else {
|
||||||
|
// Let's say currentPath is a path of "/table/a=1/", currentPath.getName will give us a=1.
|
||||||
|
// Once we get the string, we try to parse it and find the partition column and value.
|
||||||
|
val maybeColumn =
|
||||||
|
parsePartitionColumn(currentPath.getName, typeInference, userSpecifiedDataTypes,
|
||||||
|
validatePartitionColumns, zoneId, dateFormatter, timestampFormatter)
|
||||||
|
maybeColumn.foreach(columns += _)
|
||||||
|
|
||||||
|
// Now, we determine if we should stop.
|
||||||
|
// When we hit any of the following cases, we will stop:
|
||||||
|
// - In this iteration, we could not parse the value of partition column and value,
|
||||||
|
// i.e. maybeColumn is None, and columns is not empty. At here we check if columns is
|
||||||
|
// empty to handle cases like /table/a=1/_temporary/something (we need to find a=1 in
|
||||||
|
// this case).
|
||||||
|
// - After we get the new currentPath, this new currentPath represent the top level dir
|
||||||
|
// i.e. currentPath.getParent == null. For the example of "/table/a=1/",
|
||||||
|
// the top level dir is "/table".
|
||||||
|
finished =
|
||||||
|
(maybeColumn.isEmpty && !columns.isEmpty) || currentPath.getParent == null
|
||||||
|
|
||||||
|
if (!finished) {
|
||||||
|
// For the above example, currentPath will be "/table/".
|
||||||
|
currentPath = currentPath.getParent
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (columns.isEmpty) {
|
||||||
|
(None, Some(path))
|
||||||
|
} else {
|
||||||
|
val (columnNames, values) = columns.reverse.unzip
|
||||||
|
(Some(PartitionValues(columnNames.toSeq, values.toSeq)), Some(currentPath))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def parsePartitionColumn(
|
||||||
|
columnSpec: String,
|
||||||
|
typeInference: Boolean,
|
||||||
|
userSpecifiedDataTypes: Map[String, DataType],
|
||||||
|
validatePartitionColumns: Boolean,
|
||||||
|
zoneId: ZoneId,
|
||||||
|
dateFormatter: DateFormatter,
|
||||||
|
timestampFormatter: TimestampFormatter): Option[(String, TypedPartValue)] = {
|
||||||
|
val equalSignIndex = columnSpec.indexOf('=')
|
||||||
|
if (equalSignIndex == -1) {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
val columnName = unescapePathName(columnSpec.take(equalSignIndex))
|
||||||
|
assert(columnName.nonEmpty, s"Empty partition column name in '$columnSpec'")
|
||||||
|
|
||||||
|
val rawColumnValue = columnSpec.drop(equalSignIndex + 1)
|
||||||
|
assert(rawColumnValue.nonEmpty, s"Empty partition column value in '$columnSpec'")
|
||||||
|
|
||||||
|
val dataType = if (userSpecifiedDataTypes.contains(columnName)) {
|
||||||
|
// SPARK-26188: if user provides corresponding column schema, get the column value without
|
||||||
|
// inference, and then cast it as user specified data type.
|
||||||
|
userSpecifiedDataTypes(columnName)
|
||||||
|
} else {
|
||||||
|
inferPartitionColumnValue(
|
||||||
|
rawColumnValue,
|
||||||
|
typeInference,
|
||||||
|
zoneId,
|
||||||
|
dateFormatter,
|
||||||
|
timestampFormatter)
|
||||||
|
}
|
||||||
|
Some(columnName -> TypedPartValue(rawColumnValue, dataType))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def inferPartitionColumnValue(
|
||||||
|
raw: String,
|
||||||
|
typeInference: Boolean,
|
||||||
|
zoneId: ZoneId,
|
||||||
|
dateFormatter: DateFormatter,
|
||||||
|
timestampFormatter: TimestampFormatter): DataType = {
|
||||||
|
val decimalTry = Try {
|
||||||
|
// `BigDecimal` conversion can fail when the `field` is not a form of number.
|
||||||
|
val bigDecimal = new JBigDecimal(raw)
|
||||||
|
// It reduces the cases for decimals by disallowing values having scale (e.g. `1.1`).
|
||||||
|
require(bigDecimal.scale <= 0)
|
||||||
|
// `DecimalType` conversion can fail when
|
||||||
|
// 1. The precision is bigger than 38.
|
||||||
|
// 2. scale is bigger than precision.
|
||||||
|
fromDecimal(Decimal(bigDecimal))
|
||||||
|
}
|
||||||
|
|
||||||
|
val dateTry = Try {
|
||||||
|
// try and parse the date, if no exception occurs this is a candidate to be resolved as
|
||||||
|
// DateType
|
||||||
|
dateFormatter.parse(raw)
|
||||||
|
// SPARK-23436: Casting the string to date may still return null if a bad Date is provided.
|
||||||
|
// This can happen since DateFormat.parse may not use the entire text of the given string:
|
||||||
|
// so if there are extra-characters after the date, it returns correctly.
|
||||||
|
// We need to check that we can cast the raw string since we later can use Cast to get
|
||||||
|
// the partition values with the right DataType (see
|
||||||
|
// org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex.inferPartitioning)
|
||||||
|
val dateValue = Cast(Literal(raw), DateType, Some(zoneId.getId)).eval()
|
||||||
|
// Disallow DateType if the cast returned null
|
||||||
|
require(dateValue != null)
|
||||||
|
DateType
|
||||||
|
}
|
||||||
|
|
||||||
|
val timestampTry = Try {
|
||||||
|
val unescapedRaw = unescapePathName(raw)
|
||||||
|
// the inferred data type is consistent with the default timestamp type
|
||||||
|
val timestampType = TimestampType
|
||||||
|
// try and parse the date, if no exception occurs this is a candidate to be resolved as TimestampType
|
||||||
|
timestampFormatter.parse(unescapedRaw)
|
||||||
|
|
||||||
|
// SPARK-23436: see comment for date
|
||||||
|
val timestampValue = Cast(Literal(unescapedRaw), timestampType, Some(zoneId.getId)).eval()
|
||||||
|
// Disallow TimestampType if the cast returned null
|
||||||
|
require(timestampValue != null)
|
||||||
|
timestampType
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeInference) {
|
||||||
|
// First tries integral types
|
||||||
|
Try({ Integer.parseInt(raw); IntegerType })
|
||||||
|
.orElse(Try { JLong.parseLong(raw); LongType })
|
||||||
|
.orElse(decimalTry)
|
||||||
|
// Then falls back to fractional types
|
||||||
|
.orElse(Try { JDouble.parseDouble(raw); DoubleType })
|
||||||
|
// Then falls back to date/timestamp types
|
||||||
|
.orElse(timestampTry)
|
||||||
|
.orElse(dateTry)
|
||||||
|
// Then falls back to string
|
||||||
|
.getOrElse {
|
||||||
|
if (raw == DEFAULT_PARTITION_PATH) NullType else StringType
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (raw == DEFAULT_PARTITION_PATH) NullType else StringType
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def castPartValueToDesiredType(
|
||||||
|
desiredType: DataType,
|
||||||
|
value: String,
|
||||||
|
zoneId: ZoneId): Any = desiredType match {
|
||||||
|
case _ if value == DEFAULT_PARTITION_PATH => null
|
||||||
|
case NullType => null
|
||||||
|
case StringType => UTF8String.fromString(unescapePathName(value))
|
||||||
|
case IntegerType => Integer.parseInt(value)
|
||||||
|
case LongType => JLong.parseLong(value)
|
||||||
|
case DoubleType => JDouble.parseDouble(value)
|
||||||
|
case _: DecimalType => Literal(new JBigDecimal(value)).value
|
||||||
|
case DateType =>
|
||||||
|
Cast(Literal(value), DateType, Some(zoneId.getId)).eval()
|
||||||
|
// Timestamp types
|
||||||
|
case dt: TimestampType =>
|
||||||
|
Try {
|
||||||
|
Cast(Literal(unescapePathName(value)), dt, Some(zoneId.getId)).eval()
|
||||||
|
}.getOrElse {
|
||||||
|
Cast(Cast(Literal(value), DateType, Some(zoneId.getId)), dt).eval()
|
||||||
|
}
|
||||||
|
case dt => throw new IllegalArgumentException(s"Unexpected type $dt")
|
||||||
|
}
|
||||||
|
|
||||||
|
private def fromDecimal(d: Decimal): DecimalType = DecimalType(d.precision, d.scale)
|
||||||
|
}
|
||||||
|
|||||||
@@ -40,7 +40,6 @@ public class TestReflectUtil extends HoodieClientTestBase {
|
|||||||
InsertIntoStatement statement = (InsertIntoStatement) spark.sessionState().sqlParser().parsePlan(insertIntoSql);
|
InsertIntoStatement statement = (InsertIntoStatement) spark.sessionState().sqlParser().parsePlan(insertIntoSql);
|
||||||
|
|
||||||
InsertIntoStatement newStatment = ReflectUtil.createInsertInto(
|
InsertIntoStatement newStatment = ReflectUtil.createInsertInto(
|
||||||
spark.version().startsWith("3.0"),
|
|
||||||
statement.table(),
|
statement.table(),
|
||||||
statement.partitionSpec(),
|
statement.partitionSpec(),
|
||||||
scala.collection.immutable.List.empty(),
|
scala.collection.immutable.List.empty(),
|
||||||
|
|||||||
29
pom.xml
29
pom.xml
@@ -118,7 +118,7 @@
|
|||||||
<sparkbundle.version>${spark2bundle.version}</sparkbundle.version>
|
<sparkbundle.version>${spark2bundle.version}</sparkbundle.version>
|
||||||
<flink.version>1.13.1</flink.version>
|
<flink.version>1.13.1</flink.version>
|
||||||
<spark2.version>2.4.4</spark2.version>
|
<spark2.version>2.4.4</spark2.version>
|
||||||
<spark3.version>3.1.2</spark3.version>
|
<spark3.version>3.2.0</spark3.version>
|
||||||
<spark2bundle.version></spark2bundle.version>
|
<spark2bundle.version></spark2bundle.version>
|
||||||
<spark3bundle.version>3</spark3bundle.version>
|
<spark3bundle.version>3</spark3bundle.version>
|
||||||
<hudi.spark.module>hudi-spark2</hudi.spark.module>
|
<hudi.spark.module>hudi-spark2</hudi.spark.module>
|
||||||
@@ -1515,11 +1515,36 @@
|
|||||||
</activation>
|
</activation>
|
||||||
</profile>
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark3.1.x</id>
|
||||||
|
<properties>
|
||||||
|
<spark3.version>3.1.2</spark3.version>
|
||||||
|
<spark.version>${spark3.version}</spark.version>
|
||||||
|
<sparkbundle.version>${spark3bundle.version}</sparkbundle.version>
|
||||||
|
<scala.version>${scala12.version}</scala.version>
|
||||||
|
<scala.binary.version>2.12</scala.binary.version>
|
||||||
|
<hudi.spark.module>hudi-spark3</hudi.spark.module>
|
||||||
|
<scalatest.version>3.1.0</scalatest.version>
|
||||||
|
<kafka.version>2.4.1</kafka.version>
|
||||||
|
<fasterxml.version>${fasterxml.spark3.version}</fasterxml.version>
|
||||||
|
<fasterxml.jackson.databind.version>${fasterxml.spark3.version}</fasterxml.jackson.databind.version>
|
||||||
|
<fasterxml.jackson.module.scala.version>${fasterxml.spark3.version}</fasterxml.jackson.module.scala.version>
|
||||||
|
<fasterxml.jackson.dataformat.yaml.version>${fasterxml.spark3.version}</fasterxml.jackson.dataformat.yaml.version>
|
||||||
|
<skip.hudi-spark2.unit.tests>true</skip.hudi-spark2.unit.tests>
|
||||||
|
<skipITs>true</skipITs>
|
||||||
|
</properties>
|
||||||
|
<activation>
|
||||||
|
<property>
|
||||||
|
<name>spark3</name>
|
||||||
|
</property>
|
||||||
|
</activation>
|
||||||
|
</profile>
|
||||||
|
|
||||||
<profile>
|
<profile>
|
||||||
<id>spark3.0.x</id>
|
<id>spark3.0.x</id>
|
||||||
<!-- for spark 3.0.x we need override the follow propeprties to package and run test-->
|
<!-- for spark 3.0.x we need override the follow propeprties to package and run test-->
|
||||||
<properties>
|
<properties>
|
||||||
<spark3.version>3.0.0</spark3.version>
|
<spark3.version>3.0.3</spark3.version>
|
||||||
<spark.version>${spark3.version}</spark.version>
|
<spark.version>${spark3.version}</spark.version>
|
||||||
<scalatest.version>3.0.1</scalatest.version>
|
<scalatest.version>3.0.1</scalatest.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|||||||
Reference in New Issue
Block a user