1
0

[HUDI-3254] Introduce HoodieCatalog to manage tables for Spark Datasource V2 (#4611)

This commit is contained in:
leesf
2022-02-14 22:26:58 +08:00
committed by GitHub
parent 5ca4480a38
commit 0db1e978c6
26 changed files with 1288 additions and 81 deletions

View File

@@ -53,8 +53,18 @@ object HoodieSparkUtils extends SparkAdapterSupport {
def isSpark3_0: Boolean = SPARK_VERSION.startsWith("3.0")
def isSpark3_1: Boolean = SPARK_VERSION.startsWith("3.1")
def isSpark3_2: Boolean = SPARK_VERSION.startsWith("3.2")
def beforeSpark3_2(): Boolean = {
if (isSpark2 || isSpark3_0 || isSpark3_1) {
true
} else {
false
}
}
def getMetaSchema: StructType = {
StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => {
StructField(col, StringType, nullable = true)

View File

@@ -18,19 +18,22 @@
package org.apache.spark.sql.hudi
import org.apache.hudi.HoodieSparkUtils.sparkAdapter
import org.apache.hudi.client.utils.SparkRowSerDe
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.parser.ParserInterface
import org.apache.spark.sql.catalyst.plans.JoinType
import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan}
import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier}
import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile, SparkParsePartitionUtil}
import org.apache.spark.sql.execution.datasources.{FilePartition, LogicalRelation, PartitionedFile, SparkParsePartitionUtil}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.{Row, SparkSession}
import java.util.Locale
/**
* An interface to adapter the difference between spark2 and spark3
* in some spark related class.
@@ -99,4 +102,35 @@ trait SparkAdapter extends Serializable {
*/
def getFilePartitions(sparkSession: SparkSession, partitionedFiles: Seq[PartitionedFile],
maxSplitBytes: Long): Seq[FilePartition]
def isHoodieTable(table: LogicalPlan, spark: SparkSession): Boolean = {
tripAlias(table) match {
case LogicalRelation(_, _, Some(tbl), _) => isHoodieTable(tbl)
case relation: UnresolvedRelation =>
isHoodieTable(toTableIdentifier(relation), spark)
case _=> false
}
}
def isHoodieTable(map: java.util.Map[String, String]): Boolean = {
map.getOrDefault("provider", "").equals("hudi")
}
def isHoodieTable(table: CatalogTable): Boolean = {
table.provider.map(_.toLowerCase(Locale.ROOT)).orNull == "hudi"
}
def isHoodieTable(tableId: TableIdentifier, spark: SparkSession): Boolean = {
val table = spark.sessionState.catalog.getTableMetadata(tableId)
isHoodieTable(table)
}
def tripAlias(plan: LogicalPlan): LogicalPlan = {
plan match {
case SubqueryAlias(_, relation: LogicalPlan) =>
tripAlias(relation)
case other =>
other
}
}
}