[HUDI-1842] Spark Sql Support For pre-existing Hoodie Table (#3393)
This commit is contained in:
@@ -26,7 +26,6 @@ import org.apache.hudi.common.model.HoodieTableType.{COPY_ON_WRITE, MERGE_ON_REA
|
|||||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||||
import org.apache.hudi.exception.HoodieException
|
import org.apache.hudi.exception.HoodieException
|
||||||
import org.apache.hudi.hadoop.HoodieROTablePathFilter
|
import org.apache.hudi.hadoop.HoodieROTablePathFilter
|
||||||
import org.apache.hudi.hive.util.ConfigUtils
|
|
||||||
import org.apache.log4j.LogManager
|
import org.apache.log4j.LogManager
|
||||||
import org.apache.spark.sql.avro.SchemaConverters
|
import org.apache.spark.sql.avro.SchemaConverters
|
||||||
import org.apache.spark.sql.execution.datasources.{DataSource, FileStatusCache, HadoopFsRelation}
|
import org.apache.spark.sql.execution.datasources.{DataSource, FileStatusCache, HadoopFsRelation}
|
||||||
|
|||||||
@@ -126,12 +126,9 @@ object HoodieOptionConfig {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Mapping the table config (loaded from the hoodie.properties) to the sql options.
|
* Mapping the table config (loaded from the hoodie.properties) to the sql options.
|
||||||
* @param options
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
def mappingTableConfigToSqlOption(options: Map[String, String]): Map[String, String] = {
|
def mappingTableConfigToSqlOption(options: Map[String, String]): Map[String, String] = {
|
||||||
options.filterKeys(k => tableConfigKeyToSqlKey.contains(k))
|
options.map(kv => tableConfigKeyToSqlKey.getOrElse(kv._1, kv._1) -> reverseValueMapping.getOrElse(kv._2, kv._2))
|
||||||
.map(kv => tableConfigKeyToSqlKey(kv._1) -> reverseValueMapping.getOrElse(kv._2, kv._2))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private lazy val defaultTableConfig: Map[String, String] = {
|
private lazy val defaultTableConfig: Map[String, String] = {
|
||||||
|
|||||||
@@ -24,7 +24,9 @@ import org.apache.hadoop.conf.Configuration
|
|||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
import org.apache.hudi.SparkAdapterSupport
|
import org.apache.hudi.SparkAdapterSupport
|
||||||
import org.apache.hudi.common.model.HoodieRecord
|
import org.apache.hudi.common.model.HoodieRecord
|
||||||
|
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||||
import org.apache.spark.SPARK_VERSION
|
import org.apache.spark.SPARK_VERSION
|
||||||
|
import org.apache.spark.sql.avro.SchemaConverters
|
||||||
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
|
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
|
||||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||||
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
|
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
|
||||||
@@ -64,6 +66,16 @@ object HoodieSqlUtils extends SparkAdapterSupport {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def getTableSqlSchema(metaClient: HoodieTableMetaClient): Option[StructType] = {
|
||||||
|
val schemaResolver = new TableSchemaResolver(metaClient)
|
||||||
|
val avroSchema = try Some(schemaResolver.getTableAvroSchema(false))
|
||||||
|
catch {
|
||||||
|
case _: Throwable => None
|
||||||
|
}
|
||||||
|
avroSchema.map(SchemaConverters.toSqlType(_).dataType
|
||||||
|
.asInstanceOf[StructType])
|
||||||
|
}
|
||||||
|
|
||||||
private def tripAlias(plan: LogicalPlan): LogicalPlan = {
|
private def tripAlias(plan: LogicalPlan): LogicalPlan = {
|
||||||
plan match {
|
plan match {
|
||||||
case SubqueryAlias(_, relation: LogicalPlan) =>
|
case SubqueryAlias(_, relation: LogicalPlan) =>
|
||||||
@@ -122,12 +134,12 @@ object HoodieSqlUtils extends SparkAdapterSupport {
|
|||||||
* @param spark
|
* @param spark
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
def getTableLocation(tableId: TableIdentifier, spark: SparkSession): Option[String] = {
|
def getTableLocation(tableId: TableIdentifier, spark: SparkSession): String = {
|
||||||
val table = spark.sessionState.catalog.getTableMetadata(tableId)
|
val table = spark.sessionState.catalog.getTableMetadata(tableId)
|
||||||
getTableLocation(table, spark)
|
getTableLocation(table, spark)
|
||||||
}
|
}
|
||||||
|
|
||||||
def getTableLocation(table: CatalogTable, sparkSession: SparkSession): Option[String] = {
|
def getTableLocation(table: CatalogTable, sparkSession: SparkSession): String = {
|
||||||
val uri = if (table.tableType == CatalogTableType.MANAGED && isHoodieTable(table)) {
|
val uri = if (table.tableType == CatalogTableType.MANAGED && isHoodieTable(table)) {
|
||||||
Some(sparkSession.sessionState.catalog.defaultTablePath(table.identifier))
|
Some(sparkSession.sessionState.catalog.defaultTablePath(table.identifier))
|
||||||
} else {
|
} else {
|
||||||
@@ -136,6 +148,7 @@ object HoodieSqlUtils extends SparkAdapterSupport {
|
|||||||
val conf = sparkSession.sessionState.newHadoopConf()
|
val conf = sparkSession.sessionState.newHadoopConf()
|
||||||
uri.map(makePathQualified(_, conf))
|
uri.map(makePathQualified(_, conf))
|
||||||
.map(removePlaceHolder)
|
.map(removePlaceHolder)
|
||||||
|
.getOrElse(throw new IllegalArgumentException(s"Missing location for ${table.identifier}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
private def removePlaceHolder(path: String): String = {
|
private def removePlaceHolder(path: String): String = {
|
||||||
@@ -154,6 +167,16 @@ object HoodieSqlUtils extends SparkAdapterSupport {
|
|||||||
fs.makeQualified(hadoopPath).toUri.toString
|
fs.makeQualified(hadoopPath).toUri.toString
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the hoodie.properties exists in the table path.
|
||||||
|
*/
|
||||||
|
def tableExistsInPath(tablePath: String, conf: Configuration): Boolean = {
|
||||||
|
val basePath = new Path(tablePath)
|
||||||
|
val fs = basePath.getFileSystem(conf)
|
||||||
|
val metaPath = new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)
|
||||||
|
fs.exists(metaPath)
|
||||||
|
}
|
||||||
|
|
||||||
def castIfNeeded(child: Expression, dataType: DataType, conf: SQLConf): Expression = {
|
def castIfNeeded(child: Expression, dataType: DataType, conf: SQLConf): Expression = {
|
||||||
child match {
|
child match {
|
||||||
case Literal(nul, NullType) => Literal(nul, dataType)
|
case Literal(nul, NullType) => Literal(nul, dataType)
|
||||||
|
|||||||
@@ -21,7 +21,8 @@ import org.apache.hudi.DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL
|
|||||||
import org.apache.hudi.SparkAdapterSupport
|
import org.apache.hudi.SparkAdapterSupport
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType}
|
import org.apache.hudi.common.model.HoodieRecord
|
||||||
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
import org.apache.spark.sql.{AnalysisException, SparkSession}
|
import org.apache.spark.sql.{AnalysisException, SparkSession}
|
||||||
import org.apache.spark.sql.catalyst.analysis.UnresolvedStar
|
import org.apache.spark.sql.catalyst.analysis.UnresolvedStar
|
||||||
import org.apache.spark.sql.catalyst.expressions.AttributeReference
|
import org.apache.spark.sql.catalyst.expressions.AttributeReference
|
||||||
@@ -286,7 +287,28 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
|||||||
} else {
|
} else {
|
||||||
l
|
l
|
||||||
}
|
}
|
||||||
|
// Fill schema for Create Table without specify schema info
|
||||||
|
case c @ CreateTable(tableDesc, _, _)
|
||||||
|
if isHoodieTable(tableDesc) =>
|
||||||
|
val tablePath = getTableLocation(c.tableDesc, sparkSession)
|
||||||
|
if (tableExistsInPath(tablePath, sparkSession.sessionState.newHadoopConf())) {
|
||||||
|
val metaClient = HoodieTableMetaClient.builder()
|
||||||
|
.setBasePath(tablePath)
|
||||||
|
.setConf(sparkSession.sessionState.newHadoopConf())
|
||||||
|
.build()
|
||||||
|
val tableSchema = HoodieSqlUtils.getTableSqlSchema(metaClient).map(HoodieSqlUtils.addMetaFields)
|
||||||
|
if (tableSchema.isDefined && tableDesc.schema.isEmpty) {
|
||||||
|
// Fill the schema with the schema from the table
|
||||||
|
c.copy(tableDesc.copy(schema = tableSchema.get))
|
||||||
|
} else if (tableSchema.isDefined && tableDesc.schema != tableSchema.get) {
|
||||||
|
throw new AnalysisException(s"Specified schema in create table statement is not equal to the table schema." +
|
||||||
|
s"You should not specify the schema for an exist table: ${tableDesc.identifier} ")
|
||||||
|
} else {
|
||||||
|
c
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
c
|
||||||
|
}
|
||||||
case p => p
|
case p => p
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -103,7 +103,6 @@ object AlterHoodieTableAddColumnsCommand {
|
|||||||
*/
|
*/
|
||||||
def commitWithSchema(schema: Schema, table: CatalogTable, sparkSession: SparkSession): Unit = {
|
def commitWithSchema(schema: Schema, table: CatalogTable, sparkSession: SparkSession): Unit = {
|
||||||
val path = getTableLocation(table, sparkSession)
|
val path = getTableLocation(table, sparkSession)
|
||||||
.getOrElse(s"missing location for ${table.identifier}")
|
|
||||||
|
|
||||||
val jsc = new JavaSparkContext(sparkSession.sparkContext)
|
val jsc = new JavaSparkContext(sparkSession.sparkContext)
|
||||||
val client = DataSourceUtils.createHoodieClient(jsc, schema.toString,
|
val client = DataSourceUtils.createHoodieClient(jsc, schema.toString,
|
||||||
|
|||||||
@@ -68,7 +68,6 @@ case class AlterHoodieTableChangeColumnCommand(
|
|||||||
val newSchema = AvroConversionUtils.convertStructTypeToAvroSchema(newSqlSchema, structName, nameSpace)
|
val newSchema = AvroConversionUtils.convertStructTypeToAvroSchema(newSqlSchema, structName, nameSpace)
|
||||||
|
|
||||||
val path = getTableLocation(table, sparkSession)
|
val path = getTableLocation(table, sparkSession)
|
||||||
.getOrElse(s"missing location for ${table.identifier}")
|
|
||||||
val hadoopConf = sparkSession.sessionState.newHadoopConf()
|
val hadoopConf = sparkSession.sessionState.newHadoopConf()
|
||||||
val metaClient = HoodieTableMetaClient.builder().setBasePath(path)
|
val metaClient = HoodieTableMetaClient.builder().setBasePath(path)
|
||||||
.setConf(hadoopConf).build()
|
.setConf(hadoopConf).build()
|
||||||
|
|||||||
@@ -37,7 +37,6 @@ class AlterHoodieTableRenameCommand(
|
|||||||
val catalog = sparkSession.sessionState.catalog
|
val catalog = sparkSession.sessionState.catalog
|
||||||
val table = catalog.getTableMetadata(oldName)
|
val table = catalog.getTableMetadata(oldName)
|
||||||
val path = getTableLocation(table, sparkSession)
|
val path = getTableLocation(table, sparkSession)
|
||||||
.getOrElse(s"missing location for ${table.identifier}")
|
|
||||||
|
|
||||||
val hadoopConf = sparkSession.sessionState.newHadoopConf()
|
val hadoopConf = sparkSession.sessionState.newHadoopConf()
|
||||||
val metaClient = HoodieTableMetaClient.builder().setBasePath(path)
|
val metaClient = HoodieTableMetaClient.builder().setBasePath(path)
|
||||||
|
|||||||
@@ -31,7 +31,6 @@ case class CompactionHoodieTableCommand(table: CatalogTable,
|
|||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val basePath = getTableLocation(table, sparkSession)
|
val basePath = getTableLocation(table, sparkSession)
|
||||||
.getOrElse(s"missing location for ${table.identifier}")
|
|
||||||
CompactionHoodiePathCommand(basePath, operation, instantTimestamp).run(sparkSession)
|
CompactionHoodiePathCommand(basePath, operation, instantTimestamp).run(sparkSession)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,6 @@ case class CompactionShowHoodieTableCommand(table: CatalogTable, limit: Int)
|
|||||||
|
|
||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val basePath = getTableLocation(table, sparkSession)
|
val basePath = getTableLocation(table, sparkSession)
|
||||||
.getOrElse(s"missing location for ${table.identifier}")
|
|
||||||
CompactionShowHoodiePathCommand(basePath, limit).run(sparkSession)
|
CompactionShowHoodiePathCommand(basePath, limit).run(sparkSession)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -62,7 +62,6 @@ case class CreateHoodieTableAsSelectCommand(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
val tablePath = getTableLocation(table, sparkSession)
|
val tablePath = getTableLocation(table, sparkSession)
|
||||||
.getOrElse(s"Missing path for table ${table.identifier}")
|
|
||||||
val conf = sparkSession.sessionState.newHadoopConf()
|
val conf = sparkSession.sessionState.newHadoopConf()
|
||||||
assert(CreateHoodieTableCommand.isEmptyPath(tablePath, conf),
|
assert(CreateHoodieTableCommand.isEmptyPath(tablePath, conf),
|
||||||
s"Path '$tablePath' should be empty for CTAS")
|
s"Path '$tablePath' should be empty for CTAS")
|
||||||
|
|||||||
@@ -19,16 +19,19 @@ package org.apache.spark.sql.hudi.command
|
|||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import java.util.{Locale, Properties}
|
import java.util.{Locale, Properties}
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
|
import org.apache.hudi.client.common.HoodieSparkEngineContext
|
||||||
|
import org.apache.hudi.common.config.HoodieMetadataConfig
|
||||||
|
import org.apache.hudi.common.fs.FSUtils
|
||||||
import org.apache.hudi.{DataSourceWriteOptions, SparkAdapterSupport}
|
import org.apache.hudi.{DataSourceWriteOptions, SparkAdapterSupport}
|
||||||
import org.apache.hudi.common.model.HoodieFileFormat
|
import org.apache.hudi.common.model.HoodieFileFormat
|
||||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
|
||||||
import org.apache.hudi.common.util.ValidationUtils
|
import org.apache.hudi.common.util.ValidationUtils
|
||||||
import org.apache.hudi.hadoop.HoodieParquetInputFormat
|
import org.apache.hudi.hadoop.HoodieParquetInputFormat
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat
|
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat
|
||||||
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils
|
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext
|
||||||
import org.apache.spark.{SPARK_VERSION, SparkConf}
|
import org.apache.spark.{SPARK_VERSION, SparkConf}
|
||||||
import org.apache.spark.internal.Logging
|
import org.apache.spark.internal.Logging
|
||||||
import org.apache.spark.sql.avro.SchemaConverters
|
import org.apache.spark.sql.avro.SchemaConverters
|
||||||
@@ -40,8 +43,7 @@ import org.apache.spark.sql.hive.HiveClientUtils
|
|||||||
import org.apache.spark.sql.hive.HiveExternalCatalog._
|
import org.apache.spark.sql.hive.HiveExternalCatalog._
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
||||||
import org.apache.spark.sql.hudi.HoodieOptionConfig
|
import org.apache.spark.sql.hudi.HoodieOptionConfig
|
||||||
import org.apache.spark.sql.hudi.command.CreateHoodieTableCommand.{initTableIfNeed, tableExistsInPath, isEmptyPath}
|
import org.apache.spark.sql.hudi.command.CreateHoodieTableCommand.{initTableIfNeed, isEmptyPath}
|
||||||
import org.apache.spark.sql.internal.StaticSQLConf
|
|
||||||
import org.apache.spark.sql.internal.StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD
|
import org.apache.spark.sql.internal.StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD
|
||||||
import org.apache.spark.sql.types.StructType
|
import org.apache.spark.sql.types.StructType
|
||||||
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
|
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
|
||||||
@@ -50,8 +52,6 @@ import scala.collection.mutable
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Command for create hoodie table.
|
* Command for create hoodie table.
|
||||||
* @param table
|
|
||||||
* @param ignoreIfExists
|
|
||||||
*/
|
*/
|
||||||
case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean)
|
case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean)
|
||||||
extends RunnableCommand with SparkAdapterSupport {
|
extends RunnableCommand with SparkAdapterSupport {
|
||||||
@@ -83,7 +83,6 @@ case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean
|
|||||||
val sessionState = sparkSession.sessionState
|
val sessionState = sparkSession.sessionState
|
||||||
val tableName = table.identifier.unquotedString
|
val tableName = table.identifier.unquotedString
|
||||||
val path = getTableLocation(table, sparkSession)
|
val path = getTableLocation(table, sparkSession)
|
||||||
.getOrElse(s"Missing path for table ${table.identifier}")
|
|
||||||
val conf = sparkSession.sessionState.newHadoopConf()
|
val conf = sparkSession.sessionState.newHadoopConf()
|
||||||
val isTableExists = tableExistsInPath(path, conf)
|
val isTableExists = tableExistsInPath(path, conf)
|
||||||
// Get the schema & table options
|
// Get the schema & table options
|
||||||
@@ -95,26 +94,44 @@ case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean
|
|||||||
.setBasePath(path)
|
.setBasePath(path)
|
||||||
.setConf(conf)
|
.setConf(conf)
|
||||||
.build()
|
.build()
|
||||||
val schemaResolver = new TableSchemaResolver(metaClient)
|
val tableSchema = getTableSqlSchema(metaClient)
|
||||||
val avroSchema = try Some(schemaResolver.getTableAvroSchema(false))
|
|
||||||
catch {
|
|
||||||
case _: Throwable => None
|
|
||||||
}
|
|
||||||
val tableSchema = avroSchema.map(SchemaConverters.toSqlType(_).dataType
|
|
||||||
.asInstanceOf[StructType])
|
|
||||||
|
|
||||||
// Get options from the external table and append with the options in ddl.
|
// Get options from the external table and append with the options in ddl.
|
||||||
val options = HoodieOptionConfig.mappingTableConfigToSqlOption(
|
val originTableConfig = HoodieOptionConfig.mappingTableConfigToSqlOption(
|
||||||
metaClient.getTableConfig.getProps.asScala.toMap) ++ table.storage.properties
|
metaClient.getTableConfig.getProps.asScala.toMap)
|
||||||
|
|
||||||
val userSpecifiedSchema = table.schema
|
val allPartitionPaths = getAllPartitionPaths(sparkSession, table)
|
||||||
if (userSpecifiedSchema.isEmpty && tableSchema.isDefined) {
|
var upgrateConfig = Map.empty[String, String]
|
||||||
|
// If this is a non-hive-styled partition table, disable the hive style config.
|
||||||
|
// (By default this config is enable for spark sql)
|
||||||
|
upgrateConfig = if (isNotHiveStyledPartitionTable(allPartitionPaths, table)) {
|
||||||
|
upgrateConfig + (DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key -> "false")
|
||||||
|
} else {
|
||||||
|
upgrateConfig
|
||||||
|
}
|
||||||
|
upgrateConfig = if (isUrlEncodeDisable(allPartitionPaths, table)) {
|
||||||
|
upgrateConfig + (DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key -> "false")
|
||||||
|
} else {
|
||||||
|
upgrateConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the origin keygen to generate record key to keep the rowkey consistent with the old table for spark sql.
|
||||||
|
// See SqlKeyGenerator#getRecordKey for detail.
|
||||||
|
upgrateConfig = if (originTableConfig.contains(HoodieTableConfig.HOODIE_TABLE_KEY_GENERATOR_CLASS.key)) {
|
||||||
|
upgrateConfig + (SqlKeyGenerator.ORIGIN_KEYGEN_CLASS -> originTableConfig(HoodieTableConfig.HOODIE_TABLE_KEY_GENERATOR_CLASS.key))
|
||||||
|
} else {
|
||||||
|
upgrateConfig
|
||||||
|
}
|
||||||
|
val options = originTableConfig ++ upgrateConfig ++ table.storage.properties
|
||||||
|
|
||||||
|
val userSpecifiedSchema = table.schema
|
||||||
|
if (userSpecifiedSchema.isEmpty && tableSchema.isDefined) {
|
||||||
(addMetaFields(tableSchema.get), options)
|
(addMetaFields(tableSchema.get), options)
|
||||||
} else if (userSpecifiedSchema.nonEmpty) {
|
} else if (userSpecifiedSchema.nonEmpty) {
|
||||||
(addMetaFields(userSpecifiedSchema), options)
|
(addMetaFields(userSpecifiedSchema), options)
|
||||||
} else {
|
} else {
|
||||||
throw new IllegalArgumentException(s"Missing schema for Create Table: $tableName")
|
throw new IllegalArgumentException(s"Missing schema for Create Table: $tableName")
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
assert(table.schema.nonEmpty, s"Missing schema for Create Table: $tableName")
|
assert(table.schema.nonEmpty, s"Missing schema for Create Table: $tableName")
|
||||||
// SPARK-19724: the default location of a managed table should be non-existent or empty.
|
// SPARK-19724: the default location of a managed table should be non-existent or empty.
|
||||||
@@ -301,47 +318,102 @@ case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean
|
|||||||
s"'${HoodieOptionConfig.SQL_VALUE_TABLE_TYPE_MOR}'")
|
s"'${HoodieOptionConfig.SQL_VALUE_TABLE_TYPE_MOR}'")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private def getAllPartitionPaths(spark: SparkSession, table: CatalogTable): Seq[String] = {
|
||||||
|
val sparkEngine = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext))
|
||||||
|
val metadataConfig = {
|
||||||
|
val properties = new Properties()
|
||||||
|
properties.putAll((spark.sessionState.conf.getAllConfs ++ table.storage.properties).asJava)
|
||||||
|
HoodieMetadataConfig.newBuilder.fromProperties(properties).build()
|
||||||
|
}
|
||||||
|
FSUtils.getAllPartitionPaths(sparkEngine, metadataConfig, getTableLocation(table, spark)).asScala
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method is used to compatible with the old non-hive-styled partition table.
|
||||||
|
* By default we enable the "hoodie.datasource.write.hive_style_partitioning"
|
||||||
|
* when writing data to hudi table by spark sql by default.
|
||||||
|
* If the exist table is a non-hive-styled partitioned table, we should
|
||||||
|
* disable the "hoodie.datasource.write.hive_style_partitioning" when
|
||||||
|
* merge or update the table. Or else, we will get an incorrect merge result
|
||||||
|
* as the partition path mismatch.
|
||||||
|
*/
|
||||||
|
private def isNotHiveStyledPartitionTable(partitionPaths: Seq[String], table: CatalogTable): Boolean = {
|
||||||
|
if (table.partitionColumnNames.nonEmpty) {
|
||||||
|
val isHiveStylePartitionPath = (path: String) => {
|
||||||
|
val fragments = path.split("/")
|
||||||
|
if (fragments.size != table.partitionColumnNames.size) {
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
fragments.zip(table.partitionColumnNames).forall {
|
||||||
|
case (pathFragment, partitionColumn) => pathFragment.startsWith(s"$partitionColumn=")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
!partitionPaths.forall(isHiveStylePartitionPath)
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If this table has disable the url encode, spark sql should also disable it when writing to the table.
|
||||||
|
*/
|
||||||
|
private def isUrlEncodeDisable(partitionPaths: Seq[String], table: CatalogTable): Boolean = {
|
||||||
|
if (table.partitionColumnNames.nonEmpty) {
|
||||||
|
!partitionPaths.forall(partitionPath => partitionPath.split("/").length == table.partitionColumnNames.size)
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
object CreateHoodieTableCommand extends Logging {
|
object CreateHoodieTableCommand extends Logging {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Init the table if it is not exists.
|
* Init the hoodie.properties.
|
||||||
* @param sparkSession
|
|
||||||
* @param table
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
def initTableIfNeed(sparkSession: SparkSession, table: CatalogTable): Unit = {
|
def initTableIfNeed(sparkSession: SparkSession, table: CatalogTable): Unit = {
|
||||||
val location = getTableLocation(table, sparkSession).getOrElse(
|
val location = getTableLocation(table, sparkSession)
|
||||||
throw new IllegalArgumentException(s"Missing location for ${table.identifier}"))
|
|
||||||
|
|
||||||
val conf = sparkSession.sessionState.newHadoopConf()
|
val conf = sparkSession.sessionState.newHadoopConf()
|
||||||
// Init the hoodie table
|
// Init the hoodie table
|
||||||
if (!tableExistsInPath(location, conf)) {
|
val originTableConfig = if (tableExistsInPath(location, conf)) {
|
||||||
val tableName = table.identifier.table
|
val metaClient = HoodieTableMetaClient.builder()
|
||||||
logInfo(s"Table $tableName is not exists, start to create the hudi table")
|
.setBasePath(location)
|
||||||
|
.setConf(conf)
|
||||||
|
.build()
|
||||||
|
metaClient.getTableConfig.getProps.asScala.toMap
|
||||||
|
} else {
|
||||||
|
Map.empty[String, String]
|
||||||
|
}
|
||||||
|
|
||||||
// Save all the table config to the hoodie.properties.
|
val tableName = table.identifier.table
|
||||||
val parameters = HoodieOptionConfig.mappingSqlOptionToTableConfig(table.storage.properties)
|
logInfo(s"Init hoodie.properties for $tableName")
|
||||||
val properties = new Properties()
|
val tableOptions = HoodieOptionConfig.mappingSqlOptionToTableConfig(table.storage.properties)
|
||||||
|
checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.HOODIE_TABLE_PRECOMBINE_FIELD_PROP.key)
|
||||||
|
checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.HOODIE_TABLE_PARTITION_FIELDS_PROP.key)
|
||||||
|
checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.HOODIE_TABLE_RECORDKEY_FIELDS.key)
|
||||||
|
// Save all the table config to the hoodie.properties.
|
||||||
|
val parameters = originTableConfig ++ tableOptions
|
||||||
|
val properties = new Properties()
|
||||||
properties.putAll(parameters.asJava)
|
properties.putAll(parameters.asJava)
|
||||||
HoodieTableMetaClient.withPropertyBuilder()
|
HoodieTableMetaClient.withPropertyBuilder()
|
||||||
.fromProperties(properties)
|
.fromProperties(properties)
|
||||||
.setTableName(tableName)
|
.setTableName(tableName)
|
||||||
.setTableCreateSchema(SchemaConverters.toAvroType(table.schema).toString())
|
.setTableCreateSchema(SchemaConverters.toAvroType(table.schema).toString())
|
||||||
.setPartitionFields(table.partitionColumnNames.mkString(","))
|
.setPartitionFields(table.partitionColumnNames.mkString(","))
|
||||||
.initTable(conf, location)
|
.initTable(conf, location)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
def checkTableConfigEqual(originTableConfig: Map[String, String],
|
||||||
* Check if the hoodie.properties exists in the table path.
|
newTableConfig: Map[String, String], configKey: String): Unit = {
|
||||||
*/
|
if (originTableConfig.contains(configKey) && newTableConfig.contains(configKey)) {
|
||||||
def tableExistsInPath(tablePath: String, conf: Configuration): Boolean = {
|
assert(originTableConfig(configKey) == newTableConfig(configKey),
|
||||||
val basePath = new Path(tablePath)
|
s"Table config: $configKey in the create table is: ${newTableConfig(configKey)}, is not the same with the value in " +
|
||||||
val fs = basePath.getFileSystem(conf)
|
s"hoodie.properties, which is: ${originTableConfig(configKey)}. Please keep the same.")
|
||||||
val metaPath = new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)
|
}
|
||||||
fs.exists(metaPath)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -59,7 +59,6 @@ case class DeleteHoodieTableCommand(deleteTable: DeleteFromTable) extends Runnab
|
|||||||
val targetTable = sparkSession.sessionState.catalog
|
val targetTable = sparkSession.sessionState.catalog
|
||||||
.getTableMetadata(tableId)
|
.getTableMetadata(tableId)
|
||||||
val path = getTableLocation(targetTable, sparkSession)
|
val path = getTableLocation(targetTable, sparkSession)
|
||||||
.getOrElse(s"missing location for $tableId")
|
|
||||||
|
|
||||||
val primaryColumns = HoodieOptionConfig.getPrimaryColumns(targetTable.storage.properties)
|
val primaryColumns = HoodieOptionConfig.getPrimaryColumns(targetTable.storage.properties)
|
||||||
|
|
||||||
|
|||||||
@@ -193,7 +193,6 @@ object InsertIntoHoodieTableCommand {
|
|||||||
|
|
||||||
val partitionFields = table.partitionColumnNames.mkString(",")
|
val partitionFields = table.partitionColumnNames.mkString(",")
|
||||||
val path = getTableLocation(table, sparkSession)
|
val path = getTableLocation(table, sparkSession)
|
||||||
.getOrElse(s"Missing location for table ${table.identifier}")
|
|
||||||
|
|
||||||
val tableSchema = table.schema
|
val tableSchema = table.schema
|
||||||
val options = table.storage.properties
|
val options = table.storage.properties
|
||||||
|
|||||||
@@ -416,7 +416,6 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab
|
|||||||
val targetTableDb = targetTableIdentify.database.getOrElse("default")
|
val targetTableDb = targetTableIdentify.database.getOrElse("default")
|
||||||
val targetTableName = targetTableIdentify.identifier
|
val targetTableName = targetTableIdentify.identifier
|
||||||
val path = getTableLocation(targetTable, sparkSession)
|
val path = getTableLocation(targetTable, sparkSession)
|
||||||
.getOrElse(s"missing location for $targetTableIdentify")
|
|
||||||
|
|
||||||
val options = targetTable.storage.properties
|
val options = targetTable.storage.properties
|
||||||
val definedPk = HoodieOptionConfig.getPrimaryColumns(options)
|
val definedPk = HoodieOptionConfig.getPrimaryColumns(options)
|
||||||
|
|||||||
@@ -18,10 +18,10 @@
|
|||||||
package org.apache.spark.sql.hudi.command
|
package org.apache.spark.sql.hudi.command
|
||||||
|
|
||||||
import java.util.concurrent.TimeUnit.{MICROSECONDS, MILLISECONDS}
|
import java.util.concurrent.TimeUnit.{MICROSECONDS, MILLISECONDS}
|
||||||
|
|
||||||
import org.apache.avro.generic.GenericRecord
|
import org.apache.avro.generic.GenericRecord
|
||||||
import org.apache.hudi.common.config.TypedProperties
|
import org.apache.hudi.common.config.TypedProperties
|
||||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils
|
import org.apache.hudi.common.util.PartitionPathEncodeUtils
|
||||||
|
import org.apache.hudi.config.HoodieWriteConfig
|
||||||
import org.apache.hudi.keygen.{ComplexKeyGenerator, KeyGenUtils}
|
import org.apache.hudi.keygen.{ComplexKeyGenerator, KeyGenUtils}
|
||||||
import org.apache.spark.sql.types.{StructType, TimestampType}
|
import org.apache.spark.sql.types.{StructType, TimestampType}
|
||||||
import org.joda.time.format.DateTimeFormat
|
import org.joda.time.format.DateTimeFormat
|
||||||
@@ -40,45 +40,62 @@ class SqlKeyGenerator(props: TypedProperties) extends ComplexKeyGenerator(props)
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// The origin key generator class for this table.
|
||||||
|
private lazy val originKeyGen = {
|
||||||
|
val beforeKeyGenClassName = props.getString(SqlKeyGenerator.ORIGIN_KEYGEN_CLASS, null)
|
||||||
|
if (beforeKeyGenClassName != null) {
|
||||||
|
val keyGenProps = new TypedProperties()
|
||||||
|
keyGenProps.putAll(props)
|
||||||
|
keyGenProps.remove(SqlKeyGenerator.ORIGIN_KEYGEN_CLASS)
|
||||||
|
keyGenProps.put(HoodieWriteConfig.KEYGENERATOR_CLASS_PROP.key, beforeKeyGenClassName)
|
||||||
|
Some(KeyGenUtils.createKeyGeneratorByClassName(keyGenProps))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
override def getPartitionPath(record: GenericRecord): String = {
|
override def getRecordKey(record: GenericRecord): String = {
|
||||||
|
if (originKeyGen.isDefined) {
|
||||||
|
originKeyGen.get.getKey(record).getRecordKey
|
||||||
|
} else {
|
||||||
|
super.getRecordKey(record)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
override def getPartitionPath(record: GenericRecord) = {
|
||||||
val partitionPath = super.getPartitionPath(record)
|
val partitionPath = super.getPartitionPath(record)
|
||||||
if (partitionSchema.isDefined) {
|
if (partitionSchema.isDefined) {
|
||||||
// we can split the partitionPath here because we enable the URL_ENCODE_PARTITIONING_OPT
|
// we can split the partitionPath here because we enable the URL_ENCODE_PARTITIONING_OPT
|
||||||
// by default for sql.
|
// by default for sql.
|
||||||
val partitionFragments = partitionPath.split(KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR)
|
val partitionFragments = partitionPath.split(KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR)
|
||||||
assert(partitionFragments.size == partitionSchema.get.size)
|
// If it is a table which is not write by spark sql before and the url encode has disabled,
|
||||||
|
// the partition path level may not equal to the partition schema size. Just return the partitionPath
|
||||||
|
// in this case.
|
||||||
|
if (partitionFragments.size != partitionSchema.get.size) {
|
||||||
|
partitionPath
|
||||||
|
} else {
|
||||||
|
partitionFragments.zip(partitionSchema.get.fields).map {
|
||||||
|
case (partitionValue, partitionField) =>
|
||||||
|
val hiveStylePrefix = s"${partitionField.name}="
|
||||||
|
val isHiveStyle = partitionValue.startsWith(hiveStylePrefix)
|
||||||
|
val _partitionValue = if (isHiveStyle) partitionValue.substring(hiveStylePrefix.length) else partitionValue
|
||||||
|
|
||||||
partitionFragments.zip(partitionSchema.get.fields).map {
|
partitionField.dataType match {
|
||||||
case (partitionValue, partitionField) =>
|
case TimestampType =>
|
||||||
val hiveStylePrefix = s"${partitionField.name}="
|
val timeMs = MILLISECONDS.convert(_partitionValue.toLong, MICROSECONDS)
|
||||||
val isHiveStyle = partitionValue.startsWith(hiveStylePrefix)
|
val timestampFormat = PartitionPathEncodeUtils.escapePathName(
|
||||||
val _partitionValue = if (isHiveStyle) {
|
SqlKeyGenerator.timestampTimeFormat.print(timeMs))
|
||||||
partitionValue.substring(hiveStylePrefix.length)
|
if (isHiveStyle) s"$hiveStylePrefix$timestampFormat" else timestampFormat
|
||||||
} else {
|
case _ => partitionValue
|
||||||
partitionValue
|
}
|
||||||
}
|
}.mkString(KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR)
|
||||||
|
}
|
||||||
partitionField.dataType match {
|
} else partitionPath
|
||||||
case TimestampType =>
|
|
||||||
val timeMs = MILLISECONDS.convert(_partitionValue.toLong, MICROSECONDS)
|
|
||||||
val timestampFormat = PartitionPathEncodeUtils.escapePathName(
|
|
||||||
SqlKeyGenerator.timestampTimeFormat.print(timeMs))
|
|
||||||
if (isHiveStyle) {
|
|
||||||
s"$hiveStylePrefix$timestampFormat"
|
|
||||||
} else {
|
|
||||||
timestampFormat
|
|
||||||
}
|
|
||||||
case _=> partitionValue
|
|
||||||
}
|
|
||||||
}.mkString(KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR)
|
|
||||||
} else {
|
|
||||||
partitionPath
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object SqlKeyGenerator {
|
object SqlKeyGenerator {
|
||||||
val PARTITION_SCHEMA = "hoodie.sql.partition.schema"
|
val PARTITION_SCHEMA = "hoodie.sql.partition.schema"
|
||||||
|
val ORIGIN_KEYGEN_CLASS = "hoodie.sql.origin.keygen.class"
|
||||||
private val timestampTimeFormat = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
|
private val timestampTimeFormat = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,7 +35,6 @@ class TruncateHoodieTableCommand(
|
|||||||
override def run(sparkSession: SparkSession): Seq[Row] = {
|
override def run(sparkSession: SparkSession): Seq[Row] = {
|
||||||
val table = sparkSession.sessionState.catalog.getTableMetadata(tableName)
|
val table = sparkSession.sessionState.catalog.getTableMetadata(tableName)
|
||||||
val path = getTableLocation(table, sparkSession)
|
val path = getTableLocation(table, sparkSession)
|
||||||
.getOrElse(s"missing location for ${table.identifier}")
|
|
||||||
val hadoopConf = sparkSession.sessionState.newHadoopConf()
|
val hadoopConf = sparkSession.sessionState.newHadoopConf()
|
||||||
// If we have not specified the partition, truncate will delete all the
|
// If we have not specified the partition, truncate will delete all the
|
||||||
// data in the table path include the hoodi.properties. In this case we
|
// data in the table path include the hoodi.properties. In this case we
|
||||||
|
|||||||
@@ -85,7 +85,6 @@ case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends RunnableCo
|
|||||||
val targetTable = sparkSession.sessionState.catalog
|
val targetTable = sparkSession.sessionState.catalog
|
||||||
.getTableMetadata(tableId)
|
.getTableMetadata(tableId)
|
||||||
val path = getTableLocation(targetTable, sparkSession)
|
val path = getTableLocation(targetTable, sparkSession)
|
||||||
.getOrElse(s"missing location for $tableId")
|
|
||||||
|
|
||||||
val primaryColumns = HoodieOptionConfig.getPrimaryColumns(targetTable.storage.properties)
|
val primaryColumns = HoodieOptionConfig.getPrimaryColumns(targetTable.storage.properties)
|
||||||
|
|
||||||
|
|||||||
@@ -17,9 +17,15 @@
|
|||||||
|
|
||||||
package org.apache.spark.sql.hudi
|
package org.apache.spark.sql.hudi
|
||||||
|
|
||||||
|
import org.apache.hudi.DataSourceWriteOptions._
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import org.apache.hudi.common.model.HoodieRecord
|
import org.apache.hudi.common.model.HoodieRecord
|
||||||
|
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
|
||||||
|
import org.apache.hudi.config.HoodieWriteConfig
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat
|
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat
|
||||||
|
import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator}
|
||||||
|
import org.apache.spark.sql.SaveMode
|
||||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||||
import org.apache.spark.sql.catalyst.catalog.CatalogTableType
|
import org.apache.spark.sql.catalyst.catalog.CatalogTableType
|
||||||
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType, StructField}
|
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType, StructField}
|
||||||
@@ -272,4 +278,231 @@ class TestCreateTable extends TestHoodieSqlBase {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("Test Create Table From Exist Hoodie Table") {
|
||||||
|
withTempDir { tmp =>
|
||||||
|
Seq("2021-08-02", "2021/08/02").foreach { partitionValue =>
|
||||||
|
val tableName = generateTableName
|
||||||
|
val tablePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||||
|
import spark.implicits._
|
||||||
|
val df = Seq((1, "a1", 10, 1000, partitionValue)).toDF("id", "name", "value", "ts", "dt")
|
||||||
|
// Write a table by spark dataframe.
|
||||||
|
df.write.format("hudi")
|
||||||
|
.option(HoodieWriteConfig.TABLE_NAME.key, tableName)
|
||||||
|
.option(TABLE_TYPE.key, COW_TABLE_TYPE_OPT_VAL)
|
||||||
|
.option(RECORDKEY_FIELD.key, "id")
|
||||||
|
.option(PRECOMBINE_FIELD.key, "ts")
|
||||||
|
.option(PARTITIONPATH_FIELD.key, "dt")
|
||||||
|
.option(KEYGENERATOR_CLASS.key, classOf[SimpleKeyGenerator].getName)
|
||||||
|
.option(HoodieWriteConfig.INSERT_PARALLELISM.key, "1")
|
||||||
|
.option(HoodieWriteConfig.UPSERT_PARALLELISM.key, "1")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(tablePath)
|
||||||
|
|
||||||
|
// Create a table over the exist old table.
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName using hudi
|
||||||
|
| options (
|
||||||
|
| primaryKey = 'id',
|
||||||
|
| preCombineField = 'ts'
|
||||||
|
|)
|
||||||
|
|partitioned by (dt)
|
||||||
|
|location '$tablePath'
|
||||||
|
|""".stripMargin)
|
||||||
|
checkAnswer(s"select id, name, value, ts, dt from $tableName")(
|
||||||
|
Seq(1, "a1", 10, 1000, partitionValue)
|
||||||
|
)
|
||||||
|
// Check the missing properties for spark sql
|
||||||
|
val metaClient = HoodieTableMetaClient.builder()
|
||||||
|
.setBasePath(tablePath)
|
||||||
|
.setConf(spark.sessionState.newHadoopConf())
|
||||||
|
.build()
|
||||||
|
val properties = metaClient.getTableConfig.getProps.asScala.toMap
|
||||||
|
assertResult(true)(properties.contains(HoodieTableConfig.HOODIE_TABLE_CREATE_SCHEMA.key))
|
||||||
|
assertResult("dt")(properties(HoodieTableConfig.HOODIE_TABLE_PARTITION_FIELDS_PROP.key))
|
||||||
|
assertResult("ts")(properties(HoodieTableConfig.HOODIE_TABLE_PRECOMBINE_FIELD_PROP.key))
|
||||||
|
|
||||||
|
// Test insert into
|
||||||
|
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1000, '$partitionValue')")
|
||||||
|
checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, dt from $tableName order by id")(
|
||||||
|
Seq("1", partitionValue, 1, "a1", 10, 1000, partitionValue),
|
||||||
|
Seq("2", partitionValue, 2, "a2", 10, 1000, partitionValue)
|
||||||
|
)
|
||||||
|
// Test merge into
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|merge into $tableName h0
|
||||||
|
|using (select 1 as id, 'a1' as name, 11 as value, 1001 as ts, '$partitionValue' as dt) s0
|
||||||
|
|on h0.id = s0.id
|
||||||
|
|when matched then update set *
|
||||||
|
|""".stripMargin)
|
||||||
|
checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, dt from $tableName order by id")(
|
||||||
|
Seq("1", partitionValue, 1, "a1", 11, 1001, partitionValue),
|
||||||
|
Seq("2", partitionValue, 2, "a2", 10, 1000, partitionValue)
|
||||||
|
)
|
||||||
|
// Test update
|
||||||
|
spark.sql(s"update $tableName set value = value + 1 where id = 2")
|
||||||
|
checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, dt from $tableName order by id")(
|
||||||
|
Seq("1", partitionValue, 1, "a1", 11, 1001, partitionValue),
|
||||||
|
Seq("2", partitionValue, 2, "a2", 11, 1000, partitionValue)
|
||||||
|
)
|
||||||
|
// Test delete
|
||||||
|
spark.sql(s"delete from $tableName where id = 1")
|
||||||
|
checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, dt from $tableName order by id")(
|
||||||
|
Seq("2", partitionValue, 2, "a2", 11, 1000, partitionValue)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("Test Create Table From Exist Hoodie Table For Multi-Level Partitioned Table") {
|
||||||
|
withTempDir { tmp =>
|
||||||
|
Seq("2021-08-02", "2021/08/02").foreach { day =>
|
||||||
|
val tableName = generateTableName
|
||||||
|
val tablePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||||
|
import spark.implicits._
|
||||||
|
val df = Seq((1, "a1", 10, 1000, day, 12)).toDF("id", "name", "value", "ts", "day", "hh")
|
||||||
|
// Write a table by spark dataframe.
|
||||||
|
df.write.format("hudi")
|
||||||
|
.option(HoodieWriteConfig.TABLE_NAME.key, tableName)
|
||||||
|
.option(TABLE_TYPE.key, MOR_TABLE_TYPE_OPT_VAL)
|
||||||
|
.option(RECORDKEY_FIELD.key, "id")
|
||||||
|
.option(PRECOMBINE_FIELD.key, "ts")
|
||||||
|
.option(PARTITIONPATH_FIELD.key, "day,hh")
|
||||||
|
.option(KEYGENERATOR_CLASS.key, classOf[ComplexKeyGenerator].getName)
|
||||||
|
.option(HoodieWriteConfig.INSERT_PARALLELISM.key, "1")
|
||||||
|
.option(HoodieWriteConfig.UPSERT_PARALLELISM.key, "1")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(tablePath)
|
||||||
|
|
||||||
|
// Create a table over the exist old table.
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName using hudi
|
||||||
|
| options (
|
||||||
|
| primaryKey = 'id',
|
||||||
|
| preCombineField = 'ts'
|
||||||
|
|)
|
||||||
|
|partitioned by (day, hh)
|
||||||
|
|location '$tablePath'
|
||||||
|
|""".stripMargin)
|
||||||
|
checkAnswer(s"select id, name, value, ts, day, hh from $tableName")(
|
||||||
|
Seq(1, "a1", 10, 1000, day, 12)
|
||||||
|
)
|
||||||
|
// Check the missing properties for spark sql
|
||||||
|
val metaClient = HoodieTableMetaClient.builder()
|
||||||
|
.setBasePath(tablePath)
|
||||||
|
.setConf(spark.sessionState.newHadoopConf())
|
||||||
|
.build()
|
||||||
|
val properties = metaClient.getTableConfig.getProps.asScala.toMap
|
||||||
|
assertResult(true)(properties.contains(HoodieTableConfig.HOODIE_TABLE_CREATE_SCHEMA.key))
|
||||||
|
assertResult("day,hh")(properties(HoodieTableConfig.HOODIE_TABLE_PARTITION_FIELDS_PROP.key))
|
||||||
|
assertResult("ts")(properties(HoodieTableConfig.HOODIE_TABLE_PRECOMBINE_FIELD_PROP.key))
|
||||||
|
|
||||||
|
// Test insert into
|
||||||
|
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1000, '$day', 12)")
|
||||||
|
checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, day, hh from $tableName order by id")(
|
||||||
|
Seq("id:1", s"$day/12", 1, "a1", 10, 1000, day, 12),
|
||||||
|
Seq("id:2", s"$day/12", 2, "a2", 10, 1000, day, 12)
|
||||||
|
)
|
||||||
|
// Test merge into
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|merge into $tableName h0
|
||||||
|
|using (select 1 as id, 'a1' as name, 11 as value, 1001 as ts, '$day' as day, 12 as hh) s0
|
||||||
|
|on h0.id = s0.id
|
||||||
|
|when matched then update set *
|
||||||
|
|""".stripMargin)
|
||||||
|
checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, day, hh from $tableName order by id")(
|
||||||
|
Seq("id:1", s"$day/12", 1, "a1", 11, 1001, day, 12),
|
||||||
|
Seq("id:2", s"$day/12", 2, "a2", 10, 1000, day, 12)
|
||||||
|
)
|
||||||
|
// Test update
|
||||||
|
spark.sql(s"update $tableName set value = value + 1 where id = 2")
|
||||||
|
checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, day, hh from $tableName order by id")(
|
||||||
|
Seq("id:1", s"$day/12", 1, "a1", 11, 1001, day, 12),
|
||||||
|
Seq("id:2", s"$day/12", 2, "a2", 11, 1000, day, 12)
|
||||||
|
)
|
||||||
|
// Test delete
|
||||||
|
spark.sql(s"delete from $tableName where id = 1")
|
||||||
|
checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, day, hh from $tableName order by id")(
|
||||||
|
Seq("id:2", s"$day/12", 2, "a2", 11, 1000, day, 12)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("Test Create Table From Exist Hoodie Table For None Partitioned Table") {
|
||||||
|
withTempDir{tmp =>
|
||||||
|
// Write a table by spark dataframe.
|
||||||
|
val tableName = generateTableName
|
||||||
|
import spark.implicits._
|
||||||
|
val df = Seq((1, "a1", 10, 1000)).toDF("id", "name", "value", "ts")
|
||||||
|
df.write.format("hudi")
|
||||||
|
.option(HoodieWriteConfig.TABLE_NAME.key, tableName)
|
||||||
|
.option(TABLE_TYPE.key, COW_TABLE_TYPE_OPT_VAL)
|
||||||
|
.option(RECORDKEY_FIELD.key, "id")
|
||||||
|
.option(PRECOMBINE_FIELD.key, "ts")
|
||||||
|
.option(PARTITIONPATH_FIELD.key, "")
|
||||||
|
.option(KEYGENERATOR_CLASS.key, classOf[NonpartitionedKeyGenerator].getName)
|
||||||
|
.option(HoodieWriteConfig.INSERT_PARALLELISM.key, "1")
|
||||||
|
.option(HoodieWriteConfig.UPSERT_PARALLELISM.key, "1")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(tmp.getCanonicalPath)
|
||||||
|
|
||||||
|
// Create a table over the exist old table.
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName using hudi
|
||||||
|
| options (
|
||||||
|
| primaryKey = 'id',
|
||||||
|
| preCombineField = 'ts'
|
||||||
|
|)
|
||||||
|
|location '${tmp.getCanonicalPath}'
|
||||||
|
|""".stripMargin)
|
||||||
|
checkAnswer(s"select id, name, value, ts from $tableName")(
|
||||||
|
Seq(1, "a1", 10, 1000)
|
||||||
|
)
|
||||||
|
// Check the missing properties for spark sql
|
||||||
|
val metaClient = HoodieTableMetaClient.builder()
|
||||||
|
.setBasePath(tmp.getCanonicalPath)
|
||||||
|
.setConf(spark.sessionState.newHadoopConf())
|
||||||
|
.build()
|
||||||
|
val properties = metaClient.getTableConfig.getProps.asScala.toMap
|
||||||
|
assertResult(true)(properties.contains(HoodieTableConfig.HOODIE_TABLE_CREATE_SCHEMA.key))
|
||||||
|
assertResult("ts")(properties(HoodieTableConfig.HOODIE_TABLE_PRECOMBINE_FIELD_PROP.key))
|
||||||
|
|
||||||
|
// Test insert into
|
||||||
|
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1000)")
|
||||||
|
checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts from $tableName order by id")(
|
||||||
|
Seq("1", "", 1, "a1", 10, 1000),
|
||||||
|
Seq("2", "", 2, "a2", 10, 1000)
|
||||||
|
)
|
||||||
|
// Test merge into
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|merge into $tableName h0
|
||||||
|
|using (select 1 as id, 'a1' as name, 11 as value, 1001 as ts) s0
|
||||||
|
|on h0.id = s0.id
|
||||||
|
|when matched then update set *
|
||||||
|
|""".stripMargin)
|
||||||
|
checkAnswer(s"select id, name, value, ts from $tableName order by id")(
|
||||||
|
Seq(1, "a1", 11, 1001),
|
||||||
|
Seq(2, "a2", 10, 1000)
|
||||||
|
)
|
||||||
|
// Test update
|
||||||
|
spark.sql(s"update $tableName set value = value + 1 where id = 2")
|
||||||
|
checkAnswer(s"select id, name, value, ts from $tableName order by id")(
|
||||||
|
Seq(1, "a1", 11, 1001),
|
||||||
|
Seq(2, "a2", 11, 1000)
|
||||||
|
)
|
||||||
|
// Test delete
|
||||||
|
spark.sql(s"delete from $tableName where id = 1")
|
||||||
|
checkAnswer(s"select id, name, value, ts from $tableName order by id")(
|
||||||
|
Seq(2, "a2", 11, 1000)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -126,9 +126,9 @@ class TestPartialUpdateForMergeInto extends TestHoodieSqlBase {
|
|||||||
checkException(
|
checkException(
|
||||||
s"""
|
s"""
|
||||||
|merge into $tableName2 t0
|
|merge into $tableName2 t0
|
||||||
|using ( select 1 as id, 'a1' as name, 12 as price) s0
|
|using ( select 1 as id, 'a1' as name, 12 as price, 1000 as ts) s0
|
||||||
|on t0.id = s0.id
|
|on t0.id = s0.id
|
||||||
|when matched then update set price = s0.price
|
|when matched then update set price = s0.price, _ts = s0.ts
|
||||||
""".stripMargin)(
|
""".stripMargin)(
|
||||||
"Missing specify the value for target field: 'id' in merge into update action for MOR table. " +
|
"Missing specify the value for target field: 'id' in merge into update action for MOR table. " +
|
||||||
"Currently we cannot support partial update for MOR, please complete all the target fields " +
|
"Currently we cannot support partial update for MOR, please complete all the target fields " +
|
||||||
|
|||||||
Reference in New Issue
Block a user