[HUDI-1842] Spark Sql Support For pre-existing Hoodie Table (#3393)

2021-08-07 19:49:26 +08:00
parent 70b6bd485f
commit 55d2e786db
19 changed files with 451 additions and 99 deletions
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala
@@ -26,7 +26,6 @@ import org.apache.hudi.common.model.HoodieTableType.{COPY_ON_WRITE, MERGE_ON_REA
 import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
 import org.apache.hudi.exception.HoodieException
 import org.apache.hudi.hadoop.HoodieROTablePathFilter
 import org.apache.hudi.hive.util.ConfigUtils
 import org.apache.log4j.LogManager
 import org.apache.spark.sql.avro.SchemaConverters
 import org.apache.spark.sql.execution.datasources.{DataSource, FileStatusCache, HadoopFsRelation}
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala
@@ -126,12 +126,9 @@ object HoodieOptionConfig {
  /**
   * Mapping the table config (loaded from the hoodie.properties) to the sql options.
   * @param options
   * @return
   */
  def mappingTableConfigToSqlOption(options: Map[String, String]): Map[String, String] = {
-    options.filterKeys(k => tableConfigKeyToSqlKey.contains(k))
+    options.map(kv => tableConfigKeyToSqlKey.getOrElse(kv._1, kv._1) -> reverseValueMapping.getOrElse(kv._2, kv._2))
      .map(kv => tableConfigKeyToSqlKey(kv._1) -> reverseValueMapping.getOrElse(kv._2, kv._2))
  }
  private lazy val defaultTableConfig: Map[String, String] = {
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala
@@ -24,7 +24,9 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hudi.SparkAdapterSupport
 import org.apache.hudi.common.model.HoodieRecord
 import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
 import org.apache.spark.SPARK_VERSION
 import org.apache.spark.sql.avro.SchemaConverters
 import org.apache.spark.sql.{Column, DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
@@ -64,6 +66,16 @@ object HoodieSqlUtils extends SparkAdapterSupport {
    }
  }
  def getTableSqlSchema(metaClient: HoodieTableMetaClient): Option[StructType] = {
    val schemaResolver = new TableSchemaResolver(metaClient)
    val avroSchema = try Some(schemaResolver.getTableAvroSchema(false))
    catch {
      case _: Throwable => None
    }
    avroSchema.map(SchemaConverters.toSqlType(_).dataType
      .asInstanceOf[StructType])
  }
  private def tripAlias(plan: LogicalPlan): LogicalPlan = {
    plan match {
      case SubqueryAlias(_, relation: LogicalPlan) =>
@@ -122,12 +134,12 @@ object HoodieSqlUtils extends SparkAdapterSupport {
   * @param spark
   * @return
   */
-  def getTableLocation(tableId: TableIdentifier, spark: SparkSession): Option[String] = {
+  def getTableLocation(tableId: TableIdentifier, spark: SparkSession): String = {
    val table = spark.sessionState.catalog.getTableMetadata(tableId)
    getTableLocation(table, spark)
  }
-  def getTableLocation(table: CatalogTable, sparkSession: SparkSession): Option[String] = {
+  def getTableLocation(table: CatalogTable, sparkSession: SparkSession): String = {
    val uri = if (table.tableType == CatalogTableType.MANAGED && isHoodieTable(table)) {
      Some(sparkSession.sessionState.catalog.defaultTablePath(table.identifier))
    } else {
@@ -136,6 +148,7 @@ object HoodieSqlUtils extends SparkAdapterSupport {
    val conf = sparkSession.sessionState.newHadoopConf()
    uri.map(makePathQualified(_, conf))
      .map(removePlaceHolder)
      .getOrElse(throw new IllegalArgumentException(s"Missing location for ${table.identifier}"))
  }
  private def removePlaceHolder(path: String): String = {
@@ -154,6 +167,16 @@ object HoodieSqlUtils extends SparkAdapterSupport {
    fs.makeQualified(hadoopPath).toUri.toString
  }
  /**
   * Check if the hoodie.properties exists in the table path.
   */
  def tableExistsInPath(tablePath: String, conf: Configuration): Boolean = {
    val basePath = new Path(tablePath)
    val fs = basePath.getFileSystem(conf)
    val metaPath = new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)
    fs.exists(metaPath)
  }
  def castIfNeeded(child: Expression, dataType: DataType, conf: SQLConf): Expression = {
    child match {
      case Literal(nul, NullType) => Literal(nul, dataType)
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala
@@ -21,7 +21,8 @@ import org.apache.hudi.DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL
 import org.apache.hudi.SparkAdapterSupport
 import scala.collection.JavaConverters._
-import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType}
+import org.apache.hudi.common.model.HoodieRecord
 import org.apache.hudi.common.table.HoodieTableMetaClient
 import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedStar
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
@@ -286,7 +287,28 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
      } else {
        l
      }
-
+    // Fill schema for Create Table without specify schema info
    case c @ CreateTable(tableDesc, _, _)
      if isHoodieTable(tableDesc) =>
        val tablePath = getTableLocation(c.tableDesc, sparkSession)
        if (tableExistsInPath(tablePath, sparkSession.sessionState.newHadoopConf())) {
          val metaClient = HoodieTableMetaClient.builder()
            .setBasePath(tablePath)
            .setConf(sparkSession.sessionState.newHadoopConf())
            .build()
          val tableSchema = HoodieSqlUtils.getTableSqlSchema(metaClient).map(HoodieSqlUtils.addMetaFields)
          if (tableSchema.isDefined && tableDesc.schema.isEmpty) {
            // Fill the schema with the schema from the table
            c.copy(tableDesc.copy(schema = tableSchema.get))
          } else if (tableSchema.isDefined && tableDesc.schema != tableSchema.get) {
            throw new AnalysisException(s"Specified schema in create table statement is not equal to the table schema." +
              s"You should not specify the schema for an exist table: ${tableDesc.identifier} ")
          } else {
            c
          }
        } else {
          c
        }
    case p => p
  }
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableAddColumnsCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableAddColumnsCommand.scala
@@ -103,7 +103,6 @@ object AlterHoodieTableAddColumnsCommand {
   */
  def commitWithSchema(schema: Schema, table: CatalogTable, sparkSession: SparkSession): Unit = {
    val path = getTableLocation(table, sparkSession)
      .getOrElse(s"missing location for ${table.identifier}")
    val jsc = new JavaSparkContext(sparkSession.sparkContext)
    val client = DataSourceUtils.createHoodieClient(jsc, schema.toString,
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableChangeColumnCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableChangeColumnCommand.scala
@@ -68,7 +68,6 @@ case class AlterHoodieTableChangeColumnCommand(
    val newSchema = AvroConversionUtils.convertStructTypeToAvroSchema(newSqlSchema, structName, nameSpace)
    val path = getTableLocation(table, sparkSession)
      .getOrElse(s"missing location for ${table.identifier}")
    val hadoopConf = sparkSession.sessionState.newHadoopConf()
    val metaClient = HoodieTableMetaClient.builder().setBasePath(path)
      .setConf(hadoopConf).build()
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableRenameCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableRenameCommand.scala
@@ -37,7 +37,6 @@ class AlterHoodieTableRenameCommand(
      val catalog = sparkSession.sessionState.catalog
      val table = catalog.getTableMetadata(oldName)
      val path = getTableLocation(table, sparkSession)
        .getOrElse(s"missing location for ${table.identifier}")
      val hadoopConf = sparkSession.sessionState.newHadoopConf()
      val metaClient = HoodieTableMetaClient.builder().setBasePath(path)
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodieTableCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodieTableCommand.scala
@@ -31,7 +31,6 @@ case class CompactionHoodieTableCommand(table: CatalogTable,
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val basePath = getTableLocation(table, sparkSession)
      .getOrElse(s"missing location for ${table.identifier}")
    CompactionHoodiePathCommand(basePath, operation, instantTimestamp).run(sparkSession)
  }
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionShowHoodieTableCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionShowHoodieTableCommand.scala
@@ -29,7 +29,6 @@ case class CompactionShowHoodieTableCommand(table: CatalogTable, limit: Int)
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val basePath = getTableLocation(table, sparkSession)
      .getOrElse(s"missing location for ${table.identifier}")
    CompactionShowHoodiePathCommand(basePath, limit).run(sparkSession)
  }
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala
@@ -62,7 +62,6 @@ case class CreateHoodieTableAsSelectCommand(
      }
    }
    val tablePath = getTableLocation(table, sparkSession)
      .getOrElse(s"Missing path for table ${table.identifier}")
    val conf = sparkSession.sessionState.newHadoopConf()
    assert(CreateHoodieTableCommand.isEmptyPath(tablePath, conf),
      s"Path '$tablePath' should be empty for CTAS")
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala
@@ -19,16 +19,19 @@ package org.apache.spark.sql.hudi.command
 import scala.collection.JavaConverters._
 import java.util.{Locale, Properties}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hudi.client.common.HoodieSparkEngineContext
 import org.apache.hudi.common.config.HoodieMetadataConfig
 import org.apache.hudi.common.fs.FSUtils
 import org.apache.hudi.{DataSourceWriteOptions, SparkAdapterSupport}
 import org.apache.hudi.common.model.HoodieFileFormat
-import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
+import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
 import org.apache.hudi.common.util.ValidationUtils
 import org.apache.hudi.hadoop.HoodieParquetInputFormat
 import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat
 import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils
 import org.apache.spark.api.java.JavaSparkContext
 import org.apache.spark.{SPARK_VERSION, SparkConf}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.avro.SchemaConverters
@@ -40,8 +43,7 @@ import org.apache.spark.sql.hive.HiveClientUtils
 import org.apache.spark.sql.hive.HiveExternalCatalog._
 import org.apache.spark.sql.hudi.HoodieSqlUtils._
 import org.apache.spark.sql.hudi.HoodieOptionConfig
-import org.apache.spark.sql.hudi.command.CreateHoodieTableCommand.{initTableIfNeed, tableExistsInPath, isEmptyPath}
+import org.apache.spark.sql.hudi.command.CreateHoodieTableCommand.{initTableIfNeed, isEmptyPath}
 import org.apache.spark.sql.internal.StaticSQLConf
 import org.apache.spark.sql.internal.StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
@@ -50,8 +52,6 @@ import scala.collection.mutable
 /**
 * Command for create hoodie table.
 * @param table
 * @param ignoreIfExists
 */
 case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean)
  extends RunnableCommand with SparkAdapterSupport {
@@ -83,7 +83,6 @@ case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean
    val sessionState = sparkSession.sessionState
    val tableName = table.identifier.unquotedString
    val path = getTableLocation(table, sparkSession)
      .getOrElse(s"Missing path for table ${table.identifier}")
    val conf = sparkSession.sessionState.newHadoopConf()
    val isTableExists = tableExistsInPath(path, conf)
    // Get the schema & table options
@@ -95,26 +94,44 @@ case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean
        .setBasePath(path)
        .setConf(conf)
        .build()
-      val schemaResolver = new TableSchemaResolver(metaClient)
+     val tableSchema = getTableSqlSchema(metaClient)
      val avroSchema = try Some(schemaResolver.getTableAvroSchema(false))
      catch {
        case _: Throwable => None
      }
      val tableSchema = avroSchema.map(SchemaConverters.toSqlType(_).dataType
        .asInstanceOf[StructType])
-      // Get options from the external table and append with the options in ddl.
+     // Get options from the external table and append with the options in ddl.
-      val options = HoodieOptionConfig.mappingTableConfigToSqlOption(
+     val originTableConfig =  HoodieOptionConfig.mappingTableConfigToSqlOption(
-        metaClient.getTableConfig.getProps.asScala.toMap) ++ table.storage.properties
+       metaClient.getTableConfig.getProps.asScala.toMap)
-      val userSpecifiedSchema = table.schema
+     val allPartitionPaths = getAllPartitionPaths(sparkSession, table)
-      if (userSpecifiedSchema.isEmpty && tableSchema.isDefined) {
+     var upgrateConfig = Map.empty[String, String]
     // If this is a non-hive-styled partition table, disable the hive style config.
     // (By default this config is enable for spark sql)
     upgrateConfig = if (isNotHiveStyledPartitionTable(allPartitionPaths, table)) {
        upgrateConfig + (DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key -> "false")
     } else {
        upgrateConfig
     }
     upgrateConfig = if (isUrlEncodeDisable(allPartitionPaths, table)) {
       upgrateConfig + (DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key -> "false")
     } else {
       upgrateConfig
     }
      // Use the origin keygen to generate record key to keep the rowkey consistent with the old table for spark sql.
      // See SqlKeyGenerator#getRecordKey for detail.
     upgrateConfig = if (originTableConfig.contains(HoodieTableConfig.HOODIE_TABLE_KEY_GENERATOR_CLASS.key)) {
        upgrateConfig + (SqlKeyGenerator.ORIGIN_KEYGEN_CLASS -> originTableConfig(HoodieTableConfig.HOODIE_TABLE_KEY_GENERATOR_CLASS.key))
     } else {
        upgrateConfig
     }
     val options = originTableConfig ++ upgrateConfig ++ table.storage.properties
     val userSpecifiedSchema = table.schema
     if (userSpecifiedSchema.isEmpty && tableSchema.isDefined) {
        (addMetaFields(tableSchema.get), options)
-      } else if (userSpecifiedSchema.nonEmpty) {
+     }  else if (userSpecifiedSchema.nonEmpty) {
        (addMetaFields(userSpecifiedSchema), options)
-      } else {
+    } else {
        throw new IllegalArgumentException(s"Missing schema for Create Table: $tableName")
-      }
+     }
    } else {
      assert(table.schema.nonEmpty, s"Missing schema for Create Table: $tableName")
      // SPARK-19724: the default location of a managed table should be non-existent or empty.
@@ -301,47 +318,102 @@ case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean
          s"'${HoodieOptionConfig.SQL_VALUE_TABLE_TYPE_MOR}'")
    }
  }
  private def getAllPartitionPaths(spark: SparkSession, table: CatalogTable): Seq[String] = {
    val sparkEngine = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext))
    val metadataConfig = {
      val properties = new Properties()
      properties.putAll((spark.sessionState.conf.getAllConfs ++ table.storage.properties).asJava)
      HoodieMetadataConfig.newBuilder.fromProperties(properties).build()
    }
    FSUtils.getAllPartitionPaths(sparkEngine, metadataConfig, getTableLocation(table, spark)).asScala
  }
  /**
   * This method is used to compatible with the old non-hive-styled partition table.
   * By default we enable the "hoodie.datasource.write.hive_style_partitioning"
   * when writing data to hudi table by spark sql by default.
   * If the exist table is a non-hive-styled partitioned table, we should
   * disable the "hoodie.datasource.write.hive_style_partitioning" when
   * merge or update the table. Or else, we will get an incorrect merge result
   * as the partition path mismatch.
   */
  private def isNotHiveStyledPartitionTable(partitionPaths: Seq[String], table: CatalogTable): Boolean = {
    if (table.partitionColumnNames.nonEmpty) {
      val isHiveStylePartitionPath = (path: String) => {
        val fragments = path.split("/")
        if (fragments.size != table.partitionColumnNames.size) {
          false
        } else {
          fragments.zip(table.partitionColumnNames).forall {
            case (pathFragment, partitionColumn) => pathFragment.startsWith(s"$partitionColumn=")
          }
        }
      }
      !partitionPaths.forall(isHiveStylePartitionPath)
    } else {
      false
    }
  }
  /**
   * If this table has disable the url encode, spark sql should also disable it when writing to the table.
   */
  private def isUrlEncodeDisable(partitionPaths: Seq[String], table: CatalogTable): Boolean = {
    if (table.partitionColumnNames.nonEmpty) {
      !partitionPaths.forall(partitionPath => partitionPath.split("/").length == table.partitionColumnNames.size)
    } else {
      false
    }
  }
 }
 object CreateHoodieTableCommand extends Logging {
  /**
-    * Init the table if it is not exists.
+    * Init the hoodie.properties.
    * @param sparkSession
    * @param table
    * @return
    */
  def initTableIfNeed(sparkSession: SparkSession, table: CatalogTable): Unit = {
-    val location = getTableLocation(table, sparkSession).getOrElse(
+    val location = getTableLocation(table, sparkSession)
      throw new IllegalArgumentException(s"Missing location for ${table.identifier}"))
    val conf = sparkSession.sessionState.newHadoopConf()
    // Init the hoodie table
-    if (!tableExistsInPath(location, conf)) {
+    val originTableConfig = if (tableExistsInPath(location, conf)) {
-      val tableName = table.identifier.table
+      val metaClient = HoodieTableMetaClient.builder()
-      logInfo(s"Table $tableName is not exists, start to create the hudi table")
+        .setBasePath(location)
        .setConf(conf)
        .build()
      metaClient.getTableConfig.getProps.asScala.toMap
    } else {
      Map.empty[String, String]
    }
-      // Save all the table config to the hoodie.properties.
+    val tableName = table.identifier.table
-      val parameters = HoodieOptionConfig.mappingSqlOptionToTableConfig(table.storage.properties)
+    logInfo(s"Init hoodie.properties for $tableName")
-      val properties = new Properties()
+    val tableOptions = HoodieOptionConfig.mappingSqlOptionToTableConfig(table.storage.properties)
    checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.HOODIE_TABLE_PRECOMBINE_FIELD_PROP.key)
    checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.HOODIE_TABLE_PARTITION_FIELDS_PROP.key)
    checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.HOODIE_TABLE_RECORDKEY_FIELDS.key)
    // Save all the table config to the hoodie.properties.
    val parameters = originTableConfig ++ tableOptions
    val properties = new Properties()
      properties.putAll(parameters.asJava)
      HoodieTableMetaClient.withPropertyBuilder()
-          .fromProperties(properties)
+        .fromProperties(properties)
-          .setTableName(tableName)
+        .setTableName(tableName)
-          .setTableCreateSchema(SchemaConverters.toAvroType(table.schema).toString())
+        .setTableCreateSchema(SchemaConverters.toAvroType(table.schema).toString())
-          .setPartitionFields(table.partitionColumnNames.mkString(","))
+        .setPartitionFields(table.partitionColumnNames.mkString(","))
-          .initTable(conf, location)
+        .initTable(conf, location)
    }
  }
-  /**
+  def checkTableConfigEqual(originTableConfig: Map[String, String],
-   * Check if the hoodie.properties exists in the table path.
+    newTableConfig: Map[String, String], configKey: String): Unit = {
-   */
+    if (originTableConfig.contains(configKey) && newTableConfig.contains(configKey)) {
-  def tableExistsInPath(tablePath: String, conf: Configuration): Boolean = {
+      assert(originTableConfig(configKey) == newTableConfig(configKey),
-    val basePath = new Path(tablePath)
+        s"Table config: $configKey in the create table is: ${newTableConfig(configKey)}, is not the same with the value in " +
-    val fs = basePath.getFileSystem(conf)
+        s"hoodie.properties, which is:  ${originTableConfig(configKey)}. Please keep the same.")
-    val metaPath = new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)
+    }
    fs.exists(metaPath)
  }
  /**
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala
@@ -59,7 +59,6 @@ case class DeleteHoodieTableCommand(deleteTable: DeleteFromTable) extends Runnab
    val targetTable = sparkSession.sessionState.catalog
      .getTableMetadata(tableId)
    val path = getTableLocation(targetTable, sparkSession)
      .getOrElse(s"missing location for $tableId")
    val primaryColumns = HoodieOptionConfig.getPrimaryColumns(targetTable.storage.properties)
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala
@@ -193,7 +193,6 @@ object InsertIntoHoodieTableCommand {
    val partitionFields = table.partitionColumnNames.mkString(",")
    val path = getTableLocation(table, sparkSession)
      .getOrElse(s"Missing location for table ${table.identifier}")
    val tableSchema = table.schema
    val options = table.storage.properties
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala
@@ -416,7 +416,6 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab
    val targetTableDb = targetTableIdentify.database.getOrElse("default")
    val targetTableName = targetTableIdentify.identifier
    val path = getTableLocation(targetTable, sparkSession)
      .getOrElse(s"missing location for $targetTableIdentify")
    val options = targetTable.storage.properties
    val definedPk = HoodieOptionConfig.getPrimaryColumns(options)
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.hudi.command
 import java.util.concurrent.TimeUnit.{MICROSECONDS, MILLISECONDS}
 import org.apache.avro.generic.GenericRecord
 import org.apache.hudi.common.config.TypedProperties
 import org.apache.hudi.common.util.PartitionPathEncodeUtils
 import org.apache.hudi.config.HoodieWriteConfig
 import org.apache.hudi.keygen.{ComplexKeyGenerator, KeyGenUtils}
 import org.apache.spark.sql.types.{StructType, TimestampType}
 import org.joda.time.format.DateTimeFormat
@@ -40,45 +40,62 @@ class SqlKeyGenerator(props: TypedProperties) extends ComplexKeyGenerator(props)
      None
    }
  }
  // The origin key generator class for this table.
  private lazy val originKeyGen = {
    val beforeKeyGenClassName = props.getString(SqlKeyGenerator.ORIGIN_KEYGEN_CLASS, null)
    if (beforeKeyGenClassName != null) {
      val keyGenProps = new TypedProperties()
      keyGenProps.putAll(props)
      keyGenProps.remove(SqlKeyGenerator.ORIGIN_KEYGEN_CLASS)
      keyGenProps.put(HoodieWriteConfig.KEYGENERATOR_CLASS_PROP.key, beforeKeyGenClassName)
      Some(KeyGenUtils.createKeyGeneratorByClassName(keyGenProps))
    } else {
      None
    }
  }
-  override def getPartitionPath(record: GenericRecord): String = {
+  override def getRecordKey(record: GenericRecord): String = {
    if (originKeyGen.isDefined) {
      originKeyGen.get.getKey(record).getRecordKey
    } else {
      super.getRecordKey(record)
    }
  }
  override def getPartitionPath(record: GenericRecord) = {
    val partitionPath = super.getPartitionPath(record)
    if (partitionSchema.isDefined) {
      // we can split the partitionPath here because we enable the URL_ENCODE_PARTITIONING_OPT
      // by default for sql.
      val partitionFragments = partitionPath.split(KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR)
-      assert(partitionFragments.size == partitionSchema.get.size)
+      // If it is a table which is not write by spark sql before and the url encode has disabled,
      // the partition path level may not equal to the partition schema size. Just return the partitionPath
      // in this case.
      if (partitionFragments.size != partitionSchema.get.size) {
        partitionPath
      } else {
        partitionFragments.zip(partitionSchema.get.fields).map {
          case (partitionValue, partitionField) =>
            val hiveStylePrefix = s"${partitionField.name}="
            val isHiveStyle = partitionValue.startsWith(hiveStylePrefix)
            val _partitionValue = if (isHiveStyle) partitionValue.substring(hiveStylePrefix.length) else partitionValue
-      partitionFragments.zip(partitionSchema.get.fields).map {
+            partitionField.dataType match {
-        case (partitionValue, partitionField) =>
+              case TimestampType =>
-          val hiveStylePrefix = s"${partitionField.name}="
+                val timeMs = MILLISECONDS.convert(_partitionValue.toLong, MICROSECONDS)
-          val isHiveStyle = partitionValue.startsWith(hiveStylePrefix)
+                val timestampFormat = PartitionPathEncodeUtils.escapePathName(
-          val _partitionValue = if (isHiveStyle) {
+                  SqlKeyGenerator.timestampTimeFormat.print(timeMs))
-            partitionValue.substring(hiveStylePrefix.length)
+                if (isHiveStyle) s"$hiveStylePrefix$timestampFormat" else timestampFormat
-          } else {
+              case _ => partitionValue
-            partitionValue
+            }
-          }
+        }.mkString(KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR)
-
+      }
-          partitionField.dataType match {
+    } else partitionPath
            case TimestampType =>
              val timeMs = MILLISECONDS.convert(_partitionValue.toLong, MICROSECONDS)
              val timestampFormat = PartitionPathEncodeUtils.escapePathName(
                SqlKeyGenerator.timestampTimeFormat.print(timeMs))
              if (isHiveStyle) {
                s"$hiveStylePrefix$timestampFormat"
              } else {
                timestampFormat
              }
            case _=> partitionValue
          }
      }.mkString(KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR)
    } else {
      partitionPath
    }
  }
 }
 object SqlKeyGenerator {
  val PARTITION_SCHEMA = "hoodie.sql.partition.schema"
  val ORIGIN_KEYGEN_CLASS = "hoodie.sql.origin.keygen.class"
  private val timestampTimeFormat = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
 }
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/TruncateHoodieTableCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/TruncateHoodieTableCommand.scala
@@ -35,7 +35,6 @@ class TruncateHoodieTableCommand(
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val table = sparkSession.sessionState.catalog.getTableMetadata(tableName)
    val path = getTableLocation(table, sparkSession)
      .getOrElse(s"missing location for ${table.identifier}")
    val hadoopConf = sparkSession.sessionState.newHadoopConf()
    // If we have not specified the partition, truncate will delete all the
    // data in the table path include the hoodi.properties. In this case we
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala
@@ -85,7 +85,6 @@ case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends RunnableCo
    val targetTable = sparkSession.sessionState.catalog
      .getTableMetadata(tableId)
    val path = getTableLocation(targetTable, sparkSession)
      .getOrElse(s"missing location for $tableId")
    val primaryColumns = HoodieOptionConfig.getPrimaryColumns(targetTable.storage.properties)
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala
@@ -17,9 +17,15 @@
 package org.apache.spark.sql.hudi
 import org.apache.hudi.DataSourceWriteOptions._
 import scala.collection.JavaConverters._
 import org.apache.hudi.common.model.HoodieRecord
 import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
 import org.apache.hudi.config.HoodieWriteConfig
 import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat
 import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator}
 import org.apache.spark.sql.SaveMode
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType
 import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType, StructField}
@@ -272,4 +278,231 @@ class TestCreateTable extends TestHoodieSqlBase {
      )
    }
  }
  test("Test Create Table From Exist Hoodie Table") {
    withTempDir { tmp =>
      Seq("2021-08-02", "2021/08/02").foreach { partitionValue =>
        val tableName = generateTableName
        val tablePath = s"${tmp.getCanonicalPath}/$tableName"
        import spark.implicits._
        val df = Seq((1, "a1", 10, 1000, partitionValue)).toDF("id", "name", "value", "ts", "dt")
        // Write a table by spark dataframe.
        df.write.format("hudi")
          .option(HoodieWriteConfig.TABLE_NAME.key, tableName)
          .option(TABLE_TYPE.key, COW_TABLE_TYPE_OPT_VAL)
          .option(RECORDKEY_FIELD.key, "id")
          .option(PRECOMBINE_FIELD.key, "ts")
          .option(PARTITIONPATH_FIELD.key, "dt")
          .option(KEYGENERATOR_CLASS.key, classOf[SimpleKeyGenerator].getName)
          .option(HoodieWriteConfig.INSERT_PARALLELISM.key, "1")
          .option(HoodieWriteConfig.UPSERT_PARALLELISM.key, "1")
          .mode(SaveMode.Overwrite)
          .save(tablePath)
        // Create a table over the exist old table.
        spark.sql(
          s"""
             |create table $tableName using hudi
             | options (
             | primaryKey = 'id',
             | preCombineField = 'ts'
             |)
             |partitioned by (dt)
             |location '$tablePath'
             |""".stripMargin)
        checkAnswer(s"select id, name, value, ts, dt from $tableName")(
          Seq(1, "a1", 10, 1000, partitionValue)
        )
        // Check the missing properties for spark sql
        val metaClient = HoodieTableMetaClient.builder()
          .setBasePath(tablePath)
          .setConf(spark.sessionState.newHadoopConf())
          .build()
        val properties = metaClient.getTableConfig.getProps.asScala.toMap
        assertResult(true)(properties.contains(HoodieTableConfig.HOODIE_TABLE_CREATE_SCHEMA.key))
        assertResult("dt")(properties(HoodieTableConfig.HOODIE_TABLE_PARTITION_FIELDS_PROP.key))
        assertResult("ts")(properties(HoodieTableConfig.HOODIE_TABLE_PRECOMBINE_FIELD_PROP.key))
        // Test insert into
        spark.sql(s"insert into $tableName values(2, 'a2', 10, 1000, '$partitionValue')")
        checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, dt from $tableName order by id")(
          Seq("1", partitionValue, 1, "a1", 10, 1000, partitionValue),
          Seq("2", partitionValue, 2, "a2", 10, 1000, partitionValue)
        )
        // Test merge into
        spark.sql(
          s"""
             |merge into $tableName h0
             |using (select 1 as id, 'a1' as name, 11 as value, 1001 as ts, '$partitionValue' as dt) s0
             |on h0.id = s0.id
             |when matched then update set *
             |""".stripMargin)
        checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, dt from $tableName order by id")(
          Seq("1", partitionValue, 1, "a1", 11, 1001, partitionValue),
          Seq("2", partitionValue, 2, "a2", 10, 1000, partitionValue)
        )
        // Test update
        spark.sql(s"update $tableName set value = value + 1 where id = 2")
        checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, dt from $tableName order by id")(
          Seq("1", partitionValue, 1, "a1", 11, 1001, partitionValue),
          Seq("2", partitionValue, 2, "a2", 11, 1000, partitionValue)
        )
        // Test delete
        spark.sql(s"delete from $tableName where id = 1")
        checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, dt from $tableName order by id")(
          Seq("2", partitionValue, 2, "a2", 11, 1000, partitionValue)
        )
      }
    }
  }
  test("Test Create Table From Exist Hoodie Table For Multi-Level Partitioned Table") {
    withTempDir { tmp =>
      Seq("2021-08-02", "2021/08/02").foreach { day =>
        val tableName = generateTableName
        val tablePath = s"${tmp.getCanonicalPath}/$tableName"
        import spark.implicits._
        val df = Seq((1, "a1", 10, 1000, day, 12)).toDF("id", "name", "value", "ts", "day", "hh")
        // Write a table by spark dataframe.
        df.write.format("hudi")
          .option(HoodieWriteConfig.TABLE_NAME.key, tableName)
          .option(TABLE_TYPE.key, MOR_TABLE_TYPE_OPT_VAL)
          .option(RECORDKEY_FIELD.key, "id")
          .option(PRECOMBINE_FIELD.key, "ts")
          .option(PARTITIONPATH_FIELD.key, "day,hh")
          .option(KEYGENERATOR_CLASS.key, classOf[ComplexKeyGenerator].getName)
          .option(HoodieWriteConfig.INSERT_PARALLELISM.key, "1")
          .option(HoodieWriteConfig.UPSERT_PARALLELISM.key, "1")
          .mode(SaveMode.Overwrite)
          .save(tablePath)
        // Create a table over the exist old table.
        spark.sql(
          s"""
             |create table $tableName using hudi
             | options (
             | primaryKey = 'id',
             | preCombineField = 'ts'
             |)
             |partitioned by (day, hh)
             |location '$tablePath'
             |""".stripMargin)
        checkAnswer(s"select id, name, value, ts, day, hh from $tableName")(
          Seq(1, "a1", 10, 1000, day, 12)
        )
        // Check the missing properties for spark sql
        val metaClient = HoodieTableMetaClient.builder()
          .setBasePath(tablePath)
          .setConf(spark.sessionState.newHadoopConf())
          .build()
        val properties = metaClient.getTableConfig.getProps.asScala.toMap
        assertResult(true)(properties.contains(HoodieTableConfig.HOODIE_TABLE_CREATE_SCHEMA.key))
        assertResult("day,hh")(properties(HoodieTableConfig.HOODIE_TABLE_PARTITION_FIELDS_PROP.key))
        assertResult("ts")(properties(HoodieTableConfig.HOODIE_TABLE_PRECOMBINE_FIELD_PROP.key))
        // Test insert into
        spark.sql(s"insert into $tableName values(2, 'a2', 10, 1000, '$day', 12)")
        checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, day, hh from $tableName order by id")(
          Seq("id:1", s"$day/12", 1, "a1", 10, 1000, day, 12),
          Seq("id:2", s"$day/12", 2, "a2", 10, 1000, day, 12)
        )
        // Test merge into
        spark.sql(
          s"""
             |merge into $tableName h0
             |using (select 1 as id, 'a1' as name, 11 as value, 1001 as ts, '$day' as day, 12 as hh) s0
             |on h0.id = s0.id
             |when matched then update set *
             |""".stripMargin)
        checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, day, hh from $tableName order by id")(
          Seq("id:1", s"$day/12", 1, "a1", 11, 1001, day, 12),
          Seq("id:2", s"$day/12", 2, "a2", 10, 1000, day, 12)
        )
        // Test update
        spark.sql(s"update $tableName set value = value + 1 where id = 2")
        checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, day, hh from $tableName order by id")(
          Seq("id:1", s"$day/12", 1, "a1", 11, 1001, day, 12),
          Seq("id:2", s"$day/12", 2, "a2", 11, 1000, day, 12)
        )
        // Test delete
        spark.sql(s"delete from $tableName where id = 1")
        checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts, day, hh from $tableName order by id")(
          Seq("id:2", s"$day/12", 2, "a2", 11, 1000, day, 12)
        )
      }
    }
  }
  test("Test Create Table From Exist Hoodie Table For None Partitioned Table") {
    withTempDir{tmp =>
      // Write a table by spark dataframe.
      val tableName = generateTableName
      import spark.implicits._
      val df = Seq((1, "a1", 10, 1000)).toDF("id", "name", "value", "ts")
      df.write.format("hudi")
        .option(HoodieWriteConfig.TABLE_NAME.key, tableName)
        .option(TABLE_TYPE.key, COW_TABLE_TYPE_OPT_VAL)
        .option(RECORDKEY_FIELD.key, "id")
        .option(PRECOMBINE_FIELD.key, "ts")
        .option(PARTITIONPATH_FIELD.key, "")
        .option(KEYGENERATOR_CLASS.key, classOf[NonpartitionedKeyGenerator].getName)
        .option(HoodieWriteConfig.INSERT_PARALLELISM.key, "1")
        .option(HoodieWriteConfig.UPSERT_PARALLELISM.key, "1")
        .mode(SaveMode.Overwrite)
        .save(tmp.getCanonicalPath)
      // Create a table over the exist old table.
      spark.sql(
        s"""
           |create table $tableName using hudi
           | options (
           | primaryKey = 'id',
           | preCombineField = 'ts'
           |)
           |location '${tmp.getCanonicalPath}'
           |""".stripMargin)
      checkAnswer(s"select id, name, value, ts from $tableName")(
        Seq(1, "a1", 10, 1000)
      )
      // Check the missing properties for spark sql
      val metaClient = HoodieTableMetaClient.builder()
        .setBasePath(tmp.getCanonicalPath)
        .setConf(spark.sessionState.newHadoopConf())
        .build()
      val properties = metaClient.getTableConfig.getProps.asScala.toMap
      assertResult(true)(properties.contains(HoodieTableConfig.HOODIE_TABLE_CREATE_SCHEMA.key))
      assertResult("ts")(properties(HoodieTableConfig.HOODIE_TABLE_PRECOMBINE_FIELD_PROP.key))
      // Test insert into
      spark.sql(s"insert into $tableName values(2, 'a2', 10, 1000)")
      checkAnswer(s"select _hoodie_record_key, _hoodie_partition_path, id, name, value, ts from $tableName order by id")(
        Seq("1", "", 1, "a1", 10, 1000),
        Seq("2", "", 2, "a2", 10, 1000)
      )
      // Test merge into
      spark.sql(
        s"""
           |merge into $tableName h0
           |using (select 1 as id, 'a1' as name, 11 as value, 1001 as ts) s0
           |on h0.id = s0.id
           |when matched then update set *
           |""".stripMargin)
      checkAnswer(s"select id, name, value, ts from $tableName order by id")(
        Seq(1, "a1", 11, 1001),
        Seq(2, "a2", 10, 1000)
      )
      // Test update
      spark.sql(s"update $tableName set value = value + 1 where id = 2")
      checkAnswer(s"select id, name, value, ts from $tableName order by id")(
        Seq(1, "a1", 11, 1001),
        Seq(2, "a2", 11, 1000)
      )
      // Test delete
      spark.sql(s"delete from $tableName where id = 1")
      checkAnswer(s"select id, name, value, ts from $tableName order by id")(
        Seq(2, "a2", 11, 1000)
      )
    }
  }
 }
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartialUpdateForMergeInto.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartialUpdateForMergeInto.scala
@@ -126,9 +126,9 @@ class TestPartialUpdateForMergeInto extends TestHoodieSqlBase {
    checkException(
      s"""
         |merge into $tableName2 t0
-         |using ( select 1 as id, 'a1' as name, 12 as price) s0
+         |using ( select 1 as id, 'a1' as name, 12 as price, 1000 as ts) s0
         |on t0.id = s0.id
-         |when matched then update set price = s0.price
+         |when matched then update set price = s0.price, _ts = s0.ts
        """.stripMargin)(
      "Missing specify the value for target field: 'id' in merge into update action for MOR table. " +
        "Currently we cannot support partial update for MOR, please complete all the target fields " +