[HUDI-2208] Support Bulk Insert For Spark Sql (#3328)
This commit is contained in:
@@ -256,6 +256,22 @@ object DataSourceWriteOptions {
|
|||||||
.withDocumentation("When set to true, will perform write operations directly using the spark native " +
|
.withDocumentation("When set to true, will perform write operations directly using the spark native " +
|
||||||
"`Row` representation, avoiding any additional conversion costs.")
|
"`Row` representation, avoiding any additional conversion costs.")
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enable the bulk insert for sql insert statement.
|
||||||
|
*/
|
||||||
|
val SQL_ENABLE_BULK_INSERT:ConfigProperty[String] = ConfigProperty
|
||||||
|
.key("hoodie.sql.bulk.insert.enable")
|
||||||
|
.defaultValue("false")
|
||||||
|
.withDocumentation("When set to true, the sql insert statement will use bulk insert.")
|
||||||
|
|
||||||
|
val SQL_INSERT_MODE: ConfigProperty[String] = ConfigProperty
|
||||||
|
.key("hoodie.sql.insert.mode")
|
||||||
|
.defaultValue("upsert")
|
||||||
|
.withDocumentation("Insert mode when insert data to pk-table. The optional modes are: upsert, strict and non-strict." +
|
||||||
|
"For upsert mode, insert statement do the upsert operation for the pk-table which will update the duplicate record." +
|
||||||
|
"For strict mode, insert statement will keep the primary key uniqueness constraint which do not allow duplicate record." +
|
||||||
|
"While for non-strict mode, hudi just do the insert operation for the pk-table.")
|
||||||
|
|
||||||
val COMMIT_METADATA_KEYPREFIX: ConfigProperty[String] = ConfigProperty
|
val COMMIT_METADATA_KEYPREFIX: ConfigProperty[String] = ConfigProperty
|
||||||
.key("hoodie.datasource.write.commitmeta.key.prefix")
|
.key("hoodie.datasource.write.commitmeta.key.prefix")
|
||||||
.defaultValue("_")
|
.defaultValue("_")
|
||||||
|
|||||||
@@ -0,0 +1,66 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.sql;
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Insert mode for insert into pk-table.
|
||||||
|
*/
|
||||||
|
public enum InsertMode {
|
||||||
|
/**
|
||||||
|
* In upsert mode for insert into, duplicate record on primary key
|
||||||
|
* will be updated.This is the default insert mode for pk-table.
|
||||||
|
*/
|
||||||
|
UPSERT("upsert"),
|
||||||
|
/**
|
||||||
|
* In strict mode for insert into, we do the pk uniqueness guarantee
|
||||||
|
* for COW pk-table.
|
||||||
|
* For MOR pk-table, it has the same behavior with "upsert" mode.
|
||||||
|
*/
|
||||||
|
STRICT("strict"),
|
||||||
|
/**
|
||||||
|
* In non-strict mode for insert into, we use insert operation
|
||||||
|
* to write data which allow writing the duplicate record.
|
||||||
|
*/
|
||||||
|
NON_STRICT("non-strict")
|
||||||
|
;
|
||||||
|
|
||||||
|
private String value;
|
||||||
|
|
||||||
|
InsertMode(String value) {
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String value() {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static InsertMode of(String value) {
|
||||||
|
switch (value.toLowerCase(Locale.ROOT)) {
|
||||||
|
case "upsert":
|
||||||
|
return UPSERT;
|
||||||
|
case "strict":
|
||||||
|
return STRICT;
|
||||||
|
case "non-strict":
|
||||||
|
return NON_STRICT;
|
||||||
|
default:
|
||||||
|
throw new AssertionError("UnSupport Insert Mode: " + value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -164,7 +164,11 @@ object HoodieSparkSqlWriter {
|
|||||||
|
|
||||||
// Convert to RDD[HoodieRecord]
|
// Convert to RDD[HoodieRecord]
|
||||||
val genericRecords: RDD[GenericRecord] = HoodieSparkUtils.createRdd(df, schema, structName, nameSpace)
|
val genericRecords: RDD[GenericRecord] = HoodieSparkUtils.createRdd(df, schema, structName, nameSpace)
|
||||||
val shouldCombine = parameters(INSERT_DROP_DUPS.key()).toBoolean || operation.equals(WriteOperationType.UPSERT);
|
|
||||||
|
val shouldCombine = parameters(INSERT_DROP_DUPS.key()).toBoolean ||
|
||||||
|
operation.equals(WriteOperationType.UPSERT) ||
|
||||||
|
parameters.getOrElse(HoodieWriteConfig.COMBINE_BEFORE_INSERT_PROP.key(),
|
||||||
|
HoodieWriteConfig.COMBINE_BEFORE_INSERT_PROP.defaultValue()).toBoolean
|
||||||
val hoodieAllIncomingRecords = genericRecords.map(gr => {
|
val hoodieAllIncomingRecords = genericRecords.map(gr => {
|
||||||
val hoodieRecord = if (shouldCombine) {
|
val hoodieRecord = if (shouldCombine) {
|
||||||
val orderingVal = HoodieAvroUtils.getNestedFieldVal(gr, hoodieConfig.getString(PRECOMBINE_FIELD), false)
|
val orderingVal = HoodieAvroUtils.getNestedFieldVal(gr, hoodieConfig.getString(PRECOMBINE_FIELD), false)
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ object HoodieOptionConfig {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Mapping the sql options to the hoodie table config which used to store to the hoodie
|
* Mapping the sql options to the hoodie table config which used to store to the hoodie
|
||||||
* .properites when create the table.
|
* .properties when create the table.
|
||||||
* @param options
|
* @param options
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -218,7 +218,7 @@ object HoodieSqlUtils extends SparkAdapterSupport {
|
|||||||
* Append the spark config and table options to the baseConfig.
|
* Append the spark config and table options to the baseConfig.
|
||||||
*/
|
*/
|
||||||
def withSparkConf(spark: SparkSession, options: Map[String, String])
|
def withSparkConf(spark: SparkSession, options: Map[String, String])
|
||||||
(baseConfig: Map[String, String]): Map[String, String] = {
|
(baseConfig: Map[String, String] = Map.empty): Map[String, String] = {
|
||||||
baseConfig ++ // Table options has the highest priority
|
baseConfig ++ // Table options has the highest priority
|
||||||
(spark.sessionState.conf.getAllConfs ++ HoodieOptionConfig.mappingSqlOptionToHoodieParam(options))
|
(spark.sessionState.conf.getAllConfs ++ HoodieOptionConfig.mappingSqlOptionToHoodieParam(options))
|
||||||
.filterKeys(_.startsWith("hoodie."))
|
.filterKeys(_.startsWith("hoodie."))
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ import org.apache.hadoop.conf.Configuration
|
|||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
import org.apache.hudi.DataSourceWriteOptions
|
import org.apache.hudi.DataSourceWriteOptions
|
||||||
import org.apache.hudi.hive.util.ConfigUtils
|
import org.apache.hudi.hive.util.ConfigUtils
|
||||||
|
import org.apache.hudi.sql.InsertMode
|
||||||
import org.apache.spark.sql.{Row, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Row, SaveMode, SparkSession}
|
||||||
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
|
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
|
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
|
||||||
@@ -62,8 +63,8 @@ case class CreateHoodieTableAsSelectCommand(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
val tablePath = getTableLocation(table, sparkSession)
|
val tablePath = getTableLocation(table, sparkSession)
|
||||||
val conf = sparkSession.sessionState.newHadoopConf()
|
val hadoopConf = sparkSession.sessionState.newHadoopConf()
|
||||||
assert(CreateHoodieTableCommand.isEmptyPath(tablePath, conf),
|
assert(CreateHoodieTableCommand.isEmptyPath(tablePath, hadoopConf),
|
||||||
s"Path '$tablePath' should be empty for CTAS")
|
s"Path '$tablePath' should be empty for CTAS")
|
||||||
|
|
||||||
// ReOrder the query which move the partition columns to the last of the project list
|
// ReOrder the query which move the partition columns to the last of the project list
|
||||||
@@ -72,17 +73,15 @@ case class CreateHoodieTableAsSelectCommand(
|
|||||||
|
|
||||||
// Execute the insert query
|
// Execute the insert query
|
||||||
try {
|
try {
|
||||||
// Set if sync as a managed table.
|
val options = Map(
|
||||||
sparkSession.sessionState.conf.setConfString(DataSourceWriteOptions.HIVE_CREATE_MANAGED_TABLE.key,
|
DataSourceWriteOptions.HIVE_CREATE_MANAGED_TABLE.key -> (table.tableType == CatalogTableType.MANAGED).toString,
|
||||||
(table.tableType == CatalogTableType.MANAGED).toString)
|
DataSourceWriteOptions.HIVE_TABLE_SERDE_PROPERTIES.key -> ConfigUtils.configToString(table.storage.properties.asJava),
|
||||||
// Sync the options to hive serde properties
|
DataSourceWriteOptions.HIVE_TABLE_PROPERTIES.key -> ConfigUtils.configToString(table.properties.asJava),
|
||||||
sparkSession.sessionState.conf.setConfString(DataSourceWriteOptions.HIVE_TABLE_SERDE_PROPERTIES.key,
|
DataSourceWriteOptions.SQL_INSERT_MODE.key -> InsertMode.NON_STRICT.value(),
|
||||||
ConfigUtils.configToString(table.storage.properties.asJava))
|
DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key -> "true"
|
||||||
// Sync the table properties to hive
|
)
|
||||||
sparkSession.sessionState.conf.setConfString(DataSourceWriteOptions.HIVE_TABLE_PROPERTIES.key,
|
|
||||||
ConfigUtils.configToString(table.properties.asJava))
|
|
||||||
val success = InsertIntoHoodieTableCommand.run(sparkSession, tableWithSchema, reOrderedQuery, Map.empty,
|
val success = InsertIntoHoodieTableCommand.run(sparkSession, tableWithSchema, reOrderedQuery, Map.empty,
|
||||||
mode == SaveMode.Overwrite, refreshTable = false)
|
mode == SaveMode.Overwrite, refreshTable = false, extraOptions = options)
|
||||||
if (success) {
|
if (success) {
|
||||||
// If write success, create the table in catalog if it has not synced to the
|
// If write success, create the table in catalog if it has not synced to the
|
||||||
// catalog by the meta sync.
|
// catalog by the meta sync.
|
||||||
@@ -92,11 +91,11 @@ case class CreateHoodieTableAsSelectCommand(
|
|||||||
createTableCommand.createTableInCatalog(sparkSession, checkPathForManagedTable = false)
|
createTableCommand.createTableInCatalog(sparkSession, checkPathForManagedTable = false)
|
||||||
}
|
}
|
||||||
} else { // failed to insert data, clear table path
|
} else { // failed to insert data, clear table path
|
||||||
clearTablePath(tablePath, conf)
|
clearTablePath(tablePath, hadoopConf)
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
case e: Throwable => // failed to insert data, clear table path
|
case e: Throwable => // failed to insert data, clear table path
|
||||||
clearTablePath(tablePath, conf)
|
clearTablePath(tablePath, hadoopConf)
|
||||||
throw e
|
throw e
|
||||||
}
|
}
|
||||||
Seq.empty[Row]
|
Seq.empty[Row]
|
||||||
|
|||||||
@@ -28,10 +28,12 @@ import org.apache.hudi.config.HoodieWriteConfig
|
|||||||
import org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME
|
import org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME
|
||||||
import org.apache.hudi.hive.MultiPartKeysValueExtractor
|
import org.apache.hudi.hive.MultiPartKeysValueExtractor
|
||||||
import org.apache.hudi.hive.ddl.HiveSyncMode
|
import org.apache.hudi.hive.ddl.HiveSyncMode
|
||||||
import org.apache.hudi.{HoodieSparkSqlWriter, HoodieWriterUtils}
|
import org.apache.hudi.sql.InsertMode
|
||||||
|
import org.apache.hudi.{DataSourceWriteOptions, HoodieSparkSqlWriter, HoodieWriterUtils}
|
||||||
|
import org.apache.spark.internal.Logging
|
||||||
import org.apache.spark.sql.catalyst.catalog.CatalogTable
|
import org.apache.spark.sql.catalyst.catalog.CatalogTable
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
|
import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
|
||||||
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
|
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
|
||||||
import org.apache.spark.sql.execution.command.RunnableCommand
|
import org.apache.spark.sql.execution.command.RunnableCommand
|
||||||
import org.apache.spark.sql.execution.datasources.LogicalRelation
|
import org.apache.spark.sql.execution.datasources.LogicalRelation
|
||||||
@@ -58,7 +60,7 @@ case class InsertIntoHoodieTableCommand(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object InsertIntoHoodieTableCommand {
|
object InsertIntoHoodieTableCommand extends Logging {
|
||||||
/**
|
/**
|
||||||
* Run the insert query. We support both dynamic partition insert and static partition insert.
|
* Run the insert query. We support both dynamic partition insert and static partition insert.
|
||||||
* @param sparkSession The spark session.
|
* @param sparkSession The spark session.
|
||||||
@@ -71,12 +73,14 @@ object InsertIntoHoodieTableCommand {
|
|||||||
* , it is None in the map.
|
* , it is None in the map.
|
||||||
* @param overwrite Whether to overwrite the table.
|
* @param overwrite Whether to overwrite the table.
|
||||||
* @param refreshTable Whether to refresh the table after insert finished.
|
* @param refreshTable Whether to refresh the table after insert finished.
|
||||||
|
* @param extraOptions Extra options for insert.
|
||||||
*/
|
*/
|
||||||
def run(sparkSession: SparkSession, table: CatalogTable, query: LogicalPlan,
|
def run(sparkSession: SparkSession, table: CatalogTable, query: LogicalPlan,
|
||||||
insertPartitions: Map[String, Option[String]],
|
insertPartitions: Map[String, Option[String]],
|
||||||
overwrite: Boolean, refreshTable: Boolean = true): Boolean = {
|
overwrite: Boolean, refreshTable: Boolean = true,
|
||||||
|
extraOptions: Map[String, String] = Map.empty): Boolean = {
|
||||||
|
|
||||||
val config = buildHoodieInsertConfig(table, sparkSession, overwrite, insertPartitions)
|
val config = buildHoodieInsertConfig(table, sparkSession, overwrite, insertPartitions, extraOptions)
|
||||||
|
|
||||||
val mode = if (overwrite && table.partitionColumnNames.isEmpty) {
|
val mode = if (overwrite && table.partitionColumnNames.isEmpty) {
|
||||||
// insert overwrite non-partition table
|
// insert overwrite non-partition table
|
||||||
@@ -165,7 +169,7 @@ object InsertIntoHoodieTableCommand {
|
|||||||
Alias(castAttr, f.name)()
|
Alias(castAttr, f.name)()
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
// Remove the hoodie meta fileds from the projects as we do not need these to write
|
// Remove the hoodie meta fields from the projects as we do not need these to write
|
||||||
val withoutMetaFieldDataProjects = dataProjects.filter(c => !HoodieSqlUtils.isMetaField(c.name))
|
val withoutMetaFieldDataProjects = dataProjects.filter(c => !HoodieSqlUtils.isMetaField(c.name))
|
||||||
val alignedProjects = withoutMetaFieldDataProjects ++ partitionProjects
|
val alignedProjects = withoutMetaFieldDataProjects ++ partitionProjects
|
||||||
Project(alignedProjects, query)
|
Project(alignedProjects, query)
|
||||||
@@ -179,7 +183,8 @@ object InsertIntoHoodieTableCommand {
|
|||||||
table: CatalogTable,
|
table: CatalogTable,
|
||||||
sparkSession: SparkSession,
|
sparkSession: SparkSession,
|
||||||
isOverwrite: Boolean,
|
isOverwrite: Boolean,
|
||||||
insertPartitions: Map[String, Option[String]] = Map.empty): Map[String, String] = {
|
insertPartitions: Map[String, Option[String]] = Map.empty,
|
||||||
|
extraOptions: Map[String, String]): Map[String, String] = {
|
||||||
|
|
||||||
if (insertPartitions.nonEmpty &&
|
if (insertPartitions.nonEmpty &&
|
||||||
(insertPartitions.keys.toSet != table.partitionColumnNames.toSet)) {
|
(insertPartitions.keys.toSet != table.partitionColumnNames.toSet)) {
|
||||||
@@ -187,7 +192,7 @@ object InsertIntoHoodieTableCommand {
|
|||||||
s"[${insertPartitions.keys.mkString(" " )}]" +
|
s"[${insertPartitions.keys.mkString(" " )}]" +
|
||||||
s" not equal to the defined partition in table[${table.partitionColumnNames.mkString(",")}]")
|
s" not equal to the defined partition in table[${table.partitionColumnNames.mkString(",")}]")
|
||||||
}
|
}
|
||||||
val parameters = HoodieOptionConfig.mappingSqlOptionToHoodieParam(table.storage.properties)
|
val parameters = withSparkConf(sparkSession, table.storage.properties)() ++ extraOptions
|
||||||
|
|
||||||
val tableType = parameters.getOrElse(TABLE_TYPE.key, TABLE_TYPE.defaultValue)
|
val tableType = parameters.getOrElse(TABLE_TYPE.key, TABLE_TYPE.defaultValue)
|
||||||
|
|
||||||
@@ -209,28 +214,49 @@ object InsertIntoHoodieTableCommand {
|
|||||||
.getOrElse(INSERT_DROP_DUPS.defaultValue)
|
.getOrElse(INSERT_DROP_DUPS.defaultValue)
|
||||||
.toBoolean
|
.toBoolean
|
||||||
|
|
||||||
val operation = if (isOverwrite) {
|
val enableBulkInsert = parameters.getOrElse(DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key,
|
||||||
if (table.partitionColumnNames.nonEmpty) {
|
DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.defaultValue()).toBoolean
|
||||||
INSERT_OVERWRITE_OPERATION_OPT_VAL // overwrite partition
|
val isPartitionedTable = table.partitionColumnNames.nonEmpty
|
||||||
} else {
|
val isPrimaryKeyTable = primaryColumns.nonEmpty
|
||||||
INSERT_OPERATION_OPT_VAL
|
val insertMode = InsertMode.of(parameters.getOrElse(DataSourceWriteOptions.SQL_INSERT_MODE.key,
|
||||||
}
|
DataSourceWriteOptions.SQL_INSERT_MODE.defaultValue()))
|
||||||
} else {
|
val isNonStrictMode = insertMode == InsertMode.NON_STRICT
|
||||||
if (primaryColumns.nonEmpty && !dropDuplicate) {
|
|
||||||
UPSERT_OPERATION_OPT_VAL
|
|
||||||
} else {
|
|
||||||
INSERT_OPERATION_OPT_VAL
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
val payloadClassName = if (primaryColumns.nonEmpty && !dropDuplicate &&
|
val operation =
|
||||||
tableType == COW_TABLE_TYPE_OPT_VAL) {
|
(isPrimaryKeyTable, enableBulkInsert, isOverwrite, dropDuplicate) match {
|
||||||
|
case (true, true, _, _) if !isNonStrictMode =>
|
||||||
|
throw new IllegalArgumentException(s"Table with primaryKey can not use bulk insert in ${insertMode.value()} mode.")
|
||||||
|
case (_, true, true, _) if isPartitionedTable =>
|
||||||
|
throw new IllegalArgumentException(s"Insert Overwrite Partition can not use bulk insert.")
|
||||||
|
case (_, true, _, true) =>
|
||||||
|
throw new IllegalArgumentException(s"Bulk insert cannot support drop duplication." +
|
||||||
|
s" Please disable $INSERT_DROP_DUPS and try again.")
|
||||||
|
// if enableBulkInsert is true, use bulk insert for the insert overwrite non-partitioned table.
|
||||||
|
case (_, true, true, _) if !isPartitionedTable => BULK_INSERT_OPERATION_OPT_VAL
|
||||||
|
// insert overwrite partition
|
||||||
|
case (_, _, true, _) if isPartitionedTable => INSERT_OVERWRITE_OPERATION_OPT_VAL
|
||||||
|
// insert overwrite table
|
||||||
|
case (_, _, true, _) if !isPartitionedTable => INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL
|
||||||
|
// if it is pk table and the dropDuplicate has disable, use the upsert operation for strict and upsert mode.
|
||||||
|
case (true, false, false, false) if !isNonStrictMode => UPSERT_OPERATION_OPT_VAL
|
||||||
|
// if enableBulkInsert is true and the table is non-primaryKeyed, use the bulk insert operation
|
||||||
|
case (false, true, _, _) => BULK_INSERT_OPERATION_OPT_VAL
|
||||||
|
// if table is pk table and has enableBulkInsert use bulk insert for non-strict mode.
|
||||||
|
case (true, true, _, _) if isNonStrictMode => BULK_INSERT_OPERATION_OPT_VAL
|
||||||
|
// for the rest case, use the insert operation
|
||||||
|
case (_, _, _, _) => INSERT_OPERATION_OPT_VAL
|
||||||
|
}
|
||||||
|
|
||||||
|
val payloadClassName = if (operation == UPSERT_OPERATION_OPT_VAL &&
|
||||||
|
tableType == COW_TABLE_TYPE_OPT_VAL && insertMode == InsertMode.STRICT) {
|
||||||
// Only validate duplicate key for COW, for MOR it will do the merge with the DefaultHoodieRecordPayload
|
// Only validate duplicate key for COW, for MOR it will do the merge with the DefaultHoodieRecordPayload
|
||||||
// on reading.
|
// on reading.
|
||||||
classOf[ValidateDuplicateKeyPayload].getCanonicalName
|
classOf[ValidateDuplicateKeyPayload].getCanonicalName
|
||||||
} else {
|
} else {
|
||||||
classOf[DefaultHoodieRecordPayload].getCanonicalName
|
classOf[DefaultHoodieRecordPayload].getCanonicalName
|
||||||
}
|
}
|
||||||
|
logInfo(s"insert statement use write operation type: $operation, payloadClass: $payloadClassName")
|
||||||
|
|
||||||
val enableHive = isEnableHive(sparkSession)
|
val enableHive = isEnableHive(sparkSession)
|
||||||
withSparkConf(sparkSession, options) {
|
withSparkConf(sparkSession, options) {
|
||||||
Map(
|
Map(
|
||||||
@@ -243,6 +269,8 @@ object InsertIntoHoodieTableCommand {
|
|||||||
RECORDKEY_FIELD.key -> primaryColumns.mkString(","),
|
RECORDKEY_FIELD.key -> primaryColumns.mkString(","),
|
||||||
PARTITIONPATH_FIELD.key -> partitionFields,
|
PARTITIONPATH_FIELD.key -> partitionFields,
|
||||||
PAYLOAD_CLASS.key -> payloadClassName,
|
PAYLOAD_CLASS.key -> payloadClassName,
|
||||||
|
ENABLE_ROW_WRITER.key -> enableBulkInsert.toString,
|
||||||
|
HoodieWriteConfig.COMBINE_BEFORE_INSERT_PROP.key -> isPrimaryKeyTable.toString,
|
||||||
META_SYNC_ENABLED.key -> enableHive.toString,
|
META_SYNC_ENABLED.key -> enableHive.toString,
|
||||||
HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(),
|
HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(),
|
||||||
HIVE_USE_JDBC.key -> "false",
|
HIVE_USE_JDBC.key -> "false",
|
||||||
|
|||||||
@@ -22,9 +22,10 @@ import org.apache.avro.generic.GenericRecord
|
|||||||
import org.apache.hudi.common.config.TypedProperties
|
import org.apache.hudi.common.config.TypedProperties
|
||||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils
|
import org.apache.hudi.common.util.PartitionPathEncodeUtils
|
||||||
import org.apache.hudi.config.HoodieWriteConfig
|
import org.apache.hudi.config.HoodieWriteConfig
|
||||||
import org.apache.hudi.keygen.{ComplexKeyGenerator, KeyGenUtils}
|
import org.apache.hudi.keygen.{BaseKeyGenerator, ComplexKeyGenerator, KeyGenUtils, SparkKeyGeneratorInterface}
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
import org.apache.spark.sql.types.{StructType, TimestampType}
|
import org.apache.spark.sql.types.{StructType, TimestampType}
|
||||||
import org.joda.time.format.DateTimeFormat
|
import org.joda.time.format.{DateTimeFormat, DateTimeFormatter}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A complex key generator for sql command which do some process for the
|
* A complex key generator for sql command which do some process for the
|
||||||
@@ -62,8 +63,15 @@ class SqlKeyGenerator(props: TypedProperties) extends ComplexKeyGenerator(props)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
override def getPartitionPath(record: GenericRecord) = {
|
override def getRecordKey(row: Row): String = {
|
||||||
val partitionPath = super.getPartitionPath(record)
|
if (originKeyGen.isDefined && originKeyGen.get.isInstanceOf[SparkKeyGeneratorInterface]) {
|
||||||
|
originKeyGen.get.asInstanceOf[SparkKeyGeneratorInterface].getRecordKey(row)
|
||||||
|
} else {
|
||||||
|
super.getRecordKey(row)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def convertPartitionPathToSqlType(partitionPath: String, rowType: Boolean): String = {
|
||||||
if (partitionSchema.isDefined) {
|
if (partitionSchema.isDefined) {
|
||||||
// we can split the partitionPath here because we enable the URL_ENCODE_PARTITIONING_OPT
|
// we can split the partitionPath here because we enable the URL_ENCODE_PARTITIONING_OPT
|
||||||
// by default for sql.
|
// by default for sql.
|
||||||
@@ -82,9 +90,13 @@ class SqlKeyGenerator(props: TypedProperties) extends ComplexKeyGenerator(props)
|
|||||||
|
|
||||||
partitionField.dataType match {
|
partitionField.dataType match {
|
||||||
case TimestampType =>
|
case TimestampType =>
|
||||||
val timeMs = MILLISECONDS.convert(_partitionValue.toLong, MICROSECONDS)
|
val timeMs = if (rowType) { // In RowType, the partitionPathValue is the time format string, convert to millis
|
||||||
|
SqlKeyGenerator.sqlTimestampFormat.parseMillis(_partitionValue)
|
||||||
|
} else {
|
||||||
|
MILLISECONDS.convert(_partitionValue.toLong, MICROSECONDS)
|
||||||
|
}
|
||||||
val timestampFormat = PartitionPathEncodeUtils.escapePathName(
|
val timestampFormat = PartitionPathEncodeUtils.escapePathName(
|
||||||
SqlKeyGenerator.timestampTimeFormat.print(timeMs))
|
SqlKeyGenerator.timestampTimeFormat.print(timeMs))
|
||||||
if (isHiveStyle) s"$hiveStylePrefix$timestampFormat" else timestampFormat
|
if (isHiveStyle) s"$hiveStylePrefix$timestampFormat" else timestampFormat
|
||||||
case _ => partitionValue
|
case _ => partitionValue
|
||||||
}
|
}
|
||||||
@@ -92,10 +104,21 @@ class SqlKeyGenerator(props: TypedProperties) extends ComplexKeyGenerator(props)
|
|||||||
}
|
}
|
||||||
} else partitionPath
|
} else partitionPath
|
||||||
}
|
}
|
||||||
|
|
||||||
|
override def getPartitionPath(record: GenericRecord) = {
|
||||||
|
val partitionPath = super.getPartitionPath(record)
|
||||||
|
convertPartitionPathToSqlType(partitionPath, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
override def getPartitionPath(row: Row): String = {
|
||||||
|
val partitionPath = super.getPartitionPath(row)
|
||||||
|
convertPartitionPathToSqlType(partitionPath, true)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object SqlKeyGenerator {
|
object SqlKeyGenerator {
|
||||||
val PARTITION_SCHEMA = "hoodie.sql.partition.schema"
|
val PARTITION_SCHEMA = "hoodie.sql.partition.schema"
|
||||||
val ORIGIN_KEYGEN_CLASS = "hoodie.sql.origin.keygen.class"
|
val ORIGIN_KEYGEN_CLASS = "hoodie.sql.origin.keygen.class"
|
||||||
private val timestampTimeFormat = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
|
private val timestampTimeFormat = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
|
||||||
|
private val sqlTimestampFormat = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.S")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,9 +18,9 @@
|
|||||||
package org.apache.spark.sql.hudi.command
|
package org.apache.spark.sql.hudi.command
|
||||||
|
|
||||||
import java.util.UUID
|
import java.util.UUID
|
||||||
|
|
||||||
import org.apache.avro.generic.GenericRecord
|
import org.apache.avro.generic.GenericRecord
|
||||||
import org.apache.hudi.common.config.TypedProperties
|
import org.apache.hudi.common.config.TypedProperties
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A KeyGenerator which use the uuid as the record key.
|
* A KeyGenerator which use the uuid as the record key.
|
||||||
@@ -28,4 +28,6 @@ import org.apache.hudi.common.config.TypedProperties
|
|||||||
class UuidKeyGenerator(props: TypedProperties) extends SqlKeyGenerator(props) {
|
class UuidKeyGenerator(props: TypedProperties) extends SqlKeyGenerator(props) {
|
||||||
|
|
||||||
override def getRecordKey(record: GenericRecord): String = UUID.randomUUID.toString
|
override def getRecordKey(record: GenericRecord): String = UUID.randomUUID.toString
|
||||||
|
|
||||||
|
override def getRecordKey(row: Row): String = UUID.randomUUID.toString
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -79,12 +79,15 @@ class TestHoodieSqlBase extends FunSuite with BeforeAndAfterAll {
|
|||||||
}
|
}
|
||||||
|
|
||||||
protected def checkException(sql: String)(errorMsg: String): Unit = {
|
protected def checkException(sql: String)(errorMsg: String): Unit = {
|
||||||
|
var hasException = false
|
||||||
try {
|
try {
|
||||||
spark.sql(sql)
|
spark.sql(sql)
|
||||||
} catch {
|
} catch {
|
||||||
case e: Throwable =>
|
case e: Throwable =>
|
||||||
assertResult(errorMsg)(e.getMessage)
|
assertResult(errorMsg)(e.getMessage)
|
||||||
|
hasException = true
|
||||||
}
|
}
|
||||||
|
assertResult(true)(hasException)
|
||||||
}
|
}
|
||||||
|
|
||||||
protected def removeQuotes(value: Any): Any = {
|
protected def removeQuotes(value: Any): Any = {
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ class TestInsertTable extends TestHoodieSqlBase {
|
|||||||
test("Test Insert Into None Partitioned Table") {
|
test("Test Insert Into None Partitioned Table") {
|
||||||
withTempDir { tmp =>
|
withTempDir { tmp =>
|
||||||
val tableName = generateTableName
|
val tableName = generateTableName
|
||||||
|
spark.sql(s"set hoodie.sql.insert.mode=strict")
|
||||||
// Create none partitioned cow table
|
// Create none partitioned cow table
|
||||||
spark.sql(
|
spark.sql(
|
||||||
s"""
|
s"""
|
||||||
@@ -80,7 +81,6 @@ class TestInsertTable extends TestHoodieSqlBase {
|
|||||||
| preCombineField = 'ts'
|
| preCombineField = 'ts'
|
||||||
| )
|
| )
|
||||||
""".stripMargin)
|
""".stripMargin)
|
||||||
|
|
||||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||||
checkAnswer(s"select id, name, price, ts from $tableName")(
|
checkAnswer(s"select id, name, price, ts from $tableName")(
|
||||||
Seq(1, "a1", 10.0, 1000)
|
Seq(1, "a1", 10.0, 1000)
|
||||||
@@ -127,6 +127,9 @@ class TestInsertTable extends TestHoodieSqlBase {
|
|||||||
checkAnswer(s"select id, name, price, ts from $tableName2")(
|
checkAnswer(s"select id, name, price, ts from $tableName2")(
|
||||||
Seq(1, "a1", 10.0, 1000)
|
Seq(1, "a1", 10.0, 1000)
|
||||||
)
|
)
|
||||||
|
// disable this config to avoid affect other test in this class.
|
||||||
|
spark.sql("set hoodie.datasource.write.insert.drop.duplicates = false")
|
||||||
|
spark.sql(s"set hoodie.sql.insert.mode=upsert")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -146,7 +149,6 @@ class TestInsertTable extends TestHoodieSqlBase {
|
|||||||
| partitioned by (dt)
|
| partitioned by (dt)
|
||||||
| location '${tmp.getCanonicalPath}/$tableName'
|
| location '${tmp.getCanonicalPath}/$tableName'
|
||||||
""".stripMargin)
|
""".stripMargin)
|
||||||
|
|
||||||
// Insert overwrite dynamic partition
|
// Insert overwrite dynamic partition
|
||||||
spark.sql(
|
spark.sql(
|
||||||
s"""
|
s"""
|
||||||
@@ -246,7 +248,6 @@ class TestInsertTable extends TestHoodieSqlBase {
|
|||||||
| partitioned by (dt)
|
| partitioned by (dt)
|
||||||
| location '${tmp.getCanonicalPath}/$tableName'
|
| location '${tmp.getCanonicalPath}/$tableName'
|
||||||
""".stripMargin)
|
""".stripMargin)
|
||||||
|
|
||||||
spark.sql(s"insert into $tableName partition(dt = $partitionValue) select 1, 'a1', 10")
|
spark.sql(s"insert into $tableName partition(dt = $partitionValue) select 1, 'a1', 10")
|
||||||
spark.sql(s"insert into $tableName select 2, 'a2', 10, $partitionValue")
|
spark.sql(s"insert into $tableName select 2, 'a2', 10, $partitionValue")
|
||||||
checkAnswer(s"select id, name, price, cast(dt as string) from $tableName order by id")(
|
checkAnswer(s"select id, name, price, cast(dt as string) from $tableName order by id")(
|
||||||
@@ -303,5 +304,220 @@ class TestInsertTable extends TestHoodieSqlBase {
|
|||||||
"assertion failed: Required select columns count: 4, Current select columns(including static partition column)" +
|
"assertion failed: Required select columns count: 4, Current select columns(including static partition column)" +
|
||||||
" count: 3,columns: (1,a1,10)"
|
" count: 3,columns: (1,a1,10)"
|
||||||
)
|
)
|
||||||
|
spark.sql("set hoodie.sql.bulk.insert.enable = true")
|
||||||
|
spark.sql("set hoodie.sql.insert.mode= strict")
|
||||||
|
|
||||||
|
val tableName2 = generateTableName
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName2 (
|
||||||
|
| id int,
|
||||||
|
| name string,
|
||||||
|
| price double,
|
||||||
|
| ts long
|
||||||
|
|) using hudi
|
||||||
|
| options (
|
||||||
|
| primaryKey = 'id',
|
||||||
|
| preCombineField = 'ts'
|
||||||
|
| )
|
||||||
|
""".stripMargin)
|
||||||
|
checkException(s"insert into $tableName2 values(1, 'a1', 10, 1000)")(
|
||||||
|
"Table with primaryKey can not use bulk insert in strict mode."
|
||||||
|
)
|
||||||
|
|
||||||
|
val tableName3 = generateTableName
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName3 (
|
||||||
|
| id int,
|
||||||
|
| name string,
|
||||||
|
| price double,
|
||||||
|
| dt string
|
||||||
|
|) using hudi
|
||||||
|
| partitioned by (dt)
|
||||||
|
""".stripMargin)
|
||||||
|
checkException(s"insert overwrite table $tableName3 values(1, 'a1', 10, '2021-07-18')")(
|
||||||
|
"Insert Overwrite Partition can not use bulk insert."
|
||||||
|
)
|
||||||
|
spark.sql("set hoodie.sql.bulk.insert.enable = false")
|
||||||
|
spark.sql("set hoodie.sql.insert.mode= upsert")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("Test bulk insert") {
|
||||||
|
withTempDir { tmp =>
|
||||||
|
Seq("cow", "mor").foreach {tableType =>
|
||||||
|
// Test bulk insert for single partition
|
||||||
|
val tableName = generateTableName
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName (
|
||||||
|
| id int,
|
||||||
|
| name string,
|
||||||
|
| price double,
|
||||||
|
| dt string
|
||||||
|
|) using hudi
|
||||||
|
| options (
|
||||||
|
| type = '$tableType'
|
||||||
|
| )
|
||||||
|
| partitioned by (dt)
|
||||||
|
| location '${tmp.getCanonicalPath}/$tableName'
|
||||||
|
""".stripMargin)
|
||||||
|
spark.sql("set hoodie.datasource.write.insert.drop.duplicates = false")
|
||||||
|
|
||||||
|
// Enable the bulk insert
|
||||||
|
spark.sql("set hoodie.sql.bulk.insert.enable = true")
|
||||||
|
spark.sql(s"insert into $tableName values(1, 'a1', 10, '2021-07-18')")
|
||||||
|
|
||||||
|
checkAnswer(s"select id, name, price, dt from $tableName")(
|
||||||
|
Seq(1, "a1", 10.0, "2021-07-18")
|
||||||
|
)
|
||||||
|
// Disable the bulk insert
|
||||||
|
spark.sql("set hoodie.sql.bulk.insert.enable = false")
|
||||||
|
spark.sql(s"insert into $tableName values(2, 'a2', 10, '2021-07-18')")
|
||||||
|
|
||||||
|
checkAnswer(s"select id, name, price, dt from $tableName order by id")(
|
||||||
|
Seq(1, "a1", 10.0, "2021-07-18"),
|
||||||
|
Seq(2, "a2", 10.0, "2021-07-18")
|
||||||
|
)
|
||||||
|
|
||||||
|
// Test bulk insert for multi-level partition
|
||||||
|
val tableMultiPartition = generateTableName
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableMultiPartition (
|
||||||
|
| id int,
|
||||||
|
| name string,
|
||||||
|
| price double,
|
||||||
|
| dt string,
|
||||||
|
| hh string
|
||||||
|
|) using hudi
|
||||||
|
| options (
|
||||||
|
| type = '$tableType'
|
||||||
|
| )
|
||||||
|
| partitioned by (dt, hh)
|
||||||
|
| location '${tmp.getCanonicalPath}/$tableMultiPartition'
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
// Enable the bulk insert
|
||||||
|
spark.sql("set hoodie.sql.bulk.insert.enable = true")
|
||||||
|
spark.sql(s"insert into $tableMultiPartition values(1, 'a1', 10, '2021-07-18', '12')")
|
||||||
|
|
||||||
|
checkAnswer(s"select id, name, price, dt, hh from $tableMultiPartition")(
|
||||||
|
Seq(1, "a1", 10.0, "2021-07-18", "12")
|
||||||
|
)
|
||||||
|
// Disable the bulk insert
|
||||||
|
spark.sql("set hoodie.sql.bulk.insert.enable = false")
|
||||||
|
spark.sql(s"insert into $tableMultiPartition " +
|
||||||
|
s"values(2, 'a2', 10, '2021-07-18','12')")
|
||||||
|
|
||||||
|
checkAnswer(s"select id, name, price, dt, hh from $tableMultiPartition order by id")(
|
||||||
|
Seq(1, "a1", 10.0, "2021-07-18", "12"),
|
||||||
|
Seq(2, "a2", 10.0, "2021-07-18", "12")
|
||||||
|
)
|
||||||
|
// Test bulk insert for non-partitioned table
|
||||||
|
val nonPartitionedTable = generateTableName
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $nonPartitionedTable (
|
||||||
|
| id int,
|
||||||
|
| name string,
|
||||||
|
| price double
|
||||||
|
|) using hudi
|
||||||
|
| options (
|
||||||
|
| type = '$tableType'
|
||||||
|
| )
|
||||||
|
| location '${tmp.getCanonicalPath}/$nonPartitionedTable'
|
||||||
|
""".stripMargin)
|
||||||
|
spark.sql("set hoodie.sql.bulk.insert.enable = true")
|
||||||
|
spark.sql(s"insert into $nonPartitionedTable values(1, 'a1', 10)")
|
||||||
|
checkAnswer(s"select id, name, price from $nonPartitionedTable")(
|
||||||
|
Seq(1, "a1", 10.0)
|
||||||
|
)
|
||||||
|
spark.sql(s"insert overwrite table $nonPartitionedTable values(2, 'a2', 10)")
|
||||||
|
checkAnswer(s"select id, name, price from $nonPartitionedTable")(
|
||||||
|
Seq(2, "a2", 10.0)
|
||||||
|
)
|
||||||
|
spark.sql("set hoodie.sql.bulk.insert.enable = false")
|
||||||
|
|
||||||
|
// Test CTAS for bulk insert
|
||||||
|
val tableName2 = generateTableName
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName2
|
||||||
|
|using hudi
|
||||||
|
|options(
|
||||||
|
| type = '$tableType',
|
||||||
|
| primaryKey = 'id'
|
||||||
|
|)
|
||||||
|
| location '${tmp.getCanonicalPath}/$tableName2'
|
||||||
|
| as
|
||||||
|
| select * from $tableName
|
||||||
|
|""".stripMargin)
|
||||||
|
checkAnswer(s"select id, name, price, dt from $tableName2 order by id")(
|
||||||
|
Seq(1, "a1", 10.0, "2021-07-18"),
|
||||||
|
Seq(2, "a2", 10.0, "2021-07-18")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("Test combine before insert") {
|
||||||
|
withTempDir{tmp =>
|
||||||
|
val tableName = generateTableName
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName (
|
||||||
|
| id int,
|
||||||
|
| name string,
|
||||||
|
| price double,
|
||||||
|
| ts long
|
||||||
|
|) using hudi
|
||||||
|
| location '${tmp.getCanonicalPath}/$tableName'
|
||||||
|
| options (
|
||||||
|
| primaryKey = 'id',
|
||||||
|
| preCombineField = 'ts'
|
||||||
|
| )
|
||||||
|
""".stripMargin)
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|insert overwrite table $tableName
|
||||||
|
|select * from (
|
||||||
|
| select 1 as id, 'a1' as name, 10 as price, 1000 as ts
|
||||||
|
| union all
|
||||||
|
| select 1 as id, 'a1' as name, 11 as price, 1001 as ts
|
||||||
|
| )
|
||||||
|
|""".stripMargin
|
||||||
|
)
|
||||||
|
checkAnswer(s"select id, name, price, ts from $tableName")(
|
||||||
|
Seq(1, "a1", 11.0, 1001)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("Test insert pk-table") {
|
||||||
|
withTempDir{tmp =>
|
||||||
|
val tableName = generateTableName
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName (
|
||||||
|
| id int,
|
||||||
|
| name string,
|
||||||
|
| price double,
|
||||||
|
| ts long
|
||||||
|
|) using hudi
|
||||||
|
| location '${tmp.getCanonicalPath}/$tableName'
|
||||||
|
| options (
|
||||||
|
| primaryKey = 'id',
|
||||||
|
| preCombineField = 'ts'
|
||||||
|
| )
|
||||||
|
""".stripMargin)
|
||||||
|
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||||
|
spark.sql(s"insert into $tableName values(1, 'a1', 11, 1000)")
|
||||||
|
checkAnswer(s"select id, name, price, ts from $tableName")(
|
||||||
|
Seq(1, "a1", 11.0, 1000)
|
||||||
|
)
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user