1
0

[HUDI-2837] Add support for using database name in incremental query (#4083)

This commit is contained in:
董可伦
2022-01-23 14:11:27 +08:00
committed by GitHub
parent 64b1426005
commit 56cd8ffae0
19 changed files with 330 additions and 63 deletions

View File

@@ -85,6 +85,7 @@ object HoodieSparkSqlWriter {
validateTableConfig(sqlContext.sparkSession, optParams, tableConfig)
val (parameters, hoodieConfig) = mergeParamsAndGetHoodieConfig(optParams, tableConfig)
val databaseName = hoodieConfig.getStringOrDefault(HoodieTableConfig.DATABASE_NAME, "")
val tblName = hoodieConfig.getStringOrThrow(HoodieWriteConfig.TBL_NAME,
s"'${HoodieWriteConfig.TBL_NAME.key}' must be set.").trim
assert(!StringUtils.isNullOrEmpty(hoodieConfig.getString(HoodieWriteConfig.TBL_NAME)),
@@ -131,6 +132,7 @@ object HoodieSparkSqlWriter {
val tableMetaClient = HoodieTableMetaClient.withPropertyBuilder()
.setTableType(tableType)
.setDatabaseName(databaseName)
.setTableName(tblName)
.setRecordKeyFields(recordKeyFields)
.setBaseFileFormat(baseFileFormat)

View File

@@ -21,8 +21,7 @@ import org.apache.hudi.AvroConversionUtils
import org.apache.hudi.HoodieWriterUtils._
import org.apache.hudi.common.config.DFSPropertiesConfiguration
import org.apache.hudi.common.model.HoodieTableType
import org.apache.hudi.common.table.HoodieTableConfig
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
import org.apache.hudi.common.util.ValidationUtils
import org.apache.hudi.keygen.ComplexKeyGenerator
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory
@@ -184,8 +183,10 @@ class HoodieCatalogTable(val spark: SparkSession, val table: CatalogTable) exten
} else {
val (recordName, namespace) = AvroConversionUtils.getAvroRecordNameAndNamespace(table.identifier.table)
val schema = SchemaConverters.toAvroType(finalSchema, false, recordName, namespace)
val hoodieDatabaseName = formatName(spark, table.identifier.database.getOrElse(spark.sessionState.catalog.getCurrentDatabase))
HoodieTableMetaClient.withPropertyBuilder()
.fromProperties(properties)
.setDatabaseName(hoodieDatabaseName)
.setTableName(table.identifier.table)
.setTableCreateSchema(schema.toString())
.setPartitionFields(table.partitionColumnNames.mkString(","))

View File

@@ -202,11 +202,11 @@ object HoodieOptionConfig {
s"Can't find primaryKey `$primaryKey` in ${schema.treeString}.")
}
// validate precombine key
val precombineKey = sqlOptions.get(SQL_KEY_PRECOMBINE_FIELD.sqlKeyName)
if (precombineKey.isDefined && precombineKey.get.nonEmpty) {
ValidationUtils.checkArgument(schema.exists(f => resolver(f.name, precombineKey.get)),
s"Can't find preCombineKey `${precombineKey.get}` in ${schema.treeString}.")
// validate preCombine key
val preCombineKey = sqlOptions.get(SQL_KEY_PRECOMBINE_FIELD.sqlKeyName)
if (preCombineKey.isDefined && preCombineKey.get.nonEmpty) {
ValidationUtils.checkArgument(schema.exists(f => resolver(f.name, preCombineKey.get)),
s"Can't find preCombineKey `${preCombineKey.get}` in ${schema.treeString}.")
}
// validate table type

View File

@@ -123,12 +123,12 @@ object CreateHoodieTableCommand {
table.storage.compressed,
storageProperties + ("path" -> path))
val tablName = HoodieSqlCommonUtils.formatName(sparkSession, table.identifier.table)
val tableName = HoodieSqlCommonUtils.formatName(sparkSession, table.identifier.table)
val newDatabaseName = HoodieSqlCommonUtils.formatName(sparkSession, table.identifier.database
.getOrElse(catalog.getCurrentDatabase))
val newTableIdentifier = table.identifier
.copy(table = tablName, database = Some(newDatabaseName))
.copy(table = tableName, database = Some(newDatabaseName))
val partitionColumnNames = hoodieCatalogTable.partitionSchema.map(_.name)
// append pk, preCombineKey, type to the properties of table

View File

@@ -50,7 +50,7 @@ class TruncateHoodieTableCommand(
}
// If we have not specified the partition, truncate will delete all the data in the table path
// include the hoodi.properties. In this case we should reInit the table.
// include the hoodie.properties. In this case we should reInit the table.
if (partitionSpec.isEmpty) {
val hadoopConf = sparkSession.sessionState.newHadoopConf()
// ReInit hoodie.properties

View File

@@ -115,7 +115,7 @@ class ExpressionPayload(record: GenericRecord,
if (targetRecord.isEmpty || needUpdatingPersistedRecord(targetRecord.get, resultRecord, properties)) {
resultRecordOpt = HOption.of(resultRecord)
} else {
// if the PreCombine field value of targetRecord is greate
// if the PreCombine field value of targetRecord is greater
// than the new incoming record, just keep the old record value.
resultRecordOpt = HOption.of(targetRecord.get)
}
@@ -270,7 +270,7 @@ class ExpressionPayload(record: GenericRecord,
object ExpressionPayload {
/**
* Property for pass the merge-into delete clause condition expresssion.
* Property for pass the merge-into delete clause condition expression.
*/
val PAYLOAD_DELETE_CONDITION = "hoodie.payload.delete.condition"

View File

@@ -146,7 +146,7 @@ object InsertIntoHoodieTableCommand extends Logging {
queryOutputWithoutMetaFields
}
// Align for the data fields of the query
val dataProjectsWithputMetaFields = queryDataFieldsWithoutMetaFields.zip(
val dataProjectsWithoutMetaFields = queryDataFieldsWithoutMetaFields.zip(
hoodieCatalogTable.dataSchemaWithoutMetaFields.fields).map { case (dataAttr, targetField) =>
val castAttr = castIfNeeded(dataAttr.withNullability(targetField.nullable),
targetField.dataType, conf)
@@ -171,7 +171,7 @@ object InsertIntoHoodieTableCommand extends Logging {
Alias(castAttr, f.name)()
})
}
val alignedProjects = dataProjectsWithputMetaFields ++ partitionProjects
val alignedProjects = dataProjectsWithoutMetaFields ++ partitionProjects
Project(alignedProjects, query)
}
@@ -217,7 +217,7 @@ object InsertIntoHoodieTableCommand extends Logging {
DataSourceWriteOptions.SQL_INSERT_MODE.defaultValue()))
val isNonStrictMode = insertMode == InsertMode.NON_STRICT
val isPartitionedTable = hoodieCatalogTable.partitionFields.nonEmpty
val hasPrecombineColumn = preCombineColumn.nonEmpty
val hasPreCombineColumn = preCombineColumn.nonEmpty
val operation =
(enableBulkInsert, isOverwrite, dropDuplicate, isNonStrictMode, isPartitionedTable) match {
case (true, _, _, false, _) =>
@@ -234,7 +234,7 @@ object InsertIntoHoodieTableCommand extends Logging {
// insert overwrite partition
case (_, true, _, _, true) => INSERT_OVERWRITE_OPERATION_OPT_VAL
// disable dropDuplicate, and provide preCombineKey, use the upsert operation for strict and upsert mode.
case (false, false, false, false, _) if hasPrecombineColumn => UPSERT_OPERATION_OPT_VAL
case (false, false, false, false, _) if hasPreCombineColumn => UPSERT_OPERATION_OPT_VAL
// if table is pk table and has enableBulkInsert use bulk insert for non-strict mode.
case (true, _, _, true, _) => BULK_INSERT_OPERATION_OPT_VAL
// for the rest case, use the insert operation
@@ -267,7 +267,7 @@ object InsertIntoHoodieTableCommand extends Logging {
PARTITIONPATH_FIELD.key -> partitionFields,
PAYLOAD_CLASS_NAME.key -> payloadClassName,
ENABLE_ROW_WRITER.key -> enableBulkInsert.toString,
HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(hasPrecombineColumn),
HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(hasPreCombineColumn),
META_SYNC_ENABLED.key -> enableHive.toString,
HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(),
HIVE_USE_JDBC.key -> "false",

View File

@@ -450,7 +450,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Hoodie
"path" -> path,
RECORDKEY_FIELD.key -> tableConfig.getRecordKeyFieldProp,
PRECOMBINE_FIELD.key -> hoodieCatalogTable.preCombineKey.getOrElse(""),
TBL_NAME.key -> targetTableName,
TBL_NAME.key -> hoodieCatalogTable.tableName,
PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp,
PAYLOAD_CLASS_NAME.key -> classOf[ExpressionPayload].getCanonicalName,
HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable,

View File

@@ -33,6 +33,10 @@ import scala.collection.JavaConverters._
class TestCreateTable extends TestHoodieSqlBase {
test("Test Create Managed Hoodie Table") {
val databaseName = "hudi_database"
spark.sql(s"create database if not exists $databaseName")
spark.sql(s"use $databaseName")
val tableName = generateTableName
// Create a managed table
spark.sql(
@@ -60,6 +64,14 @@ class TestCreateTable extends TestHoodieSqlBase {
StructField("price", DoubleType),
StructField("ts", LongType))
)(table.schema.fields)
val tablePath = table.storage.properties("path")
val metaClient = HoodieTableMetaClient.builder()
.setBasePath(tablePath)
.setConf(spark.sessionState.newHadoopConf())
.build()
val tableConfig = metaClient.getTableConfig
assertResult(databaseName)(tableConfig.getDatabaseName)
}
test("Test Create Hoodie Table With Options") {
@@ -88,7 +100,7 @@ class TestCreateTable extends TestHoodieSqlBase {
assertResult(CatalogTableType.MANAGED)(table.tableType)
assertResult(
HoodieRecord.HOODIE_META_COLUMNS.asScala.map(StructField(_, StringType))
++ Seq(
++ Seq(
StructField("id", IntegerType),
StructField("name", StringType),
StructField("price", DoubleType),
@@ -192,7 +204,7 @@ class TestCreateTable extends TestHoodieSqlBase {
}
test("Test Table Column Validate") {
withTempDir {tmp =>
withTempDir { tmp =>
val tableName = generateTableName
assertThrows[IllegalArgumentException] {
spark.sql(
@@ -277,7 +289,7 @@ class TestCreateTable extends TestHoodieSqlBase {
| select 1 as id, 'a1' as name, 10 as price, '2021-04-01' as dt
""".stripMargin
)
checkAnswer(s"select id, name, price, dt from $tableName2") (
checkAnswer(s"select id, name, price, dt from $tableName2")(
Seq(1, "a1", 10, "2021-04-01")
)
@@ -360,6 +372,10 @@ class TestCreateTable extends TestHoodieSqlBase {
test("Test Create Table From Existing Hoodie Table") {
withTempDir { tmp =>
val databaseName = "hudi_database"
spark.sql(s"create database if not exists $databaseName")
spark.sql(s"use $databaseName")
Seq("2021-08-02", "2021/08/02").foreach { partitionValue =>
val tableName = generateTableName
val tablePath = s"${tmp.getCanonicalPath}/$tableName"
@@ -367,7 +383,7 @@ class TestCreateTable extends TestHoodieSqlBase {
val df = Seq((1, "a1", 10, 1000, partitionValue)).toDF("id", "name", "value", "ts", "dt")
// Write a table by spark dataframe.
df.write.format("hudi")
.option(HoodieWriteConfig.TBL_NAME.key, tableName)
.option(HoodieWriteConfig.TBL_NAME.key, s"original_$tableName")
.option(TABLE_TYPE.key, COW_TABLE_TYPE_OPT_VAL)
.option(RECORDKEY_FIELD.key, "id")
.option(PRECOMBINE_FIELD.key, "ts")
@@ -386,7 +402,7 @@ class TestCreateTable extends TestHoodieSqlBase {
|partitioned by (dt)
|location '$tablePath'
|""".stripMargin
) ("It is not allowed to specify partition columns when the table schema is not defined.")
)("It is not allowed to specify partition columns when the table schema is not defined.")
spark.sql(
s"""
@@ -405,6 +421,8 @@ class TestCreateTable extends TestHoodieSqlBase {
assertResult(true)(properties.contains(HoodieTableConfig.CREATE_SCHEMA.key))
assertResult("dt")(properties(HoodieTableConfig.PARTITION_FIELDS.key))
assertResult("ts")(properties(HoodieTableConfig.PRECOMBINE_FIELD.key))
assertResult("")(metaClient.getTableConfig.getDatabaseName)
assertResult(s"original_$tableName")(metaClient.getTableConfig.getTableName)
// Test insert into
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1000, '$partitionValue')")
@@ -512,7 +530,7 @@ class TestCreateTable extends TestHoodieSqlBase {
}
test("Test Create Table From Existing Hoodie Table For None Partitioned Table") {
withTempDir{tmp =>
withTempDir { tmp =>
// Write a table by spark dataframe.
val tableName = generateTableName
import spark.implicits._