1
0

[HUDI-4404] Fix insert into dynamic partition write misalignment (#6124)

This commit is contained in:
ForwardXu
2022-07-22 09:40:52 +08:00
committed by GitHub
parent 8e0b47e360
commit 51b5783161
2 changed files with 177 additions and 41 deletions

View File

@@ -20,13 +20,14 @@ package org.apache.spark.sql.hudi.command
import org.apache.hudi.HoodieSparkSqlWriter import org.apache.hudi.HoodieSparkSqlWriter
import org.apache.spark.internal.Logging import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable} import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable}
import org.apache.spark.sql.catalyst.expressions.{Alias, Literal} import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Literal}
import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._
import org.apache.spark.sql.hudi.ProvidesHoodieConfig import org.apache.spark.sql.hudi.ProvidesHoodieConfig
import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
/** /**
@@ -120,50 +121,45 @@ object InsertIntoHoodieTableCommand extends Logging with ProvidesHoodieConfig {
val staticPartitionValues = insertPartitions.filter(p => p._2.isDefined).mapValues(_.get) val staticPartitionValues = insertPartitions.filter(p => p._2.isDefined).mapValues(_.get)
assert(staticPartitionValues.isEmpty || assert(staticPartitionValues.isEmpty ||
staticPartitionValues.size == targetPartitionSchema.size, insertPartitions.size == targetPartitionSchema.size,
s"Required partition columns is: ${targetPartitionSchema.json}, Current static partitions " + s"Required partition columns is: ${targetPartitionSchema.json}, Current input partitions " +
s"is: ${staticPartitionValues.mkString("," + "")}") s"is: ${staticPartitionValues.mkString("," + "")}")
val queryOutputWithoutMetaFields = removeMetaFields(query.output) val queryOutputWithoutMetaFields = removeMetaFields(query.output)
assert(staticPartitionValues.size + queryOutputWithoutMetaFields.size assert(staticPartitionValues.size + queryOutputWithoutMetaFields.size
== hoodieCatalogTable.tableSchemaWithoutMetaFields.size, == hoodieCatalogTable.tableSchemaWithoutMetaFields.size,
s"Required select columns count: ${hoodieCatalogTable.tableSchemaWithoutMetaFields.size}, " + s"Required select columns count: ${hoodieCatalogTable.tableSchemaWithoutMetaFields.size}, " +
s"Current select columns(including static partition column) count: " + s"Current select columns(including static partition column) count: " +
s"${staticPartitionValues.size + queryOutputWithoutMetaFields.size}columns: " + s"${staticPartitionValues.size + queryOutputWithoutMetaFields.size}columns: " +
s"(${(queryOutputWithoutMetaFields.map(_.name) ++ staticPartitionValues.keys).mkString(",")})") s"(${(queryOutputWithoutMetaFields.map(_.name) ++ staticPartitionValues.keys).mkString(",")})")
val queryDataFieldsWithoutMetaFields = if (staticPartitionValues.isEmpty) { // insert dynamic partition val dataAndDynamicPartitionSchemaWithoutMetaFields = StructType(
queryOutputWithoutMetaFields.dropRight(targetPartitionSchema.fields.length) hoodieCatalogTable.tableSchemaWithoutMetaFields.filterNot(f => staticPartitionValues.contains(f.name)))
} else { // insert static partition val dataProjectsWithoutMetaFields = getTableFieldsAlias(queryOutputWithoutMetaFields,
queryOutputWithoutMetaFields dataAndDynamicPartitionSchemaWithoutMetaFields.fields, conf)
}
// Align for the data fields of the query
val dataProjectsWithoutMetaFields = queryDataFieldsWithoutMetaFields.zip(
hoodieCatalogTable.dataSchemaWithoutMetaFields.fields).map { case (dataAttr, targetField) =>
val castAttr = castIfNeeded(dataAttr.withNullability(targetField.nullable),
targetField.dataType, conf)
Alias(castAttr, targetField.name)()
}
val partitionProjects = if (staticPartitionValues.isEmpty) { // insert dynamic partitions val partitionProjects = targetPartitionSchema.fields.filter(f => staticPartitionValues.contains(f.name))
// The partition attributes is followed the data attributes in the query .map(f => {
// So we init the partitionAttrPosition with the data schema size.
var partitionAttrPosition = hoodieCatalogTable.dataSchemaWithoutMetaFields.size
targetPartitionSchema.fields.map(f => {
val partitionAttr = queryOutputWithoutMetaFields(partitionAttrPosition)
partitionAttrPosition = partitionAttrPosition + 1
val castAttr = castIfNeeded(partitionAttr.withNullability(f.nullable), f.dataType, conf)
Alias(castAttr, f.name)()
})
} else { // insert static partitions
targetPartitionSchema.fields.map(f => {
val staticPartitionValue = staticPartitionValues.getOrElse(f.name, val staticPartitionValue = staticPartitionValues.getOrElse(f.name,
s"Missing static partition value for: ${f.name}") s"Missing static partition value for: ${f.name}")
val castAttr = castIfNeeded(Literal.create(staticPartitionValue), f.dataType, conf) val castAttr = castIfNeeded(Literal.create(staticPartitionValue), f.dataType, conf)
Alias(castAttr, f.name)() Alias(castAttr, f.name)()
}) })
Project(dataProjectsWithoutMetaFields ++ partitionProjects, query)
}
private def getTableFieldsAlias(
queryOutputWithoutMetaFields: Seq[Attribute],
schemaWithoutMetaFields: Seq[StructField],
conf: SQLConf): Seq[Alias] = {
queryOutputWithoutMetaFields.zip(schemaWithoutMetaFields).map { case (dataAttr, dataField) =>
val targetFieldOption = if (dataAttr.name.startsWith("col")) None else
schemaWithoutMetaFields.find(_.name.equals(dataAttr.name))
val targetField = if (targetFieldOption.isDefined) targetFieldOption.get else dataField
val castAttr = castIfNeeded(dataAttr.withNullability(targetField.nullable),
targetField.dataType, conf)
Alias(castAttr, targetField.name)()
} }
val alignedProjects = dataProjectsWithoutMetaFields ++ partitionProjects
Project(alignedProjects, query)
} }
} }

View File

@@ -29,7 +29,7 @@ import java.io.File
class TestInsertTable extends HoodieSparkSqlTestBase { class TestInsertTable extends HoodieSparkSqlTestBase {
test("Test Insert Into") { test("Test Insert Into with values") {
withTempDir { tmp => withTempDir { tmp =>
val tableName = generateTableName val tableName = generateTableName
// Create a partitioned table // Create a partitioned table
@@ -37,33 +37,173 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
s""" s"""
|create table $tableName ( |create table $tableName (
| id int, | id int,
| dt string,
| name string, | name string,
| price double, | price double,
| ts long, | ts long
| dt string
|) using hudi |) using hudi
| tblproperties (primaryKey = 'id') | tblproperties (primaryKey = 'id')
| partitioned by (dt) | partitioned by (dt)
| location '${tmp.getCanonicalPath}' | location '${tmp.getCanonicalPath}'
""".stripMargin) """.stripMargin)
// Insert into dynamic partition
// Note: Do not write the field alias, the partition field must be placed last.
spark.sql( spark.sql(
s""" s"""
| insert into $tableName | insert into $tableName values
| select 1 as id, 'a1' as name, 10 as price, 1000 as ts, '2021-01-05' as dt | (1, 'a1', 10, 1000, "2021-01-05"),
""".stripMargin) | (2, 'a2', 20, 2000, "2021-01-06"),
| (3, 'a3', 30, 3000, "2021-01-07")
""".stripMargin)
checkAnswer(s"select id, name, price, ts, dt from $tableName")( checkAnswer(s"select id, name, price, ts, dt from $tableName")(
Seq(1, "a1", 10.0, 1000, "2021-01-05") Seq(1, "a1", 10.0, 1000, "2021-01-05"),
Seq(2, "a2", 20.0, 2000, "2021-01-06"),
Seq(3, "a3", 30.0, 3000, "2021-01-07")
) )
}
}
test("Test Insert Into with static partition") {
withTempDir { tmp =>
val tableName = generateTableName
// Create a partitioned table
spark.sql(
s"""
|create table $tableName (
| id int,
| dt string,
| name string,
| price double,
| ts long
|) using hudi
| tblproperties (primaryKey = 'id')
| partitioned by (dt)
| location '${tmp.getCanonicalPath}'
""".stripMargin)
// Insert into static partition // Insert into static partition
spark.sql( spark.sql(
s""" s"""
| insert into $tableName partition(dt = '2021-01-05') | insert into $tableName partition(dt = '2021-01-05')
| select 2 as id, 'a2' as name, 10 as price, 1000 as ts | select 1 as id, 'a1' as name, 10 as price, 1000 as ts
""".stripMargin)
spark.sql(
s"""
| insert into $tableName partition(dt = '2021-01-06')
| select 20 as price, 2000 as ts, 2 as id, 'a2' as name
""".stripMargin)
// Note: Do not write the field alias, the partition field must be placed last.
spark.sql(
s"""
| insert into $tableName
| select 3, 'a3', 30, 3000, '2021-01-07'
""".stripMargin) """.stripMargin)
checkAnswer(s"select id, name, price, ts, dt from $tableName")( checkAnswer(s"select id, name, price, ts, dt from $tableName")(
Seq(1, "a1", 10.0, 1000, "2021-01-05"), Seq(1, "a1", 10.0, 1000, "2021-01-05"),
Seq(2, "a2", 10.0, 1000, "2021-01-05") Seq(2, "a2", 20.0, 2000, "2021-01-06"),
Seq(3, "a3", 30.0, 3000, "2021-01-07")
)
}
}
test("Test Insert Into with dynamic partition") {
withTempDir { tmp =>
val tableName = generateTableName
// Create a partitioned table
spark.sql(
s"""
|create table $tableName (
| id int,
| dt string,
| name string,
| price double,
| ts long
|) using hudi
| tblproperties (primaryKey = 'id')
| partitioned by (dt)
| location '${tmp.getCanonicalPath}'
""".stripMargin)
// Insert into dynamic partition
spark.sql(
s"""
| insert into $tableName partition(dt)
| select 1 as id, '2021-01-05' as dt, 'a1' as name, 10 as price, 1000 as ts
""".stripMargin)
spark.sql(
s"""
| insert into $tableName
| select 2 as id, 'a2' as name, 20 as price, 2000 as ts, '2021-01-06' as dt
""".stripMargin)
// Note: Do not write the field alias, the partition field must be placed last.
spark.sql(
s"""
| insert into $tableName
| select 3, 'a3', 30, 3000, '2021-01-07'
""".stripMargin)
checkAnswer(s"select id, name, price, ts, dt from $tableName")(
Seq(1, "a1", 10.0, 1000, "2021-01-05"),
Seq(2, "a2", 20.0, 2000, "2021-01-06"),
Seq(3, "a3", 30.0, 3000, "2021-01-07")
)
}
}
test("Test Insert Into with multi partition") {
withTempDir { tmp =>
val tableName = generateTableName
// Create a partitioned table
spark.sql(
s"""
|create table $tableName (
| id int,
| dt string,
| name string,
| price double,
| ht string,
| ts long
|) using hudi
| tblproperties (primaryKey = 'id')
| partitioned by (dt, ht)
| location '${tmp.getCanonicalPath}'
""".stripMargin)
spark.sql(
s"""
| insert into $tableName partition(dt, ht)
| select 1 as id, 'a1' as name, 10 as price,'20210101' as dt, 1000 as ts, '01' as ht
""".stripMargin)
// Insert into static partition and dynamic partition
spark.sql(
s"""
| insert into $tableName partition(dt = '20210102', ht)
| select 2 as id, 'a2' as name, 20 as price, 2000 as ts, '02' as ht
""".stripMargin)
spark.sql(
s"""
| insert into $tableName partition(dt, ht = '03')
| select 3 as id, 'a3' as name, 30 as price, 3000 as ts, '20210103' as dt
""".stripMargin)
// Note: Do not write the field alias, the partition field must be placed last.
spark.sql(
s"""
| insert into $tableName
| select 4, 'a4', 40, 4000, '20210104', '04'
""".stripMargin)
checkAnswer(s"select id, name, price, ts, dt, ht from $tableName")(
Seq(1, "a1", 10.0, 1000, "20210101", "01"),
Seq(2, "a2", 20.0, 2000, "20210102", "02"),
Seq(3, "a3", 30.0, 3000, "20210103", "03"),
Seq(4, "a4", 40.0, 4000, "20210104", "04")
) )
} }
} }