[HUDI-4404] Fix insert into dynamic partition write misalignment (#6124)
This commit is contained in:
@@ -20,13 +20,14 @@ package org.apache.spark.sql.hudi.command
|
|||||||
import org.apache.hudi.HoodieSparkSqlWriter
|
import org.apache.hudi.HoodieSparkSqlWriter
|
||||||
import org.apache.spark.internal.Logging
|
import org.apache.spark.internal.Logging
|
||||||
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable}
|
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable}
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
|
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Literal}
|
||||||
import org.apache.spark.sql.catalyst.plans.QueryPlan
|
import org.apache.spark.sql.catalyst.plans.QueryPlan
|
||||||
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
|
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
|
||||||
import org.apache.spark.sql.execution.datasources.LogicalRelation
|
import org.apache.spark.sql.execution.datasources.LogicalRelation
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._
|
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._
|
||||||
import org.apache.spark.sql.hudi.ProvidesHoodieConfig
|
import org.apache.spark.sql.hudi.ProvidesHoodieConfig
|
||||||
import org.apache.spark.sql.internal.SQLConf
|
import org.apache.spark.sql.internal.SQLConf
|
||||||
|
import org.apache.spark.sql.types.{StructField, StructType}
|
||||||
import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -120,50 +121,45 @@ object InsertIntoHoodieTableCommand extends Logging with ProvidesHoodieConfig {
|
|||||||
|
|
||||||
val staticPartitionValues = insertPartitions.filter(p => p._2.isDefined).mapValues(_.get)
|
val staticPartitionValues = insertPartitions.filter(p => p._2.isDefined).mapValues(_.get)
|
||||||
assert(staticPartitionValues.isEmpty ||
|
assert(staticPartitionValues.isEmpty ||
|
||||||
staticPartitionValues.size == targetPartitionSchema.size,
|
insertPartitions.size == targetPartitionSchema.size,
|
||||||
s"Required partition columns is: ${targetPartitionSchema.json}, Current static partitions " +
|
s"Required partition columns is: ${targetPartitionSchema.json}, Current input partitions " +
|
||||||
s"is: ${staticPartitionValues.mkString("," + "")}")
|
s"is: ${staticPartitionValues.mkString("," + "")}")
|
||||||
|
|
||||||
val queryOutputWithoutMetaFields = removeMetaFields(query.output)
|
val queryOutputWithoutMetaFields = removeMetaFields(query.output)
|
||||||
assert(staticPartitionValues.size + queryOutputWithoutMetaFields.size
|
assert(staticPartitionValues.size + queryOutputWithoutMetaFields.size
|
||||||
== hoodieCatalogTable.tableSchemaWithoutMetaFields.size,
|
== hoodieCatalogTable.tableSchemaWithoutMetaFields.size,
|
||||||
s"Required select columns count: ${hoodieCatalogTable.tableSchemaWithoutMetaFields.size}, " +
|
s"Required select columns count: ${hoodieCatalogTable.tableSchemaWithoutMetaFields.size}, " +
|
||||||
s"Current select columns(including static partition column) count: " +
|
s"Current select columns(including static partition column) count: " +
|
||||||
s"${staticPartitionValues.size + queryOutputWithoutMetaFields.size},columns: " +
|
s"${staticPartitionValues.size + queryOutputWithoutMetaFields.size},columns: " +
|
||||||
s"(${(queryOutputWithoutMetaFields.map(_.name) ++ staticPartitionValues.keys).mkString(",")})")
|
s"(${(queryOutputWithoutMetaFields.map(_.name) ++ staticPartitionValues.keys).mkString(",")})")
|
||||||
|
|
||||||
val queryDataFieldsWithoutMetaFields = if (staticPartitionValues.isEmpty) { // insert dynamic partition
|
val dataAndDynamicPartitionSchemaWithoutMetaFields = StructType(
|
||||||
queryOutputWithoutMetaFields.dropRight(targetPartitionSchema.fields.length)
|
hoodieCatalogTable.tableSchemaWithoutMetaFields.filterNot(f => staticPartitionValues.contains(f.name)))
|
||||||
} else { // insert static partition
|
val dataProjectsWithoutMetaFields = getTableFieldsAlias(queryOutputWithoutMetaFields,
|
||||||
queryOutputWithoutMetaFields
|
dataAndDynamicPartitionSchemaWithoutMetaFields.fields, conf)
|
||||||
}
|
|
||||||
// Align for the data fields of the query
|
|
||||||
val dataProjectsWithoutMetaFields = queryDataFieldsWithoutMetaFields.zip(
|
|
||||||
hoodieCatalogTable.dataSchemaWithoutMetaFields.fields).map { case (dataAttr, targetField) =>
|
|
||||||
val castAttr = castIfNeeded(dataAttr.withNullability(targetField.nullable),
|
|
||||||
targetField.dataType, conf)
|
|
||||||
Alias(castAttr, targetField.name)()
|
|
||||||
}
|
|
||||||
|
|
||||||
val partitionProjects = if (staticPartitionValues.isEmpty) { // insert dynamic partitions
|
val partitionProjects = targetPartitionSchema.fields.filter(f => staticPartitionValues.contains(f.name))
|
||||||
// The partition attributes is followed the data attributes in the query
|
.map(f => {
|
||||||
// So we init the partitionAttrPosition with the data schema size.
|
|
||||||
var partitionAttrPosition = hoodieCatalogTable.dataSchemaWithoutMetaFields.size
|
|
||||||
targetPartitionSchema.fields.map(f => {
|
|
||||||
val partitionAttr = queryOutputWithoutMetaFields(partitionAttrPosition)
|
|
||||||
partitionAttrPosition = partitionAttrPosition + 1
|
|
||||||
val castAttr = castIfNeeded(partitionAttr.withNullability(f.nullable), f.dataType, conf)
|
|
||||||
Alias(castAttr, f.name)()
|
|
||||||
})
|
|
||||||
} else { // insert static partitions
|
|
||||||
targetPartitionSchema.fields.map(f => {
|
|
||||||
val staticPartitionValue = staticPartitionValues.getOrElse(f.name,
|
val staticPartitionValue = staticPartitionValues.getOrElse(f.name,
|
||||||
s"Missing static partition value for: ${f.name}")
|
s"Missing static partition value for: ${f.name}")
|
||||||
val castAttr = castIfNeeded(Literal.create(staticPartitionValue), f.dataType, conf)
|
val castAttr = castIfNeeded(Literal.create(staticPartitionValue), f.dataType, conf)
|
||||||
Alias(castAttr, f.name)()
|
Alias(castAttr, f.name)()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
Project(dataProjectsWithoutMetaFields ++ partitionProjects, query)
|
||||||
|
}
|
||||||
|
|
||||||
|
private def getTableFieldsAlias(
|
||||||
|
queryOutputWithoutMetaFields: Seq[Attribute],
|
||||||
|
schemaWithoutMetaFields: Seq[StructField],
|
||||||
|
conf: SQLConf): Seq[Alias] = {
|
||||||
|
queryOutputWithoutMetaFields.zip(schemaWithoutMetaFields).map { case (dataAttr, dataField) =>
|
||||||
|
val targetFieldOption = if (dataAttr.name.startsWith("col")) None else
|
||||||
|
schemaWithoutMetaFields.find(_.name.equals(dataAttr.name))
|
||||||
|
val targetField = if (targetFieldOption.isDefined) targetFieldOption.get else dataField
|
||||||
|
val castAttr = castIfNeeded(dataAttr.withNullability(targetField.nullable),
|
||||||
|
targetField.dataType, conf)
|
||||||
|
Alias(castAttr, targetField.name)()
|
||||||
}
|
}
|
||||||
val alignedProjects = dataProjectsWithoutMetaFields ++ partitionProjects
|
|
||||||
Project(alignedProjects, query)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ import java.io.File
|
|||||||
|
|
||||||
class TestInsertTable extends HoodieSparkSqlTestBase {
|
class TestInsertTable extends HoodieSparkSqlTestBase {
|
||||||
|
|
||||||
test("Test Insert Into") {
|
test("Test Insert Into with values") {
|
||||||
withTempDir { tmp =>
|
withTempDir { tmp =>
|
||||||
val tableName = generateTableName
|
val tableName = generateTableName
|
||||||
// Create a partitioned table
|
// Create a partitioned table
|
||||||
@@ -37,33 +37,173 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
|
|||||||
s"""
|
s"""
|
||||||
|create table $tableName (
|
|create table $tableName (
|
||||||
| id int,
|
| id int,
|
||||||
|
| dt string,
|
||||||
| name string,
|
| name string,
|
||||||
| price double,
|
| price double,
|
||||||
| ts long,
|
| ts long
|
||||||
| dt string
|
|
||||||
|) using hudi
|
|) using hudi
|
||||||
| tblproperties (primaryKey = 'id')
|
| tblproperties (primaryKey = 'id')
|
||||||
| partitioned by (dt)
|
| partitioned by (dt)
|
||||||
| location '${tmp.getCanonicalPath}'
|
| location '${tmp.getCanonicalPath}'
|
||||||
""".stripMargin)
|
""".stripMargin)
|
||||||
// Insert into dynamic partition
|
|
||||||
|
// Note: Do not write the field alias, the partition field must be placed last.
|
||||||
spark.sql(
|
spark.sql(
|
||||||
s"""
|
s"""
|
||||||
| insert into $tableName
|
| insert into $tableName values
|
||||||
| select 1 as id, 'a1' as name, 10 as price, 1000 as ts, '2021-01-05' as dt
|
| (1, 'a1', 10, 1000, "2021-01-05"),
|
||||||
""".stripMargin)
|
| (2, 'a2', 20, 2000, "2021-01-06"),
|
||||||
|
| (3, 'a3', 30, 3000, "2021-01-07")
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
checkAnswer(s"select id, name, price, ts, dt from $tableName")(
|
checkAnswer(s"select id, name, price, ts, dt from $tableName")(
|
||||||
Seq(1, "a1", 10.0, 1000, "2021-01-05")
|
Seq(1, "a1", 10.0, 1000, "2021-01-05"),
|
||||||
|
Seq(2, "a2", 20.0, 2000, "2021-01-06"),
|
||||||
|
Seq(3, "a3", 30.0, 3000, "2021-01-07")
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("Test Insert Into with static partition") {
|
||||||
|
withTempDir { tmp =>
|
||||||
|
val tableName = generateTableName
|
||||||
|
// Create a partitioned table
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName (
|
||||||
|
| id int,
|
||||||
|
| dt string,
|
||||||
|
| name string,
|
||||||
|
| price double,
|
||||||
|
| ts long
|
||||||
|
|) using hudi
|
||||||
|
| tblproperties (primaryKey = 'id')
|
||||||
|
| partitioned by (dt)
|
||||||
|
| location '${tmp.getCanonicalPath}'
|
||||||
|
""".stripMargin)
|
||||||
// Insert into static partition
|
// Insert into static partition
|
||||||
spark.sql(
|
spark.sql(
|
||||||
s"""
|
s"""
|
||||||
| insert into $tableName partition(dt = '2021-01-05')
|
| insert into $tableName partition(dt = '2021-01-05')
|
||||||
| select 2 as id, 'a2' as name, 10 as price, 1000 as ts
|
| select 1 as id, 'a1' as name, 10 as price, 1000 as ts
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
| insert into $tableName partition(dt = '2021-01-06')
|
||||||
|
| select 20 as price, 2000 as ts, 2 as id, 'a2' as name
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
// Note: Do not write the field alias, the partition field must be placed last.
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
| insert into $tableName
|
||||||
|
| select 3, 'a3', 30, 3000, '2021-01-07'
|
||||||
""".stripMargin)
|
""".stripMargin)
|
||||||
|
|
||||||
checkAnswer(s"select id, name, price, ts, dt from $tableName")(
|
checkAnswer(s"select id, name, price, ts, dt from $tableName")(
|
||||||
Seq(1, "a1", 10.0, 1000, "2021-01-05"),
|
Seq(1, "a1", 10.0, 1000, "2021-01-05"),
|
||||||
Seq(2, "a2", 10.0, 1000, "2021-01-05")
|
Seq(2, "a2", 20.0, 2000, "2021-01-06"),
|
||||||
|
Seq(3, "a3", 30.0, 3000, "2021-01-07")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("Test Insert Into with dynamic partition") {
|
||||||
|
withTempDir { tmp =>
|
||||||
|
val tableName = generateTableName
|
||||||
|
// Create a partitioned table
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName (
|
||||||
|
| id int,
|
||||||
|
| dt string,
|
||||||
|
| name string,
|
||||||
|
| price double,
|
||||||
|
| ts long
|
||||||
|
|) using hudi
|
||||||
|
| tblproperties (primaryKey = 'id')
|
||||||
|
| partitioned by (dt)
|
||||||
|
| location '${tmp.getCanonicalPath}'
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
// Insert into dynamic partition
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
| insert into $tableName partition(dt)
|
||||||
|
| select 1 as id, '2021-01-05' as dt, 'a1' as name, 10 as price, 1000 as ts
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
| insert into $tableName
|
||||||
|
| select 2 as id, 'a2' as name, 20 as price, 2000 as ts, '2021-01-06' as dt
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
// Note: Do not write the field alias, the partition field must be placed last.
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
| insert into $tableName
|
||||||
|
| select 3, 'a3', 30, 3000, '2021-01-07'
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
checkAnswer(s"select id, name, price, ts, dt from $tableName")(
|
||||||
|
Seq(1, "a1", 10.0, 1000, "2021-01-05"),
|
||||||
|
Seq(2, "a2", 20.0, 2000, "2021-01-06"),
|
||||||
|
Seq(3, "a3", 30.0, 3000, "2021-01-07")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("Test Insert Into with multi partition") {
|
||||||
|
withTempDir { tmp =>
|
||||||
|
val tableName = generateTableName
|
||||||
|
// Create a partitioned table
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName (
|
||||||
|
| id int,
|
||||||
|
| dt string,
|
||||||
|
| name string,
|
||||||
|
| price double,
|
||||||
|
| ht string,
|
||||||
|
| ts long
|
||||||
|
|) using hudi
|
||||||
|
| tblproperties (primaryKey = 'id')
|
||||||
|
| partitioned by (dt, ht)
|
||||||
|
| location '${tmp.getCanonicalPath}'
|
||||||
|
""".stripMargin)
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
| insert into $tableName partition(dt, ht)
|
||||||
|
| select 1 as id, 'a1' as name, 10 as price,'20210101' as dt, 1000 as ts, '01' as ht
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
// Insert into static partition and dynamic partition
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
| insert into $tableName partition(dt = '20210102', ht)
|
||||||
|
| select 2 as id, 'a2' as name, 20 as price, 2000 as ts, '02' as ht
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
| insert into $tableName partition(dt, ht = '03')
|
||||||
|
| select 3 as id, 'a3' as name, 30 as price, 3000 as ts, '20210103' as dt
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
// Note: Do not write the field alias, the partition field must be placed last.
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
| insert into $tableName
|
||||||
|
| select 4, 'a4', 40, 4000, '20210104', '04'
|
||||||
|
""".stripMargin)
|
||||||
|
|
||||||
|
checkAnswer(s"select id, name, price, ts, dt, ht from $tableName")(
|
||||||
|
Seq(1, "a1", 10.0, 1000, "20210101", "01"),
|
||||||
|
Seq(2, "a2", 20.0, 2000, "20210102", "02"),
|
||||||
|
Seq(3, "a3", 30.0, 3000, "20210103", "03"),
|
||||||
|
Seq(4, "a4", 40.0, 4000, "20210104", "04")
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user