1
0

[HUDI-3204] Fixing partition-values being derived from partition-path instead of source columns (#5364)

- Scaffolded `Spark24HoodieParquetFileFormat` extending `ParquetFileFormat` and overriding the behavior of adding partition columns to every row
 - Amended `SparkAdapter`s `createHoodieParquetFileFormat` API to be able to configure whether to append partition values or not
 - Fallback to append partition values in cases when the source columns are not persisted in data-file
 - Fixing HoodieBaseRelation incorrectly handling mandatory columns
This commit is contained in:
Alexey Kudinkin
2022-04-20 04:30:27 -07:00
committed by GitHub
parent 408663c42b
commit f7544e23ac
28 changed files with 1156 additions and 686 deletions

View File

@@ -18,12 +18,30 @@
package org.apache.spark.sql
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction}
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, SubqueryExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, SubqueryExpression, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, LogicalPlan}
import org.apache.spark.sql.types.StructType
trait HoodieCatalystExpressionUtils {
/**
* Generates instance of [[UnsafeProjection]] projecting row of one [[StructType]] into another [[StructType]]
*
* NOTE: No safety checks are executed to validate that this projection is actually feasible,
* it's up to the caller to make sure that such projection is possible.
*
* NOTE: Projection of the row from [[StructType]] A to [[StructType]] B is only possible, if
* B is a subset of A
*/
def generateUnsafeProjection(from: StructType, to: StructType): UnsafeProjection = {
val attrs = from.toAttributes
val attrsMap = attrs.map(attr => (attr.name, attr)).toMap
val targetExprs = to.fields.map(f => attrsMap(f.name))
GenerateUnsafeProjection.generate(targetExprs, attrs)
}
/**
* Parses and resolves expression against the attributes of the given table schema.
*

View File

@@ -177,7 +177,7 @@ trait SparkAdapter extends Serializable {
def createResolveHudiAlterTableCommand(sparkSession: SparkSession): Rule[LogicalPlan]
/**
* Create hoodie parquet file format.
* Create instance of [[ParquetFileFormat]]
*/
def createHoodieParquetFileFormat(): Option[ParquetFileFormat]
def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat]
}