1
0

[HUDI-3664] Fixing Column Stats Index composition (#5181)

Co-authored-by: Sagar Sumit <sagarsumit09@gmail.com>
This commit is contained in:
Alexey Kudinkin
2022-04-02 17:15:52 -07:00
committed by GitHub
parent 74eb09be9b
commit cc3737be50
52 changed files with 1776 additions and 749 deletions

View File

@@ -17,7 +17,7 @@
package org.apache.spark.sql
import org.apache.spark.HoodieSparkTypeUtils.isCastPreservingOrdering
import HoodieSparkTypeUtils.isCastPreservingOrdering
import org.apache.spark.sql.catalyst.expressions.{Add, AttributeReference, BitwiseOr, Cast, DateAdd, DateDiff, DateFormatClass, DateSub, Divide, Exp, Expm1, Expression, FromUTCTimestamp, FromUnixTime, Log, Log10, Log1p, Log2, Lower, Multiply, ParseToDate, ParseToTimestamp, ShiftLeft, ShiftRight, ToUTCTimestamp, ToUnixTimestamp, Upper}
object HoodieSpark2CatalystExpressionUtils extends HoodieCatalystExpressionUtils {

View File

@@ -178,6 +178,11 @@ class AvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable:
val numFields = st.length
(getter, ordinal) => structConverter(getter.getStruct(ordinal, numFields))
case (st: StructType, UNION) =>
val unionConverter = newUnionConverter(st, avroType)
val numFields = st.length
(getter, ordinal) => unionConverter(getter.getStruct(ordinal, numFields))
case (MapType(kt, vt, valueContainsNull), MAP) if kt == StringType =>
val valueConverter = newConverter(
vt, resolveNullableType(avroType.getValueType, valueContainsNull))
@@ -205,8 +210,7 @@ class AvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable:
}
}
private def newStructConverter(
catalystStruct: StructType, avroStruct: Schema): InternalRow => Record = {
private def newStructConverter(catalystStruct: StructType, avroStruct: Schema): InternalRow => Record = {
if (avroStruct.getType != RECORD || avroStruct.getFields.size() != catalystStruct.length) {
throw new IncompatibleSchemaException(s"Cannot convert Catalyst type $catalystStruct to " +
s"Avro type $avroStruct.")
@@ -229,14 +233,58 @@ class AvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable:
result
}
private def newUnionConverter(catalystStruct: StructType, avroUnion: Schema): InternalRow => Any = {
if (avroUnion.getType != UNION || !canMapUnion(catalystStruct, avroUnion)) {
throw new IncompatibleSchemaException(s"Cannot convert Catalyst type $catalystStruct to " +
s"Avro type $avroUnion.")
}
val nullable = avroUnion.getTypes.size() > 0 && avroUnion.getTypes.get(0).getType == Type.NULL
val avroInnerTypes = if (nullable) {
avroUnion.getTypes.asScala.tail
} else {
avroUnion.getTypes.asScala
}
val fieldConverters = catalystStruct.zip(avroInnerTypes).map {
case (f1, f2) => newConverter(f1.dataType, f2)
}
val numFields = catalystStruct.length
(row: InternalRow) =>
var i = 0
var result: Any = null
while (i < numFields) {
if (!row.isNullAt(i)) {
if (result != null) {
throw new IncompatibleSchemaException(s"Cannot convert Catalyst record $catalystStruct to " +
s"Avro union $avroUnion. Record has more than one optional values set")
}
result = fieldConverters(i).apply(row, i)
}
i += 1
}
if (!nullable && result == null) {
throw new IncompatibleSchemaException(s"Cannot convert Catalyst record $catalystStruct to " +
s"Avro union $avroUnion. Record has no values set, while should have exactly one")
}
result
}
private def canMapUnion(catalystStruct: StructType, avroStruct: Schema): Boolean = {
(avroStruct.getTypes.size() > 0 &&
avroStruct.getTypes.get(0).getType == Type.NULL &&
avroStruct.getTypes.size() - 1 == catalystStruct.length) || avroStruct.getTypes.size() == catalystStruct.length
}
private def resolveNullableType(avroType: Schema, nullable: Boolean): Schema = {
if (nullable && avroType.getType != NULL) {
// avro uses union to represent nullable type.
// Avro uses union to represent nullable type.
val fields = avroType.getTypes.asScala
assert(fields.length == 2)
val actualType = fields.filter(_.getType != Type.NULL)
assert(actualType.length == 1)
actualType.head
if (fields.length == 2 && actualType.length == 1) {
actualType.head
} else {
// This is just a normal union, not used to designate nullability
avroType
}
} else {
avroType
}