1
0

[HUDI-3664] Fixing Column Stats Index composition (#5181)

Co-authored-by: Sagar Sumit <sagarsumit09@gmail.com>
This commit is contained in:
Alexey Kudinkin
2022-04-02 17:15:52 -07:00
committed by GitHub
parent 74eb09be9b
commit cc3737be50
52 changed files with 1776 additions and 749 deletions

View File

@@ -122,17 +122,195 @@
"doc": "Minimum value in the range. Based on user data table schema, we can convert this to appropriate type",
"name": "minValue",
"type": [
// Those types should be aligned with Parquet `Statistics` impl
// making sure that we implement semantic consistent across file formats
//
// NOTE: Other logical types (decimal, date, timestamp, etc) will be converted
// into one of the following types, making sure that their corresponding
// ordering is preserved
"null",
"string"
]
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "BooleanWrapper",
"doc": "A record wrapping boolean type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "boolean",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "IntWrapper",
"doc": "A record wrapping int type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "int",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "LongWrapper",
"doc": "A record wrapping long type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "long",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "FloatWrapper",
"doc": "A record wrapping float type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "float",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "DoubleWrapper",
"doc": "A record wrapping double type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "double",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "BytesWrapper",
"doc": "A record wrapping bytes type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "bytes",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "StringWrapper",
"doc": "A record wrapping string type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "string",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "DateWrapper",
"doc": "A record wrapping Date logical type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": {
"type": "int"
// NOTE: Due to breaking changes in code-gen b/w Avro 1.8.2 and 1.10, we can't
// rely on logical types to do proper encoding of the native Java types,
// and hereby have to encode statistic manually
//"logicalType": "date"
},
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "DecimalWrapper",
"doc": "A record wrapping Decimal logical type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": {
"type": "bytes",
"logicalType": "decimal",
// NOTE: This is equivalent to Spark's [[DoubleDecimal]] and should
// be enough for almost any possible use-cases
"precision": 30,
"scale": 15
},
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "TimeMicrosWrapper",
"doc": "A record wrapping Time-micros logical type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": {
"type": "long",
"logicalType": "time-micros"
},
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "TimestampMicrosWrapper",
"doc": "A record wrapping Timestamp-micros logical type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": {
"type": "long"
// NOTE: Due to breaking changes in code-gen b/w Avro 1.8.2 and 1.10, we can't
// rely on logical types to do proper encoding of the native Java types,
// and hereby have to encode statistic manually
//"logicalType": "timestamp-micros"
},
"name": "value"
}
]
}
],
"default": null
},
{
"doc": "Maximum value in the range. Based on user data table schema, we can convert it to appropriate type",
"name": "maxValue",
"type": [
// Those types should be aligned with Parquet `Statistics` impl
// making sure that we implement semantic consistent across file formats
//
// NOTE: Other logical types (decimal, date, timestamp, etc) will be converted
// into one of the following types, making sure that their corresponding
// ordering is preserved
"null",
"string"
]
"org.apache.hudi.avro.model.BooleanWrapper",
"org.apache.hudi.avro.model.IntWrapper",
"org.apache.hudi.avro.model.LongWrapper",
"org.apache.hudi.avro.model.FloatWrapper",
"org.apache.hudi.avro.model.DoubleWrapper",
"org.apache.hudi.avro.model.BytesWrapper",
"org.apache.hudi.avro.model.StringWrapper",
"org.apache.hudi.avro.model.DateWrapper",
"org.apache.hudi.avro.model.DecimalWrapper",
"org.apache.hudi.avro.model.TimeMicrosWrapper",
"org.apache.hudi.avro.model.TimestampMicrosWrapper"
],
"default": null
},
{
"doc": "Total count of values",