1
0
Files
hudi/hudi-common/src/main/avro/HoodieMetadata.avsc
2022-04-02 17:15:52 -07:00

359 lines
18 KiB
JSON

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "HoodieMetadataRecord",
"doc": "A record saved within the Metadata Table",
"fields": [
{
"name": "key",
"type": "string"
},
{
"name": "type",
"doc": "Type of the metadata record",
"type": "int"
},
{
"doc": "Contains information about partitions and files within the dataset",
"name": "filesystemMetadata",
"type": [
"null",
{
"type": "map",
"values": {
"type": "record",
"name": "HoodieMetadataFileInfo",
"fields": [
{
"name": "size",
"type": "long",
"doc": "Size of the file"
},
{
"name": "isDeleted",
"type": "boolean",
"doc": "True if this file has been deleted"
}
]
}
}
]
},
{
"doc": "Metadata Index of bloom filters for all data files in the user table",
"name": "BloomFilterMetadata",
"type": [
"null",
{
"doc": "Data file bloom filter details",
"name": "HoodieMetadataBloomFilter",
"type": "record",
"fields": [
{
"doc": "Bloom filter type code",
"name": "type",
"type": "string"
},
{
"doc": "Instant timestamp when this metadata was created/updated",
"name": "timestamp",
"type": "string"
},
{
"doc": "Bloom filter binary byte array",
"name": "bloomFilter",
"type": "bytes"
},
{
"doc": "Bloom filter entry valid/deleted flag",
"name": "isDeleted",
"type": "boolean"
}
]
}
],
"default" : null
},
{
"doc": "Metadata Index of column statistics for all data files in the user table",
"name": "ColumnStatsMetadata",
"type": [
"null",
{
"doc": "Data file column statistics",
"name": "HoodieMetadataColumnStats",
"type": "record",
"fields": [
{
"doc": "File name for which this column statistics applies",
"name": "fileName",
"type": [
"null",
"string"
]
},
{
"doc": "Column name for which this column statistics applies",
"name": "columnName",
"type": [
"null",
"string"
],
"default" : null
},
{
"doc": "Minimum value in the range. Based on user data table schema, we can convert this to appropriate type",
"name": "minValue",
"type": [
// Those types should be aligned with Parquet `Statistics` impl
// making sure that we implement semantic consistent across file formats
//
// NOTE: Other logical types (decimal, date, timestamp, etc) will be converted
// into one of the following types, making sure that their corresponding
// ordering is preserved
"null",
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "BooleanWrapper",
"doc": "A record wrapping boolean type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "boolean",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "IntWrapper",
"doc": "A record wrapping int type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "int",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "LongWrapper",
"doc": "A record wrapping long type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "long",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "FloatWrapper",
"doc": "A record wrapping float type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "float",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "DoubleWrapper",
"doc": "A record wrapping double type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "double",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "BytesWrapper",
"doc": "A record wrapping bytes type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "bytes",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "StringWrapper",
"doc": "A record wrapping string type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": "string",
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "DateWrapper",
"doc": "A record wrapping Date logical type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": {
"type": "int"
// NOTE: Due to breaking changes in code-gen b/w Avro 1.8.2 and 1.10, we can't
// rely on logical types to do proper encoding of the native Java types,
// and hereby have to encode statistic manually
//"logicalType": "date"
},
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "DecimalWrapper",
"doc": "A record wrapping Decimal logical type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": {
"type": "bytes",
"logicalType": "decimal",
// NOTE: This is equivalent to Spark's [[DoubleDecimal]] and should
// be enough for almost any possible use-cases
"precision": 30,
"scale": 15
},
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "TimeMicrosWrapper",
"doc": "A record wrapping Time-micros logical type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": {
"type": "long",
"logicalType": "time-micros"
},
"name": "value"
}
]
},
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "TimestampMicrosWrapper",
"doc": "A record wrapping Timestamp-micros logical type to be able to be used it w/in Avro's Union",
"fields": [
{
"type": {
"type": "long"
// NOTE: Due to breaking changes in code-gen b/w Avro 1.8.2 and 1.10, we can't
// rely on logical types to do proper encoding of the native Java types,
// and hereby have to encode statistic manually
//"logicalType": "timestamp-micros"
},
"name": "value"
}
]
}
],
"default": null
},
{
"doc": "Maximum value in the range. Based on user data table schema, we can convert it to appropriate type",
"name": "maxValue",
"type": [
// Those types should be aligned with Parquet `Statistics` impl
// making sure that we implement semantic consistent across file formats
//
// NOTE: Other logical types (decimal, date, timestamp, etc) will be converted
// into one of the following types, making sure that their corresponding
// ordering is preserved
"null",
"org.apache.hudi.avro.model.BooleanWrapper",
"org.apache.hudi.avro.model.IntWrapper",
"org.apache.hudi.avro.model.LongWrapper",
"org.apache.hudi.avro.model.FloatWrapper",
"org.apache.hudi.avro.model.DoubleWrapper",
"org.apache.hudi.avro.model.BytesWrapper",
"org.apache.hudi.avro.model.StringWrapper",
"org.apache.hudi.avro.model.DateWrapper",
"org.apache.hudi.avro.model.DecimalWrapper",
"org.apache.hudi.avro.model.TimeMicrosWrapper",
"org.apache.hudi.avro.model.TimestampMicrosWrapper"
],
"default": null
},
{
"doc": "Total count of values",
"name": "valueCount",
"type": [
"null",
"long"
]
},
{
"doc": "Total count of null values",
"name": "nullCount",
"type": [
"null",
"long"
]
},
{
"doc": "Total storage size on disk",
"name": "totalSize",
"type": [
"null",
"long"
]
},
{
"doc": "Total uncompressed storage size on disk",
"name": "totalUncompressedSize",
"type": [
"null",
"long"
]
},
{
"doc": "Column range entry valid/deleted flag",
"name": "isDeleted",
"type": "boolean"
}
]
}
],
"default" : null
}
]
}