[HUDI-3337] Fixing Parquet Column Range metadata extraction (#4705)
- Parquet Column Range metadata extraction utility was simplistically assuming that Decimal types are only represented by INT32, while they representation varies depending on precision. - More details could be found here: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#DECIMAL
This commit is contained in:
@@ -18,8 +18,6 @@
|
||||
|
||||
package org.apache.hudi.common.model;
|
||||
|
||||
import org.apache.parquet.schema.PrimitiveStringifier;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
@@ -31,15 +29,13 @@ public class HoodieColumnRangeMetadata<T> {
|
||||
private final T minValue;
|
||||
private final T maxValue;
|
||||
private final long numNulls;
|
||||
private final PrimitiveStringifier stringifier;
|
||||
|
||||
public HoodieColumnRangeMetadata(final String filePath, final String columnName, final T minValue, final T maxValue, final long numNulls, final PrimitiveStringifier stringifier) {
|
||||
public HoodieColumnRangeMetadata(final String filePath, final String columnName, final T minValue, final T maxValue, final long numNulls) {
|
||||
this.filePath = filePath;
|
||||
this.columnName = columnName;
|
||||
this.minValue = minValue;
|
||||
this.maxValue = maxValue;
|
||||
this.numNulls = numNulls;
|
||||
this.stringifier = stringifier;
|
||||
}
|
||||
|
||||
public String getFilePath() {
|
||||
@@ -58,10 +54,6 @@ public class HoodieColumnRangeMetadata<T> {
|
||||
return this.maxValue;
|
||||
}
|
||||
|
||||
public PrimitiveStringifier getStringifier() {
|
||||
return stringifier;
|
||||
}
|
||||
|
||||
public long getNumNulls() {
|
||||
return numNulls;
|
||||
}
|
||||
|
||||
@@ -37,6 +37,7 @@ import org.apache.parquet.hadoop.ParquetFileReader;
|
||||
import org.apache.parquet.hadoop.ParquetReader;
|
||||
import org.apache.parquet.hadoop.metadata.BlockMetaData;
|
||||
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
|
||||
import org.apache.parquet.io.api.Binary;
|
||||
import org.apache.parquet.schema.DecimalMetadata;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.parquet.schema.OriginalType;
|
||||
@@ -45,6 +46,7 @@ import org.apache.parquet.schema.PrimitiveType;
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.IOException;
|
||||
import java.math.BigDecimal;
|
||||
import java.math.BigInteger;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@@ -308,9 +310,8 @@ public class ParquetUtils extends BaseFileUtils {
|
||||
convertToNativeJavaType(
|
||||
columnChunkMetaData.getPrimitiveType(),
|
||||
columnChunkMetaData.getStatistics().genericGetMax()),
|
||||
columnChunkMetaData.getStatistics().getNumNulls(),
|
||||
columnChunkMetaData.getPrimitiveType().stringifier()))
|
||||
).collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName));
|
||||
columnChunkMetaData.getStatistics().getNumNulls())))
|
||||
.collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName));
|
||||
|
||||
// Combine those into file-level statistics
|
||||
// NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer
|
||||
@@ -360,24 +361,56 @@ public class ParquetUtils extends BaseFileUtils {
|
||||
|
||||
return new HoodieColumnRangeMetadata<T>(
|
||||
one.getFilePath(),
|
||||
one.getColumnName(), minValue, maxValue, one.getNumNulls() + another.getNumNulls(), one.getStringifier());
|
||||
one.getColumnName(), minValue, maxValue, one.getNumNulls() + another.getNumNulls());
|
||||
}
|
||||
|
||||
private static Comparable<?> convertToNativeJavaType(PrimitiveType primitiveType, Comparable val) {
|
||||
if (primitiveType.getOriginalType() == OriginalType.DECIMAL) {
|
||||
DecimalMetadata decimalMetadata = primitiveType.getDecimalMetadata();
|
||||
return BigDecimal.valueOf((Integer) val, decimalMetadata.getScale());
|
||||
return extractDecimal(val, primitiveType.getDecimalMetadata());
|
||||
} else if (primitiveType.getOriginalType() == OriginalType.DATE) {
|
||||
// NOTE: This is a workaround to address race-condition in using
|
||||
// {@code SimpleDataFormat} concurrently (w/in {@code DateStringifier})
|
||||
// TODO cleanup after Parquet upgrade to 1.12
|
||||
synchronized (primitiveType.stringifier()) {
|
||||
// Date logical type is implemented as a signed INT32
|
||||
// REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
|
||||
return java.sql.Date.valueOf(
|
||||
primitiveType.stringifier().stringify((Integer) val)
|
||||
);
|
||||
}
|
||||
} else if (primitiveType.getOriginalType() == OriginalType.UTF8) {
|
||||
// NOTE: UTF8 type designates a byte array that should be interpreted as a
|
||||
// UTF-8 encoded character string
|
||||
// REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
|
||||
return ((Binary) val).toStringUsingUTF8();
|
||||
} else if (primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.BINARY) {
|
||||
// NOTE: `getBytes` access makes a copy of the underlying byte buffer
|
||||
return ((Binary) val).toByteBuffer();
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
private static BigDecimal extractDecimal(Object val, DecimalMetadata decimalMetadata) {
|
||||
// In Parquet, Decimal could be represented as either of
|
||||
// 1. INT32 (for 1 <= precision <= 9)
|
||||
// 2. INT64 (for 1 <= precision <= 18)
|
||||
// 3. FIXED_LEN_BYTE_ARRAY (precision is limited by the array size. Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits)
|
||||
// 4. BINARY (precision is not limited)
|
||||
// REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#DECIMAL
|
||||
int scale = decimalMetadata.getScale();
|
||||
if (val == null) {
|
||||
return null;
|
||||
} else if (val instanceof Integer) {
|
||||
return BigDecimal.valueOf((Integer) val, scale);
|
||||
} else if (val instanceof Long) {
|
||||
return BigDecimal.valueOf((Long) val, scale);
|
||||
} else if (val instanceof Binary) {
|
||||
// NOTE: Unscaled number is stored in BE format (most significant byte is 0th)
|
||||
return new BigDecimal(new BigInteger(((Binary)val).getBytesUnsafe()), scale);
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format("Unsupported value type (%s)", val.getClass().getName()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user