1
0

[HUDI-3337] Fixing Parquet Column Range metadata extraction (#4705)

- Parquet Column Range metadata extraction utility was simplistically assuming that Decimal types are only represented by INT32, while they representation varies depending on precision.

- More details could be found here:
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#DECIMAL
This commit is contained in:
Alexey Kudinkin
2022-02-02 17:58:05 -08:00
committed by GitHub
parent 819e8018ff
commit d681824982
15 changed files with 323 additions and 111 deletions

View File

@@ -29,7 +29,6 @@ import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.io.api.Binary;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@@ -62,6 +61,7 @@ import scala.collection.JavaConversions;
import javax.annotation.Nonnull;
import java.io.IOException;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -422,9 +422,8 @@ public class ColumnStatsIndexHelper {
);
} else if (colType instanceof StringType) {
return Pair.of(
new String(((Binary) colMetadata.getMinValue()).getBytes()),
new String(((Binary) colMetadata.getMaxValue()).getBytes())
);
colMetadata.getMinValue().toString(),
colMetadata.getMaxValue().toString());
} else if (colType instanceof DecimalType) {
return Pair.of(
new BigDecimal(colMetadata.getMinValue().toString()),
@@ -447,8 +446,8 @@ public class ColumnStatsIndexHelper {
new Float(colMetadata.getMaxValue().toString()));
} else if (colType instanceof BinaryType) {
return Pair.of(
((Binary) colMetadata.getMinValue()).getBytes(),
((Binary) colMetadata.getMaxValue()).getBytes());
((ByteBuffer) colMetadata.getMinValue()).array(),
((ByteBuffer) colMetadata.getMaxValue()).array());
} else if (colType instanceof BooleanType) {
return Pair.of(
Boolean.valueOf(colMetadata.getMinValue().toString()),