[HUDI-3664] Fixing Column Stats Index composition (#5181)
Co-authored-by: Sagar Sumit <sagarsumit09@gmail.com>
This commit is contained in:
@@ -51,6 +51,7 @@ import org.apache.hudi.exception.HoodieValidationException;
|
||||
import org.apache.hudi.io.storage.HoodieFileReader;
|
||||
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadata;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
|
||||
import org.apache.hudi.utilities.util.BloomFilterData;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
@@ -660,6 +661,7 @@ public class HoodieMetadataTableValidator implements Serializable {
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@SuppressWarnings("rawtypes")
|
||||
private void validateAllColumnStats(
|
||||
HoodieMetadataValidationContext metadataTableBasedContext,
|
||||
HoodieMetadataValidationContext fsBasedContext,
|
||||
@@ -667,9 +669,9 @@ public class HoodieMetadataTableValidator implements Serializable {
|
||||
Set<String> baseDataFilesForCleaning) {
|
||||
|
||||
List<String> latestBaseFilenameList = getLatestBaseFileNames(fsBasedContext, partitionPath, baseDataFilesForCleaning);
|
||||
List<HoodieColumnRangeMetadata<String>> metadataBasedColStats = metadataTableBasedContext
|
||||
List<HoodieColumnRangeMetadata<Comparable>> metadataBasedColStats = metadataTableBasedContext
|
||||
.getSortedColumnStatsList(partitionPath, latestBaseFilenameList);
|
||||
List<HoodieColumnRangeMetadata<String>> fsBasedColStats = fsBasedContext
|
||||
List<HoodieColumnRangeMetadata<Comparable>> fsBasedColStats = fsBasedContext
|
||||
.getSortedColumnStatsList(partitionPath, latestBaseFilenameList);
|
||||
|
||||
validate(metadataBasedColStats, fsBasedColStats, partitionPath, "column stats");
|
||||
@@ -777,10 +779,10 @@ public class HoodieMetadataTableValidator implements Serializable {
|
||||
}
|
||||
|
||||
public static class HoodieColumnRangeMetadataComparator
|
||||
implements Comparator<HoodieColumnRangeMetadata<String>>, Serializable {
|
||||
implements Comparator<HoodieColumnRangeMetadata<Comparable>>, Serializable {
|
||||
|
||||
@Override
|
||||
public int compare(HoodieColumnRangeMetadata<String> o1, HoodieColumnRangeMetadata<String> o2) {
|
||||
public int compare(HoodieColumnRangeMetadata<Comparable> o1, HoodieColumnRangeMetadata<Comparable> o2) {
|
||||
return o1.toString().compareTo(o2.toString());
|
||||
}
|
||||
}
|
||||
@@ -837,7 +839,8 @@ public class HoodieMetadataTableValidator implements Serializable {
|
||||
.sorted(new HoodieFileGroupComparator()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public List<HoodieColumnRangeMetadata<String>> getSortedColumnStatsList(
|
||||
@SuppressWarnings({"rawtypes", "unchecked"})
|
||||
public List<HoodieColumnRangeMetadata<Comparable>> getSortedColumnStatsList(
|
||||
String partitionPath, List<String> baseFileNameList) {
|
||||
LOG.info("All column names for getting column stats: " + allColumnNameList);
|
||||
if (enableMetadataTable) {
|
||||
@@ -846,15 +849,7 @@ public class HoodieMetadataTableValidator implements Serializable {
|
||||
return allColumnNameList.stream()
|
||||
.flatMap(columnName ->
|
||||
tableMetadata.getColumnStats(partitionFileNameList, columnName).values().stream()
|
||||
.map(stats -> HoodieColumnRangeMetadata.create(
|
||||
stats.getFileName(),
|
||||
columnName,
|
||||
stats.getMinValue(),
|
||||
stats.getMaxValue(),
|
||||
stats.getNullCount(),
|
||||
stats.getValueCount(),
|
||||
stats.getTotalSize(),
|
||||
stats.getTotalUncompressedSize()))
|
||||
.map(HoodieTableMetadataUtil::convertColumnStatsRecordToColumnRangeMetadata)
|
||||
.collect(Collectors.toList())
|
||||
.stream())
|
||||
.sorted(new HoodieColumnRangeMetadataComparator())
|
||||
@@ -865,18 +860,6 @@ public class HoodieMetadataTableValidator implements Serializable {
|
||||
metaClient.getHadoopConf(),
|
||||
new Path(new Path(metaClient.getBasePath(), partitionPath), filename),
|
||||
allColumnNameList).stream())
|
||||
.map(rangeMetadata -> HoodieColumnRangeMetadata.create(
|
||||
rangeMetadata.getFilePath(),
|
||||
rangeMetadata.getColumnName(),
|
||||
// Note: here we ignore the type in the validation,
|
||||
// since column stats from metadata table store the min/max values as String
|
||||
rangeMetadata.getMinValue().toString(),
|
||||
rangeMetadata.getMaxValue().toString(),
|
||||
rangeMetadata.getNullCount(),
|
||||
rangeMetadata.getValueCount(),
|
||||
rangeMetadata.getTotalSize(),
|
||||
rangeMetadata.getTotalUncompressedSize()
|
||||
))
|
||||
.sorted(new HoodieColumnRangeMetadataComparator())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user