1
0

[HUDI-3664] Fixing Column Stats Index composition (#5181)

Co-authored-by: Sagar Sumit <sagarsumit09@gmail.com>
This commit is contained in:
Alexey Kudinkin
2022-04-02 17:15:52 -07:00
committed by GitHub
parent 74eb09be9b
commit cc3737be50
52 changed files with 1776 additions and 749 deletions

View File

@@ -51,6 +51,7 @@ import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
import org.apache.hudi.metadata.HoodieTableMetadata;
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
import org.apache.hudi.utilities.util.BloomFilterData;
import com.beust.jcommander.JCommander;
@@ -660,6 +661,7 @@ public class HoodieMetadataTableValidator implements Serializable {
}).collect(Collectors.toList());
}
@SuppressWarnings("rawtypes")
private void validateAllColumnStats(
HoodieMetadataValidationContext metadataTableBasedContext,
HoodieMetadataValidationContext fsBasedContext,
@@ -667,9 +669,9 @@ public class HoodieMetadataTableValidator implements Serializable {
Set<String> baseDataFilesForCleaning) {
List<String> latestBaseFilenameList = getLatestBaseFileNames(fsBasedContext, partitionPath, baseDataFilesForCleaning);
List<HoodieColumnRangeMetadata<String>> metadataBasedColStats = metadataTableBasedContext
List<HoodieColumnRangeMetadata<Comparable>> metadataBasedColStats = metadataTableBasedContext
.getSortedColumnStatsList(partitionPath, latestBaseFilenameList);
List<HoodieColumnRangeMetadata<String>> fsBasedColStats = fsBasedContext
List<HoodieColumnRangeMetadata<Comparable>> fsBasedColStats = fsBasedContext
.getSortedColumnStatsList(partitionPath, latestBaseFilenameList);
validate(metadataBasedColStats, fsBasedColStats, partitionPath, "column stats");
@@ -777,10 +779,10 @@ public class HoodieMetadataTableValidator implements Serializable {
}
public static class HoodieColumnRangeMetadataComparator
implements Comparator<HoodieColumnRangeMetadata<String>>, Serializable {
implements Comparator<HoodieColumnRangeMetadata<Comparable>>, Serializable {
@Override
public int compare(HoodieColumnRangeMetadata<String> o1, HoodieColumnRangeMetadata<String> o2) {
public int compare(HoodieColumnRangeMetadata<Comparable> o1, HoodieColumnRangeMetadata<Comparable> o2) {
return o1.toString().compareTo(o2.toString());
}
}
@@ -837,7 +839,8 @@ public class HoodieMetadataTableValidator implements Serializable {
.sorted(new HoodieFileGroupComparator()).collect(Collectors.toList());
}
public List<HoodieColumnRangeMetadata<String>> getSortedColumnStatsList(
@SuppressWarnings({"rawtypes", "unchecked"})
public List<HoodieColumnRangeMetadata<Comparable>> getSortedColumnStatsList(
String partitionPath, List<String> baseFileNameList) {
LOG.info("All column names for getting column stats: " + allColumnNameList);
if (enableMetadataTable) {
@@ -846,15 +849,7 @@ public class HoodieMetadataTableValidator implements Serializable {
return allColumnNameList.stream()
.flatMap(columnName ->
tableMetadata.getColumnStats(partitionFileNameList, columnName).values().stream()
.map(stats -> HoodieColumnRangeMetadata.create(
stats.getFileName(),
columnName,
stats.getMinValue(),
stats.getMaxValue(),
stats.getNullCount(),
stats.getValueCount(),
stats.getTotalSize(),
stats.getTotalUncompressedSize()))
.map(HoodieTableMetadataUtil::convertColumnStatsRecordToColumnRangeMetadata)
.collect(Collectors.toList())
.stream())
.sorted(new HoodieColumnRangeMetadataComparator())
@@ -865,18 +860,6 @@ public class HoodieMetadataTableValidator implements Serializable {
metaClient.getHadoopConf(),
new Path(new Path(metaClient.getBasePath(), partitionPath), filename),
allColumnNameList).stream())
.map(rangeMetadata -> HoodieColumnRangeMetadata.create(
rangeMetadata.getFilePath(),
rangeMetadata.getColumnName(),
// Note: here we ignore the type in the validation,
// since column stats from metadata table store the min/max values as String
rangeMetadata.getMinValue().toString(),
rangeMetadata.getMaxValue().toString(),
rangeMetadata.getNullCount(),
rangeMetadata.getValueCount(),
rangeMetadata.getTotalSize(),
rangeMetadata.getTotalUncompressedSize()
))
.sorted(new HoodieColumnRangeMetadataComparator())
.collect(Collectors.toList());
}