1
0

[HUDI-3513] Make sure Column Stats does not fail in case it fails to load previous Index Table state (#5015)

This commit is contained in:
Alexey Kudinkin
2022-03-11 14:39:22 -08:00
committed by GitHub
parent 56cb49485d
commit 5d59bf67ae

View File

@@ -24,6 +24,7 @@ import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieColumnRangeMetadata; import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.util.BaseFileUtils; import org.apache.hudi.common.util.BaseFileUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ParquetUtils; import org.apache.hudi.common.util.ParquetUtils;
import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
@@ -303,6 +304,14 @@ public class ColumnStatsIndexHelper {
// with the most recent one that were successfully persisted previously // with the most recent one that were successfully persisted previously
if (validIndexTables.isEmpty()) { if (validIndexTables.isEmpty()) {
finalColStatsIndexDf = newColStatsIndexDf; finalColStatsIndexDf = newColStatsIndexDf;
} else {
Path latestIndexTablePath = new Path(indexFolderPath, validIndexTables.get(validIndexTables.size() - 1));
Option<Dataset<Row>> existingIndexTableOpt =
tryLoadExistingIndexTable(sparkSession, latestIndexTablePath);
if (!existingIndexTableOpt.isPresent()) {
finalColStatsIndexDf = newColStatsIndexDf;
} else { } else {
// NOTE: That Parquet schema might deviate from the original table schema (for ex, // NOTE: That Parquet schema might deviate from the original table schema (for ex,
// by upcasting "short" to "integer" types, etc), and hence we need to re-adjust it // by upcasting "short" to "integer" types, etc), and hence we need to re-adjust it
@@ -312,14 +321,13 @@ public class ColumnStatsIndexHelper {
sparkSession, sparkSession,
newColStatsIndexDf, newColStatsIndexDf,
// Load current most recent col-stats-index table // Load current most recent col-stats-index table
sparkSession.read().load( existingIndexTableOpt.get()
new Path(indexFolderPath, validIndexTables.get(validIndexTables.size() - 1)).toString()
)
); );
// Clean up all index tables (after creation of the new index) // Clean up all index tables (after creation of the new index)
tablesToCleanup.addAll(validIndexTables); tablesToCleanup.addAll(validIndexTables);
} }
}
// Persist new col-stats-index table // Persist new col-stats-index table
finalColStatsIndexDf finalColStatsIndexDf
@@ -349,6 +357,17 @@ public class ColumnStatsIndexHelper {
} }
} }
@Nonnull
private static Option<Dataset<Row>> tryLoadExistingIndexTable(@Nonnull SparkSession sparkSession, @Nonnull Path indexTablePath) {
try {
Dataset<Row> indexTableDataset = sparkSession.read().load(indexTablePath.toUri().toString());
return Option.of(indexTableDataset);
} catch (Exception e) {
LOG.error(String.format("Failed to load existing Column Stats index table from (%s)", indexTablePath), e);
return Option.empty();
}
}
@Nonnull @Nonnull
private static Dataset<Row> tryMergeMostRecentIndexTableInto( private static Dataset<Row> tryMergeMostRecentIndexTableInto(
@Nonnull SparkSession sparkSession, @Nonnull SparkSession sparkSession,