1
0

[HUDI-3514] Rebase Data Skipping flow to rely on MT Column Stats index (#4948)

This commit is contained in:
Alexey Kudinkin
2022-03-15 10:38:36 -07:00
committed by GitHub
parent 9bdda2a312
commit 5e8ff8d793
19 changed files with 359 additions and 224 deletions

View File

@@ -109,6 +109,14 @@
"string"
]
},
{
"doc": "Column name for which this column statistics applies",
"name": "columnName",
"type": [
"null",
"string"
]
},
{
"doc": "Minimum value in the range. Based on user data table schema, we can convert this to appropriate type",
"name": "minValue",

View File

@@ -83,6 +83,7 @@ public class HoodieTableMetaClient implements Serializable {
public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".aux";
public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap";
public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat";
public static final String METADATA_TABLE_FOLDER_PATH = METAFOLDER_NAME + Path.SEPARATOR + "metadata";
public static final String COLUMN_STATISTICS_INDEX_NAME = ".colstatsindex";
public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH
+ Path.SEPARATOR + ".partitions";

View File

@@ -58,6 +58,8 @@ import java.io.IOException;
/**
* Helper class to read schema from data files and log files and to convert it between different formats.
*
* TODO(HUDI-3626) cleanup
*/
public class TableSchemaResolver {
@@ -143,7 +145,7 @@ public class TableSchemaResolver {
* @throws Exception
*/
public Schema getTableAvroSchema() throws Exception {
return getTableAvroSchema(true);
return getTableAvroSchema(metaClient.getTableConfig().populateMetaFields());
}
/**
@@ -197,7 +199,10 @@ public class TableSchemaResolver {
*
* @return Avro user data schema
* @throws Exception
*
* @deprecated use {@link #getTableAvroSchema(boolean)} instead
*/
@Deprecated
public Schema getTableAvroSchemaWithoutMetadataFields() throws Exception {
return getTableAvroSchema(false);
}
@@ -208,7 +213,9 @@ public class TableSchemaResolver {
* @param instant will get the instant data schema
* @return Avro user data schema
* @throws Exception
* @deprecated use {@link #getTableSchemaFromCommitMetadata} instead
*/
@Deprecated
public Schema getTableAvroSchemaWithoutMetadataFields(HoodieInstant instant) throws Exception {
Option<Schema> schemaFromCommitMetadata = getTableSchemaFromCommitMetadata(instant, false);
if (schemaFromCommitMetadata.isPresent()) {

View File

@@ -50,13 +50,13 @@ public class TablePathUtils {
FileStatus fileStatus = fs.getFileStatus(path);
Path directory = fileStatus.isFile() ? fileStatus.getPath().getParent() : fileStatus.getPath();
if (TablePathUtils.hasTableMetadataFolder(fs, directory)) {
if (hasTableMetadataFolder(fs, directory)) {
// Handle table folder itself
return Option.of(directory);
}
// Handle metadata folder or metadata sub folder path
Option<Path> tablePath = getTablePathFromTableMetadataPath(directory);
Option<Path> tablePath = getTablePathFromMetaFolderPath(directory);
if (tablePath.isPresent()) {
return tablePath;
}
@@ -65,20 +65,20 @@ public class TablePathUtils {
return getTablePathFromPartitionPath(fs, directory);
}
private static boolean isTableMetadataFolder(String path) {
return path != null && path.endsWith("/" + HoodieTableMetaClient.METAFOLDER_NAME);
private static boolean isInsideTableMetaFolder(String path) {
return path != null && path.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME);
}
private static boolean isInsideTableMetadataFolder(String path) {
return path != null && path.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME + "/");
private static boolean isInsideMetadataTableInMetaFolder(String path) {
return path != null && path.contains("/" + HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH);
}
private static Option<Path> getTablePathFromTableMetadataPath(Path path) {
private static Option<Path> getTablePathFromMetaFolderPath(Path path) {
String pathStr = path.toString();
if (isTableMetadataFolder(pathStr)) {
return Option.of(path.getParent());
} else if (isInsideTableMetadataFolder(pathStr)) {
// NOTE: Since Metadata Table itself resides w/in the Meta-folder, we need to make sure
// that we don't misinterpret attempt to read MT table itself
if (isInsideTableMetaFolder(pathStr) && !isInsideMetadataTableInMetaFolder(pathStr)) {
int index = pathStr.indexOf("/" + HoodieTableMetaClient.METAFOLDER_NAME);
return Option.of(new Path(pathStr.substring(0, index)));
}
@@ -92,12 +92,21 @@ public class TablePathUtils {
HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, partitionPath);
metadata.readFromFS();
return Option.of(getNthParent(partitionPath, metadata.getPartitionDepth()));
} else {
// Simply traverse directory structure until found .hoodie folder
Path current = partitionPath;
while (current != null) {
if (hasTableMetadataFolder(fs, current)) {
return Option.of(current);
}
current = current.getParent();
}
return Option.empty();
}
} catch (IOException ioe) {
throw new HoodieException("Error reading partition metadata for " + partitionPath, ioe);
}
return Option.empty();
}
private static Path getNthParent(Path path, int n) {

View File

@@ -108,14 +108,15 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
private static final String BLOOM_FILTER_FIELD_IS_DELETED = FIELD_IS_DELETED;
// HoodieMetadata column stats payload field ids
private static final String COLUMN_STATS_FIELD_MIN_VALUE = "minValue";
private static final String COLUMN_STATS_FIELD_MAX_VALUE = "maxValue";
private static final String COLUMN_STATS_FIELD_NULL_COUNT = "nullCount";
private static final String COLUMN_STATS_FIELD_VALUE_COUNT = "valueCount";
private static final String COLUMN_STATS_FIELD_TOTAL_SIZE = "totalSize";
private static final String COLUMN_STATS_FIELD_FILE_NAME = "fileName";
private static final String COLUMN_STATS_FIELD_TOTAL_UNCOMPRESSED_SIZE = "totalUncompressedSize";
private static final String COLUMN_STATS_FIELD_IS_DELETED = FIELD_IS_DELETED;
public static final String COLUMN_STATS_FIELD_MIN_VALUE = "minValue";
public static final String COLUMN_STATS_FIELD_MAX_VALUE = "maxValue";
public static final String COLUMN_STATS_FIELD_NULL_COUNT = "nullCount";
public static final String COLUMN_STATS_FIELD_VALUE_COUNT = "valueCount";
public static final String COLUMN_STATS_FIELD_TOTAL_SIZE = "totalSize";
public static final String COLUMN_STATS_FIELD_FILE_NAME = "fileName";
public static final String COLUMN_STATS_FIELD_COLUMN_NAME = "columnName";
public static final String COLUMN_STATS_FIELD_TOTAL_UNCOMPRESSED_SIZE = "totalUncompressedSize";
public static final String COLUMN_STATS_FIELD_IS_DELETED = FIELD_IS_DELETED;
private String key = null;
private int type = 0;
@@ -177,6 +178,7 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
} else {
columnStatMetadata = HoodieMetadataColumnStats.newBuilder()
.setFileName((String) columnStatsRecord.get(COLUMN_STATS_FIELD_FILE_NAME))
.setColumnName((String) columnStatsRecord.get(COLUMN_STATS_FIELD_COLUMN_NAME))
.setMinValue((String) columnStatsRecord.get(COLUMN_STATS_FIELD_MIN_VALUE))
.setMaxValue((String) columnStatsRecord.get(COLUMN_STATS_FIELD_MAX_VALUE))
.setValueCount((Long) columnStatsRecord.get(COLUMN_STATS_FIELD_VALUE_COUNT))
@@ -508,6 +510,7 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(),
HoodieMetadataColumnStats.newBuilder()
.setFileName(new Path(columnRangeMetadata.getFilePath()).getName())
.setColumnName(columnRangeMetadata.getColumnName())
.setMinValue(columnRangeMetadata.getMinValue() == null ? null :
columnRangeMetadata.getMinValue().toString())
.setMaxValue(columnRangeMetadata.getMaxValue() == null ? null :

View File

@@ -56,14 +56,11 @@ public interface HoodieTableMetadata extends Serializable, AutoCloseable {
String NON_PARTITIONED_NAME = ".";
String EMPTY_PARTITION_NAME = "";
// Base path of the Metadata Table relative to the dataset (.hoodie/metadata)
static final String METADATA_TABLE_REL_PATH = HoodieTableMetaClient.METAFOLDER_NAME + Path.SEPARATOR + "metadata";
/**
* Return the base-path of the Metadata Table for the given Dataset identified by base-path
*/
static String getMetadataTableBasePath(String dataTableBasePath) {
return dataTableBasePath + Path.SEPARATOR + METADATA_TABLE_REL_PATH;
return dataTableBasePath + Path.SEPARATOR + HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH;
}
/**
@@ -72,7 +69,7 @@ public interface HoodieTableMetadata extends Serializable, AutoCloseable {
*/
static String getDataTableBasePathFromMetadataTable(String metadataTableBasePath) {
checkArgument(isMetadataTable(metadataTableBasePath));
return metadataTableBasePath.substring(0, metadataTableBasePath.lastIndexOf(METADATA_TABLE_REL_PATH) - 1);
return metadataTableBasePath.substring(0, metadataTableBasePath.lastIndexOf(HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH) - 1);
}
/**
@@ -84,7 +81,7 @@ public interface HoodieTableMetadata extends Serializable, AutoCloseable {
if (basePath.endsWith(Path.SEPARATOR)) {
basePath = basePath.substring(0, basePath.length() - 1);
}
return basePath.endsWith(METADATA_TABLE_REL_PATH);
return basePath.endsWith(HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH);
}
static HoodieTableMetadata create(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, String datasetBasePath,

View File

@@ -927,16 +927,13 @@ public class HoodieTableMetadataUtil {
final String fileName = filePathWithPartition.substring(offset);
if (filePathWithPartition.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList = new ArrayList<>();
final Path fullFilePath = new Path(datasetMetaClient.getBasePath(), filePathWithPartition);
List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList;
if (!isDeleted) {
try {
columnRangeMetadataList = new ParquetUtils().readRangeFromParquetMetadata(
datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
} catch (Exception e) {
LOG.error("Failed to read column stats for " + fullFilePath, e);
}
columnRangeMetadataList = new ParquetUtils().readRangeFromParquetMetadata(
datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
} else {
// TODO we should delete records instead of stubbing them
columnRangeMetadataList =
columnsToIndex.stream().map(entry -> new HoodieColumnRangeMetadata<Comparable>(fileName,
entry, null, null, 0, 0, 0, 0))