[HUDI-3514] Rebase Data Skipping flow to rely on MT Column Stats index (#4948)
This commit is contained in:
@@ -109,6 +109,14 @@
|
||||
"string"
|
||||
]
|
||||
},
|
||||
{
|
||||
"doc": "Column name for which this column statistics applies",
|
||||
"name": "columnName",
|
||||
"type": [
|
||||
"null",
|
||||
"string"
|
||||
]
|
||||
},
|
||||
{
|
||||
"doc": "Minimum value in the range. Based on user data table schema, we can convert this to appropriate type",
|
||||
"name": "minValue",
|
||||
|
||||
@@ -83,6 +83,7 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".aux";
|
||||
public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap";
|
||||
public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat";
|
||||
public static final String METADATA_TABLE_FOLDER_PATH = METAFOLDER_NAME + Path.SEPARATOR + "metadata";
|
||||
public static final String COLUMN_STATISTICS_INDEX_NAME = ".colstatsindex";
|
||||
public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH
|
||||
+ Path.SEPARATOR + ".partitions";
|
||||
|
||||
@@ -58,6 +58,8 @@ import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Helper class to read schema from data files and log files and to convert it between different formats.
|
||||
*
|
||||
* TODO(HUDI-3626) cleanup
|
||||
*/
|
||||
public class TableSchemaResolver {
|
||||
|
||||
@@ -143,7 +145,7 @@ public class TableSchemaResolver {
|
||||
* @throws Exception
|
||||
*/
|
||||
public Schema getTableAvroSchema() throws Exception {
|
||||
return getTableAvroSchema(true);
|
||||
return getTableAvroSchema(metaClient.getTableConfig().populateMetaFields());
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -197,7 +199,10 @@ public class TableSchemaResolver {
|
||||
*
|
||||
* @return Avro user data schema
|
||||
* @throws Exception
|
||||
*
|
||||
* @deprecated use {@link #getTableAvroSchema(boolean)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public Schema getTableAvroSchemaWithoutMetadataFields() throws Exception {
|
||||
return getTableAvroSchema(false);
|
||||
}
|
||||
@@ -208,7 +213,9 @@ public class TableSchemaResolver {
|
||||
* @param instant will get the instant data schema
|
||||
* @return Avro user data schema
|
||||
* @throws Exception
|
||||
* @deprecated use {@link #getTableSchemaFromCommitMetadata} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public Schema getTableAvroSchemaWithoutMetadataFields(HoodieInstant instant) throws Exception {
|
||||
Option<Schema> schemaFromCommitMetadata = getTableSchemaFromCommitMetadata(instant, false);
|
||||
if (schemaFromCommitMetadata.isPresent()) {
|
||||
|
||||
@@ -50,13 +50,13 @@ public class TablePathUtils {
|
||||
FileStatus fileStatus = fs.getFileStatus(path);
|
||||
Path directory = fileStatus.isFile() ? fileStatus.getPath().getParent() : fileStatus.getPath();
|
||||
|
||||
if (TablePathUtils.hasTableMetadataFolder(fs, directory)) {
|
||||
if (hasTableMetadataFolder(fs, directory)) {
|
||||
// Handle table folder itself
|
||||
return Option.of(directory);
|
||||
}
|
||||
|
||||
// Handle metadata folder or metadata sub folder path
|
||||
Option<Path> tablePath = getTablePathFromTableMetadataPath(directory);
|
||||
Option<Path> tablePath = getTablePathFromMetaFolderPath(directory);
|
||||
if (tablePath.isPresent()) {
|
||||
return tablePath;
|
||||
}
|
||||
@@ -65,20 +65,20 @@ public class TablePathUtils {
|
||||
return getTablePathFromPartitionPath(fs, directory);
|
||||
}
|
||||
|
||||
private static boolean isTableMetadataFolder(String path) {
|
||||
return path != null && path.endsWith("/" + HoodieTableMetaClient.METAFOLDER_NAME);
|
||||
private static boolean isInsideTableMetaFolder(String path) {
|
||||
return path != null && path.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME);
|
||||
}
|
||||
|
||||
private static boolean isInsideTableMetadataFolder(String path) {
|
||||
return path != null && path.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME + "/");
|
||||
private static boolean isInsideMetadataTableInMetaFolder(String path) {
|
||||
return path != null && path.contains("/" + HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH);
|
||||
}
|
||||
|
||||
private static Option<Path> getTablePathFromTableMetadataPath(Path path) {
|
||||
private static Option<Path> getTablePathFromMetaFolderPath(Path path) {
|
||||
String pathStr = path.toString();
|
||||
|
||||
if (isTableMetadataFolder(pathStr)) {
|
||||
return Option.of(path.getParent());
|
||||
} else if (isInsideTableMetadataFolder(pathStr)) {
|
||||
// NOTE: Since Metadata Table itself resides w/in the Meta-folder, we need to make sure
|
||||
// that we don't misinterpret attempt to read MT table itself
|
||||
if (isInsideTableMetaFolder(pathStr) && !isInsideMetadataTableInMetaFolder(pathStr)) {
|
||||
int index = pathStr.indexOf("/" + HoodieTableMetaClient.METAFOLDER_NAME);
|
||||
return Option.of(new Path(pathStr.substring(0, index)));
|
||||
}
|
||||
@@ -92,12 +92,21 @@ public class TablePathUtils {
|
||||
HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, partitionPath);
|
||||
metadata.readFromFS();
|
||||
return Option.of(getNthParent(partitionPath, metadata.getPartitionDepth()));
|
||||
} else {
|
||||
// Simply traverse directory structure until found .hoodie folder
|
||||
Path current = partitionPath;
|
||||
while (current != null) {
|
||||
if (hasTableMetadataFolder(fs, current)) {
|
||||
return Option.of(current);
|
||||
}
|
||||
current = current.getParent();
|
||||
}
|
||||
|
||||
return Option.empty();
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
throw new HoodieException("Error reading partition metadata for " + partitionPath, ioe);
|
||||
}
|
||||
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
private static Path getNthParent(Path path, int n) {
|
||||
|
||||
@@ -108,14 +108,15 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
|
||||
private static final String BLOOM_FILTER_FIELD_IS_DELETED = FIELD_IS_DELETED;
|
||||
|
||||
// HoodieMetadata column stats payload field ids
|
||||
private static final String COLUMN_STATS_FIELD_MIN_VALUE = "minValue";
|
||||
private static final String COLUMN_STATS_FIELD_MAX_VALUE = "maxValue";
|
||||
private static final String COLUMN_STATS_FIELD_NULL_COUNT = "nullCount";
|
||||
private static final String COLUMN_STATS_FIELD_VALUE_COUNT = "valueCount";
|
||||
private static final String COLUMN_STATS_FIELD_TOTAL_SIZE = "totalSize";
|
||||
private static final String COLUMN_STATS_FIELD_FILE_NAME = "fileName";
|
||||
private static final String COLUMN_STATS_FIELD_TOTAL_UNCOMPRESSED_SIZE = "totalUncompressedSize";
|
||||
private static final String COLUMN_STATS_FIELD_IS_DELETED = FIELD_IS_DELETED;
|
||||
public static final String COLUMN_STATS_FIELD_MIN_VALUE = "minValue";
|
||||
public static final String COLUMN_STATS_FIELD_MAX_VALUE = "maxValue";
|
||||
public static final String COLUMN_STATS_FIELD_NULL_COUNT = "nullCount";
|
||||
public static final String COLUMN_STATS_FIELD_VALUE_COUNT = "valueCount";
|
||||
public static final String COLUMN_STATS_FIELD_TOTAL_SIZE = "totalSize";
|
||||
public static final String COLUMN_STATS_FIELD_FILE_NAME = "fileName";
|
||||
public static final String COLUMN_STATS_FIELD_COLUMN_NAME = "columnName";
|
||||
public static final String COLUMN_STATS_FIELD_TOTAL_UNCOMPRESSED_SIZE = "totalUncompressedSize";
|
||||
public static final String COLUMN_STATS_FIELD_IS_DELETED = FIELD_IS_DELETED;
|
||||
|
||||
private String key = null;
|
||||
private int type = 0;
|
||||
@@ -177,6 +178,7 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
|
||||
} else {
|
||||
columnStatMetadata = HoodieMetadataColumnStats.newBuilder()
|
||||
.setFileName((String) columnStatsRecord.get(COLUMN_STATS_FIELD_FILE_NAME))
|
||||
.setColumnName((String) columnStatsRecord.get(COLUMN_STATS_FIELD_COLUMN_NAME))
|
||||
.setMinValue((String) columnStatsRecord.get(COLUMN_STATS_FIELD_MIN_VALUE))
|
||||
.setMaxValue((String) columnStatsRecord.get(COLUMN_STATS_FIELD_MAX_VALUE))
|
||||
.setValueCount((Long) columnStatsRecord.get(COLUMN_STATS_FIELD_VALUE_COUNT))
|
||||
@@ -508,6 +510,7 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
|
||||
HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(),
|
||||
HoodieMetadataColumnStats.newBuilder()
|
||||
.setFileName(new Path(columnRangeMetadata.getFilePath()).getName())
|
||||
.setColumnName(columnRangeMetadata.getColumnName())
|
||||
.setMinValue(columnRangeMetadata.getMinValue() == null ? null :
|
||||
columnRangeMetadata.getMinValue().toString())
|
||||
.setMaxValue(columnRangeMetadata.getMaxValue() == null ? null :
|
||||
|
||||
@@ -56,14 +56,11 @@ public interface HoodieTableMetadata extends Serializable, AutoCloseable {
|
||||
String NON_PARTITIONED_NAME = ".";
|
||||
String EMPTY_PARTITION_NAME = "";
|
||||
|
||||
// Base path of the Metadata Table relative to the dataset (.hoodie/metadata)
|
||||
static final String METADATA_TABLE_REL_PATH = HoodieTableMetaClient.METAFOLDER_NAME + Path.SEPARATOR + "metadata";
|
||||
|
||||
/**
|
||||
* Return the base-path of the Metadata Table for the given Dataset identified by base-path
|
||||
*/
|
||||
static String getMetadataTableBasePath(String dataTableBasePath) {
|
||||
return dataTableBasePath + Path.SEPARATOR + METADATA_TABLE_REL_PATH;
|
||||
return dataTableBasePath + Path.SEPARATOR + HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -72,7 +69,7 @@ public interface HoodieTableMetadata extends Serializable, AutoCloseable {
|
||||
*/
|
||||
static String getDataTableBasePathFromMetadataTable(String metadataTableBasePath) {
|
||||
checkArgument(isMetadataTable(metadataTableBasePath));
|
||||
return metadataTableBasePath.substring(0, metadataTableBasePath.lastIndexOf(METADATA_TABLE_REL_PATH) - 1);
|
||||
return metadataTableBasePath.substring(0, metadataTableBasePath.lastIndexOf(HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH) - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -84,7 +81,7 @@ public interface HoodieTableMetadata extends Serializable, AutoCloseable {
|
||||
if (basePath.endsWith(Path.SEPARATOR)) {
|
||||
basePath = basePath.substring(0, basePath.length() - 1);
|
||||
}
|
||||
return basePath.endsWith(METADATA_TABLE_REL_PATH);
|
||||
return basePath.endsWith(HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH);
|
||||
}
|
||||
|
||||
static HoodieTableMetadata create(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, String datasetBasePath,
|
||||
|
||||
@@ -927,16 +927,13 @@ public class HoodieTableMetadataUtil {
|
||||
final String fileName = filePathWithPartition.substring(offset);
|
||||
|
||||
if (filePathWithPartition.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
|
||||
List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList = new ArrayList<>();
|
||||
final Path fullFilePath = new Path(datasetMetaClient.getBasePath(), filePathWithPartition);
|
||||
List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList;
|
||||
if (!isDeleted) {
|
||||
try {
|
||||
columnRangeMetadataList = new ParquetUtils().readRangeFromParquetMetadata(
|
||||
datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
|
||||
} catch (Exception e) {
|
||||
LOG.error("Failed to read column stats for " + fullFilePath, e);
|
||||
}
|
||||
columnRangeMetadataList = new ParquetUtils().readRangeFromParquetMetadata(
|
||||
datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
|
||||
} else {
|
||||
// TODO we should delete records instead of stubbing them
|
||||
columnRangeMetadataList =
|
||||
columnsToIndex.stream().map(entry -> new HoodieColumnRangeMetadata<Comparable>(fileName,
|
||||
entry, null, null, 0, 0, 0, 0))
|
||||
|
||||
Reference in New Issue
Block a user