[HUDI-3514] Rebase Data Skipping flow to rely on MT Column Stats index (#4948)
This commit is contained in:
@@ -80,22 +80,21 @@ public class ColumnStatsIndexHelper {
|
||||
|
||||
private static final String SPARK_JOB_DESCRIPTION = "spark.job.description";
|
||||
|
||||
private static final String Z_INDEX_FILE_COLUMN_NAME = "file";
|
||||
|
||||
private static final String Z_INDEX_MIN_VALUE_STAT_NAME = "minValue";
|
||||
private static final String Z_INDEX_MAX_VALUE_STAT_NAME = "maxValue";
|
||||
private static final String Z_INDEX_NUM_NULLS_STAT_NAME = "num_nulls";
|
||||
private static final String COLUMN_STATS_INDEX_FILE_COLUMN_NAME = "file";
|
||||
private static final String COLUMN_STATS_INDEX_MIN_VALUE_STAT_NAME = "minValue";
|
||||
private static final String COLUMN_STATS_INDEX_MAX_VALUE_STAT_NAME = "maxValue";
|
||||
private static final String COLUMN_STATS_INDEX_NUM_NULLS_STAT_NAME = "num_nulls";
|
||||
|
||||
public static String getMinColumnNameFor(String colName) {
|
||||
return composeZIndexColName(colName, Z_INDEX_MIN_VALUE_STAT_NAME);
|
||||
return composeZIndexColName(colName, COLUMN_STATS_INDEX_MIN_VALUE_STAT_NAME);
|
||||
}
|
||||
|
||||
public static String getMaxColumnNameFor(String colName) {
|
||||
return composeZIndexColName(colName, Z_INDEX_MAX_VALUE_STAT_NAME);
|
||||
return composeZIndexColName(colName, COLUMN_STATS_INDEX_MAX_VALUE_STAT_NAME);
|
||||
}
|
||||
|
||||
public static String getNumNullsColumnNameFor(String colName) {
|
||||
return composeZIndexColName(colName, Z_INDEX_NUM_NULLS_STAT_NAME);
|
||||
return composeZIndexColName(colName, COLUMN_STATS_INDEX_NUM_NULLS_STAT_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -407,11 +406,11 @@ public class ColumnStatsIndexHelper {
|
||||
@Nonnull
|
||||
public static StructType composeIndexSchema(@Nonnull List<StructField> zorderedColumnsSchemas) {
|
||||
List<StructField> schema = new ArrayList<>();
|
||||
schema.add(new StructField(Z_INDEX_FILE_COLUMN_NAME, StringType$.MODULE$, true, Metadata.empty()));
|
||||
schema.add(new StructField(COLUMN_STATS_INDEX_FILE_COLUMN_NAME, StringType$.MODULE$, true, Metadata.empty()));
|
||||
zorderedColumnsSchemas.forEach(colSchema -> {
|
||||
schema.add(composeColumnStatStructType(colSchema.name(), Z_INDEX_MIN_VALUE_STAT_NAME, colSchema.dataType()));
|
||||
schema.add(composeColumnStatStructType(colSchema.name(), Z_INDEX_MAX_VALUE_STAT_NAME, colSchema.dataType()));
|
||||
schema.add(composeColumnStatStructType(colSchema.name(), Z_INDEX_NUM_NULLS_STAT_NAME, LongType$.MODULE$));
|
||||
schema.add(composeColumnStatStructType(colSchema.name(), COLUMN_STATS_INDEX_MIN_VALUE_STAT_NAME, colSchema.dataType()));
|
||||
schema.add(composeColumnStatStructType(colSchema.name(), COLUMN_STATS_INDEX_MAX_VALUE_STAT_NAME, colSchema.dataType()));
|
||||
schema.add(composeColumnStatStructType(colSchema.name(), COLUMN_STATS_INDEX_NUM_NULLS_STAT_NAME, LongType$.MODULE$));
|
||||
});
|
||||
return StructType$.MODULE$.apply(schema);
|
||||
}
|
||||
|
||||
@@ -62,10 +62,10 @@ public class HoodieMergeOnReadTestUtils {
|
||||
}
|
||||
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(Configuration conf, List<String> inputPaths,
|
||||
String basePath, JobConf jobConf, boolean realtime, boolean populateMetaFieldsConfigValue) {
|
||||
String basePath, JobConf jobConf, boolean realtime, boolean populateMetaFields) {
|
||||
Schema schema = new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA);
|
||||
return getRecordsUsingInputFormat(conf, inputPaths, basePath, jobConf, realtime, schema,
|
||||
HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>(), populateMetaFieldsConfigValue);
|
||||
HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>(), populateMetaFields);
|
||||
}
|
||||
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(Configuration conf, List<String> inputPaths, String basePath, JobConf jobConf, boolean realtime, Schema rawSchema,
|
||||
@@ -74,14 +74,23 @@ public class HoodieMergeOnReadTestUtils {
|
||||
}
|
||||
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(Configuration conf, List<String> inputPaths, String basePath, JobConf jobConf, boolean realtime, Schema rawSchema,
|
||||
String rawHiveColumnTypes, boolean projectCols, List<String> projectedColumns, boolean populateMetaFieldsConfigValue) {
|
||||
String rawHiveColumnTypes, boolean projectCols, List<String> projectedColumns, boolean populateMetaFields) {
|
||||
|
||||
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(conf).setBasePath(basePath).build();
|
||||
FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(metaClient.getTableConfig().getBaseFileFormat(), realtime, jobConf);
|
||||
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(rawSchema);
|
||||
String hiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(rawHiveColumnTypes);
|
||||
setPropsForInputFormat(inputFormat, jobConf, schema, hiveColumnTypes, projectCols, projectedColumns, populateMetaFieldsConfigValue);
|
||||
Schema schema;
|
||||
String hiveColumnTypes;
|
||||
|
||||
if (populateMetaFields) {
|
||||
schema = HoodieAvroUtils.addMetadataFields(rawSchema);
|
||||
hiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(rawHiveColumnTypes);
|
||||
} else {
|
||||
schema = rawSchema;
|
||||
hiveColumnTypes = rawHiveColumnTypes;
|
||||
}
|
||||
|
||||
setPropsForInputFormat(inputFormat, jobConf, schema, hiveColumnTypes, projectCols, projectedColumns, populateMetaFields);
|
||||
final List<Field> fields;
|
||||
if (projectCols) {
|
||||
fields = schema.getFields().stream().filter(f -> projectedColumns.contains(f.name()))
|
||||
|
||||
Reference in New Issue
Block a user