[HUDI-4353] Column stats data skipping for flink (#6026)
This commit is contained in:
@@ -74,6 +74,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -159,10 +160,14 @@ public class HoodieFlinkWriteClient<T extends HoodieRecordPayload> extends
|
|||||||
initTable(WriteOperationType.UPSERT, Option.ofNullable(instantTime));
|
initTable(WriteOperationType.UPSERT, Option.ofNullable(instantTime));
|
||||||
table.validateUpsertSchema();
|
table.validateUpsertSchema();
|
||||||
preWrite(instantTime, WriteOperationType.UPSERT_PREPPED, table.getMetaClient());
|
preWrite(instantTime, WriteOperationType.UPSERT_PREPPED, table.getMetaClient());
|
||||||
final HoodieWriteHandle<?, ?, ?, ?> writeHandle = getOrCreateWriteHandle(preppedRecords.get(0), getConfig(),
|
Map<String, List<HoodieRecord<T>>> preppedRecordsByFileId = preppedRecords.stream().parallel()
|
||||||
instantTime, table, preppedRecords.listIterator());
|
.collect(Collectors.groupingBy(r -> r.getCurrentLocation().getFileId()));
|
||||||
HoodieWriteMetadata<List<WriteStatus>> result = ((HoodieFlinkTable<T>) table).upsertPrepped(context, writeHandle, instantTime, preppedRecords);
|
return preppedRecordsByFileId.values().stream().parallel().map(records -> {
|
||||||
return postWrite(result, instantTime, table);
|
final HoodieWriteHandle<?, ?, ?, ?> writeHandle = getOrCreateWriteHandle(records.get(0), getConfig(),
|
||||||
|
instantTime, table, records.listIterator());
|
||||||
|
HoodieWriteMetadata<List<WriteStatus>> result = ((HoodieFlinkTable<T>) table).upsertPrepped(context, writeHandle, instantTime, records);
|
||||||
|
return postWrite(result, instantTime, table);
|
||||||
|
}).flatMap(Collection::stream).collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -236,6 +236,13 @@ public class FlinkOptions extends HoodieConfig {
|
|||||||
.noDefaultValue()
|
.noDefaultValue()
|
||||||
.withDescription("End commit instant for reading, the commit time format should be 'yyyyMMddHHmmss'");
|
.withDescription("End commit instant for reading, the commit time format should be 'yyyyMMddHHmmss'");
|
||||||
|
|
||||||
|
public static final ConfigOption<Boolean> READ_DATA_SKIPPING_ENABLED = ConfigOptions
|
||||||
|
.key("read.data.skipping.enabled")
|
||||||
|
.booleanType()
|
||||||
|
.defaultValue(false)
|
||||||
|
.withDescription("Enables data-skipping allowing queries to leverage indexes to reduce the search space by"
|
||||||
|
+ "skipping over files");
|
||||||
|
|
||||||
// ------------------------------------------------------------------------
|
// ------------------------------------------------------------------------
|
||||||
// Write Options
|
// Write Options
|
||||||
// ------------------------------------------------------------------------
|
// ------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -18,11 +18,11 @@
|
|||||||
|
|
||||||
package org.apache.hudi.configuration;
|
package org.apache.hudi.configuration;
|
||||||
|
|
||||||
|
import org.apache.hudi.util.FlinkClientUtil;
|
||||||
|
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
import org.apache.hudi.util.FlinkClientUtil;
|
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -18,7 +18,6 @@
|
|||||||
|
|
||||||
package org.apache.hudi.sink.meta;
|
package org.apache.hudi.sink.meta;
|
||||||
|
|
||||||
import org.apache.flink.configuration.Configuration;
|
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.util.ValidationUtils;
|
import org.apache.hudi.common.util.ValidationUtils;
|
||||||
@@ -26,6 +25,7 @@ import org.apache.hudi.configuration.FlinkOptions;
|
|||||||
import org.apache.hudi.configuration.HadoopConfigurations;
|
import org.apache.hudi.configuration.HadoopConfigurations;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
|
|
||||||
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
|||||||
@@ -18,7 +18,6 @@
|
|||||||
|
|
||||||
package org.apache.hudi.sink.utils;
|
package org.apache.hudi.sink.utils;
|
||||||
|
|
||||||
import org.apache.flink.annotation.VisibleForTesting;
|
|
||||||
import org.apache.hudi.aws.sync.AwsGlueCatalogSyncTool;
|
import org.apache.hudi.aws.sync.AwsGlueCatalogSyncTool;
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
@@ -29,6 +28,7 @@ import org.apache.hudi.hive.HiveSyncTool;
|
|||||||
import org.apache.hudi.hive.ddl.HiveSyncMode;
|
import org.apache.hudi.hive.ddl.HiveSyncMode;
|
||||||
import org.apache.hudi.table.format.FilePathUtils;
|
import org.apache.hudi.table.format.FilePathUtils;
|
||||||
|
|
||||||
|
import org.apache.flink.annotation.VisibleForTesting;
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.hive.conf.HiveConf;
|
import org.apache.hadoop.hive.conf.HiveConf;
|
||||||
|
|||||||
@@ -155,6 +155,10 @@ public class NonThrownExecutor implements AutoCloseable {
|
|||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
// Inner Class
|
// Inner Class
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The exception hook.
|
||||||
|
*/
|
||||||
public interface ExceptionHook {
|
public interface ExceptionHook {
|
||||||
void apply(String errMsg, Throwable t);
|
void apply(String errMsg, Throwable t);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,14 +21,24 @@ package org.apache.hudi.source;
|
|||||||
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
|
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
|
||||||
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.configuration.FlinkOptions;
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
import org.apache.hudi.configuration.HadoopConfigurations;
|
import org.apache.hudi.configuration.HadoopConfigurations;
|
||||||
|
import org.apache.hudi.source.stats.ColumnStatsIndices;
|
||||||
|
import org.apache.hudi.source.stats.ExpressionEvaluator;
|
||||||
|
import org.apache.hudi.util.DataTypeUtils;
|
||||||
|
import org.apache.hudi.util.ExpressionUtils;
|
||||||
import org.apache.hudi.util.StreamerUtil;
|
import org.apache.hudi.util.StreamerUtil;
|
||||||
|
|
||||||
import org.apache.flink.annotation.VisibleForTesting;
|
import org.apache.flink.annotation.VisibleForTesting;
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
|
import org.apache.flink.table.data.RowData;
|
||||||
|
import org.apache.flink.table.expressions.ResolvedExpression;
|
||||||
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
|
||||||
@@ -40,6 +50,7 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A file index which supports listing files efficiently through metadata table.
|
* A file index which supports listing files efficiently through metadata table.
|
||||||
@@ -47,19 +58,26 @@ import java.util.Set;
|
|||||||
* <p>It caches the partition paths to avoid redundant look up.
|
* <p>It caches the partition paths to avoid redundant look up.
|
||||||
*/
|
*/
|
||||||
public class FileIndex {
|
public class FileIndex {
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(FileIndex.class);
|
||||||
|
|
||||||
private final Path path;
|
private final Path path;
|
||||||
|
private final RowType rowType;
|
||||||
private final HoodieMetadataConfig metadataConfig;
|
private final HoodieMetadataConfig metadataConfig;
|
||||||
private List<String> partitionPaths; // cache of partition paths
|
private final boolean dataSkippingEnabled;
|
||||||
|
private List<String> partitionPaths; // cache of partition paths
|
||||||
|
private List<ResolvedExpression> filters; // push down filters
|
||||||
private final boolean tableExists;
|
private final boolean tableExists;
|
||||||
|
|
||||||
private FileIndex(Path path, Configuration conf) {
|
private FileIndex(Path path, Configuration conf, RowType rowType) {
|
||||||
this.path = path;
|
this.path = path;
|
||||||
|
this.rowType = rowType;
|
||||||
this.metadataConfig = metadataConfig(conf);
|
this.metadataConfig = metadataConfig(conf);
|
||||||
|
this.dataSkippingEnabled = conf.getBoolean(FlinkOptions.READ_DATA_SKIPPING_ENABLED);
|
||||||
this.tableExists = StreamerUtil.tableExists(path.toString(), HadoopConfigurations.getHadoopConf(conf));
|
this.tableExists = StreamerUtil.tableExists(path.toString(), HadoopConfigurations.getHadoopConf(conf));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static FileIndex instance(Path path, Configuration conf) {
|
public static FileIndex instance(Path path, Configuration conf, RowType rowType) {
|
||||||
return new FileIndex(path, conf);
|
return new FileIndex(path, conf, rowType);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -119,9 +137,17 @@ public class FileIndex {
|
|||||||
return new FileStatus[0];
|
return new FileStatus[0];
|
||||||
}
|
}
|
||||||
String[] partitions = getOrBuildPartitionPaths().stream().map(p -> fullPartitionPath(path, p)).toArray(String[]::new);
|
String[] partitions = getOrBuildPartitionPaths().stream().map(p -> fullPartitionPath(path, p)).toArray(String[]::new);
|
||||||
return FSUtils.getFilesInPartitions(HoodieFlinkEngineContext.DEFAULT, metadataConfig, path.toString(),
|
FileStatus[] allFileStatus = FSUtils.getFilesInPartitions(HoodieFlinkEngineContext.DEFAULT, metadataConfig, path.toString(),
|
||||||
partitions, "/tmp/")
|
partitions, "/tmp/")
|
||||||
.values().stream().flatMap(Arrays::stream).toArray(FileStatus[]::new);
|
.values().stream().flatMap(Arrays::stream).toArray(FileStatus[]::new);
|
||||||
|
Set<String> candidateFiles = candidateFilesInMetadataTable(allFileStatus);
|
||||||
|
if (candidateFiles == null) {
|
||||||
|
// no need to filter by col stats or error occurs.
|
||||||
|
return allFileStatus;
|
||||||
|
}
|
||||||
|
return Arrays.stream(allFileStatus).parallel()
|
||||||
|
.filter(fileStatus -> candidateFiles.contains(fileStatus.getPath().getName()))
|
||||||
|
.toArray(FileStatus[]::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -159,10 +185,96 @@ public class FileIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets up pushed down filters.
|
||||||
|
*/
|
||||||
|
public void setFilters(List<ResolvedExpression> filters) {
|
||||||
|
if (filters.size() > 0) {
|
||||||
|
this.filters = new ArrayList<>(filters);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
// Utilities
|
// Utilities
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes pruned list of candidate base-files' names based on provided list of data filters.
|
||||||
|
* conditions, by leveraging Metadata Table's Column Statistics index (hereon referred as ColStats for brevity)
|
||||||
|
* bearing "min", "max", "num_nulls" statistics for all columns.
|
||||||
|
*
|
||||||
|
* <p>NOTE: This method has to return complete set of candidate files, since only provided candidates will
|
||||||
|
* ultimately be scanned as part of query execution. Hence, this method has to maintain the
|
||||||
|
* invariant of conservatively including every base-file's name, that is NOT referenced in its index.
|
||||||
|
*
|
||||||
|
* <p>The {@code filters} must all be simple.
|
||||||
|
*
|
||||||
|
* @return list of pruned (data-skipped) candidate base-files' names
|
||||||
|
*/
|
||||||
|
@Nullable
|
||||||
|
private Set<String> candidateFilesInMetadataTable(FileStatus[] allFileStatus) {
|
||||||
|
// NOTE: Data Skipping is only effective when it references columns that are indexed w/in
|
||||||
|
// the Column Stats Index (CSI). Following cases could not be effectively handled by Data Skipping:
|
||||||
|
// - Expressions on top-level column's fields (ie, for ex filters like "struct.field > 0", since
|
||||||
|
// CSI only contains stats for top-level columns, in this case for "struct")
|
||||||
|
// - Any expression not directly referencing top-level column (for ex, sub-queries, since there's
|
||||||
|
// nothing CSI in particular could be applied for)
|
||||||
|
if (!metadataConfig.enabled() || !dataSkippingEnabled) {
|
||||||
|
validateConfig();
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (this.filters == null || this.filters.size() == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
String[] referencedCols = ExpressionUtils.referencedColumns(filters);
|
||||||
|
if (referencedCols.length == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
final List<RowData> colStats = ColumnStatsIndices.readColumnStatsIndex(path.toString(), metadataConfig, referencedCols);
|
||||||
|
final Pair<List<RowData>, String[]> colStatsTable = ColumnStatsIndices.transposeColumnStatsIndex(colStats, referencedCols, rowType);
|
||||||
|
List<RowData> transposedColStats = colStatsTable.getLeft();
|
||||||
|
String[] queryCols = colStatsTable.getRight();
|
||||||
|
if (queryCols.length == 0) {
|
||||||
|
// the indexed columns have no intersection with the referenced columns, returns early
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
RowType.RowField[] queryFields = DataTypeUtils.projectRowFields(rowType, queryCols);
|
||||||
|
|
||||||
|
Set<String> allIndexedFileNames = transposedColStats.stream().parallel()
|
||||||
|
.map(row -> row.getString(0).toString())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
Set<String> candidateFileNames = transposedColStats.stream().parallel()
|
||||||
|
.filter(row -> ExpressionEvaluator.filterExprs(filters, row, queryFields))
|
||||||
|
.map(row -> row.getString(0).toString())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
// NOTE: Col-Stats Index isn't guaranteed to have complete set of statistics for every
|
||||||
|
// base-file: since it's bound to clustering, which could occur asynchronously
|
||||||
|
// at arbitrary point in time, and is not likely to be touching all the base files.
|
||||||
|
//
|
||||||
|
// To close that gap, we manually compute the difference b/w all indexed (by col-stats-index)
|
||||||
|
// files and all outstanding base-files, and make sure that all base files not
|
||||||
|
// represented w/in the index are included in the output of this method
|
||||||
|
Set<String> nonIndexedFileNames = Arrays.stream(allFileStatus)
|
||||||
|
.map(fileStatus -> fileStatus.getPath().getName()).collect(Collectors.toSet());
|
||||||
|
nonIndexedFileNames.removeAll(allIndexedFileNames);
|
||||||
|
|
||||||
|
candidateFileNames.addAll(nonIndexedFileNames);
|
||||||
|
return candidateFileNames;
|
||||||
|
} catch (Throwable throwable) {
|
||||||
|
LOG.warn("Read column stats for data skipping error", throwable);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void validateConfig() {
|
||||||
|
if (dataSkippingEnabled && !metadataConfig.enabled()) {
|
||||||
|
LOG.warn("Data skipping requires Metadata Table to be enabled! "
|
||||||
|
+ "isMetadataTableEnabled = {}", metadataConfig.enabled());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns all the relative partition paths.
|
* Returns all the relative partition paths.
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ import org.apache.hudi.table.format.mor.MergeOnReadInputSplit;
|
|||||||
|
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.flink.core.fs.Path;
|
import org.apache.flink.core.fs.Path;
|
||||||
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -77,6 +78,7 @@ public class IncrementalInputSplits implements Serializable {
|
|||||||
private static final Logger LOG = LoggerFactory.getLogger(IncrementalInputSplits.class);
|
private static final Logger LOG = LoggerFactory.getLogger(IncrementalInputSplits.class);
|
||||||
private final Configuration conf;
|
private final Configuration conf;
|
||||||
private final Path path;
|
private final Path path;
|
||||||
|
private final RowType rowType;
|
||||||
private final long maxCompactionMemoryInBytes;
|
private final long maxCompactionMemoryInBytes;
|
||||||
// for partition pruning
|
// for partition pruning
|
||||||
private final Set<String> requiredPartitions;
|
private final Set<String> requiredPartitions;
|
||||||
@@ -86,11 +88,13 @@ public class IncrementalInputSplits implements Serializable {
|
|||||||
private IncrementalInputSplits(
|
private IncrementalInputSplits(
|
||||||
Configuration conf,
|
Configuration conf,
|
||||||
Path path,
|
Path path,
|
||||||
|
RowType rowType,
|
||||||
long maxCompactionMemoryInBytes,
|
long maxCompactionMemoryInBytes,
|
||||||
@Nullable Set<String> requiredPartitions,
|
@Nullable Set<String> requiredPartitions,
|
||||||
boolean skipCompaction) {
|
boolean skipCompaction) {
|
||||||
this.conf = conf;
|
this.conf = conf;
|
||||||
this.path = path;
|
this.path = path;
|
||||||
|
this.rowType = rowType;
|
||||||
this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes;
|
this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes;
|
||||||
this.requiredPartitions = requiredPartitions;
|
this.requiredPartitions = requiredPartitions;
|
||||||
this.skipCompaction = skipCompaction;
|
this.skipCompaction = skipCompaction;
|
||||||
@@ -167,7 +171,7 @@ public class IncrementalInputSplits implements Serializable {
|
|||||||
|
|
||||||
if (instantRange == null) {
|
if (instantRange == null) {
|
||||||
// reading from the earliest, scans the partitions and files directly.
|
// reading from the earliest, scans the partitions and files directly.
|
||||||
FileIndex fileIndex = FileIndex.instance(new org.apache.hadoop.fs.Path(path.toUri()), conf);
|
FileIndex fileIndex = FileIndex.instance(new org.apache.hadoop.fs.Path(path.toUri()), conf, rowType);
|
||||||
if (this.requiredPartitions != null) {
|
if (this.requiredPartitions != null) {
|
||||||
// apply partition push down
|
// apply partition push down
|
||||||
fileIndex.setPartitionPaths(this.requiredPartitions);
|
fileIndex.setPartitionPaths(this.requiredPartitions);
|
||||||
@@ -349,6 +353,7 @@ public class IncrementalInputSplits implements Serializable {
|
|||||||
public static class Builder {
|
public static class Builder {
|
||||||
private Configuration conf;
|
private Configuration conf;
|
||||||
private Path path;
|
private Path path;
|
||||||
|
private RowType rowType;
|
||||||
private long maxCompactionMemoryInBytes;
|
private long maxCompactionMemoryInBytes;
|
||||||
// for partition pruning
|
// for partition pruning
|
||||||
private Set<String> requiredPartitions;
|
private Set<String> requiredPartitions;
|
||||||
@@ -368,6 +373,11 @@ public class IncrementalInputSplits implements Serializable {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Builder rowType(RowType rowType) {
|
||||||
|
this.rowType = rowType;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public Builder maxCompactionMemoryInBytes(long maxCompactionMemoryInBytes) {
|
public Builder maxCompactionMemoryInBytes(long maxCompactionMemoryInBytes) {
|
||||||
this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes;
|
this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes;
|
||||||
return this;
|
return this;
|
||||||
@@ -384,7 +394,8 @@ public class IncrementalInputSplits implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public IncrementalInputSplits build() {
|
public IncrementalInputSplits build() {
|
||||||
return new IncrementalInputSplits(Objects.requireNonNull(this.conf), Objects.requireNonNull(this.path),
|
return new IncrementalInputSplits(
|
||||||
|
Objects.requireNonNull(this.conf), Objects.requireNonNull(this.path), Objects.requireNonNull(this.rowType),
|
||||||
this.maxCompactionMemoryInBytes, this.requiredPartitions, this.skipCompaction);
|
this.maxCompactionMemoryInBytes, this.requiredPartitions, this.skipCompaction);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ import org.apache.flink.runtime.state.FunctionSnapshotContext;
|
|||||||
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
|
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
|
||||||
import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
|
import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
|
||||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||||
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -99,6 +100,7 @@ public class StreamReadMonitoringFunction
|
|||||||
public StreamReadMonitoringFunction(
|
public StreamReadMonitoringFunction(
|
||||||
Configuration conf,
|
Configuration conf,
|
||||||
Path path,
|
Path path,
|
||||||
|
RowType rowType,
|
||||||
long maxCompactionMemoryInBytes,
|
long maxCompactionMemoryInBytes,
|
||||||
@Nullable Set<String> requiredPartitionPaths) {
|
@Nullable Set<String> requiredPartitionPaths) {
|
||||||
this.conf = conf;
|
this.conf = conf;
|
||||||
@@ -107,6 +109,7 @@ public class StreamReadMonitoringFunction
|
|||||||
this.incrementalInputSplits = IncrementalInputSplits.builder()
|
this.incrementalInputSplits = IncrementalInputSplits.builder()
|
||||||
.conf(conf)
|
.conf(conf)
|
||||||
.path(path)
|
.path(path)
|
||||||
|
.rowType(rowType)
|
||||||
.maxCompactionMemoryInBytes(maxCompactionMemoryInBytes)
|
.maxCompactionMemoryInBytes(maxCompactionMemoryInBytes)
|
||||||
.requiredPartitions(requiredPartitionPaths)
|
.requiredPartitions(requiredPartitionPaths)
|
||||||
.skipCompaction(conf.getBoolean(FlinkOptions.READ_STREAMING_SKIP_COMPACT))
|
.skipCompaction(conf.getBoolean(FlinkOptions.READ_STREAMING_SKIP_COMPACT))
|
||||||
|
|||||||
@@ -0,0 +1,377 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.source.stats;
|
||||||
|
|
||||||
|
import org.apache.hudi.avro.model.HoodieMetadataRecord;
|
||||||
|
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
|
||||||
|
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||||
|
import org.apache.hudi.common.data.HoodieData;
|
||||||
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
|
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||||
|
import org.apache.hudi.common.util.ValidationUtils;
|
||||||
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
|
import org.apache.hudi.common.util.hash.ColumnIndexID;
|
||||||
|
import org.apache.hudi.exception.HoodieException;
|
||||||
|
import org.apache.hudi.metadata.HoodieMetadataPayload;
|
||||||
|
import org.apache.hudi.metadata.HoodieTableMetadata;
|
||||||
|
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
|
||||||
|
import org.apache.hudi.util.AvroSchemaConverter;
|
||||||
|
import org.apache.hudi.util.AvroToRowDataConverters;
|
||||||
|
import org.apache.hudi.util.RowDataProjection;
|
||||||
|
|
||||||
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.flink.table.data.GenericRowData;
|
||||||
|
import org.apache.flink.table.data.RowData;
|
||||||
|
import org.apache.flink.table.data.StringData;
|
||||||
|
import org.apache.flink.table.types.DataType;
|
||||||
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
|
import org.apache.flink.table.types.logical.TimeType;
|
||||||
|
import org.apache.flink.table.types.logical.TimestampType;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.SortedMap;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import static org.apache.hudi.common.util.ValidationUtils.checkState;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utilities for abstracting away heavy-lifting of interactions with Metadata Table's Column Stats Index,
|
||||||
|
* providing convenient interfaces to read it, transpose, etc.
|
||||||
|
*/
|
||||||
|
public class ColumnStatsIndices {
|
||||||
|
private static final DataType METADATA_DATA_TYPE = getMetadataDataType();
|
||||||
|
private static final DataType COL_STATS_DATA_TYPE = getColStatsDataType();
|
||||||
|
private static final int[] COL_STATS_TARGET_POS = getColStatsTargetPos();
|
||||||
|
|
||||||
|
// the column schema:
|
||||||
|
// |- file_name: string
|
||||||
|
// |- min_val: row
|
||||||
|
// |- max_val: row
|
||||||
|
// |- null_cnt: long
|
||||||
|
// |- val_cnt: long
|
||||||
|
// |- column_name: string
|
||||||
|
private static final int ORD_FILE_NAME = 0;
|
||||||
|
private static final int ORD_MIN_VAL = 1;
|
||||||
|
private static final int ORD_MAX_VAL = 2;
|
||||||
|
private static final int ORD_NULL_CNT = 3;
|
||||||
|
private static final int ORD_VAL_CNT = 4;
|
||||||
|
private static final int ORD_COL_NAME = 5;
|
||||||
|
|
||||||
|
private ColumnStatsIndices() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<RowData> readColumnStatsIndex(String basePath, HoodieMetadataConfig metadataConfig, String[] targetColumns) {
|
||||||
|
// NOTE: If specific columns have been provided, we can considerably trim down amount of data fetched
|
||||||
|
// by only fetching Column Stats Index records pertaining to the requested columns.
|
||||||
|
// Otherwise, we fall back to read whole Column Stats Index
|
||||||
|
ValidationUtils.checkArgument(targetColumns.length > 0,
|
||||||
|
"Column stats is only valid when push down filters have referenced columns");
|
||||||
|
final List<RowData> metadataRows = readColumnStatsIndexByColumns(basePath, targetColumns, metadataConfig);
|
||||||
|
return projectNestedColStatsColumns(metadataRows);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<RowData> projectNestedColStatsColumns(List<RowData> rows) {
|
||||||
|
int pos = HoodieMetadataRecord.SCHEMA$.getField(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).pos();
|
||||||
|
RowDataProjection projection = RowDataProjection.instanceV2((RowType) COL_STATS_DATA_TYPE.getLogicalType(), COL_STATS_TARGET_POS);
|
||||||
|
return rows.stream().parallel()
|
||||||
|
.map(row -> {
|
||||||
|
RowData columnStatsField = row.getRow(pos, 9);
|
||||||
|
return projection.project(columnStatsField);
|
||||||
|
}).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transposes and converts the raw table format of the Column Stats Index representation,
|
||||||
|
* where each row/record corresponds to individual (column, file) pair, into the table format
|
||||||
|
* where each row corresponds to single file with statistic for individual columns collated
|
||||||
|
* w/in such row:
|
||||||
|
* <p>
|
||||||
|
* Metadata Table Column Stats Index format:
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* +---------------------------+------------+------------+------------+-------------+
|
||||||
|
* | fileName | columnName | minValue | maxValue | num_nulls |
|
||||||
|
* +---------------------------+------------+------------+------------+-------------+
|
||||||
|
* | one_base_file.parquet | A | 1 | 10 | 0 |
|
||||||
|
* | another_base_file.parquet | A | -10 | 0 | 5 |
|
||||||
|
* +---------------------------+------------+------------+------------+-------------+
|
||||||
|
* </pre>
|
||||||
|
* <p>
|
||||||
|
* Returned table format
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* +---------------------------+------------+------------+-------------+
|
||||||
|
* | file | A_minValue | A_maxValue | A_nullCount |
|
||||||
|
* +---------------------------+------------+------------+-------------+
|
||||||
|
* | one_base_file.parquet | 1 | 10 | 0 |
|
||||||
|
* | another_base_file.parquet | -10 | 0 | 5 |
|
||||||
|
* +---------------------------+------------+------------+-------------+
|
||||||
|
* </pre>
|
||||||
|
* <p>
|
||||||
|
* NOTE: Column Stats Index might potentially contain statistics for many columns (if not all), while
|
||||||
|
* query at hand might only be referencing a handful of those. As such, we collect all the
|
||||||
|
* column references from the filtering expressions, and only transpose records corresponding to the
|
||||||
|
* columns referenced in those
|
||||||
|
*
|
||||||
|
* @param colStats RowData list bearing raw Column Stats Index table
|
||||||
|
* @param queryColumns target columns to be included into the final table
|
||||||
|
* @param tableSchema schema of the source data table
|
||||||
|
* @return reshaped table according to the format outlined above
|
||||||
|
*/
|
||||||
|
public static Pair<List<RowData>, String[]> transposeColumnStatsIndex(List<RowData> colStats, String[] queryColumns, RowType tableSchema) {
|
||||||
|
|
||||||
|
Map<String, LogicalType> tableFieldTypeMap = tableSchema.getFields().stream()
|
||||||
|
.collect(Collectors.toMap(RowType.RowField::getName, RowType.RowField::getType));
|
||||||
|
|
||||||
|
// NOTE: We have to collect list of indexed columns to make sure we properly align the rows
|
||||||
|
// w/in the transposed dataset: since some files might not have all the columns indexed
|
||||||
|
// either due to the Column Stats Index config changes, schema evolution, etc. we have
|
||||||
|
// to make sure that all the rows w/in transposed data-frame are properly padded (with null
|
||||||
|
// values) for such file-column combinations
|
||||||
|
Set<String> indexedColumns = colStats.stream().map(row -> row.getString(ORD_COL_NAME)
|
||||||
|
.toString()).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
// NOTE: We're sorting the columns to make sure final index schema matches layout
|
||||||
|
// of the transposed table
|
||||||
|
TreeSet<String> sortedTargetColumns = Arrays.stream(queryColumns).sorted()
|
||||||
|
.filter(indexedColumns::contains)
|
||||||
|
.collect(Collectors.toCollection(TreeSet::new));
|
||||||
|
|
||||||
|
Map<StringData, List<RowData>> fileNameToRows = colStats.stream().parallel()
|
||||||
|
.filter(row -> sortedTargetColumns.contains(row.getString(ORD_COL_NAME).toString()))
|
||||||
|
.map(row -> {
|
||||||
|
if (row.isNullAt(ORD_MIN_VAL) && row.isNullAt(ORD_MAX_VAL)) {
|
||||||
|
// Corresponding row could be null in either of the 2 cases
|
||||||
|
// - Column contains only null values (in that case both min/max have to be nulls)
|
||||||
|
// - This is a stubbed Column Stats record (used as a tombstone)
|
||||||
|
return row;
|
||||||
|
} else {
|
||||||
|
String colName = row.getString(ORD_COL_NAME).toString();
|
||||||
|
LogicalType colType = tableFieldTypeMap.get(colName);
|
||||||
|
return unpackMinMaxVal(row, colType);
|
||||||
|
}
|
||||||
|
}).collect(Collectors.groupingBy(rowData -> rowData.getString(ORD_FILE_NAME)));
|
||||||
|
|
||||||
|
return Pair.of(foldRowsByFiles(sortedTargetColumns, fileNameToRows), sortedTargetColumns.toArray(new String[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<RowData> foldRowsByFiles(
|
||||||
|
TreeSet<String> sortedTargetColumns,
|
||||||
|
Map<StringData, List<RowData>> fileNameToRows) {
|
||||||
|
return fileNameToRows.values().stream().parallel().map(rows -> {
|
||||||
|
// Rows seq is always non-empty (otherwise it won't be grouped into)
|
||||||
|
StringData fileName = rows.get(0).getString(ORD_FILE_NAME);
|
||||||
|
long valueCount = rows.get(0).getLong(ORD_VAL_CNT);
|
||||||
|
|
||||||
|
// To properly align individual rows (corresponding to a file) w/in the transposed projection, we need
|
||||||
|
// to align existing column-stats for individual file with the list of expected ones for the
|
||||||
|
// whole transposed projection (a superset of all files)
|
||||||
|
Map<String, RowData> columnRowsMap = rows.stream()
|
||||||
|
.collect(Collectors.toMap(row -> row.getString(ORD_COL_NAME).toString(), row -> row));
|
||||||
|
SortedMap<String, RowData> alignedColumnRowsMap = new TreeMap<>();
|
||||||
|
sortedTargetColumns.forEach(col -> alignedColumnRowsMap.put(col, columnRowsMap.get(col)));
|
||||||
|
|
||||||
|
List<Tuple3> columnStats = alignedColumnRowsMap.values().stream().map(row -> {
|
||||||
|
if (row == null) {
|
||||||
|
// NOTE: Since we're assuming missing column to essentially contain exclusively
|
||||||
|
// null values, we set null-count to be equal to value-count (this behavior is
|
||||||
|
// consistent with reading non-existent columns from Parquet)
|
||||||
|
return Tuple3.of(null, null, valueCount);
|
||||||
|
} else {
|
||||||
|
GenericRowData gr = (GenericRowData) row;
|
||||||
|
return Tuple3.of(gr.getField(ORD_MIN_VAL), gr.getField(ORD_MAX_VAL), gr.getField(ORD_NULL_CNT));
|
||||||
|
}
|
||||||
|
}).collect(Collectors.toList());
|
||||||
|
GenericRowData foldedRow = new GenericRowData(2 + 3 * columnStats.size());
|
||||||
|
foldedRow.setField(0, fileName);
|
||||||
|
foldedRow.setField(1, valueCount);
|
||||||
|
for (int i = 0; i < columnStats.size(); i++) {
|
||||||
|
Tuple3 stats = columnStats.get(i);
|
||||||
|
int startPos = 2 + 3 * i;
|
||||||
|
foldedRow.setField(startPos, stats.f0);
|
||||||
|
foldedRow.setField(startPos + 1, stats.f1);
|
||||||
|
foldedRow.setField(startPos + 2, stats.f2);
|
||||||
|
}
|
||||||
|
return foldedRow;
|
||||||
|
}).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static RowData unpackMinMaxVal(
|
||||||
|
RowData row,
|
||||||
|
LogicalType colType) {
|
||||||
|
|
||||||
|
RowData minValueStruct = row.getRow(ORD_MIN_VAL, 1);
|
||||||
|
RowData maxValueStruct = row.getRow(ORD_MAX_VAL, 1);
|
||||||
|
|
||||||
|
checkState(minValueStruct != null && maxValueStruct != null,
|
||||||
|
"Invalid Column Stats record: either both min/max have to be null, or both have to be non-null");
|
||||||
|
|
||||||
|
Object minValue = tryUnpackNonNullVal(minValueStruct, colType);
|
||||||
|
Object maxValue = tryUnpackNonNullVal(maxValueStruct, colType);
|
||||||
|
|
||||||
|
// the column schema:
|
||||||
|
// |- file_name: string
|
||||||
|
// |- min_val: row
|
||||||
|
// |- max_val: row
|
||||||
|
// |- null_cnt: long
|
||||||
|
// |- val_cnt: long
|
||||||
|
// |- column_name: string
|
||||||
|
|
||||||
|
GenericRowData unpackedRow = new GenericRowData(row.getArity());
|
||||||
|
unpackedRow.setField(0, row.getString(0));
|
||||||
|
unpackedRow.setField(1, minValue);
|
||||||
|
unpackedRow.setField(2, maxValue);
|
||||||
|
unpackedRow.setField(3, row.getLong(3));
|
||||||
|
unpackedRow.setField(4, row.getLong(4));
|
||||||
|
unpackedRow.setField(5, row.getString(5));
|
||||||
|
|
||||||
|
return unpackedRow;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Object tryUnpackNonNullVal(RowData rowData, LogicalType colType) {
|
||||||
|
for (int i = 0; i < rowData.getArity(); i++) {
|
||||||
|
// row data converted from avro is definitely generic.
|
||||||
|
Object nested = ((GenericRowData) rowData).getField(i);
|
||||||
|
if (nested != null) {
|
||||||
|
return doUnpack(nested, colType);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Object doUnpack(Object rawVal, LogicalType logicalType) {
|
||||||
|
// fix time unit
|
||||||
|
switch (logicalType.getTypeRoot()) {
|
||||||
|
case TIME_WITHOUT_TIME_ZONE:
|
||||||
|
TimeType timeType = (TimeType) logicalType;
|
||||||
|
if (timeType.getPrecision() == 3) {
|
||||||
|
// the precision in HoodieMetadata is 6
|
||||||
|
rawVal = ((Long) rawVal) / 1000;
|
||||||
|
} else if (timeType.getPrecision() == 9) {
|
||||||
|
rawVal = ((Long) rawVal) * 1000;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case TIMESTAMP_WITHOUT_TIME_ZONE:
|
||||||
|
TimestampType timestampType = (TimestampType) logicalType;
|
||||||
|
if (timestampType.getPrecision() == 3) {
|
||||||
|
// the precision in HoodieMetadata is 6
|
||||||
|
rawVal = ((Long) rawVal) / 1000;
|
||||||
|
} else if (timestampType.getPrecision() == 9) {
|
||||||
|
rawVal = ((Long) rawVal) * 1000;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// no operation
|
||||||
|
}
|
||||||
|
AvroToRowDataConverters.AvroToRowDataConverter converter = AvroToRowDataConverters.createConverter(logicalType);
|
||||||
|
return converter.convert(rawVal);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<RowData> readColumnStatsIndexByColumns(
|
||||||
|
String basePath,
|
||||||
|
String[] targetColumns,
|
||||||
|
HoodieMetadataConfig metadataConfig) {
|
||||||
|
|
||||||
|
// Read Metadata Table's Column Stats Index into Flink's RowData list by
|
||||||
|
// - Fetching the records from CSI by key-prefixes (encoded column names)
|
||||||
|
// - Deserializing fetched records into [[RowData]]s
|
||||||
|
HoodieTableMetadata metadataTable = HoodieTableMetadata.create(
|
||||||
|
HoodieFlinkEngineContext.DEFAULT,
|
||||||
|
metadataConfig, basePath,
|
||||||
|
FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue());
|
||||||
|
|
||||||
|
// TODO encoding should be done internally w/in HoodieBackedTableMetadata
|
||||||
|
List<String> encodedTargetColumnNames = Arrays.stream(targetColumns)
|
||||||
|
.map(colName -> new ColumnIndexID(colName).asBase64EncodedString()).collect(Collectors.toList());
|
||||||
|
|
||||||
|
HoodieData<HoodieRecord<HoodieMetadataPayload>> records =
|
||||||
|
metadataTable.getRecordsByKeyPrefixes(encodedTargetColumnNames, HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS);
|
||||||
|
|
||||||
|
org.apache.hudi.util.AvroToRowDataConverters.AvroToRowDataConverter converter =
|
||||||
|
AvroToRowDataConverters.createRowConverter((RowType) METADATA_DATA_TYPE.getLogicalType());
|
||||||
|
return records.collectAsList().stream().parallel().map(record -> {
|
||||||
|
// schema and props are ignored for generating metadata record from the payload
|
||||||
|
// instead, the underlying file system, or bloom filter, or columns stats metadata (part of payload) are directly used
|
||||||
|
GenericRecord genericRecord;
|
||||||
|
try {
|
||||||
|
genericRecord = (GenericRecord) record.getData().getInsertValue(null, null).orElse(null);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieException("Exception while getting insert value from metadata payload");
|
||||||
|
}
|
||||||
|
return (RowData) converter.convert(genericRecord);
|
||||||
|
}
|
||||||
|
).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
// Utilities
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
private static class Tuple3 {
|
||||||
|
public Object f0;
|
||||||
|
public Object f1;
|
||||||
|
public Object f2;
|
||||||
|
|
||||||
|
private Tuple3(Object f0, Object f1, Object f2) {
|
||||||
|
this.f0 = f0;
|
||||||
|
this.f1 = f1;
|
||||||
|
this.f2 = f2;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Tuple3 of(Object f0, Object f1, Object f2) {
|
||||||
|
return new Tuple3(f0, f1, f2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DataType getMetadataDataType() {
|
||||||
|
return AvroSchemaConverter.convertToDataType(HoodieMetadataRecord.SCHEMA$);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DataType getColStatsDataType() {
|
||||||
|
int pos = HoodieMetadataRecord.SCHEMA$.getField(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).pos();
|
||||||
|
return METADATA_DATA_TYPE.getChildren().get(pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
// the column schema:
|
||||||
|
// |- file_name: string
|
||||||
|
// |- min_val: row
|
||||||
|
// |- max_val: row
|
||||||
|
// |- null_cnt: long
|
||||||
|
// |- val_cnt: long
|
||||||
|
// |- column_name: string
|
||||||
|
private static int[] getColStatsTargetPos() {
|
||||||
|
RowType colStatsRowType = (RowType) COL_STATS_DATA_TYPE.getLogicalType();
|
||||||
|
return Stream.of(
|
||||||
|
HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME,
|
||||||
|
HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE,
|
||||||
|
HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE,
|
||||||
|
HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT,
|
||||||
|
HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT,
|
||||||
|
HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME)
|
||||||
|
.mapToInt(colStatsRowType::getFieldIndex)
|
||||||
|
.toArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,552 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.source.stats;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.util.ValidationUtils;
|
||||||
|
import org.apache.hudi.util.ExpressionUtils;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.RowData;
|
||||||
|
import org.apache.flink.table.expressions.CallExpression;
|
||||||
|
import org.apache.flink.table.expressions.Expression;
|
||||||
|
import org.apache.flink.table.expressions.FieldReferenceExpression;
|
||||||
|
import org.apache.flink.table.expressions.ResolvedExpression;
|
||||||
|
import org.apache.flink.table.expressions.ValueLiteralExpression;
|
||||||
|
import org.apache.flink.table.functions.BuiltInFunctionDefinitions;
|
||||||
|
import org.apache.flink.table.functions.FunctionDefinition;
|
||||||
|
import org.apache.flink.table.types.logical.DecimalType;
|
||||||
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
|
|
||||||
|
import javax.validation.constraints.NotNull;
|
||||||
|
|
||||||
|
import java.math.BigDecimal;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tool to evaluate the {@link org.apache.flink.table.expressions.ResolvedExpression}s.
|
||||||
|
*/
|
||||||
|
public class ExpressionEvaluator {
|
||||||
|
private static final int IN_PREDICATE_LIMIT = 200;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter the index row with specific data filters and query fields.
|
||||||
|
*
|
||||||
|
* @param filters The pushed down data filters
|
||||||
|
* @param indexRow The index row
|
||||||
|
* @param queryFields The query fields referenced by the filters
|
||||||
|
* @return true if the index row should be considered as a candidate
|
||||||
|
*/
|
||||||
|
public static boolean filterExprs(List<ResolvedExpression> filters, RowData indexRow, RowType.RowField[] queryFields) {
|
||||||
|
for (ResolvedExpression filter : filters) {
|
||||||
|
if (!Evaluator.bindCall((CallExpression) filter, indexRow, queryFields).eval()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used for deciding whether the literal values match the column stats.
|
||||||
|
* The evaluator can be nested.
|
||||||
|
*/
|
||||||
|
public abstract static class Evaluator {
|
||||||
|
// the constant literal value
|
||||||
|
protected Object val;
|
||||||
|
|
||||||
|
// column stats
|
||||||
|
protected Object minVal;
|
||||||
|
protected Object maxVal;
|
||||||
|
protected long nullCnt = 0;
|
||||||
|
|
||||||
|
// referenced field type
|
||||||
|
protected LogicalType type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Binds the evaluator with specific call expression.
|
||||||
|
*
|
||||||
|
* <p>Three steps to bind the call:
|
||||||
|
* 1. map the evaluator instance;
|
||||||
|
* 2. bind the field reference;
|
||||||
|
* 3. bind the column stats.
|
||||||
|
*
|
||||||
|
* <p>Normalize the expression to simplify the following decision logic:
|
||||||
|
* always put the literal expression in the right.
|
||||||
|
*/
|
||||||
|
public static Evaluator bindCall(CallExpression call, RowData indexRow, RowType.RowField[] queryFields) {
|
||||||
|
FunctionDefinition funDef = call.getFunctionDefinition();
|
||||||
|
List<Expression> childExprs = call.getChildren();
|
||||||
|
|
||||||
|
boolean normalized = childExprs.get(0) instanceof FieldReferenceExpression;
|
||||||
|
final Evaluator evaluator;
|
||||||
|
|
||||||
|
if (BuiltInFunctionDefinitions.NOT.equals(funDef)) {
|
||||||
|
evaluator = Not.getInstance();
|
||||||
|
Evaluator childEvaluator = bindCall((CallExpression) childExprs.get(0), indexRow, queryFields);
|
||||||
|
return ((Not) evaluator).bindEvaluator(childEvaluator);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (BuiltInFunctionDefinitions.AND.equals(funDef)) {
|
||||||
|
evaluator = And.getInstance();
|
||||||
|
Evaluator evaluator1 = bindCall((CallExpression) childExprs.get(0), indexRow, queryFields);
|
||||||
|
Evaluator evaluator2 = bindCall((CallExpression) childExprs.get(1), indexRow, queryFields);
|
||||||
|
return ((And) evaluator).bindEvaluator(evaluator1, evaluator2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (BuiltInFunctionDefinitions.OR.equals(funDef)) {
|
||||||
|
evaluator = Or.getInstance();
|
||||||
|
Evaluator evaluator1 = bindCall((CallExpression) childExprs.get(0), indexRow, queryFields);
|
||||||
|
Evaluator evaluator2 = bindCall((CallExpression) childExprs.get(1), indexRow, queryFields);
|
||||||
|
return ((Or) evaluator).bindEvaluator(evaluator1, evaluator2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle IN specifically
|
||||||
|
if (BuiltInFunctionDefinitions.IN.equals(funDef)) {
|
||||||
|
ValidationUtils.checkState(normalized, "The IN expression expects to be normalized");
|
||||||
|
evaluator = In.getInstance();
|
||||||
|
FieldReferenceExpression rExpr = (FieldReferenceExpression) childExprs.get(0);
|
||||||
|
evaluator.bindFieldReference(rExpr);
|
||||||
|
((In) evaluator).bindVals(getInLiteralVals(childExprs));
|
||||||
|
return evaluator.bindColStats(indexRow, queryFields, rExpr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle unary operators
|
||||||
|
if (BuiltInFunctionDefinitions.IS_NULL.equals(funDef)) {
|
||||||
|
FieldReferenceExpression rExpr = (FieldReferenceExpression) childExprs.get(0);
|
||||||
|
return IsNull.getInstance()
|
||||||
|
.bindFieldReference(rExpr)
|
||||||
|
.bindColStats(indexRow, queryFields, rExpr);
|
||||||
|
} else if (BuiltInFunctionDefinitions.IS_NOT_NULL.equals(funDef)) {
|
||||||
|
FieldReferenceExpression rExpr = (FieldReferenceExpression) childExprs.get(0);
|
||||||
|
return IsNotNull.getInstance()
|
||||||
|
.bindFieldReference(rExpr)
|
||||||
|
.bindColStats(indexRow, queryFields, rExpr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle binary operators
|
||||||
|
if (BuiltInFunctionDefinitions.EQUALS.equals(funDef)) {
|
||||||
|
evaluator = EqualTo.getInstance();
|
||||||
|
} else if (BuiltInFunctionDefinitions.NOT_EQUALS.equals(funDef)) {
|
||||||
|
evaluator = NotEqualTo.getInstance();
|
||||||
|
} else if (BuiltInFunctionDefinitions.LESS_THAN.equals(funDef)) {
|
||||||
|
evaluator = normalized ? LessThan.getInstance() : GreaterThan.getInstance();
|
||||||
|
} else if (BuiltInFunctionDefinitions.GREATER_THAN.equals(funDef)) {
|
||||||
|
evaluator = normalized ? GreaterThan.getInstance() : LessThan.getInstance();
|
||||||
|
} else if (BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL.equals(funDef)) {
|
||||||
|
evaluator = normalized ? LessThanOrEqual.getInstance() : GreaterThanOrEqual.getInstance();
|
||||||
|
} else if (BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL.equals(funDef)) {
|
||||||
|
evaluator = normalized ? GreaterThanOrEqual.getInstance() : LessThanOrEqual.getInstance();
|
||||||
|
} else {
|
||||||
|
throw new AssertionError("Unexpected function definition " + funDef);
|
||||||
|
}
|
||||||
|
FieldReferenceExpression rExpr = normalized
|
||||||
|
? (FieldReferenceExpression) childExprs.get(0)
|
||||||
|
: (FieldReferenceExpression) childExprs.get(1);
|
||||||
|
ValueLiteralExpression vExpr = normalized
|
||||||
|
? (ValueLiteralExpression) childExprs.get(1)
|
||||||
|
: (ValueLiteralExpression) childExprs.get(0);
|
||||||
|
evaluator
|
||||||
|
.bindFieldReference(rExpr)
|
||||||
|
.bindVal(vExpr)
|
||||||
|
.bindColStats(indexRow, queryFields, rExpr);
|
||||||
|
return evaluator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Evaluator bindColStats(
|
||||||
|
RowData indexRow,
|
||||||
|
RowType.RowField[] queryFields,
|
||||||
|
FieldReferenceExpression expr) {
|
||||||
|
int colPos = -1;
|
||||||
|
for (int i = 0; i < queryFields.length; i++) {
|
||||||
|
if (expr.getName().equals(queryFields[i].getName())) {
|
||||||
|
colPos = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ValidationUtils.checkState(colPos != -1, "Can not find column " + expr.getName());
|
||||||
|
int startPos = 2 + colPos * 3;
|
||||||
|
LogicalType colType = queryFields[colPos].getType();
|
||||||
|
Object minVal = indexRow.isNullAt(startPos) ? null : getValAsJavaObj(indexRow, startPos, colType);
|
||||||
|
Object maxVal = indexRow.isNullAt(startPos + 1) ? null : getValAsJavaObj(indexRow, startPos + 1, colType);
|
||||||
|
long nullCnt = indexRow.getLong(startPos + 2);
|
||||||
|
|
||||||
|
this.minVal = minVal;
|
||||||
|
this.maxVal = maxVal;
|
||||||
|
this.nullCnt = nullCnt;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Evaluator bindVal(ValueLiteralExpression vExpr) {
|
||||||
|
this.val = ExpressionUtils.getValueFromLiteral(vExpr);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Evaluator bindFieldReference(FieldReferenceExpression expr) {
|
||||||
|
this.type = expr.getOutputDataType().getLogicalType();
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract boolean eval();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate = expr.
|
||||||
|
*/
|
||||||
|
public static class EqualTo extends Evaluator {
|
||||||
|
|
||||||
|
public static EqualTo getInstance() {
|
||||||
|
return new EqualTo();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
if (this.minVal == null || this.maxVal == null || this.val == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (compare(this.minVal, this.val, this.type) > 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return compare(this.maxVal, this.val, this.type) >= 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate <> expr.
|
||||||
|
*/
|
||||||
|
public static class NotEqualTo extends Evaluator {
|
||||||
|
public static NotEqualTo getInstance() {
|
||||||
|
return new NotEqualTo();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
// because the bounds are not necessarily a min or max value, this cannot be answered using
|
||||||
|
// them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate IS NULL expr.
|
||||||
|
*/
|
||||||
|
public static class IsNull extends Evaluator {
|
||||||
|
public static IsNull getInstance() {
|
||||||
|
return new IsNull();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
return this.nullCnt > 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate IS NOT NULL expr.
|
||||||
|
*/
|
||||||
|
public static class IsNotNull extends Evaluator {
|
||||||
|
public static IsNotNull getInstance() {
|
||||||
|
return new IsNotNull();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
// should consider FLOAT/DOUBLE & NAN
|
||||||
|
return this.minVal != null || this.nullCnt <= 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate < expr.
|
||||||
|
*/
|
||||||
|
public static class LessThan extends Evaluator {
|
||||||
|
public static LessThan getInstance() {
|
||||||
|
return new LessThan();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
if (this.minVal == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return compare(this.minVal, this.val, this.type) < 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate > expr.
|
||||||
|
*/
|
||||||
|
public static class GreaterThan extends Evaluator {
|
||||||
|
public static GreaterThan getInstance() {
|
||||||
|
return new GreaterThan();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
if (this.maxVal == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return compare(this.maxVal, this.val, this.type) > 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate <= expr.
|
||||||
|
*/
|
||||||
|
public static class LessThanOrEqual extends Evaluator {
|
||||||
|
public static LessThanOrEqual getInstance() {
|
||||||
|
return new LessThanOrEqual();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
if (this.minVal == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return compare(this.minVal, this.val, this.type) <= 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate >= expr.
|
||||||
|
*/
|
||||||
|
public static class GreaterThanOrEqual extends Evaluator {
|
||||||
|
public static GreaterThanOrEqual getInstance() {
|
||||||
|
return new GreaterThanOrEqual();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
if (this.maxVal == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return compare(this.maxVal, this.val, this.type) >= 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate IN expr.
|
||||||
|
*/
|
||||||
|
public static class In extends Evaluator {
|
||||||
|
public static In getInstance() {
|
||||||
|
return new In();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Object[] vals;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
if (this.minVal == null) {
|
||||||
|
return false; // values are all null and literalSet cannot contain null.
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vals.length > IN_PREDICATE_LIMIT) {
|
||||||
|
// skip evaluating the predicate if the number of values is too big
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
vals = Arrays.stream(vals).filter(v -> compare(this.minVal, v, this.type) <= 0).toArray();
|
||||||
|
if (vals.length == 0) { // if all values are less than lower bound, rows cannot match.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
vals = Arrays.stream(vals).filter(v -> compare(this.maxVal, v, this.type) >= 0).toArray();
|
||||||
|
if (vals.length == 0) { // if all remaining values are greater than upper bound, rows cannot match.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void bindVals(Object... vals) {
|
||||||
|
this.vals = vals;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// component predicate
|
||||||
|
/**
|
||||||
|
* To evaluate NOT expr.
|
||||||
|
*/
|
||||||
|
public static class Not extends Evaluator {
|
||||||
|
public static Not getInstance() {
|
||||||
|
return new Not();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Evaluator evaluator;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
return !this.evaluator.eval();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Evaluator bindEvaluator(Evaluator evaluator) {
|
||||||
|
this.evaluator = evaluator;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate AND expr.
|
||||||
|
*/
|
||||||
|
public static class And extends Evaluator {
|
||||||
|
public static And getInstance() {
|
||||||
|
return new And();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Evaluator[] evaluators;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
for (Evaluator evaluator : evaluators) {
|
||||||
|
if (!evaluator.eval()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Evaluator bindEvaluator(Evaluator... evaluators) {
|
||||||
|
this.evaluators = evaluators;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To evaluate OR expr.
|
||||||
|
*/
|
||||||
|
public static class Or extends Evaluator {
|
||||||
|
public static Or getInstance() {
|
||||||
|
return new Or();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Evaluator[] evaluators;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean eval() {
|
||||||
|
for (Evaluator evaluator : evaluators) {
|
||||||
|
if (evaluator.eval()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Evaluator bindEvaluator(Evaluator... evaluators) {
|
||||||
|
this.evaluators = evaluators;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
// Utilities
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private static int compare(@NotNull Object val1, @NotNull Object val2, LogicalType logicalType) {
|
||||||
|
switch (logicalType.getTypeRoot()) {
|
||||||
|
case TIMESTAMP_WITHOUT_TIME_ZONE:
|
||||||
|
case TIME_WITHOUT_TIME_ZONE:
|
||||||
|
case DATE:
|
||||||
|
return ((Long) val1).compareTo((Long) val2);
|
||||||
|
case BOOLEAN:
|
||||||
|
return ((Boolean) val1).compareTo((Boolean) val2);
|
||||||
|
case TINYINT:
|
||||||
|
case SMALLINT:
|
||||||
|
case INTEGER:
|
||||||
|
return ((Integer) val1).compareTo((Integer) val2);
|
||||||
|
case FLOAT:
|
||||||
|
return ((Float) val1).compareTo((Float) val2);
|
||||||
|
case DOUBLE:
|
||||||
|
return ((Double) val1).compareTo((Double) val2);
|
||||||
|
case BINARY:
|
||||||
|
case VARBINARY:
|
||||||
|
return compareBytes((byte[]) val1, (byte[]) val2);
|
||||||
|
case CHAR:
|
||||||
|
case VARCHAR:
|
||||||
|
return ((String) val1).compareTo((String) val2);
|
||||||
|
case DECIMAL:
|
||||||
|
return ((BigDecimal) val1).compareTo((BigDecimal) val2);
|
||||||
|
default:
|
||||||
|
throw new UnsupportedOperationException("Unsupported type: " + logicalType);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int compareBytes(byte[] v1, byte[] v2) {
|
||||||
|
int len1 = v1.length;
|
||||||
|
int len2 = v2.length;
|
||||||
|
int lim = Math.min(len1, len2);
|
||||||
|
|
||||||
|
int k = 0;
|
||||||
|
while (k < lim) {
|
||||||
|
byte c1 = v1[k];
|
||||||
|
byte c2 = v2[k];
|
||||||
|
if (c1 != c2) {
|
||||||
|
return c1 - c2;
|
||||||
|
}
|
||||||
|
k++;
|
||||||
|
}
|
||||||
|
return len1 - len2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the IN expression literal values.
|
||||||
|
*/
|
||||||
|
private static Object[] getInLiteralVals(List<Expression> childExprs) {
|
||||||
|
List<Object> vals = new ArrayList<>();
|
||||||
|
for (int i = 1; i < childExprs.size(); i++) {
|
||||||
|
vals.add(ExpressionUtils.getValueFromLiteral((ValueLiteralExpression) childExprs.get(i)));
|
||||||
|
}
|
||||||
|
return vals.toArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the value as Java object at position {@code pos} of row {@code indexRow}.
|
||||||
|
*/
|
||||||
|
private static Object getValAsJavaObj(RowData indexRow, int pos, LogicalType colType) {
|
||||||
|
switch (colType.getTypeRoot()) {
|
||||||
|
// NOTE: Since we can't rely on Avro's "date", and "timestamp-micros" logical-types, we're
|
||||||
|
// manually encoding corresponding values as int and long w/in the Column Stats Index and
|
||||||
|
// here we have to decode those back into corresponding logical representation.
|
||||||
|
case TIMESTAMP_WITHOUT_TIME_ZONE:
|
||||||
|
case TIME_WITHOUT_TIME_ZONE:
|
||||||
|
case DATE:
|
||||||
|
return indexRow.getLong(pos);
|
||||||
|
// NOTE: All integral types of size less than Int are encoded as Ints in MT
|
||||||
|
case BOOLEAN:
|
||||||
|
return indexRow.getBoolean(pos);
|
||||||
|
case TINYINT:
|
||||||
|
case SMALLINT:
|
||||||
|
case INTEGER:
|
||||||
|
return indexRow.getInt(pos);
|
||||||
|
case FLOAT:
|
||||||
|
return indexRow.getFloat(pos);
|
||||||
|
case DOUBLE:
|
||||||
|
return indexRow.getDouble(pos);
|
||||||
|
case BINARY:
|
||||||
|
case VARBINARY:
|
||||||
|
return indexRow.getBinary(pos);
|
||||||
|
case CHAR:
|
||||||
|
case VARCHAR:
|
||||||
|
return indexRow.getString(pos).toString();
|
||||||
|
case DECIMAL:
|
||||||
|
DecimalType decimalType = (DecimalType) colType;
|
||||||
|
return indexRow.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal();
|
||||||
|
default:
|
||||||
|
throw new UnsupportedOperationException("Unsupported type: " + colType);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -67,7 +67,7 @@ public class HoodieTableSink implements DynamicTableSink, SupportsPartitioning,
|
|||||||
.getCheckpointConfig().getCheckpointTimeout();
|
.getCheckpointConfig().getCheckpointTimeout();
|
||||||
conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout);
|
conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout);
|
||||||
|
|
||||||
RowType rowType = (RowType) schema.toSourceRowDataType().notNull().getLogicalType();
|
RowType rowType = (RowType) schema.toSinkRowDataType().notNull().getLogicalType();
|
||||||
|
|
||||||
// bulk_insert mode
|
// bulk_insert mode
|
||||||
final String writeOperation = this.conf.get(FlinkOptions.OPERATION);
|
final String writeOperation = this.conf.get(FlinkOptions.OPERATION);
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ package org.apache.hudi.table;
|
|||||||
|
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
import org.apache.hudi.common.model.BaseFile;
|
import org.apache.hudi.common.model.BaseFile;
|
||||||
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
import org.apache.hudi.common.model.HoodieLogFile;
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.model.HoodieTableType;
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
@@ -34,7 +35,6 @@ import org.apache.hudi.configuration.FlinkOptions;
|
|||||||
import org.apache.hudi.configuration.HadoopConfigurations;
|
import org.apache.hudi.configuration.HadoopConfigurations;
|
||||||
import org.apache.hudi.configuration.OptionsResolver;
|
import org.apache.hudi.configuration.OptionsResolver;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.hadoop.HoodieROTablePathFilter;
|
|
||||||
import org.apache.hudi.source.FileIndex;
|
import org.apache.hudi.source.FileIndex;
|
||||||
import org.apache.hudi.source.IncrementalInputSplits;
|
import org.apache.hudi.source.IncrementalInputSplits;
|
||||||
import org.apache.hudi.source.StreamReadMonitoringFunction;
|
import org.apache.hudi.source.StreamReadMonitoringFunction;
|
||||||
@@ -46,13 +46,12 @@ import org.apache.hudi.table.format.mor.MergeOnReadInputSplit;
|
|||||||
import org.apache.hudi.table.format.mor.MergeOnReadTableState;
|
import org.apache.hudi.table.format.mor.MergeOnReadTableState;
|
||||||
import org.apache.hudi.util.AvroSchemaConverter;
|
import org.apache.hudi.util.AvroSchemaConverter;
|
||||||
import org.apache.hudi.util.ChangelogModes;
|
import org.apache.hudi.util.ChangelogModes;
|
||||||
|
import org.apache.hudi.util.ExpressionUtils;
|
||||||
import org.apache.hudi.util.InputFormats;
|
import org.apache.hudi.util.InputFormats;
|
||||||
import org.apache.hudi.util.StreamerUtil;
|
import org.apache.hudi.util.StreamerUtil;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.flink.annotation.VisibleForTesting;
|
import org.apache.flink.annotation.VisibleForTesting;
|
||||||
import org.apache.flink.api.common.io.FileInputFormat;
|
|
||||||
import org.apache.flink.api.common.io.FilePathFilter;
|
|
||||||
import org.apache.flink.api.common.io.InputFormat;
|
import org.apache.flink.api.common.io.InputFormat;
|
||||||
import org.apache.flink.api.common.typeinfo.TypeInformation;
|
import org.apache.flink.api.common.typeinfo.TypeInformation;
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
@@ -73,7 +72,6 @@ import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown;
|
|||||||
import org.apache.flink.table.connector.source.abilities.SupportsPartitionPushDown;
|
import org.apache.flink.table.connector.source.abilities.SupportsPartitionPushDown;
|
||||||
import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown;
|
import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown;
|
||||||
import org.apache.flink.table.data.RowData;
|
import org.apache.flink.table.data.RowData;
|
||||||
import org.apache.flink.table.expressions.Expression;
|
|
||||||
import org.apache.flink.table.expressions.ResolvedExpression;
|
import org.apache.flink.table.expressions.ResolvedExpression;
|
||||||
import org.apache.flink.table.runtime.types.TypeInfoDataTypeConverter;
|
import org.apache.flink.table.runtime.types.TypeInfoDataTypeConverter;
|
||||||
import org.apache.flink.table.types.DataType;
|
import org.apache.flink.table.types.DataType;
|
||||||
@@ -117,6 +115,7 @@ public class HoodieTableSource implements
|
|||||||
private final long maxCompactionMemoryInBytes;
|
private final long maxCompactionMemoryInBytes;
|
||||||
|
|
||||||
private final ResolvedSchema schema;
|
private final ResolvedSchema schema;
|
||||||
|
private final RowType tableRowType;
|
||||||
private final Path path;
|
private final Path path;
|
||||||
private final List<String> partitionKeys;
|
private final List<String> partitionKeys;
|
||||||
private final String defaultPartName;
|
private final String defaultPartName;
|
||||||
@@ -125,7 +124,7 @@ public class HoodieTableSource implements
|
|||||||
|
|
||||||
private int[] requiredPos;
|
private int[] requiredPos;
|
||||||
private long limit;
|
private long limit;
|
||||||
private List<Expression> filters;
|
private List<ResolvedExpression> filters;
|
||||||
|
|
||||||
private List<Map<String, String>> requiredPartitions;
|
private List<Map<String, String>> requiredPartitions;
|
||||||
|
|
||||||
@@ -147,21 +146,22 @@ public class HoodieTableSource implements
|
|||||||
@Nullable List<Map<String, String>> requiredPartitions,
|
@Nullable List<Map<String, String>> requiredPartitions,
|
||||||
@Nullable int[] requiredPos,
|
@Nullable int[] requiredPos,
|
||||||
@Nullable Long limit,
|
@Nullable Long limit,
|
||||||
@Nullable List<Expression> filters) {
|
@Nullable List<ResolvedExpression> filters) {
|
||||||
this.schema = schema;
|
this.schema = schema;
|
||||||
|
this.tableRowType = (RowType) schema.toPhysicalRowDataType().notNull().getLogicalType();
|
||||||
this.path = path;
|
this.path = path;
|
||||||
this.partitionKeys = partitionKeys;
|
this.partitionKeys = partitionKeys;
|
||||||
this.defaultPartName = defaultPartName;
|
this.defaultPartName = defaultPartName;
|
||||||
this.conf = conf;
|
this.conf = conf;
|
||||||
this.fileIndex = FileIndex.instance(this.path, this.conf);
|
|
||||||
this.requiredPartitions = requiredPartitions;
|
this.requiredPartitions = requiredPartitions;
|
||||||
this.requiredPos = requiredPos == null
|
this.requiredPos = requiredPos == null
|
||||||
? IntStream.range(0, schema.toPhysicalRowDataType().getChildren().size()).toArray()
|
? IntStream.range(0, this.tableRowType.getFieldCount()).toArray()
|
||||||
: requiredPos;
|
: requiredPos;
|
||||||
this.limit = limit == null ? NO_LIMIT_CONSTANT : limit;
|
this.limit = limit == null ? NO_LIMIT_CONSTANT : limit;
|
||||||
this.filters = filters == null ? Collections.emptyList() : filters;
|
this.filters = filters == null ? Collections.emptyList() : filters;
|
||||||
this.hadoopConf = HadoopConfigurations.getHadoopConf(conf);
|
this.hadoopConf = HadoopConfigurations.getHadoopConf(conf);
|
||||||
this.metaClient = StreamerUtil.metaClientForReader(conf, hadoopConf);
|
this.metaClient = StreamerUtil.metaClientForReader(conf, hadoopConf);
|
||||||
|
this.fileIndex = FileIndex.instance(this.path, this.conf, this.tableRowType);
|
||||||
this.maxCompactionMemoryInBytes = StreamerUtil.getMaxCompactionMemoryInBytes(conf);
|
this.maxCompactionMemoryInBytes = StreamerUtil.getMaxCompactionMemoryInBytes(conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -181,12 +181,12 @@ public class HoodieTableSource implements
|
|||||||
(TypeInformation<RowData>) TypeInfoDataTypeConverter.fromDataTypeToTypeInfo(getProducedDataType());
|
(TypeInformation<RowData>) TypeInfoDataTypeConverter.fromDataTypeToTypeInfo(getProducedDataType());
|
||||||
if (conf.getBoolean(FlinkOptions.READ_AS_STREAMING)) {
|
if (conf.getBoolean(FlinkOptions.READ_AS_STREAMING)) {
|
||||||
StreamReadMonitoringFunction monitoringFunction = new StreamReadMonitoringFunction(
|
StreamReadMonitoringFunction monitoringFunction = new StreamReadMonitoringFunction(
|
||||||
conf, FilePathUtils.toFlinkPath(path), maxCompactionMemoryInBytes, getRequiredPartitionPaths());
|
conf, FilePathUtils.toFlinkPath(path), tableRowType, maxCompactionMemoryInBytes, getRequiredPartitionPaths());
|
||||||
InputFormat<RowData, ?> inputFormat = getInputFormat(true);
|
InputFormat<RowData, ?> inputFormat = getInputFormat(true);
|
||||||
OneInputStreamOperatorFactory<MergeOnReadInputSplit, RowData> factory = StreamReadOperator.factory((MergeOnReadInputFormat) inputFormat);
|
OneInputStreamOperatorFactory<MergeOnReadInputSplit, RowData> factory = StreamReadOperator.factory((MergeOnReadInputFormat) inputFormat);
|
||||||
SingleOutputStreamOperator<RowData> source = execEnv.addSource(monitoringFunction, getSourceOperatorName("split_monitor"))
|
SingleOutputStreamOperator<RowData> source = execEnv.addSource(monitoringFunction, getSourceOperatorName("split_monitor"))
|
||||||
.setParallelism(1)
|
.setParallelism(1)
|
||||||
.keyBy(inputSplit -> inputSplit.getFileId())
|
.keyBy(MergeOnReadInputSplit::getFileId)
|
||||||
.transform("split_reader", typeInfo, factory)
|
.transform("split_reader", typeInfo, factory)
|
||||||
.setParallelism(conf.getInteger(FlinkOptions.READ_TASKS));
|
.setParallelism(conf.getInteger(FlinkOptions.READ_TASKS));
|
||||||
return new DataStreamSource<>(source);
|
return new DataStreamSource<>(source);
|
||||||
@@ -219,7 +219,8 @@ public class HoodieTableSource implements
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Result applyFilters(List<ResolvedExpression> filters) {
|
public Result applyFilters(List<ResolvedExpression> filters) {
|
||||||
this.filters = new ArrayList<>(filters);
|
this.filters = filters.stream().filter(ExpressionUtils::isSimpleCallExpression).collect(Collectors.toList());
|
||||||
|
this.fileIndex.setFilters(this.filters);
|
||||||
// refuse all the filters now
|
// refuse all the filters now
|
||||||
return SupportsFilterPushDown.Result.of(Collections.emptyList(), new ArrayList<>(filters));
|
return SupportsFilterPushDown.Result.of(Collections.emptyList(), new ArrayList<>(filters));
|
||||||
}
|
}
|
||||||
@@ -262,13 +263,6 @@ public class HoodieTableSource implements
|
|||||||
.bridgedTo(RowData.class);
|
.bridgedTo(RowData.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Map<String, String>> getOrFetchPartitions() {
|
|
||||||
if (requiredPartitions == null) {
|
|
||||||
requiredPartitions = listPartitions().orElse(Collections.emptyList());
|
|
||||||
}
|
|
||||||
return requiredPartitions;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getSourceOperatorName(String operatorName) {
|
private String getSourceOperatorName(String operatorName) {
|
||||||
String[] schemaFieldNames = this.schema.getColumnNames().toArray(new String[0]);
|
String[] schemaFieldNames = this.schema.getColumnNames().toArray(new String[0]);
|
||||||
List<String> fields = Arrays.stream(this.requiredPos)
|
List<String> fields = Arrays.stream(this.requiredPos)
|
||||||
@@ -366,7 +360,9 @@ public class HoodieTableSource implements
|
|||||||
return baseFileOnlyInputFormat();
|
return baseFileOnlyInputFormat();
|
||||||
case FlinkOptions.QUERY_TYPE_INCREMENTAL:
|
case FlinkOptions.QUERY_TYPE_INCREMENTAL:
|
||||||
IncrementalInputSplits incrementalInputSplits = IncrementalInputSplits.builder()
|
IncrementalInputSplits incrementalInputSplits = IncrementalInputSplits.builder()
|
||||||
.conf(conf).path(FilePathUtils.toFlinkPath(path))
|
.conf(conf)
|
||||||
|
.path(FilePathUtils.toFlinkPath(path))
|
||||||
|
.rowType(this.tableRowType)
|
||||||
.maxCompactionMemoryInBytes(maxCompactionMemoryInBytes)
|
.maxCompactionMemoryInBytes(maxCompactionMemoryInBytes)
|
||||||
.requiredPartitions(getRequiredPartitionPaths()).build();
|
.requiredPartitions(getRequiredPartitionPaths()).build();
|
||||||
final IncrementalInputSplits.Result result = incrementalInputSplits.inputSplits(metaClient, hadoopConf);
|
final IncrementalInputSplits.Result result = incrementalInputSplits.inputSplits(metaClient, hadoopConf);
|
||||||
@@ -439,11 +435,18 @@ public class HoodieTableSource implements
|
|||||||
}
|
}
|
||||||
|
|
||||||
private InputFormat<RowData, ?> baseFileOnlyInputFormat() {
|
private InputFormat<RowData, ?> baseFileOnlyInputFormat() {
|
||||||
final Path[] paths = getReadPaths();
|
final FileStatus[] fileStatuses = getReadFiles();
|
||||||
if (paths.length == 0) {
|
if (fileStatuses.length == 0) {
|
||||||
return InputFormats.EMPTY_INPUT_FORMAT;
|
return InputFormats.EMPTY_INPUT_FORMAT;
|
||||||
}
|
}
|
||||||
FileInputFormat<RowData> format = new CopyOnWriteInputFormat(
|
|
||||||
|
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
|
||||||
|
metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants(), fileStatuses);
|
||||||
|
Path[] paths = fsView.getLatestBaseFiles()
|
||||||
|
.map(HoodieBaseFile::getFileStatus)
|
||||||
|
.map(FileStatus::getPath).toArray(Path[]::new);
|
||||||
|
|
||||||
|
return new CopyOnWriteInputFormat(
|
||||||
FilePathUtils.toFlinkPaths(paths),
|
FilePathUtils.toFlinkPaths(paths),
|
||||||
this.schema.getColumnNames().toArray(new String[0]),
|
this.schema.getColumnNames().toArray(new String[0]),
|
||||||
this.schema.getColumnDataTypes().toArray(new DataType[0]),
|
this.schema.getColumnDataTypes().toArray(new DataType[0]),
|
||||||
@@ -453,12 +456,10 @@ public class HoodieTableSource implements
|
|||||||
getParquetConf(this.conf, this.hadoopConf),
|
getParquetConf(this.conf, this.hadoopConf),
|
||||||
this.conf.getBoolean(FlinkOptions.UTC_TIMEZONE)
|
this.conf.getBoolean(FlinkOptions.UTC_TIMEZONE)
|
||||||
);
|
);
|
||||||
format.setFilesFilter(new LatestFileFilter(this.hadoopConf));
|
|
||||||
return format;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Schema inferSchemaFromDdl() {
|
private Schema inferSchemaFromDdl() {
|
||||||
Schema schema = AvroSchemaConverter.convertToSchema(this.schema.toPhysicalRowDataType().getLogicalType());
|
Schema schema = AvroSchemaConverter.convertToSchema(this.tableRowType);
|
||||||
return HoodieAvroUtils.addMetadataFields(schema, conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED));
|
return HoodieAvroUtils.addMetadataFields(schema, conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -498,23 +499,13 @@ public class HoodieTableSource implements
|
|||||||
* Get the reader paths with partition path expanded.
|
* Get the reader paths with partition path expanded.
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public Path[] getReadPaths() {
|
public FileStatus[] getReadFiles() {
|
||||||
return partitionKeys.isEmpty()
|
Set<String> requiredPartitionPaths = getRequiredPartitionPaths();
|
||||||
? new Path[] {path}
|
fileIndex.setPartitionPaths(requiredPartitionPaths);
|
||||||
: FilePathUtils.partitionPath2ReadPath(path, partitionKeys, getOrFetchPartitions(),
|
List<String> relPartitionPaths = fileIndex.getOrBuildPartitionPaths();
|
||||||
conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING));
|
if (relPartitionPaths.size() == 0) {
|
||||||
}
|
return new FileStatus[0];
|
||||||
|
|
||||||
private static class LatestFileFilter extends FilePathFilter {
|
|
||||||
private final HoodieROTablePathFilter hoodieFilter;
|
|
||||||
|
|
||||||
public LatestFileFilter(org.apache.hadoop.conf.Configuration hadoopConf) {
|
|
||||||
this.hoodieFilter = new HoodieROTablePathFilter(hadoopConf);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean filterPath(org.apache.flink.core.fs.Path filePath) {
|
|
||||||
return !this.hoodieFilter.accept(new Path(filePath.toUri()));
|
|
||||||
}
|
}
|
||||||
|
return fileIndex.getFilesInPartitions();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -323,6 +323,10 @@ public class FilePathUtils {
|
|||||||
public static LinkedHashMap<String, String> validateAndReorderPartitions(
|
public static LinkedHashMap<String, String> validateAndReorderPartitions(
|
||||||
Map<String, String> partitionKVs,
|
Map<String, String> partitionKVs,
|
||||||
List<String> partitionKeys) {
|
List<String> partitionKeys) {
|
||||||
|
if (partitionKeys.size() == 0) {
|
||||||
|
// in case the partition fields are not in schema
|
||||||
|
return new LinkedHashMap<>(partitionKVs);
|
||||||
|
}
|
||||||
LinkedHashMap<String, String> map = new LinkedHashMap<>();
|
LinkedHashMap<String, String> map = new LinkedHashMap<>();
|
||||||
for (String k : partitionKeys) {
|
for (String k : partitionKeys) {
|
||||||
if (!partitionKVs.containsKey(k)) {
|
if (!partitionKVs.containsKey(k)) {
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ import org.apache.flink.table.types.logical.TypeInformationRawType;
|
|||||||
import org.apache.flink.table.types.logical.utils.LogicalTypeChecks;
|
import org.apache.flink.table.types.logical.utils.LogicalTypeChecks;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts an Avro schema into Flink's type information. It uses {@link org.apache.flink.api.java.typeutils.RowTypeInfo} for
|
* Converts an Avro schema into Flink's type information. It uses {@link org.apache.flink.api.java.typeutils.RowTypeInfo} for
|
||||||
@@ -96,9 +97,24 @@ public class AvroSchemaConverter {
|
|||||||
actualSchema = schema.getTypes().get(0);
|
actualSchema = schema.getTypes().get(0);
|
||||||
nullable = false;
|
nullable = false;
|
||||||
} else {
|
} else {
|
||||||
|
List<Schema> nonNullTypes = schema.getTypes().stream()
|
||||||
|
.filter(s -> s.getType() != Schema.Type.NULL)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
nullable = schema.getTypes().size() > nonNullTypes.size();
|
||||||
|
|
||||||
// use Kryo for serialization
|
// use Kryo for serialization
|
||||||
return new AtomicDataType(
|
DataType rawDataType = new AtomicDataType(
|
||||||
new TypeInformationRawType<>(false, Types.GENERIC(Object.class)));
|
new TypeInformationRawType<>(false, Types.GENERIC(Object.class)))
|
||||||
|
.notNull();
|
||||||
|
|
||||||
|
if (recordTypesOfSameNumFields(nonNullTypes)) {
|
||||||
|
DataType converted = DataTypes.ROW(
|
||||||
|
DataTypes.FIELD("wrapper", rawDataType))
|
||||||
|
.notNull();
|
||||||
|
return nullable ? converted.nullable() : converted;
|
||||||
|
}
|
||||||
|
// use Kryo for serialization
|
||||||
|
return nullable ? rawDataType.nullable() : rawDataType;
|
||||||
}
|
}
|
||||||
DataType converted = convertToDataType(actualSchema);
|
DataType converted = convertToDataType(actualSchema);
|
||||||
return nullable ? converted.nullable() : converted;
|
return nullable ? converted.nullable() : converted;
|
||||||
@@ -155,6 +171,20 @@ public class AvroSchemaConverter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if all the types are RECORD type with same number of fields.
|
||||||
|
*/
|
||||||
|
private static boolean recordTypesOfSameNumFields(List<Schema> types) {
|
||||||
|
if (types == null || types.size() == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (types.stream().anyMatch(s -> s.getType() != Schema.Type.RECORD)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int numFields = types.get(0).getFields().size();
|
||||||
|
return types.stream().allMatch(s -> s.getFields().size() == numFields);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema.
|
* Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema.
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ public class AvroToRowDataConverters {
|
|||||||
/**
|
/**
|
||||||
* Creates a runtime converter which assuming input object is not null.
|
* Creates a runtime converter which assuming input object is not null.
|
||||||
*/
|
*/
|
||||||
private static AvroToRowDataConverter createConverter(LogicalType type) {
|
public static AvroToRowDataConverter createConverter(LogicalType type) {
|
||||||
switch (type.getTypeRoot()) {
|
switch (type.getTypeRoot()) {
|
||||||
case NULL:
|
case NULL:
|
||||||
return avroObject -> null;
|
return avroObject -> null;
|
||||||
@@ -121,6 +121,7 @@ public class AvroToRowDataConverters {
|
|||||||
case INTERVAL_DAY_TIME: // long
|
case INTERVAL_DAY_TIME: // long
|
||||||
case FLOAT: // float
|
case FLOAT: // float
|
||||||
case DOUBLE: // double
|
case DOUBLE: // double
|
||||||
|
case RAW:
|
||||||
return avroObject -> avroObject;
|
return avroObject -> avroObject;
|
||||||
case DATE:
|
case DATE:
|
||||||
return AvroToRowDataConverters::convertToDate;
|
return AvroToRowDataConverters::convertToDate;
|
||||||
@@ -143,7 +144,6 @@ public class AvroToRowDataConverters {
|
|||||||
case MAP:
|
case MAP:
|
||||||
case MULTISET:
|
case MULTISET:
|
||||||
return createMapConverter(type);
|
return createMapConverter(type);
|
||||||
case RAW:
|
|
||||||
default:
|
default:
|
||||||
throw new UnsupportedOperationException("Unsupported type: " + type);
|
throw new UnsupportedOperationException("Unsupported type: " + type);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,8 +23,11 @@ import org.apache.hudi.common.util.ValidationUtils;
|
|||||||
import org.apache.flink.table.types.DataType;
|
import org.apache.flink.table.types.DataType;
|
||||||
import org.apache.flink.table.types.logical.LogicalType;
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
import org.apache.flink.table.types.logical.LogicalTypeRoot;
|
import org.apache.flink.table.types.logical.LogicalTypeRoot;
|
||||||
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
import org.apache.flink.table.types.logical.TimestampType;
|
import org.apache.flink.table.types.logical.TimestampType;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utilities for {@link org.apache.flink.table.types.DataType}.
|
* Utilities for {@link org.apache.flink.table.types.DataType}.
|
||||||
*/
|
*/
|
||||||
@@ -58,4 +61,12 @@ public class DataTypeUtils {
|
|||||||
public static boolean isDatetimeType(DataType type) {
|
public static boolean isDatetimeType(DataType type) {
|
||||||
return isTimestampType(type) || isDateType(type);
|
return isTimestampType(type) || isDateType(type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Projects the row fields with given names.
|
||||||
|
*/
|
||||||
|
public static RowType.RowField[] projectRowFields(RowType rowType, String[] names) {
|
||||||
|
int [] fieldIndices = Arrays.stream(names).mapToInt(rowType::getFieldIndex).toArray();
|
||||||
|
return Arrays.stream(fieldIndices).mapToObj(i -> rowType.getFields().get(i)).toArray(RowType.RowField[]::new);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,180 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.util;
|
||||||
|
|
||||||
|
import org.apache.flink.table.expressions.CallExpression;
|
||||||
|
import org.apache.flink.table.expressions.Expression;
|
||||||
|
import org.apache.flink.table.expressions.FieldReferenceExpression;
|
||||||
|
import org.apache.flink.table.expressions.ResolvedExpression;
|
||||||
|
import org.apache.flink.table.expressions.ValueLiteralExpression;
|
||||||
|
import org.apache.flink.table.functions.BuiltInFunctionDefinitions;
|
||||||
|
import org.apache.flink.table.functions.FunctionDefinition;
|
||||||
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
|
||||||
|
import java.math.BigDecimal;
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.time.LocalTime;
|
||||||
|
import java.time.ZoneOffset;
|
||||||
|
import java.time.temporal.ChronoField;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utilities for expression resolving.
|
||||||
|
*/
|
||||||
|
public class ExpressionUtils {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Collect the referenced columns with given expressions,
|
||||||
|
* only simple call expression is supported.
|
||||||
|
*/
|
||||||
|
public static String[] referencedColumns(List<ResolvedExpression> exprs) {
|
||||||
|
return exprs.stream()
|
||||||
|
.map(ExpressionUtils::getReferencedColumns)
|
||||||
|
.filter(columns -> columns.length > 0)
|
||||||
|
.flatMap(Arrays::stream)
|
||||||
|
.distinct() // deduplication
|
||||||
|
.toArray(String[]::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether the given expression is simple call expression:
|
||||||
|
* a binary call with one operand as field reference and another operand
|
||||||
|
* as literal.
|
||||||
|
*/
|
||||||
|
public static boolean isSimpleCallExpression(Expression expr) {
|
||||||
|
if (!(expr instanceof CallExpression)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
CallExpression callExpression = (CallExpression) expr;
|
||||||
|
FunctionDefinition funcDef = callExpression.getFunctionDefinition();
|
||||||
|
// simple call list:
|
||||||
|
// NOT AND OR IN EQUALS NOT_EQUALS IS_NULL IS_NOT_NULL LESS_THAN GREATER_THAN
|
||||||
|
// LESS_THAN_OR_EQUAL GREATER_THAN_OR_EQUAL
|
||||||
|
|
||||||
|
if (funcDef == BuiltInFunctionDefinitions.NOT
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.AND
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.OR) {
|
||||||
|
return callExpression.getChildren().stream()
|
||||||
|
.allMatch(ExpressionUtils::isSimpleCallExpression);
|
||||||
|
}
|
||||||
|
if (!(funcDef == BuiltInFunctionDefinitions.IN
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.EQUALS
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.NOT_EQUALS
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.IS_NULL
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.IS_NOT_NULL
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.LESS_THAN
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.GREATER_THAN
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// handle IN
|
||||||
|
if (funcDef == BuiltInFunctionDefinitions.IN) {
|
||||||
|
// In expression RHS operands are always literals
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// handle unary operator
|
||||||
|
if (funcDef == BuiltInFunctionDefinitions.IS_NULL
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.IS_NOT_NULL) {
|
||||||
|
return callExpression.getChildren().stream()
|
||||||
|
.allMatch(e -> e instanceof FieldReferenceExpression);
|
||||||
|
}
|
||||||
|
// handle binary operator
|
||||||
|
return isFieldReferenceAndLiteral(callExpression.getChildren());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isFieldReferenceAndLiteral(List<Expression> exprs) {
|
||||||
|
if (exprs.size() != 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
final Expression expr0 = exprs.get(0);
|
||||||
|
final Expression expr1 = exprs.get(1);
|
||||||
|
return expr0 instanceof FieldReferenceExpression && expr1 instanceof ValueLiteralExpression
|
||||||
|
|| expr0 instanceof ValueLiteralExpression && expr1 instanceof FieldReferenceExpression;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String[] getReferencedColumns(ResolvedExpression expression) {
|
||||||
|
CallExpression callExpr = (CallExpression) expression;
|
||||||
|
FunctionDefinition funcDef = callExpr.getFunctionDefinition();
|
||||||
|
if (funcDef == BuiltInFunctionDefinitions.NOT
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.AND
|
||||||
|
|| funcDef == BuiltInFunctionDefinitions.OR) {
|
||||||
|
return callExpr.getChildren().stream()
|
||||||
|
.map(e -> getReferencedColumns((ResolvedExpression) e))
|
||||||
|
.flatMap(Arrays::stream)
|
||||||
|
.toArray(String[]::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
return expression.getChildren().stream()
|
||||||
|
.filter(expr -> expr instanceof FieldReferenceExpression)
|
||||||
|
.map(expr -> ((FieldReferenceExpression) expr).getName())
|
||||||
|
.toArray(String[]::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the value with given value literal expression.
|
||||||
|
*
|
||||||
|
* <p>Returns null if the value can not parse as the output data type correctly,
|
||||||
|
* should call {@code ValueLiteralExpression.isNull} first to decide whether
|
||||||
|
* the literal is NULL.
|
||||||
|
*/
|
||||||
|
@Nullable
|
||||||
|
public static Object getValueFromLiteral(ValueLiteralExpression expr) {
|
||||||
|
LogicalType logicalType = expr.getOutputDataType().getLogicalType();
|
||||||
|
switch (logicalType.getTypeRoot()) {
|
||||||
|
case TIMESTAMP_WITHOUT_TIME_ZONE:
|
||||||
|
return expr.getValueAs(LocalDateTime.class)
|
||||||
|
.map(ldt -> ldt.toInstant(ZoneOffset.UTC).toEpochMilli())
|
||||||
|
.orElse(null);
|
||||||
|
case TIME_WITHOUT_TIME_ZONE:
|
||||||
|
return expr.getValueAs(LocalTime.class)
|
||||||
|
.map(lt -> lt.get(ChronoField.MILLI_OF_DAY))
|
||||||
|
.orElse(null);
|
||||||
|
case DATE:
|
||||||
|
return expr.getValueAs(LocalDate.class)
|
||||||
|
.map(LocalDate::toEpochDay)
|
||||||
|
.orElse(null);
|
||||||
|
// NOTE: All integral types of size less than Int are encoded as Ints in MT
|
||||||
|
case BOOLEAN:
|
||||||
|
return expr.getValueAs(Boolean.class).orElse(null);
|
||||||
|
case TINYINT:
|
||||||
|
case SMALLINT:
|
||||||
|
case INTEGER:
|
||||||
|
return expr.getValueAs(Integer.class).orElse(null);
|
||||||
|
case FLOAT:
|
||||||
|
return expr.getValueAs(Float.class).orElse(null);
|
||||||
|
case DOUBLE:
|
||||||
|
return expr.getValueAs(Double.class).orElse(null);
|
||||||
|
case BINARY:
|
||||||
|
case VARBINARY:
|
||||||
|
return expr.getValueAs(byte[].class).orElse(null);
|
||||||
|
case CHAR:
|
||||||
|
case VARCHAR:
|
||||||
|
return expr.getValueAs(String.class).orElse(null);
|
||||||
|
case DECIMAL:
|
||||||
|
return expr.getValueAs(BigDecimal.class).orElse(null);
|
||||||
|
default:
|
||||||
|
throw new UnsupportedOperationException("Unsupported type: " + logicalType);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -26,6 +26,8 @@ import org.apache.flink.table.types.logical.LogicalType;
|
|||||||
import org.apache.flink.table.types.logical.RowType;
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utilities to project the row data with given positions.
|
* Utilities to project the row data with given positions.
|
||||||
@@ -51,6 +53,12 @@ public class RowDataProjection implements Serializable {
|
|||||||
return new RowDataProjection(types, positions);
|
return new RowDataProjection(types, positions);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static RowDataProjection instanceV2(RowType rowType, int[] positions) {
|
||||||
|
List<LogicalType> fieldTypes = rowType.getChildren();
|
||||||
|
final LogicalType[] types = Arrays.stream(positions).mapToObj(fieldTypes::get).toArray(LogicalType[]::new);
|
||||||
|
return new RowDataProjection(types, positions);
|
||||||
|
}
|
||||||
|
|
||||||
public static RowDataProjection instance(LogicalType[] types, int[] positions) {
|
public static RowDataProjection instance(LogicalType[] types, int[] positions) {
|
||||||
return new RowDataProjection(types, positions);
|
return new RowDataProjection(types, positions);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,12 +18,12 @@
|
|||||||
|
|
||||||
package org.apache.hudi.util;
|
package org.apache.hudi.util;
|
||||||
|
|
||||||
import org.apache.flink.configuration.Configuration;
|
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||||
import org.apache.hudi.configuration.HadoopConfigurations;
|
import org.apache.hudi.configuration.HadoopConfigurations;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
|
|
||||||
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
|||||||
@@ -18,11 +18,10 @@
|
|||||||
|
|
||||||
package org.apache.hudi.sink.utils;
|
package org.apache.hudi.sink.utils;
|
||||||
|
|
||||||
import org.apache.flink.configuration.Configuration;
|
|
||||||
|
|
||||||
import org.apache.hudi.configuration.FlinkOptions;
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
import org.apache.hudi.hive.HiveSyncConfig;
|
import org.apache.hudi.hive.HiveSyncConfig;
|
||||||
|
|
||||||
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.lang.reflect.Method;
|
import java.lang.reflect.Method;
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ public class TestFileIndex {
|
|||||||
conf.setBoolean(FlinkOptions.METADATA_ENABLED, true);
|
conf.setBoolean(FlinkOptions.METADATA_ENABLED, true);
|
||||||
conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning);
|
conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning);
|
||||||
TestData.writeData(TestData.DATA_SET_INSERT, conf);
|
TestData.writeData(TestData.DATA_SET_INSERT, conf);
|
||||||
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf);
|
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf, TestConfigurations.ROW_TYPE);
|
||||||
List<String> partitionKeys = Collections.singletonList("partition");
|
List<String> partitionKeys = Collections.singletonList("partition");
|
||||||
List<Map<String, String>> partitions = fileIndex.getPartitions(partitionKeys, "default", hiveStylePartitioning);
|
List<Map<String, String>> partitions = fileIndex.getPartitions(partitionKeys, "default", hiveStylePartitioning);
|
||||||
assertTrue(partitions.stream().allMatch(m -> m.size() == 1));
|
assertTrue(partitions.stream().allMatch(m -> m.size() == 1));
|
||||||
@@ -79,7 +79,7 @@ public class TestFileIndex {
|
|||||||
conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, NonpartitionedAvroKeyGenerator.class.getName());
|
conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, NonpartitionedAvroKeyGenerator.class.getName());
|
||||||
conf.setBoolean(FlinkOptions.METADATA_ENABLED, true);
|
conf.setBoolean(FlinkOptions.METADATA_ENABLED, true);
|
||||||
TestData.writeData(TestData.DATA_SET_INSERT, conf);
|
TestData.writeData(TestData.DATA_SET_INSERT, conf);
|
||||||
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf);
|
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf, TestConfigurations.ROW_TYPE);
|
||||||
List<String> partitionKeys = Collections.singletonList("");
|
List<String> partitionKeys = Collections.singletonList("");
|
||||||
List<Map<String, String>> partitions = fileIndex.getPartitions(partitionKeys, "default", false);
|
List<Map<String, String>> partitions = fileIndex.getPartitions(partitionKeys, "default", false);
|
||||||
assertThat(partitions.size(), is(0));
|
assertThat(partitions.size(), is(0));
|
||||||
@@ -94,7 +94,7 @@ public class TestFileIndex {
|
|||||||
void testFileListingEmptyTable(boolean enableMetadata) {
|
void testFileListingEmptyTable(boolean enableMetadata) {
|
||||||
Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath());
|
Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath());
|
||||||
conf.setBoolean(FlinkOptions.METADATA_ENABLED, enableMetadata);
|
conf.setBoolean(FlinkOptions.METADATA_ENABLED, enableMetadata);
|
||||||
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf);
|
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf, TestConfigurations.ROW_TYPE);
|
||||||
List<String> partitionKeys = Collections.singletonList("partition");
|
List<String> partitionKeys = Collections.singletonList("partition");
|
||||||
List<Map<String, String>> partitions = fileIndex.getPartitions(partitionKeys, "default", false);
|
List<Map<String, String>> partitions = fileIndex.getPartitions(partitionKeys, "default", false);
|
||||||
assertThat(partitions.size(), is(0));
|
assertThat(partitions.size(), is(0));
|
||||||
|
|||||||
@@ -0,0 +1,98 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.source.stats;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||||
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
|
import org.apache.hudi.utils.TestConfigurations;
|
||||||
|
import org.apache.hudi.utils.TestData;
|
||||||
|
|
||||||
|
import org.apache.flink.configuration.Configuration;
|
||||||
|
import org.apache.flink.table.data.GenericRowData;
|
||||||
|
import org.apache.flink.table.data.RowData;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.io.TempDir;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import static org.hamcrest.CoreMatchers.is;
|
||||||
|
import static org.hamcrest.MatcherAssert.assertThat;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test cases for {@link ColumnStatsIndices}.
|
||||||
|
*/
|
||||||
|
public class TestColumnStatsIndices {
|
||||||
|
@TempDir
|
||||||
|
File tempFile;
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testTransposeColumnStatsIndex() throws Exception {
|
||||||
|
final String path = tempFile.getAbsolutePath();
|
||||||
|
Configuration conf = TestConfigurations.getDefaultConf(path);
|
||||||
|
conf.setBoolean(FlinkOptions.METADATA_ENABLED, true);
|
||||||
|
conf.setBoolean(FlinkOptions.READ_DATA_SKIPPING_ENABLED, true);
|
||||||
|
conf.setString("hoodie.metadata.index.column.stats.enable", "true");
|
||||||
|
|
||||||
|
HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder()
|
||||||
|
.enable(true)
|
||||||
|
.withMetadataIndexColumnStats(true)
|
||||||
|
.build();
|
||||||
|
TestData.writeData(TestData.DATA_SET_INSERT, conf);
|
||||||
|
|
||||||
|
// explicit query columns
|
||||||
|
String[] queryColumns1 = {"uuid", "age"};
|
||||||
|
List<RowData> indexRows1 = ColumnStatsIndices.readColumnStatsIndex(path, metadataConfig, queryColumns1);
|
||||||
|
Pair<List<RowData>, String[]> transposedIndexTable1 = ColumnStatsIndices
|
||||||
|
.transposeColumnStatsIndex(indexRows1, queryColumns1, TestConfigurations.ROW_TYPE);
|
||||||
|
assertThat("The schema columns should sort by natural order",
|
||||||
|
Arrays.toString(transposedIndexTable1.getRight()), is("[age, uuid]"));
|
||||||
|
List<RowData> transposed1 = filterOutFileNames(transposedIndexTable1.getLeft());
|
||||||
|
assertThat(transposed1.size(), is(4));
|
||||||
|
final String expected = "["
|
||||||
|
+ "+I(2,18,20,0,id5,id6,0), "
|
||||||
|
+ "+I(2,23,33,0,id1,id2,0), "
|
||||||
|
+ "+I(2,31,53,0,id3,id4,0), "
|
||||||
|
+ "+I(2,44,56,0,id7,id8,0)]";
|
||||||
|
assertThat(transposed1.toString(), is(expected));
|
||||||
|
|
||||||
|
// no query columns, only for tests
|
||||||
|
assertThrows(IllegalArgumentException.class,
|
||||||
|
() -> ColumnStatsIndices.readColumnStatsIndex(path, metadataConfig, new String[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<RowData> filterOutFileNames(List<RowData> indexRows) {
|
||||||
|
return indexRows.stream().map(row -> {
|
||||||
|
GenericRowData gr = (GenericRowData) row;
|
||||||
|
GenericRowData converted = new GenericRowData(gr.getArity() - 1);
|
||||||
|
for (int i = 1; i < gr.getArity(); i++) {
|
||||||
|
converted.setField(i - 1, gr.getField(i));
|
||||||
|
}
|
||||||
|
return converted;
|
||||||
|
})
|
||||||
|
// sort by age min values
|
||||||
|
.sorted(Comparator.comparingInt(r -> r.getInt(1)))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,374 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.source.stats;
|
||||||
|
|
||||||
|
import org.apache.hudi.utils.TestData;
|
||||||
|
|
||||||
|
import org.apache.flink.table.api.DataTypes;
|
||||||
|
import org.apache.flink.table.data.RowData;
|
||||||
|
import org.apache.flink.table.data.StringData;
|
||||||
|
import org.apache.flink.table.data.TimestampData;
|
||||||
|
import org.apache.flink.table.expressions.FieldReferenceExpression;
|
||||||
|
import org.apache.flink.table.expressions.ValueLiteralExpression;
|
||||||
|
import org.apache.flink.table.types.DataType;
|
||||||
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test cases for {@link ExpressionEvaluator}.
|
||||||
|
*/
|
||||||
|
public class TestExpressionEvaluator {
|
||||||
|
private static final DataType ROW_DATA_TYPE = DataTypes.ROW(
|
||||||
|
DataTypes.FIELD("f_tinyint", DataTypes.TINYINT()),
|
||||||
|
DataTypes.FIELD("f_smallint", DataTypes.SMALLINT()),
|
||||||
|
DataTypes.FIELD("f_int", DataTypes.INT()),
|
||||||
|
DataTypes.FIELD("f_long", DataTypes.BIGINT()),
|
||||||
|
DataTypes.FIELD("f_float", DataTypes.FLOAT()),
|
||||||
|
DataTypes.FIELD("f_double", DataTypes.DOUBLE()),
|
||||||
|
DataTypes.FIELD("f_boolean", DataTypes.BOOLEAN()),
|
||||||
|
DataTypes.FIELD("f_decimal", DataTypes.DECIMAL(10, 2)),
|
||||||
|
DataTypes.FIELD("f_bytes", DataTypes.VARBINARY(10)),
|
||||||
|
DataTypes.FIELD("f_string", DataTypes.VARCHAR(10)),
|
||||||
|
DataTypes.FIELD("f_time", DataTypes.TIME(3)),
|
||||||
|
DataTypes.FIELD("f_date", DataTypes.DATE()),
|
||||||
|
DataTypes.FIELD("f_timestamp", DataTypes.TIMESTAMP(3))
|
||||||
|
).notNull();
|
||||||
|
private static final DataType INDEX_ROW_DATA_TYPE = DataTypes.ROW(
|
||||||
|
DataTypes.FIELD("file_name", DataTypes.STRING()),
|
||||||
|
DataTypes.FIELD("value_cnt", DataTypes.BIGINT()),
|
||||||
|
DataTypes.FIELD("f_int_min", DataTypes.INT()),
|
||||||
|
DataTypes.FIELD("f_int_max", DataTypes.INT()),
|
||||||
|
DataTypes.FIELD("f_int_null_cnt", DataTypes.BIGINT()),
|
||||||
|
DataTypes.FIELD("f_string_min", DataTypes.VARCHAR(10)),
|
||||||
|
DataTypes.FIELD("f_string_max", DataTypes.VARCHAR(10)),
|
||||||
|
DataTypes.FIELD("f_string_null_cnt", DataTypes.BIGINT()),
|
||||||
|
DataTypes.FIELD("f_timestamp_min", DataTypes.TIMESTAMP(3)),
|
||||||
|
DataTypes.FIELD("f_timestamp_max", DataTypes.TIMESTAMP(3)),
|
||||||
|
DataTypes.FIELD("f_timestamp_null_cnt", DataTypes.BIGINT())
|
||||||
|
).notNull();
|
||||||
|
|
||||||
|
private static final RowType INDEX_ROW_TYPE = (RowType) INDEX_ROW_DATA_TYPE.getLogicalType();
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testEqualTo() {
|
||||||
|
ExpressionEvaluator.EqualTo equalTo = ExpressionEvaluator.EqualTo.getInstance();
|
||||||
|
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
|
||||||
|
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
|
||||||
|
|
||||||
|
RowData indexRow1 = intIndexRow(11, 13);
|
||||||
|
equalTo.bindFieldReference(rExpr)
|
||||||
|
.bindVal(vExpr)
|
||||||
|
.bindColStats(indexRow1, queryFields(2), rExpr);
|
||||||
|
assertTrue(equalTo.eval(), "11 < 12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow2 = intIndexRow(12, 13);
|
||||||
|
equalTo.bindColStats(indexRow2, queryFields(2), rExpr);
|
||||||
|
assertTrue(equalTo.eval(), "12 <= 12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow3 = intIndexRow(11, 12);
|
||||||
|
equalTo.bindColStats(indexRow3, queryFields(2), rExpr);
|
||||||
|
assertTrue(equalTo.eval(), "11 < 12 <= 12");
|
||||||
|
|
||||||
|
RowData indexRow4 = intIndexRow(10, 11);
|
||||||
|
equalTo.bindColStats(indexRow4, queryFields(2), rExpr);
|
||||||
|
assertFalse(equalTo.eval(), "11 < 12");
|
||||||
|
|
||||||
|
RowData indexRow5 = intIndexRow(13, 14);
|
||||||
|
equalTo.bindColStats(indexRow5, queryFields(2), rExpr);
|
||||||
|
assertFalse(equalTo.eval(), "12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow6 = intIndexRow(null, null);
|
||||||
|
equalTo.bindColStats(indexRow6, queryFields(2), rExpr);
|
||||||
|
assertFalse(equalTo.eval(), "12 <> null");
|
||||||
|
|
||||||
|
equalTo.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
|
||||||
|
assertFalse(equalTo.eval(), "null <> null");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testNotEqualTo() {
|
||||||
|
ExpressionEvaluator.NotEqualTo notEqualTo = ExpressionEvaluator.NotEqualTo.getInstance();
|
||||||
|
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
|
||||||
|
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
|
||||||
|
|
||||||
|
RowData indexRow1 = intIndexRow(11, 13);
|
||||||
|
notEqualTo.bindFieldReference(rExpr)
|
||||||
|
.bindVal(vExpr)
|
||||||
|
.bindColStats(indexRow1, queryFields(2), rExpr);
|
||||||
|
assertTrue(notEqualTo.eval(), "11 <> 12 && 12 <> 13");
|
||||||
|
|
||||||
|
RowData indexRow2 = intIndexRow(12, 13);
|
||||||
|
notEqualTo.bindColStats(indexRow2, queryFields(2), rExpr);
|
||||||
|
assertTrue(notEqualTo.eval(), "12 <> 13");
|
||||||
|
|
||||||
|
RowData indexRow3 = intIndexRow(11, 12);
|
||||||
|
notEqualTo.bindColStats(indexRow3, queryFields(2), rExpr);
|
||||||
|
assertTrue(notEqualTo.eval(), "11 <> 12");
|
||||||
|
|
||||||
|
RowData indexRow4 = intIndexRow(10, 11);
|
||||||
|
notEqualTo.bindColStats(indexRow4, queryFields(2), rExpr);
|
||||||
|
assertTrue(notEqualTo.eval(), "10 <> 12 and 11 < 12");
|
||||||
|
|
||||||
|
RowData indexRow5 = intIndexRow(13, 14);
|
||||||
|
notEqualTo.bindColStats(indexRow5, queryFields(2), rExpr);
|
||||||
|
assertTrue(notEqualTo.eval(), "12 <> 13 and 12 <> 14");
|
||||||
|
|
||||||
|
RowData indexRow6 = intIndexRow(null, null);
|
||||||
|
notEqualTo.bindColStats(indexRow6, queryFields(2), rExpr);
|
||||||
|
assertTrue(notEqualTo.eval(), "12 <> null");
|
||||||
|
|
||||||
|
notEqualTo.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
|
||||||
|
assertTrue(notEqualTo.eval(), "null <> null");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testIsNull() {
|
||||||
|
ExpressionEvaluator.IsNull isNull = ExpressionEvaluator.IsNull.getInstance();
|
||||||
|
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
|
||||||
|
|
||||||
|
RowData indexRow1 = intIndexRow(11, 13);
|
||||||
|
isNull.bindFieldReference(rExpr)
|
||||||
|
.bindColStats(indexRow1, queryFields(2), rExpr);
|
||||||
|
assertTrue(isNull.eval(), "2 nulls");
|
||||||
|
|
||||||
|
RowData indexRow2 = intIndexRow(12, 13, 0L);
|
||||||
|
isNull.bindColStats(indexRow2, queryFields(2), rExpr);
|
||||||
|
assertFalse(isNull.eval(), "0 nulls");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testIsNotNull() {
|
||||||
|
ExpressionEvaluator.IsNotNull isNotNull = ExpressionEvaluator.IsNotNull.getInstance();
|
||||||
|
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
|
||||||
|
|
||||||
|
RowData indexRow1 = intIndexRow(11, 13);
|
||||||
|
isNotNull.bindFieldReference(rExpr)
|
||||||
|
.bindColStats(indexRow1, queryFields(2), rExpr);
|
||||||
|
assertTrue(isNotNull.eval(), "min 11 is not null");
|
||||||
|
|
||||||
|
RowData indexRow2 = intIndexRow(null, null, 0L);
|
||||||
|
isNotNull.bindColStats(indexRow2, queryFields(2), rExpr);
|
||||||
|
assertTrue(isNotNull.eval(), "min is null and 0 nulls");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testLessThan() {
|
||||||
|
ExpressionEvaluator.LessThan lessThan = ExpressionEvaluator.LessThan.getInstance();
|
||||||
|
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
|
||||||
|
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
|
||||||
|
|
||||||
|
RowData indexRow1 = intIndexRow(11, 13);
|
||||||
|
lessThan.bindFieldReference(rExpr)
|
||||||
|
.bindVal(vExpr)
|
||||||
|
.bindColStats(indexRow1, queryFields(2), rExpr);
|
||||||
|
assertTrue(lessThan.eval(), "12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow2 = intIndexRow(12, 13);
|
||||||
|
lessThan.bindColStats(indexRow2, queryFields(2), rExpr);
|
||||||
|
assertFalse(lessThan.eval(), "min 12 = 12");
|
||||||
|
|
||||||
|
RowData indexRow3 = intIndexRow(11, 12);
|
||||||
|
lessThan.bindColStats(indexRow3, queryFields(2), rExpr);
|
||||||
|
assertTrue(lessThan.eval(), "11 < 12");
|
||||||
|
|
||||||
|
RowData indexRow4 = intIndexRow(10, 11);
|
||||||
|
lessThan.bindColStats(indexRow4, queryFields(2), rExpr);
|
||||||
|
assertTrue(lessThan.eval(), "11 < 12");
|
||||||
|
|
||||||
|
RowData indexRow5 = intIndexRow(13, 14);
|
||||||
|
lessThan.bindColStats(indexRow5, queryFields(2), rExpr);
|
||||||
|
assertFalse(lessThan.eval(), "12 < min 13");
|
||||||
|
|
||||||
|
RowData indexRow6 = intIndexRow(null, null);
|
||||||
|
lessThan.bindColStats(indexRow6, queryFields(2), rExpr);
|
||||||
|
assertFalse(lessThan.eval(), "12 <> null");
|
||||||
|
|
||||||
|
lessThan.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
|
||||||
|
assertFalse(lessThan.eval(), "null <> null");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testGreaterThan() {
|
||||||
|
ExpressionEvaluator.GreaterThan greaterThan = ExpressionEvaluator.GreaterThan.getInstance();
|
||||||
|
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
|
||||||
|
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
|
||||||
|
|
||||||
|
RowData indexRow1 = intIndexRow(11, 13);
|
||||||
|
greaterThan.bindFieldReference(rExpr)
|
||||||
|
.bindVal(vExpr)
|
||||||
|
.bindColStats(indexRow1, queryFields(2), rExpr);
|
||||||
|
assertTrue(greaterThan.eval(), "12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow2 = intIndexRow(12, 13);
|
||||||
|
greaterThan.bindColStats(indexRow2, queryFields(2), rExpr);
|
||||||
|
assertTrue(greaterThan.eval(), "12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow3 = intIndexRow(11, 12);
|
||||||
|
greaterThan.bindColStats(indexRow3, queryFields(2), rExpr);
|
||||||
|
assertFalse(greaterThan.eval(), "max 12 = 12");
|
||||||
|
|
||||||
|
RowData indexRow4 = intIndexRow(10, 11);
|
||||||
|
greaterThan.bindColStats(indexRow4, queryFields(2), rExpr);
|
||||||
|
assertFalse(greaterThan.eval(), "max 11 < 12");
|
||||||
|
|
||||||
|
RowData indexRow5 = intIndexRow(13, 14);
|
||||||
|
greaterThan.bindColStats(indexRow5, queryFields(2), rExpr);
|
||||||
|
assertTrue(greaterThan.eval(), "12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow6 = intIndexRow(null, null);
|
||||||
|
greaterThan.bindColStats(indexRow6, queryFields(2), rExpr);
|
||||||
|
assertFalse(greaterThan.eval(), "12 <> null");
|
||||||
|
|
||||||
|
greaterThan.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
|
||||||
|
assertFalse(greaterThan.eval(), "null <> null");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testLessThanOrEqual() {
|
||||||
|
ExpressionEvaluator.LessThanOrEqual lessThanOrEqual = ExpressionEvaluator.LessThanOrEqual.getInstance();
|
||||||
|
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
|
||||||
|
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
|
||||||
|
|
||||||
|
RowData indexRow1 = intIndexRow(11, 13);
|
||||||
|
lessThanOrEqual.bindFieldReference(rExpr)
|
||||||
|
.bindVal(vExpr)
|
||||||
|
.bindColStats(indexRow1, queryFields(2), rExpr);
|
||||||
|
assertTrue(lessThanOrEqual.eval(), "11 < 12");
|
||||||
|
|
||||||
|
RowData indexRow2 = intIndexRow(12, 13);
|
||||||
|
lessThanOrEqual.bindColStats(indexRow2, queryFields(2), rExpr);
|
||||||
|
assertTrue(lessThanOrEqual.eval(), "min 12 = 12");
|
||||||
|
|
||||||
|
RowData indexRow3 = intIndexRow(11, 12);
|
||||||
|
lessThanOrEqual.bindColStats(indexRow3, queryFields(2), rExpr);
|
||||||
|
assertTrue(lessThanOrEqual.eval(), "max 12 = 12");
|
||||||
|
|
||||||
|
RowData indexRow4 = intIndexRow(10, 11);
|
||||||
|
lessThanOrEqual.bindColStats(indexRow4, queryFields(2), rExpr);
|
||||||
|
assertTrue(lessThanOrEqual.eval(), "max 11 < 12");
|
||||||
|
|
||||||
|
RowData indexRow5 = intIndexRow(13, 14);
|
||||||
|
lessThanOrEqual.bindColStats(indexRow5, queryFields(2), rExpr);
|
||||||
|
assertFalse(lessThanOrEqual.eval(), "12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow6 = intIndexRow(null, null);
|
||||||
|
lessThanOrEqual.bindColStats(indexRow6, queryFields(2), rExpr);
|
||||||
|
assertFalse(lessThanOrEqual.eval(), "12 <> null");
|
||||||
|
|
||||||
|
lessThanOrEqual.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
|
||||||
|
assertFalse(lessThanOrEqual.eval(), "null <> null");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testGreaterThanOrEqual() {
|
||||||
|
ExpressionEvaluator.GreaterThanOrEqual greaterThanOrEqual = ExpressionEvaluator.GreaterThanOrEqual.getInstance();
|
||||||
|
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
|
||||||
|
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
|
||||||
|
|
||||||
|
RowData indexRow1 = intIndexRow(11, 13);
|
||||||
|
greaterThanOrEqual.bindFieldReference(rExpr)
|
||||||
|
.bindVal(vExpr)
|
||||||
|
.bindColStats(indexRow1, queryFields(2), rExpr);
|
||||||
|
assertTrue(greaterThanOrEqual.eval(), "12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow2 = intIndexRow(12, 13);
|
||||||
|
greaterThanOrEqual.bindColStats(indexRow2, queryFields(2), rExpr);
|
||||||
|
assertTrue(greaterThanOrEqual.eval(), "min 12 = 12");
|
||||||
|
|
||||||
|
RowData indexRow3 = intIndexRow(11, 12);
|
||||||
|
greaterThanOrEqual.bindColStats(indexRow3, queryFields(2), rExpr);
|
||||||
|
assertTrue(greaterThanOrEqual.eval(), "max 12 = 12");
|
||||||
|
|
||||||
|
RowData indexRow4 = intIndexRow(10, 11);
|
||||||
|
greaterThanOrEqual.bindColStats(indexRow4, queryFields(2), rExpr);
|
||||||
|
assertFalse(greaterThanOrEqual.eval(), "max 11 < 12");
|
||||||
|
|
||||||
|
RowData indexRow5 = intIndexRow(13, 14);
|
||||||
|
greaterThanOrEqual.bindColStats(indexRow5, queryFields(2), rExpr);
|
||||||
|
assertTrue(greaterThanOrEqual.eval(), "12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow6 = intIndexRow(null, null);
|
||||||
|
greaterThanOrEqual.bindColStats(indexRow6, queryFields(2), rExpr);
|
||||||
|
assertFalse(greaterThanOrEqual.eval(), "12 <> null");
|
||||||
|
|
||||||
|
greaterThanOrEqual.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
|
||||||
|
assertFalse(greaterThanOrEqual.eval(), "null <> null");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testIn() {
|
||||||
|
ExpressionEvaluator.In in = ExpressionEvaluator.In.getInstance();
|
||||||
|
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
|
||||||
|
|
||||||
|
RowData indexRow1 = intIndexRow(11, 13);
|
||||||
|
in.bindFieldReference(rExpr)
|
||||||
|
.bindColStats(indexRow1, queryFields(2), rExpr);
|
||||||
|
in.bindVals(12);
|
||||||
|
assertTrue(in.eval(), "11 < 12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow2 = intIndexRow(12, 13);
|
||||||
|
in.bindColStats(indexRow2, queryFields(2), rExpr);
|
||||||
|
assertTrue(in.eval(), "min 12 = 12");
|
||||||
|
|
||||||
|
RowData indexRow3 = intIndexRow(11, 12);
|
||||||
|
in.bindColStats(indexRow3, queryFields(2), rExpr);
|
||||||
|
assertTrue(in.eval(), "max 12 = 12");
|
||||||
|
|
||||||
|
RowData indexRow4 = intIndexRow(10, 11);
|
||||||
|
in.bindColStats(indexRow4, queryFields(2), rExpr);
|
||||||
|
assertFalse(in.eval(), "max 11 < 12");
|
||||||
|
|
||||||
|
RowData indexRow5 = intIndexRow(13, 14);
|
||||||
|
in.bindColStats(indexRow5, queryFields(2), rExpr);
|
||||||
|
assertFalse(in.eval(), "12 < 13");
|
||||||
|
|
||||||
|
RowData indexRow6 = intIndexRow(null, null);
|
||||||
|
in.bindColStats(indexRow6, queryFields(2), rExpr);
|
||||||
|
assertFalse(in.eval(), "12 <> null");
|
||||||
|
|
||||||
|
in.bindVals((Object) null);
|
||||||
|
assertFalse(in.eval(), "null <> null");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static RowData intIndexRow(Integer minVal, Integer maxVal) {
|
||||||
|
return intIndexRow(minVal, maxVal, 2L);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static RowData intIndexRow(Integer minVal, Integer maxVal, Long nullCnt) {
|
||||||
|
return indexRow(StringData.fromString("f1"), 100L,
|
||||||
|
minVal, maxVal, nullCnt,
|
||||||
|
StringData.fromString("1"), StringData.fromString("100"), 5L,
|
||||||
|
TimestampData.fromEpochMillis(1), TimestampData.fromEpochMillis(100), 3L);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static RowData indexRow(Object... fields) {
|
||||||
|
return TestData.insertRow(INDEX_ROW_TYPE, fields);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static RowType.RowField[] queryFields(int... pos) {
|
||||||
|
List<RowType.RowField> fields = ((RowType) ROW_DATA_TYPE.getLogicalType()).getFields();
|
||||||
|
return Arrays.stream(pos).mapToObj(fields::get).toArray(RowType.RowField[]::new);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1256,6 +1256,37 @@ public class ITTestHoodieDataSource extends AbstractTestBase {
|
|||||||
assertRowsEquals(partitionResult, "[+I[1, 2022-02-02]]");
|
assertRowsEquals(partitionResult, "[+I[1, 2022-02-02]]");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testWriteAndReadWithDataSkipping() {
|
||||||
|
TableEnvironment tableEnv = batchTableEnv;
|
||||||
|
String hoodieTableDDL = sql("t1")
|
||||||
|
.option(FlinkOptions.PATH, tempFile.getAbsolutePath())
|
||||||
|
.option(FlinkOptions.METADATA_ENABLED, true)
|
||||||
|
.option("hoodie.metadata.index.column.stats.enable", true)
|
||||||
|
.option(FlinkOptions.READ_DATA_SKIPPING_ENABLED, true)
|
||||||
|
.end();
|
||||||
|
tableEnv.executeSql(hoodieTableDDL);
|
||||||
|
|
||||||
|
execInsertSql(tableEnv, TestSQL.INSERT_T1);
|
||||||
|
|
||||||
|
List<Row> result1 = CollectionUtil.iterableToList(
|
||||||
|
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
|
||||||
|
assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT);
|
||||||
|
// apply filters
|
||||||
|
List<Row> result2 = CollectionUtil.iterableToList(
|
||||||
|
() -> tableEnv.sqlQuery("select * from t1 where uuid > 'id5' and age > 20").execute().collect());
|
||||||
|
assertRowsEquals(result2, "["
|
||||||
|
+ "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], "
|
||||||
|
+ "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]");
|
||||||
|
// filter by timestamp
|
||||||
|
List<Row> result3 = CollectionUtil.iterableToList(
|
||||||
|
() -> tableEnv.sqlQuery("select * from t1 where ts > TIMESTAMP '1970-01-01 00:00:05'").execute().collect());
|
||||||
|
assertRowsEquals(result3, "["
|
||||||
|
+ "+I[id6, Emma, 20, 1970-01-01T00:00:06, par3], "
|
||||||
|
+ "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], "
|
||||||
|
+ "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]");
|
||||||
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
// Utilities
|
// Utilities
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ import org.apache.flink.api.common.io.FileInputFormat;
|
|||||||
import org.apache.flink.api.common.io.InputFormat;
|
import org.apache.flink.api.common.io.InputFormat;
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.flink.table.data.RowData;
|
import org.apache.flink.table.data.RowData;
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.api.function.ThrowingSupplier;
|
import org.junit.jupiter.api.function.ThrowingSupplier;
|
||||||
@@ -38,7 +39,6 @@ import org.slf4j.LoggerFactory;
|
|||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@@ -76,22 +76,18 @@ public class TestHoodieTableSource {
|
|||||||
Arrays.asList(conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(",")),
|
Arrays.asList(conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(",")),
|
||||||
"default-par",
|
"default-par",
|
||||||
conf);
|
conf);
|
||||||
Path[] paths = tableSource.getReadPaths();
|
FileStatus[] fileStatuses = tableSource.getReadFiles();
|
||||||
assertNotNull(paths);
|
assertNotNull(fileStatuses);
|
||||||
String[] names = Arrays.stream(paths).map(Path::getName)
|
assertThat(fileStatuses.length, is(4));
|
||||||
.sorted(Comparator.naturalOrder()).toArray(String[]::new);
|
|
||||||
assertThat(Arrays.toString(names), is("[par1, par2, par3, par4]"));
|
|
||||||
// apply partition pruning
|
// apply partition pruning
|
||||||
Map<String, String> partitions = new HashMap<>();
|
Map<String, String> partitions = new HashMap<>();
|
||||||
partitions.put("partition", "par1");
|
partitions.put("partition", "par1");
|
||||||
|
|
||||||
tableSource.applyPartitions(Collections.singletonList(partitions));
|
tableSource.applyPartitions(Collections.singletonList(partitions));
|
||||||
|
|
||||||
Path[] paths2 = tableSource.getReadPaths();
|
FileStatus[] fileStatuses2 = tableSource.getReadFiles();
|
||||||
assertNotNull(paths2);
|
assertNotNull(fileStatuses2);
|
||||||
String[] names2 = Arrays.stream(paths2).map(Path::getName)
|
assertThat(fileStatuses2.length, is(1));
|
||||||
.sorted(Comparator.naturalOrder()).toArray(String[]::new);
|
|
||||||
assertThat(Arrays.toString(names2), is("[par1]"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|||||||
@@ -0,0 +1,53 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.utils;
|
||||||
|
|
||||||
|
import org.apache.hudi.avro.model.HoodieMetadataRecord;
|
||||||
|
import org.apache.hudi.metadata.HoodieMetadataPayload;
|
||||||
|
import org.apache.hudi.util.AvroSchemaConverter;
|
||||||
|
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.flink.table.types.DataType;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.hamcrest.CoreMatchers.is;
|
||||||
|
import static org.hamcrest.MatcherAssert.assertThat;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test cases for {@link org.apache.hudi.util.AvroSchemaConverter}.
|
||||||
|
*/
|
||||||
|
public class TestAvroSchemaConverter {
|
||||||
|
@Test
|
||||||
|
void testUnionSchemaWithMultipleRecordTypes() {
|
||||||
|
Schema schema = HoodieMetadataRecord.SCHEMA$;
|
||||||
|
DataType dataType = AvroSchemaConverter.convertToDataType(schema);
|
||||||
|
int pos = HoodieMetadataRecord.SCHEMA$.getField(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).pos();
|
||||||
|
final String expected = "ROW<"
|
||||||
|
+ "`fileName` STRING, "
|
||||||
|
+ "`columnName` STRING, "
|
||||||
|
+ "`minValue` ROW<`wrapper` RAW('java.lang.Object', ?) NOT NULL>, "
|
||||||
|
+ "`maxValue` ROW<`wrapper` RAW('java.lang.Object', ?) NOT NULL>, "
|
||||||
|
+ "`valueCount` BIGINT, "
|
||||||
|
+ "`nullCount` BIGINT, "
|
||||||
|
+ "`totalSize` BIGINT, "
|
||||||
|
+ "`totalUncompressedSize` BIGINT, "
|
||||||
|
+ "`isDeleted` BOOLEAN NOT NULL>";
|
||||||
|
assertThat(dataType.getChildren().get(pos).toString(), is(expected));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -318,6 +318,9 @@ public class TestConfigurations {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tool to construct the catalog DDL.
|
||||||
|
*/
|
||||||
public static class Catalog {
|
public static class Catalog {
|
||||||
private final String catalogName;
|
private final String catalogName;
|
||||||
private String catalogPath = ".";
|
private String catalogPath = ".";
|
||||||
|
|||||||
@@ -97,6 +97,6 @@ public class TestUtils {
|
|||||||
|
|
||||||
public static StreamReadMonitoringFunction getMonitorFunc(Configuration conf) {
|
public static StreamReadMonitoringFunction getMonitorFunc(Configuration conf) {
|
||||||
final String basePath = conf.getString(FlinkOptions.PATH);
|
final String basePath = conf.getString(FlinkOptions.PATH);
|
||||||
return new StreamReadMonitoringFunction(conf, new Path(basePath), 1024 * 1024L, null);
|
return new StreamReadMonitoringFunction(conf, new Path(basePath), TestConfigurations.ROW_TYPE, 1024 * 1024L, null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,11 +18,11 @@
|
|||||||
|
|
||||||
package org.apache.hudi.utils;
|
package org.apache.hudi.utils;
|
||||||
|
|
||||||
import org.apache.flink.configuration.Configuration;
|
|
||||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||||
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
|
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
|
||||||
import org.apache.hudi.util.ViewStorageProperties;
|
import org.apache.hudi.util.ViewStorageProperties;
|
||||||
|
|
||||||
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.api.io.TempDir;
|
import org.junit.jupiter.api.io.TempDir;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user