1
0

[HUDI-2788] Fixing issues w/ Z-order Layout Optimization (#4026)

* Simplyfying, tidying up

* Fixed packaging for `TestOptimizeTable`

* Cleaned up `HoodiFileIndex` file filtering seq;
Removed optimization manually reading Parquet table circumventing Spark

* Refactored `DataSkippingUtils`:
  - Fixed checks to validate all statistics cols are present
  - Fixed some predicates being constructed incorrectly
  - Rewrote comments for easier comprehension, added more notes
  - Tidying up

* Tidying up tests

* `lint`

* Fixing compilation

* `TestOptimizeTable` > `TestTableLayoutOptimization`;
Added assertions to test data skipping paths

* Fixed tests to properly hit data-skipping path

* Fixed pruned files candidates lookup seq to conservatively included all non-indexed files

* Added java-doc

* Fixed compilation
This commit is contained in:
Alexey Kudinkin
2021-11-24 10:10:28 -08:00
committed by GitHub
parent 973f78f5ca
commit 60b23b9797
6 changed files with 289 additions and 181 deletions

View File

@@ -39,7 +39,7 @@ import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
/**
* A partitioner that does spartial curve optimization sorting based on specified column values for each RDD partition.
* A partitioner that does spatial curve optimization sorting based on specified column values for each RDD partition.
* support z-curve optimization, hilbert will come soon.
* @param <T> HoodieRecordPayload type
*/

View File

@@ -189,11 +189,10 @@ public class ZCurveOptimizeHelper {
SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration());
int numParallelism = (scanFiles.size() / 3 + 1);
List<HoodieColumnRangeMetadata<Comparable>> colMinMaxInfos = new ArrayList<>();
List<HoodieColumnRangeMetadata<Comparable>> colMinMaxInfos;
String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION);
try {
String description = "Listing parquet column statistics";
jsc.setJobDescription(description);
jsc.setJobDescription("Listing parquet column statistics");
colMinMaxInfos = jsc.parallelize(scanFiles, numParallelism).mapPartitions(paths -> {
Configuration conf = serializableConfiguration.value();
ParquetUtils parquetUtils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
@@ -209,7 +208,7 @@ public class ZCurveOptimizeHelper {
}
Map<String, List<HoodieColumnRangeMetadata<Comparable>>> fileToStatsListMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(e -> e.getFilePath()));
JavaRDD<Row> allMetaDataRDD = jsc.parallelize(fileToStatsListMap.values().stream().collect(Collectors.toList()), 1).map(f -> {
JavaRDD<Row> allMetaDataRDD = jsc.parallelize(new ArrayList<>(fileToStatsListMap.values()), 1).map(f -> {
int colSize = f.size();
if (colSize == 0) {
return null;
@@ -299,50 +298,54 @@ public class ZCurveOptimizeHelper {
Dataset<Row> statisticsDF = ZCurveOptimizeHelper.getMinMaxValue(df, cols);
// try to find last validate index table from index path
try {
if (fs.exists(new Path(indexPath))) {
List<String> allIndexTables = Arrays
.stream(fs.listStatus(new Path(indexPath))).filter(f -> f.isDirectory()).map(f -> f.getPath().getName()).collect(Collectors.toList());
List<String> candidateIndexTables = allIndexTables.stream().filter(f -> validateCommits.contains(f)).sorted().collect(Collectors.toList());
List<String> residualTables = allIndexTables.stream().filter(f -> !validateCommits.contains(f)).collect(Collectors.toList());
Option<Dataset> latestIndexData = Option.empty();
if (!candidateIndexTables.isEmpty()) {
latestIndexData = Option.of(spark.read().load(new Path(indexPath, candidateIndexTables.get(candidateIndexTables.size() - 1)).toString()));
// clean old index table, keep at most 1 index table.
candidateIndexTables.remove(candidateIndexTables.size() - 1);
candidateIndexTables.forEach(f -> {
try {
fs.delete(new Path(indexPath, f));
} catch (IOException ie) {
throw new HoodieException(ie);
}
});
}
// If there's currently no index, create one
if (!fs.exists(new Path(indexPath))) {
statisticsDF.repartition(1).write().mode("overwrite").save(savePath.toString());
return;
}
// clean residualTables
// retried cluster operations at the same instant time is also considered,
// the residual files produced by retried are cleaned up before save statistics
// save statistics info to index table which named commitTime
residualTables.forEach(f -> {
// Otherwise, clean up all indexes but the most recent one
List<String> allIndexTables = Arrays
.stream(fs.listStatus(new Path(indexPath))).filter(f -> f.isDirectory()).map(f -> f.getPath().getName()).collect(Collectors.toList());
List<String> candidateIndexTables = allIndexTables.stream().filter(f -> validateCommits.contains(f)).sorted().collect(Collectors.toList());
List<String> residualTables = allIndexTables.stream().filter(f -> !validateCommits.contains(f)).collect(Collectors.toList());
Option<Dataset> latestIndexData = Option.empty();
if (!candidateIndexTables.isEmpty()) {
latestIndexData = Option.of(spark.read().load(new Path(indexPath, candidateIndexTables.get(candidateIndexTables.size() - 1)).toString()));
// clean old index table, keep at most 1 index table.
candidateIndexTables.remove(candidateIndexTables.size() - 1);
candidateIndexTables.forEach(f -> {
try {
fs.delete(new Path(indexPath, f));
} catch (IOException ie) {
throw new HoodieException(ie);
}
});
}
if (latestIndexData.isPresent() && latestIndexData.get().schema().equals(statisticsDF.schema())) {
// update the statistics info
String originalTable = "indexTable_" + java.util.UUID.randomUUID().toString().replace("-", "");
String updateTable = "updateTable_" + java.util.UUID.randomUUID().toString().replace("-", "");
latestIndexData.get().registerTempTable(originalTable);
statisticsDF.registerTempTable(updateTable);
// update table by full out join
List columns = Arrays.asList(statisticsDF.schema().fieldNames());
spark.sql(HoodieSparkUtils$
.MODULE$.createMergeSql(originalTable, updateTable, JavaConversions.asScalaBuffer(columns))).repartition(1).write().save(savePath.toString());
} else {
statisticsDF.repartition(1).write().mode("overwrite").save(savePath.toString());
// clean residualTables
// retried cluster operations at the same instant time is also considered,
// the residual files produced by retried are cleaned up before save statistics
// save statistics info to index table which named commitTime
residualTables.forEach(f -> {
try {
fs.delete(new Path(indexPath, f));
} catch (IOException ie) {
throw new HoodieException(ie);
}
});
if (latestIndexData.isPresent() && latestIndexData.get().schema().equals(statisticsDF.schema())) {
// update the statistics info
String originalTable = "indexTable_" + java.util.UUID.randomUUID().toString().replace("-", "");
String updateTable = "updateTable_" + java.util.UUID.randomUUID().toString().replace("-", "");
latestIndexData.get().registerTempTable(originalTable);
statisticsDF.registerTempTable(updateTable);
// update table by full out join
List columns = Arrays.asList(statisticsDF.schema().fieldNames());
spark.sql(HoodieSparkUtils$
.MODULE$.createMergeSql(originalTable, updateTable, JavaConversions.asScalaBuffer(columns))).repartition(1).write().save(savePath.toString());
} else {
statisticsDF.repartition(1).write().mode("overwrite").save(savePath.toString());
}

View File

@@ -298,8 +298,8 @@ object HoodieSparkUtils extends SparkAdapterSupport {
*/
def createMergeSql(leftTable: String, rightTable: String, cols: Seq[String]): String = {
var selectsql = ""
for (i <- (0 to cols.size-1)) {
selectsql = selectsql + s" if (${leftTable}.${cols(0)} is null, ${rightTable}.${cols(i)}, ${leftTable}.${cols(i)}) as ${cols(i)} ,"
for (i <- cols.indices) {
selectsql = selectsql + s" if (${leftTable}.${cols(i)} is null, ${rightTable}.${cols(i)}, ${leftTable}.${cols(i)}) as ${cols(i)} ,"
}
"select " + selectsql.dropRight(1) + s" from ${leftTable} full join ${rightTable} on ${leftTable}.${cols(0)} = ${rightTable}.${cols(0)}"
}

View File

@@ -160,41 +160,92 @@ case class HoodieFileIndex(
spark.sessionState.conf.getConfString(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "false")).toBoolean
}
private def filterFilesByDataSkippingIndex(dataFilters: Seq[Expression]): Set[String] = {
var allFiles: Set[String] = Set.empty
var candidateFiles: Set[String] = Set.empty
/**
* Computes pruned list of candidate base-files' names based on provided list of {@link dataFilters}
* conditions, by leveraging custom Z-order index (Z-index) bearing "min", "max", "num_nulls" statistic
* for all clustered columns
*
* NOTE: This method has to return complete set of candidate files, since only provided candidates will
* ultimately be scanned as part of query execution. Hence, this method has to maintain the
* invariant of conservatively including every base-file's name, that is NOT referenced in its index.
*
* @param dataFilters list of original data filters passed down from querying engine
* @return list of pruned (data-skipped) candidate base-files' names
*/
private def lookupCandidateFilesNamesInZIndex(dataFilters: Seq[Expression]): Option[Set[String]] = {
val indexPath = metaClient.getZindexPath
val fs = metaClient.getFs
if (fs.exists(new Path(indexPath)) && dataFilters.nonEmpty) {
// try to load latest index table from index path
val candidateIndexTables = fs.listStatus(new Path(indexPath)).filter(_.isDirectory)
.map(_.getPath.getName).filter(f => completedCommits.contains(f)).sortBy(x => x)
if (candidateIndexTables.nonEmpty) {
val dataFrameOpt = try {
Some(spark.read.load(new Path(indexPath, candidateIndexTables.last).toString))
} catch {
case _: Throwable =>
logError("missing index skip data-skipping")
None
}
if (dataFrameOpt.isDefined) {
val indexSchema = dataFrameOpt.get.schema
val indexFiles = DataSkippingUtils.getIndexFiles(spark.sparkContext.hadoopConfiguration, new Path(indexPath, candidateIndexTables.last).toString)
val indexFilter = dataFilters.map(DataSkippingUtils.createZindexFilter(_, indexSchema)).reduce(And)
logInfo(s"index filter condition: $indexFilter")
dataFrameOpt.get.persist()
if (indexFiles.size <= 4) {
allFiles = DataSkippingUtils.readParquetFile(spark, indexFiles)
} else {
allFiles = dataFrameOpt.get.select("file").collect().map(_.getString(0)).toSet
}
candidateFiles = dataFrameOpt.get.filter(new Column(indexFilter)).select("file").collect().map(_.getString(0)).toSet
dataFrameOpt.get.unpersist()
}
}
if (!enableDataSkipping() || !fs.exists(new Path(indexPath)) || dataFilters.isEmpty) {
// scalastyle:off return
return Option.empty
// scalastyle:on return
}
allFiles -- candidateFiles
// Collect all index tables present in `.zindex` folder
val candidateIndexTables =
fs.listStatus(new Path(indexPath))
.filter(_.isDirectory)
.map(_.getPath.getName)
.filter(f => completedCommits.contains(f))
.sortBy(x => x)
if (candidateIndexTables.isEmpty) {
// scalastyle:off return
return Option.empty
// scalastyle:on return
}
val dataFrameOpt = try {
Some(spark.read.load(new Path(indexPath, candidateIndexTables.last).toString))
} catch {
case t: Throwable =>
logError("Failed to read Z-index; skipping", t)
None
}
dataFrameOpt.map(df => {
val indexSchema = df.schema
val indexFilter =
dataFilters.map(DataSkippingUtils.createZIndexLookupFilter(_, indexSchema))
.reduce(And)
logInfo(s"Index filter condition: $indexFilter")
df.persist()
val allIndexedFileNames =
df.select("file")
.collect()
.map(_.getString(0))
.toSet
val prunedCandidateFileNames =
df.filter(new Column(indexFilter))
.select("file")
.collect()
.map(_.getString(0))
.toSet
df.unpersist()
// NOTE: Z-index isn't guaranteed to have complete set of statistics for every
// base-file: since it's bound to clustering, which could occur asynchronously
// at arbitrary point in time, and is not likely to touching all of the base files.
//
// To close that gap, we manually compute the difference b/w all indexed (Z-index)
// files and all outstanding base-files, and make sure that all base files not
// represented w/in Z-index are included in the output of this method
val notIndexedFileNames =
lookupFileNamesMissingFromIndex(allIndexedFileNames)
prunedCandidateFileNames ++ notIndexedFileNames
})
}
private def lookupFileNamesMissingFromIndex(allIndexedFileNames: Set[String]) = {
val allBaseFileNames = allFiles.map(f => f.getPath.getName).toSet
allBaseFileNames -- allIndexedFileNames
}
/**
@@ -206,18 +257,22 @@ case class HoodieFileIndex(
*/
override def listFiles(partitionFilters: Seq[Expression],
dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
// try to load filterFiles from index
val filterFiles: Set[String] = if (enableDataSkipping()) {
filterFilesByDataSkippingIndex(dataFilters)
} else {
Set.empty
}
// Look up candidate files names in the Z-index, if all of the following conditions are true
// - Data-skipping is enabled
// - Z-index is present
// - List of predicates (filters) is present
val candidateFilesNamesOpt: Option[Set[String]] = lookupCandidateFilesNamesInZIndex(dataFilters)
logDebug(s"Overlapping candidate files (from Z-index): ${candidateFilesNamesOpt.getOrElse(Set.empty)}")
if (queryAsNonePartitionedTable) { // Read as Non-Partitioned table.
val candidateFiles = if (!filterFiles.isEmpty) {
allFiles.filterNot(fileStatus => filterFiles.contains(fileStatus.getPath.getName))
} else {
allFiles
}
// Filter in candidate files based on the Z-index lookup
val candidateFiles =
allFiles.filter(fileStatus =>
// NOTE: This predicate is true when {@code Option} is empty
candidateFilesNamesOpt.forall(_.contains(fileStatus.getPath.getName))
)
logInfo(s"Total files : ${allFiles.size}," +
s" candidate files after data skipping: ${candidateFiles.size} " +
s" skipping percent ${if (allFiles.length != 0) (allFiles.size - candidateFiles.size) / allFiles.size.toDouble else 0}")
@@ -236,11 +291,13 @@ case class HoodieFileIndex(
null
}
}).filterNot(_ == null)
val candidateFiles = if (!filterFiles.isEmpty) {
baseFileStatuses.filterNot(fileStatus => filterFiles.contains(fileStatus.getPath.getName))
} else {
baseFileStatuses
}
// Filter in candidate files based on the Z-index lookup
val candidateFiles =
baseFileStatuses.filter(fileStatus =>
// NOTE: This predicate is true when {@code Option} is empty
candidateFilesNamesOpt.forall(_.contains(fileStatus.getPath.getName)))
totalFileSize += baseFileStatuses.size
candidateFileSize += candidateFiles.size
PartitionDirectory(partition.values, candidateFiles)

View File

@@ -36,120 +36,153 @@ import scala.collection.JavaConverters._
object DataSkippingUtils {
/**
* create z_index filter and push those filters to index table to filter all candidate scan files.
* @param condition origin filter from query.
* @param indexSchema schema from index table.
* @return filters for index table.
*/
def createZindexFilter(condition: Expression, indexSchema: StructType): Expression = {
def buildExpressionInternal(colName: Seq[String], statisticValue: String): Expression = {
val appendColName = UnresolvedAttribute(colName).name + statisticValue
col(appendColName).expr
}
* Translates provided {@link filterExpr} into corresponding filter-expression for Z-index index table
* to filter out candidate files that would hold records matching the original filter
*
* @param filterExpr original filter from query
* @param indexSchema index table schema
* @return filter for Z-index table
*/
def createZIndexLookupFilter(filterExpr: Expression, indexSchema: StructType): Expression = {
def reWriteCondition(colName: Seq[String], conditionExpress: Expression): Expression = {
val appendColName = UnresolvedAttribute(colName).name + "_minValue"
if (indexSchema.exists(p => p.name == appendColName)) {
def rewriteCondition(colName: Seq[String], conditionExpress: Expression): Expression = {
val stats = Set.apply(
UnresolvedAttribute(colName).name + "_minValue",
UnresolvedAttribute(colName).name + "_maxValue",
UnresolvedAttribute(colName).name + "_num_nulls"
)
if (stats.forall(stat => indexSchema.exists(_.name == stat))) {
conditionExpress
} else {
Literal.TrueLiteral
}
}
val minValue = (colName: Seq[String]) => buildExpressionInternal(colName, "_minValue")
val maxValue = (colName: Seq[String]) => buildExpressionInternal(colName, "_maxValue")
val num_nulls = (colName: Seq[String]) => buildExpressionInternal(colName, "_num_nulls")
def refColExpr(colName: Seq[String], statisticValue: String): Expression =
col(UnresolvedAttribute(colName).name + statisticValue).expr
condition match {
// query filter "colA = b" convert it to "colA_minValue <= b and colA_maxValue >= b" for index table
def minValue(colName: Seq[String]) = refColExpr(colName, "_minValue")
def maxValue(colName: Seq[String]) = refColExpr(colName, "_maxValue")
def numNulls(colName: Seq[String]) = refColExpr(colName, "_num_nulls")
def colContainsValuesEqualToLiteral(colName: Seq[String], value: Literal) =
And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value))
def colContainsValuesEqualToLiterals(colName: Seq[String], list: Seq[Literal]) =
list.map { lit => colContainsValuesEqualToLiteral(colName, lit) }.reduce(Or)
filterExpr match {
// Filter "colA = b"
// Translates to "colA_minValue <= b AND colA_maxValue >= b" condition for index lookup
case EqualTo(attribute: AttributeReference, value: Literal) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value)))
// query filter "b = colA" convert it to "colA_minValue <= b and colA_maxValue >= b" for index table
rewriteCondition(colName, colContainsValuesEqualToLiteral(colName, value))
// Filter "b = colA"
// Translates to "colA_minValue <= b AND colA_maxValue >= b" condition for index lookup
case EqualTo(value: Literal, attribute: AttributeReference) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value)))
// query filter "colA = null" convert it to "colA_num_nulls = null" for index table
rewriteCondition(colName, colContainsValuesEqualToLiteral(colName, value))
// Filter "colA = null"
// Translates to "colA_num_nulls = null" for index lookup
case equalNullSafe @ EqualNullSafe(_: AttributeReference, _ @ Literal(null, _)) =>
val colName = getTargetColNameParts(equalNullSafe.left)
reWriteCondition(colName, EqualTo(num_nulls(colName), equalNullSafe.right))
// query filter "colA < b" convert it to "colA_minValue < b" for index table
rewriteCondition(colName, EqualTo(numNulls(colName), equalNullSafe.right))
// Filter "colA < b"
// Translates to "colA_minValue < b" for index lookup
case LessThan(attribute: AttributeReference, value: Literal) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName,LessThan(minValue(colName), value))
// query filter "b < colA" convert it to "colA_maxValue > b" for index table
rewriteCondition(colName, LessThan(minValue(colName), value))
// Filter "b < colA"
// Translates to "b < colA_maxValue" for index lookup
case LessThan(value: Literal, attribute: AttributeReference) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, GreaterThan(maxValue(colName), value))
// query filter "colA > b" convert it to "colA_maxValue > b" for index table
rewriteCondition(colName, GreaterThan(maxValue(colName), value))
// Filter "colA > b"
// Translates to "colA_maxValue > b" for index lookup
case GreaterThan(attribute: AttributeReference, value: Literal) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, GreaterThan(maxValue(colName), value))
// query filter "b > colA" convert it to "colA_minValue < b" for index table
rewriteCondition(colName, GreaterThan(maxValue(colName), value))
// Filter "b > colA"
// Translates to "b > colA_minValue" for index lookup
case GreaterThan(value: Literal, attribute: AttributeReference) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, LessThan(minValue(colName), value))
// query filter "colA <= b" convert it to "colA_minValue <= b" for index table
rewriteCondition(colName, LessThan(minValue(colName), value))
// Filter "colA <= b"
// Translates to "colA_minValue <= b" for index lookup
case LessThanOrEqual(attribute: AttributeReference, value: Literal) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, LessThanOrEqual(minValue(colName), value))
// query filter "b <= colA" convert it to "colA_maxValue >= b" for index table
rewriteCondition(colName, LessThanOrEqual(minValue(colName), value))
// Filter "b <= colA"
// Translates to "b <= colA_maxValue" for index lookup
case LessThanOrEqual(value: Literal, attribute: AttributeReference) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, GreaterThanOrEqual(maxValue(colName), value))
// query filter "colA >= b" convert it to "colA_maxValue >= b" for index table
rewriteCondition(colName, GreaterThanOrEqual(maxValue(colName), value))
// Filter "colA >= b"
// Translates to "colA_maxValue >= b" for index lookup
case GreaterThanOrEqual(attribute: AttributeReference, right: Literal) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, GreaterThanOrEqual(maxValue(colName), right))
// query filter "b >= colA" convert it to "colA_minValue <= b" for index table
rewriteCondition(colName, GreaterThanOrEqual(maxValue(colName), right))
// Filter "b >= colA"
// Translates to "b >= colA_minValue" for index lookup
case GreaterThanOrEqual(value: Literal, attribute: AttributeReference) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, LessThanOrEqual(minValue(colName), value))
// query filter "colA is null" convert it to "colA_num_nulls > 0" for index table
rewriteCondition(colName, LessThanOrEqual(minValue(colName), value))
// Filter "colA is null"
// Translates to "colA_num_nulls > 0" for index lookup
case IsNull(attribute: AttributeReference) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, GreaterThan(num_nulls(colName), Literal(0)))
// query filter "colA is not null" convert it to "colA_num_nulls = 0" for index table
rewriteCondition(colName, GreaterThan(numNulls(colName), Literal(0)))
// Filter "colA is not null"
// Translates to "colA_num_nulls = 0" for index lookup
case IsNotNull(attribute: AttributeReference) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, EqualTo(num_nulls(colName), Literal(0)))
// query filter "colA in (a,b)" convert it to " (colA_minValue <= a and colA_maxValue >= a) or (colA_minValue <= b and colA_maxValue >= b) " for index table
rewriteCondition(colName, EqualTo(numNulls(colName), Literal(0)))
// Filter "colA in (a, b, ...)"
// Translates to "(colA_minValue <= a AND colA_maxValue >= a) OR (colA_minValue <= b AND colA_maxValue >= b)" for index lookup
case In(attribute: AttributeReference, list: Seq[Literal]) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, list.map { lit =>
And(LessThanOrEqual(minValue(colName), lit), GreaterThanOrEqual(maxValue(colName), lit))
}.reduce(Or))
// query filter "colA like xxx" convert it to " (colA_minValue <= xxx and colA_maxValue >= xxx) or (colA_min start with xxx or colA_max start with xxx) " for index table
rewriteCondition(colName, colContainsValuesEqualToLiterals(colName, list))
// Filter "colA like xxx"
// Translates to "colA_minValue <= xxx AND colA_maxValue >= xxx" for index lookup
// NOTE: That this operator only matches string prefixes, and this is
// essentially equivalent to "colA = b" expression
case StartsWith(attribute, v @ Literal(_: UTF8String, _)) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, Or(And(LessThanOrEqual(minValue(colName), v), GreaterThanOrEqual(maxValue(colName), v)) ,
Or(StartsWith(minValue(colName), v), StartsWith(maxValue(colName), v))))
// query filter "colA not in (a, b)" convert it to " (not( colA_minValue = a and colA_maxValue = a)) and (not( colA_minValue = b and colA_maxValue = b)) " for index table
rewriteCondition(colName, colContainsValuesEqualToLiteral(colName, v))
// Filter "colA not in (a, b, ...)"
// Translates to "(colA_minValue > a OR colA_maxValue < a) AND (colA_minValue > b OR colA_maxValue < b)" for index lookup
// NOTE: This is an inversion of `in (a, b, ...)` expr
case Not(In(attribute: AttributeReference, list: Seq[Literal])) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, list.map { lit =>
Not(And(EqualTo(minValue(colName), lit), EqualTo(maxValue(colName), lit)))
}.reduce(And))
// query filter "colA != b" convert it to "not ( colA_minValue = b and colA_maxValue = b )" for index table
rewriteCondition(colName, Not(colContainsValuesEqualToLiterals(colName, list)))
// Filter "colA != b"
// Translates to "colA_minValue > b OR colA_maxValue < b" (which is an inversion of expr for "colA = b") for index lookup
// NOTE: This is an inversion of `colA = b` expr
case Not(EqualTo(attribute: AttributeReference, value: Literal)) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, Not(And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value))))
// query filter "b != colA" convert it to "not ( colA_minValue = b and colA_maxValue = b )" for index table
rewriteCondition(colName, Not(colContainsValuesEqualToLiteral(colName, value)))
// Filter "b != colA"
// Translates to "colA_minValue > b OR colA_maxValue < b" (which is an inversion of expr for "colA = b") for index lookup
// NOTE: This is an inversion of `colA != b` expr
case Not(EqualTo(value: Literal, attribute: AttributeReference)) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, Not(And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value))))
// query filter "colA not like xxxx" convert it to "not ( colA_minValue startWith xxx and colA_maxValue startWith xxx)" for index table
rewriteCondition(colName, Not(colContainsValuesEqualToLiteral(colName, value)))
// Filter "colA not like xxx"
// Translates to "!(colA_minValue <= xxx AND colA_maxValue >= xxx)" for index lookup
// NOTE: This is a inversion of "colA like xxx" assuming that colA is a string-based type
case Not(StartsWith(attribute, value @ Literal(_: UTF8String, _))) =>
val colName = getTargetColNameParts(attribute)
reWriteCondition(colName, Not(And(StartsWith(minValue(colName), value), StartsWith(maxValue(colName), value))))
rewriteCondition(colName, Not(colContainsValuesEqualToLiteral(colName, value)))
case or: Or =>
val resLeft = createZindexFilter(or.left, indexSchema)
val resRight = createZindexFilter(or.right, indexSchema)
val resLeft = createZIndexLookupFilter(or.left, indexSchema)
val resRight = createZIndexLookupFilter(or.right, indexSchema)
Or(resLeft, resRight)
case and: And =>
val resLeft = createZindexFilter(and.left, indexSchema)
val resRight = createZindexFilter(and.right, indexSchema)
val resLeft = createZIndexLookupFilter(and.left, indexSchema)
val resRight = createZIndexLookupFilter(and.right, indexSchema)
And(resLeft, resRight)
case expr: Expression =>

View File

@@ -18,28 +18,30 @@
package org.apache.hudi.functional
import java.sql.{Date, Timestamp}
import org.apache.hadoop.fs.Path
import org.apache.hudi.common.model.HoodieFileFormat
import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig}
import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions}
import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings
import org.apache.hudi.common.util.{BaseFileUtils, ParquetUtils}
import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig}
import org.apache.hudi.testutils.HoodieClientTestBase
import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions}
import org.apache.spark.ZCurveOptimizeHelper
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.junit.jupiter.api.Assertions.assertEquals
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
import org.junit.jupiter.api.{AfterEach, BeforeEach, Tag, Test}
import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.ValueSource
import java.sql.{Date, Timestamp}
import scala.collection.JavaConversions._
import scala.util.Random
class TestOptimizeTable extends HoodieClientTestBase {
var spark: SparkSession = null
@Tag("functional")
class TestTableLayoutOptimization extends HoodieClientTestBase {
var spark: SparkSession = _
val commonOpts = Map(
"hoodie.insert.shuffle.parallelism" -> "4",
@@ -67,11 +69,13 @@ class TestOptimizeTable extends HoodieClientTestBase {
@ParameterizedTest
@ValueSource(strings = Array("COPY_ON_WRITE", "MERGE_ON_READ"))
def testOptimizewithClustering(tableType: String): Unit = {
def testOptimizeWithClustering(tableType: String): Unit = {
val targetRecordsCount = 10000
// Bulk Insert Operation
val records1 = recordsToStrings(dataGen.generateInserts("001", 1000)).toList
val inputDF1: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records1, 2))
inputDF1.write.format("org.apache.hudi")
val records = recordsToStrings(dataGen.generateInserts("001", targetRecordsCount)).toList
val writeDf: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records, 2))
writeDf.write.format("org.apache.hudi")
.options(commonOpts)
.option("hoodie.compact.inline", "false")
.option(DataSourceWriteOptions.OPERATION.key(), DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL)
@@ -83,30 +87,41 @@ class TestOptimizeTable extends HoodieClientTestBase {
.option("hoodie.clustering.plan.strategy.target.file.max.bytes", "1073741824")
.option("hoodie.clustering.plan.strategy.small.file.limit", "629145600")
.option("hoodie.clustering.plan.strategy.max.bytes.per.group", Long.MaxValue.toString)
.option("hoodie.clustering.plan.strategy.target.file.max.bytes", String.valueOf(64 *1024 * 1024L))
.option("hoodie.clustering.plan.strategy.target.file.max.bytes", String.valueOf(64 * 1024 * 1024L))
.option(HoodieClusteringConfig.LAYOUT_OPTIMIZE_ENABLE.key, "true")
.option(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key, "begin_lat, begin_lon")
.mode(SaveMode.Overwrite)
.save(basePath)
assertEquals(1000, spark.read.format("hudi").load(basePath).count())
// use unsorted col as filter.
assertEquals(spark.read
.format("hudi").load(basePath).where("end_lat >= 0 and rider != '1' and weight > 0.0").count(),
spark.read.option(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "true")
.format("hudi").load(basePath).where("end_lat >= 0 and rider != '1' and weight > 0.0").count())
// use sorted col as filter.
assertEquals(spark.read.format("hudi").load(basePath)
.where("begin_lat >= 0.49 and begin_lat < 0.51 and begin_lon >= 0.49 and begin_lon < 0.51").count(),
spark.read.option(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "true")
.format("hudi").load(basePath)
.where("begin_lat >= 0.49 and begin_lat < 0.51 and begin_lon >= 0.49 and begin_lon < 0.51").count())
// use sorted cols and unsorted cols as filter
assertEquals(spark.read.format("hudi").load(basePath)
.where("begin_lat >= 0.49 and begin_lat < 0.51 and end_lat > 0.56").count(),
spark.read.option(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "true")
.format("hudi").load(basePath)
.where("begin_lat >= 0.49 and begin_lat < 0.51 and end_lat > 0.56").count())
val readDf =
spark.read
.format("hudi")
.load(basePath)
val readDfSkip =
spark.read
.option(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "true")
.format("hudi")
.load(basePath)
assertEquals(targetRecordsCount, readDf.count())
assertEquals(targetRecordsCount, readDfSkip.count())
readDf.createOrReplaceTempView("hudi_snapshot_raw")
readDfSkip.createOrReplaceTempView("hudi_snapshot_skipping")
def select(tableName: String) =
spark.sql(s"SELECT * FROM $tableName WHERE begin_lat >= 0.49 AND begin_lat < 0.51 AND begin_lon >= 0.49 AND begin_lon < 0.51")
assertRowsMatch(
select("hudi_snapshot_raw"),
select("hudi_snapshot_skipping")
)
}
def assertRowsMatch(one: DataFrame, other: DataFrame) = {
val rows = one.count()
assert(rows == other.count() && one.intersect(other).count() == rows)
}
@Test