1
0

[HUDI-3594] Supporting Composite Expressions over Data Table Columns in Data Skipping flow (#4996)

This commit is contained in:
Alexey Kudinkin
2022-03-24 22:27:15 -07:00
committed by GitHub
parent 8896864d7b
commit 8b38ddedc2
18 changed files with 1079 additions and 302 deletions

View File

@@ -24,9 +24,9 @@ import org.apache.hudi.common.util.ValidationUtils.checkArgument
import org.apache.hudi.common.util.{ClusteringUtils, Option => HOption}
import org.apache.hudi.config.HoodieClusteringConfig
import org.apache.hudi.exception.HoodieClusteringException
import org.apache.hudi.{AvroConversionUtils, HoodieCLIUtils, HoodieFileIndex}
import org.apache.hudi.{AvroConversionUtils, HoodieCLIUtils, HoodieFileIndex, SparkAdapterSupport}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{HoodieCatalystExpressionUtils, Row}
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.PredicateHelper
import org.apache.spark.sql.execution.datasources.FileStatusCache
import org.apache.spark.sql.types._
@@ -34,7 +34,14 @@ import org.apache.spark.sql.types._
import java.util.function.Supplier
import scala.collection.JavaConverters._
class RunClusteringProcedure extends BaseProcedure with ProcedureBuilder with PredicateHelper with Logging {
class RunClusteringProcedure extends BaseProcedure
with ProcedureBuilder
with PredicateHelper
with Logging
with SparkAdapterSupport {
private val exprUtils = sparkAdapter.createCatalystExpressionUtils()
/**
* OPTIMIZE table_name|table_path [WHERE predicate]
* [ORDER BY (col_name1 [, ...] ) ]
@@ -120,9 +127,9 @@ class RunClusteringProcedure extends BaseProcedure with ProcedureBuilder with Pr
// Resolve partition predicates
val schemaResolver = new TableSchemaResolver(metaClient)
val tableSchema = AvroConversionUtils.convertAvroSchemaToStructType(schemaResolver.getTableAvroSchema)
val condition = HoodieCatalystExpressionUtils.resolveFilterExpr(sparkSession, predicate, tableSchema)
val condition = exprUtils.resolveExpr(sparkSession, predicate, tableSchema)
val partitionColumns = metaClient.getTableConfig.getPartitionFields.orElse(Array[String]())
val (partitionPredicates, dataPredicates) = HoodieCatalystExpressionUtils.splitPartitionAndDataPredicates(
val (partitionPredicates, dataPredicates) = exprUtils.splitPartitionAndDataPredicates(
sparkSession, splitConjunctivePredicates(condition).toArray, partitionColumns)
checkArgument(dataPredicates.isEmpty, "Only partition predicates are allowed")

View File

@@ -20,30 +20,44 @@ package org.apache.hudi
import org.apache.hudi.index.columnstats.ColumnStatsIndexHelper
import org.apache.hudi.testutils.HoodieClientTestBase
import org.apache.spark.sql.catalyst.expressions.{Expression, Not}
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions.{col, lower}
import org.apache.spark.sql.hudi.DataSkippingUtils
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType, VarcharType}
import org.apache.spark.sql.{Column, HoodieCatalystExpressionUtils, SparkSession}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, HoodieCatalystExpressionUtils, Row, SparkSession}
import org.junit.jupiter.api.Assertions.assertEquals
import org.junit.jupiter.api.BeforeEach
import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.Arguments.arguments
import org.junit.jupiter.params.provider.{Arguments, MethodSource}
import java.sql.Timestamp
import scala.collection.JavaConverters._
// NOTE: Only A, B columns are indexed
case class IndexRow(
file: String,
A_minValue: Long,
A_maxValue: Long,
A_num_nulls: Long,
// Corresponding A column is LongType
A_minValue: Long = -1,
A_maxValue: Long = -1,
A_num_nulls: Long = -1,
// Corresponding B column is StringType
B_minValue: String = null,
B_maxValue: String = null,
B_num_nulls: Long = -1
)
B_num_nulls: Long = -1,
class TestDataSkippingUtils extends HoodieClientTestBase {
// Corresponding B column is TimestampType
C_minValue: Timestamp = null,
C_maxValue: Timestamp = null,
C_num_nulls: Long = -1
) {
def toRow: Row = Row(productIterator.toSeq: _*)
}
class TestDataSkippingUtils extends HoodieClientTestBase with SparkAdapterSupport {
val exprUtils: HoodieCatalystExpressionUtils = sparkAdapter.createCatalystExpressionUtils()
var spark: SparkSession = _
@@ -53,17 +67,18 @@ class TestDataSkippingUtils extends HoodieClientTestBase {
spark = sqlContext.sparkSession
}
val indexedCols = Seq("A", "B")
val sourceTableSchema =
val indexedCols: Seq[String] = Seq("A", "B", "C")
val sourceTableSchema: StructType =
StructType(
Seq(
StructField("A", LongType),
StructField("B", StringType),
StructField("C", VarcharType(32))
StructField("C", TimestampType),
StructField("D", VarcharType(32))
)
)
val indexSchema =
val indexSchema: StructType =
ColumnStatsIndexHelper.composeIndexSchema(
sourceTableSchema.fields.toSeq
.filter(f => indexedCols.contains(f.name))
@@ -71,15 +86,17 @@ class TestDataSkippingUtils extends HoodieClientTestBase {
)
@ParameterizedTest
@MethodSource(Array("testBaseLookupFilterExpressionsSource", "testAdvancedLookupFilterExpressionsSource"))
@MethodSource(
Array(
"testBasicLookupFilterExpressionsSource",
"testAdvancedLookupFilterExpressionsSource",
"testCompositeFilterExpressionsSource"
))
def testLookupFilterExpressions(sourceExpr: String, input: Seq[IndexRow], output: Seq[String]): Unit = {
val resolvedExpr: Expression = HoodieCatalystExpressionUtils.resolveFilterExpr(spark, sourceExpr, sourceTableSchema)
val resolvedExpr: Expression = exprUtils.resolveExpr(spark, sourceExpr, sourceTableSchema)
val lookupFilter = DataSkippingUtils.translateIntoColumnStatsIndexFilterExpr(resolvedExpr, indexSchema)
val spark2 = spark
import spark2.implicits._
val indexDf = spark.createDataset(input)
val indexDf = spark.createDataFrame(input.map(_.toRow).asJava, indexSchema)
val rows = indexDf.where(new Column(lookupFilter))
.select("file")
@@ -93,7 +110,7 @@ class TestDataSkippingUtils extends HoodieClientTestBase {
@ParameterizedTest
@MethodSource(Array("testStringsLookupFilterExpressionsSource"))
def testStringsLookupFilterExpressions(sourceExpr: Expression, input: Seq[IndexRow], output: Seq[String]): Unit = {
val resolvedExpr = HoodieCatalystExpressionUtils.resolveFilterExpr(spark, sourceExpr, sourceTableSchema)
val resolvedExpr = exprUtils.resolveExpr(spark, sourceExpr, sourceTableSchema)
val lookupFilter = DataSkippingUtils.translateIntoColumnStatsIndexFilterExpr(resolvedExpr, indexSchema)
val spark2 = spark
@@ -130,11 +147,21 @@ object TestDataSkippingUtils {
IndexRow("file_3", 0, 0, 0, "aaa", "aba", 0),
IndexRow("file_4", 0, 0, 0, "abc123", "abc345", 0) // all strings start w/ "abc"
),
Seq("file_1", "file_2", "file_3")),
arguments(
// Composite expression
Not(lower(col("B")).startsWith("abc").expr),
Seq(
IndexRow("file_1", 0, 0, 0, "ABA", "ADF", 1), // may contain strings starting w/ "ABC" (after upper)
IndexRow("file_2", 0, 0, 0, "ADF", "AZY", 0),
IndexRow("file_3", 0, 0, 0, "AAA", "ABA", 0),
IndexRow("file_4", 0, 0, 0, "ABC123", "ABC345", 0) // all strings start w/ "ABC" (after upper)
),
Seq("file_1", "file_2", "file_3"))
)
}
def testBaseLookupFilterExpressionsSource(): java.util.stream.Stream[Arguments] = {
def testBasicLookupFilterExpressionsSource(): java.util.stream.Stream[Arguments] = {
java.util.stream.Stream.of(
// TODO cases
// A = null
@@ -263,6 +290,23 @@ object TestDataSkippingUtils {
IndexRow("file_4", 0, 0, 0), // only contains 0
IndexRow("file_5", 1, 1, 0) // only contains 1
),
Seq("file_1", "file_2", "file_3")),
arguments(
// Value expression containing expression, which isn't a literal
"A = int('0')",
Seq(
IndexRow("file_1", 1, 2, 0),
IndexRow("file_2", -1, 1, 0)
),
Seq("file_2")),
arguments(
// Value expression containing reference to the other attribute (column), fallback
"A = D",
Seq(
IndexRow("file_1", 1, 2, 0),
IndexRow("file_2", -1, 1, 0),
IndexRow("file_3", -2, -1, 0)
),
Seq("file_1", "file_2", "file_3"))
)
}
@@ -316,8 +360,8 @@ object TestDataSkippingUtils {
Seq("file_1", "file_2", "file_3")),
arguments(
// Queries contains expression involving non-indexed column C
"A = 0 AND B = 'abc' AND C = '...'",
// Queries contains expression involving non-indexed column D
"A = 0 AND B = 'abc' AND D IS NULL",
Seq(
IndexRow("file_1", 1, 2, 0),
IndexRow("file_2", -1, 1, 0),
@@ -327,8 +371,8 @@ object TestDataSkippingUtils {
Seq("file_4")),
arguments(
// Queries contains expression involving non-indexed column C
"A = 0 OR B = 'abc' OR C = '...'",
// Queries contains expression involving non-indexed column D
"A = 0 OR B = 'abc' OR D IS NULL",
Seq(
IndexRow("file_1", 1, 2, 0),
IndexRow("file_2", -1, 1, 0),
@@ -338,4 +382,206 @@ object TestDataSkippingUtils {
Seq("file_1", "file_2", "file_3", "file_4"))
)
}
def testCompositeFilterExpressionsSource(): java.util.stream.Stream[Arguments] = {
java.util.stream.Stream.of(
arguments(
"date_format(C, 'MM/dd/yyyy') = '03/06/2022'",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_2")),
arguments(
"'03/06/2022' = date_format(C, 'MM/dd/yyyy')",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_2")),
arguments(
"'03/06/2022' != date_format(C, 'MM/dd/yyyy')",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646625048000L), // 03/06/2022
C_num_nulls = 0)
),
Seq("file_1")),
arguments(
"date_format(C, 'MM/dd/yyyy') != '03/06/2022'",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646625048000L), // 03/06/2022
C_num_nulls = 0)
),
Seq("file_1")),
arguments(
"date_format(C, 'MM/dd/yyyy') < '03/07/2022'",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_2")),
arguments(
"'03/07/2022' > date_format(C, 'MM/dd/yyyy')",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_2")),
arguments(
"'03/07/2022' < date_format(C, 'MM/dd/yyyy')",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_1")),
arguments(
"date_format(C, 'MM/dd/yyyy') > '03/07/2022'",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_1")),
arguments(
"date_format(C, 'MM/dd/yyyy') <= '03/06/2022'",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_2")),
arguments(
"'03/06/2022' >= date_format(C, 'MM/dd/yyyy')",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_2")),
arguments(
"'03/08/2022' <= date_format(C, 'MM/dd/yyyy')",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_1")),
arguments(
"date_format(C, 'MM/dd/yyyy') >= '03/08/2022'",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_1")),
arguments(
"date_format(C, 'MM/dd/yyyy') IN ('03/08/2022')",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022
C_num_nulls = 0)
),
Seq("file_1")),
arguments(
"date_format(C, 'MM/dd/yyyy') NOT IN ('03/06/2022')",
Seq(
IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022
C_num_nulls = 0),
IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022
C_maxValue = new Timestamp(1646625048000L), // 03/06/2022
C_num_nulls = 0)
),
Seq("file_1")),
arguments(
// Should be identical to the one above
"date_format(to_timestamp(B, 'yyyy-MM-dd'), 'MM/dd/yyyy') NOT IN ('03/06/2022')",
Seq(
IndexRow("file_1",
B_minValue = "2022-03-07", // 03/07/2022
B_maxValue = "2022-03-08", // 03/08/2022
B_num_nulls = 0),
IndexRow("file_2",
B_minValue = "2022-03-06", // 03/06/2022
B_maxValue = "2022-03-06", // 03/06/2022
B_num_nulls = 0)
),
Seq("file_1"))
)
}
}