1
0

[MINOR] Fix dates as per UTC in TestDataSkippingUtils (#5166)

* Fix timezone in test
This commit is contained in:
Sagar Sumit
2022-03-30 20:03:14 +05:30
committed by GitHub
parent b9fbada2f2
commit 04478a45d9

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.testutils.HoodieClientTestBase
import org.apache.spark.sql.catalyst.expressions.{Expression, Not} import org.apache.spark.sql.catalyst.expressions.{Expression, Not}
import org.apache.spark.sql.functions.{col, lower} import org.apache.spark.sql.functions.{col, lower}
import org.apache.spark.sql.hudi.DataSkippingUtils import org.apache.spark.sql.hudi.DataSkippingUtils
import org.apache.spark.sql.internal.SQLConf.SESSION_LOCAL_TIMEZONE
import org.apache.spark.sql.types._ import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, HoodieCatalystExpressionUtils, Row, SparkSession} import org.apache.spark.sql.{Column, HoodieCatalystExpressionUtils, Row, SparkSession}
import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Assertions.assertEquals
@@ -93,6 +94,7 @@ class TestDataSkippingUtils extends HoodieClientTestBase with SparkAdapterSuppor
"testCompositeFilterExpressionsSource" "testCompositeFilterExpressionsSource"
)) ))
def testLookupFilterExpressions(sourceExpr: String, input: Seq[IndexRow], output: Seq[String]): Unit = { def testLookupFilterExpressions(sourceExpr: String, input: Seq[IndexRow], output: Seq[String]): Unit = {
spark.sqlContext.setConf(SESSION_LOCAL_TIMEZONE.key, "UTC")
val resolvedExpr: Expression = exprUtils.resolveExpr(spark, sourceExpr, sourceTableSchema) val resolvedExpr: Expression = exprUtils.resolveExpr(spark, sourceExpr, sourceTableSchema)
val lookupFilter = DataSkippingUtils.translateIntoColumnStatsIndexFilterExpr(resolvedExpr, indexSchema) val lookupFilter = DataSkippingUtils.translateIntoColumnStatsIndexFilterExpr(resolvedExpr, indexSchema)
@@ -384,186 +386,187 @@ object TestDataSkippingUtils {
} }
def testCompositeFilterExpressionsSource(): java.util.stream.Stream[Arguments] = { def testCompositeFilterExpressionsSource(): java.util.stream.Stream[Arguments] = {
// NOTE: all timestamps in UTC
java.util.stream.Stream.of( java.util.stream.Stream.of(
arguments( arguments(
"date_format(C, 'MM/dd/yyyy') = '03/06/2022'", "date_format(C, 'MM/dd/yyyy') = '03/07/2022'",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_2")), Seq("file_2")),
arguments( arguments(
"'03/06/2022' = date_format(C, 'MM/dd/yyyy')", "'03/07/2022' = date_format(C, 'MM/dd/yyyy')",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_2")), Seq("file_2")),
arguments( arguments(
"'03/06/2022' != date_format(C, 'MM/dd/yyyy')", "'03/07/2022' != date_format(C, 'MM/dd/yyyy')",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646625048000L), // 03/06/2022 C_maxValue = new Timestamp(1646625048000L), // 03/07/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_1")), Seq("file_1")),
arguments( arguments(
"date_format(C, 'MM/dd/yyyy') != '03/06/2022'", "date_format(C, 'MM/dd/yyyy') != '03/07/2022'",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646625048000L), // 03/06/2022 C_maxValue = new Timestamp(1646625048000L), // 03/07/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_1")), Seq("file_1")),
arguments( arguments(
"date_format(C, 'MM/dd/yyyy') < '03/07/2022'", "date_format(C, 'MM/dd/yyyy') < '03/08/2022'",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_2")), Seq("file_2")),
arguments( arguments(
"'03/07/2022' > date_format(C, 'MM/dd/yyyy')", "'03/08/2022' > date_format(C, 'MM/dd/yyyy')",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_2")), Seq("file_2")),
arguments( arguments(
"'03/07/2022' < date_format(C, 'MM/dd/yyyy')", "'03/08/2022' < date_format(C, 'MM/dd/yyyy')",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_1")), Seq("file_1")),
arguments( arguments(
"date_format(C, 'MM/dd/yyyy') > '03/07/2022'", "date_format(C, 'MM/dd/yyyy') > '03/08/2022'",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_1")), Seq("file_1")),
arguments( arguments(
"date_format(C, 'MM/dd/yyyy') <= '03/06/2022'", "date_format(C, 'MM/dd/yyyy') <= '03/07/2022'",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_2")), Seq("file_2")),
arguments( arguments(
"'03/06/2022' >= date_format(C, 'MM/dd/yyyy')", "'03/07/2022' >= date_format(C, 'MM/dd/yyyy')",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_2")), Seq("file_2")),
arguments( arguments(
"'03/08/2022' <= date_format(C, 'MM/dd/yyyy')", "'03/09/2022' <= date_format(C, 'MM/dd/yyyy')",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_1")), Seq("file_1")),
arguments( arguments(
"date_format(C, 'MM/dd/yyyy') >= '03/08/2022'", "date_format(C, 'MM/dd/yyyy') >= '03/09/2022'",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_1")), Seq("file_1")),
arguments( arguments(
"date_format(C, 'MM/dd/yyyy') IN ('03/08/2022')", "date_format(C, 'MM/dd/yyyy') IN ('03/09/2022')",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646711448000L), // 03/07/2022 C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_1")), Seq("file_1")),
arguments( arguments(
"date_format(C, 'MM/dd/yyyy') NOT IN ('03/06/2022')", "date_format(C, 'MM/dd/yyyy') NOT IN ('03/07/2022')",
Seq( Seq(
IndexRow("file_1", IndexRow("file_1",
C_minValue = new Timestamp(1646711448000L), // 03/07/2022 C_minValue = new Timestamp(1646711448000L), // 03/08/2022
C_maxValue = new Timestamp(1646797848000L), // 03/08/2022 C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
C_num_nulls = 0), C_num_nulls = 0),
IndexRow("file_2", IndexRow("file_2",
C_minValue = new Timestamp(1646625048000L), // 03/06/2022 C_minValue = new Timestamp(1646625048000L), // 03/07/2022
C_maxValue = new Timestamp(1646625048000L), // 03/06/2022 C_maxValue = new Timestamp(1646625048000L), // 03/07/2022
C_num_nulls = 0) C_num_nulls = 0)
), ),
Seq("file_1")), Seq("file_1")),