[HUDI-3739] Fix handling of the isNotNull predicate in Data Skipping (#5224)
- Fix handling of the isNotNull predicate in Data Skipping
This commit is contained in:
@@ -130,7 +130,7 @@ public class ColumnStatsIndexHelper {
|
||||
*
|
||||
* <pre>
|
||||
* +---------------------------+------------+------------+-------------+
|
||||
* | file | A_minValue | A_maxValue | A_num_nulls |
|
||||
* | file | A_minValue | A_maxValue | A_nullCount |
|
||||
* +---------------------------+------------+------------+-------------+
|
||||
* | one_base_file.parquet | 1 | 10 | 0 |
|
||||
* | another_base_file.parquet | -10 | 0 | 5 |
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
{"c1_maxValue":769,"c1_minValue":309,"c1_num_nulls":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_num_nulls":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_num_nulls":0,"c5_maxValue":78,"c5_minValue":32,"c5_num_nulls":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_num_nulls":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":932,"c1_minValue":0,"c1_num_nulls":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_num_nulls":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_num_nulls":0,"c5_maxValue":94,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":943,"c1_minValue":89,"c1_num_nulls":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_num_nulls":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_num_nulls":0,"c5_maxValue":95,"c5_minValue":10,"c5_num_nulls":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_num_nulls":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":959,"c1_minValue":74,"c1_num_nulls":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_num_nulls":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_num_nulls":0,"c5_maxValue":97,"c5_minValue":9,"c5_num_nulls":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_num_nulls":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":9}
|
||||
{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":8}
|
||||
{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_nullCount":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10}
|
||||
{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":13}
|
||||
@@ -1,8 +1,8 @@
|
||||
{"c1_maxValue":568,"c1_minValue":8,"c1_num_nulls":0,"c2_maxValue":" 8sdc","c2_minValue":" 111sdc","c2_num_nulls":0,"c3_maxValue":979.272,"c3_minValue":82.111,"c3_num_nulls":0,"c4_maxValue":"2021-11-18T23:34:44.193-08:00","c4_minValue":"2021-11-18T23:34:44.159-08:00","c4_num_nulls":0,"c5_maxValue":58,"c5_minValue":2,"c5_num_nulls":0,"c6_maxValue":"2020-11-08","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"9g==","c7_minValue":"Ag==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":715,"c1_minValue":76,"c1_num_nulls":0,"c2_maxValue":" 76sdc","c2_minValue":" 224sdc","c2_num_nulls":0,"c3_maxValue":958.579,"c3_minValue":246.427,"c3_num_nulls":0,"c4_maxValue":"2021-11-18T23:34:44.199-08:00","c4_minValue":"2021-11-18T23:34:44.166-08:00","c4_num_nulls":0,"c5_maxValue":73,"c5_minValue":9,"c5_num_nulls":0,"c6_maxValue":"2020-11-21","c6_minValue":"2020-01-16","c6_num_nulls":0,"c7_maxValue":"+g==","c7_minValue":"LA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":768,"c1_minValue":59,"c1_num_nulls":0,"c2_maxValue":" 768sdc","c2_minValue":" 118sdc","c2_num_nulls":0,"c3_maxValue":959.131,"c3_minValue":64.768,"c3_num_nulls":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.164-08:00","c4_num_nulls":0,"c5_maxValue":78,"c5_minValue":7,"c5_num_nulls":0,"c6_maxValue":"2020-11-20","c6_minValue":"2020-05-04","c6_num_nulls":0,"c7_maxValue":"zw==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":769,"c1_minValue":309,"c1_num_nulls":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_num_nulls":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_num_nulls":0,"c5_maxValue":78,"c5_minValue":32,"c5_num_nulls":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_num_nulls":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":770,"c1_minValue":129,"c1_num_nulls":0,"c2_maxValue":" 770sdc","c2_minValue":" 129sdc","c2_num_nulls":0,"c3_maxValue":977.328,"c3_minValue":153.431,"c3_num_nulls":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.169-08:00","c4_num_nulls":0,"c5_maxValue":78,"c5_minValue":14,"c5_num_nulls":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_num_nulls":0,"c7_maxValue":"rw==","c7_minValue":"Ag==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":932,"c1_minValue":0,"c1_num_nulls":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_num_nulls":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_num_nulls":0,"c5_maxValue":94,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":943,"c1_minValue":89,"c1_num_nulls":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_num_nulls":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_num_nulls":0,"c5_maxValue":95,"c5_minValue":10,"c5_num_nulls":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_num_nulls":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":959,"c1_minValue":74,"c1_num_nulls":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_num_nulls":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_num_nulls":0,"c5_maxValue":97,"c5_minValue":9,"c5_num_nulls":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_num_nulls":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
|
||||
{"c1_maxValue":568,"c1_minValue":8,"c1_nullCount":0,"c2_maxValue":" 8sdc","c2_minValue":" 111sdc","c2_nullCount":0,"c3_maxValue":979.272,"c3_minValue":82.111,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.193-08:00","c4_minValue":"2021-11-18T23:34:44.159-08:00","c4_nullCount":0,"c5_maxValue":58,"c5_minValue":2,"c5_nullCount":0,"c6_maxValue":"2020-11-08","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"9g==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":15}
|
||||
{"c1_maxValue":715,"c1_minValue":76,"c1_nullCount":0,"c2_maxValue":" 76sdc","c2_minValue":" 224sdc","c2_nullCount":0,"c3_maxValue":958.579,"c3_minValue":246.427,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.199-08:00","c4_minValue":"2021-11-18T23:34:44.166-08:00","c4_nullCount":0,"c5_maxValue":73,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-21","c6_minValue":"2020-01-16","c6_nullCount":0,"c7_maxValue":"+g==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":12}
|
||||
{"c1_maxValue":768,"c1_minValue":59,"c1_nullCount":0,"c2_maxValue":" 768sdc","c2_minValue":" 118sdc","c2_nullCount":0,"c3_maxValue":959.131,"c3_minValue":64.768,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.164-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":7,"c5_nullCount":0,"c6_maxValue":"2020-11-20","c6_minValue":"2020-05-04","c6_nullCount":0,"c7_maxValue":"zw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":7}
|
||||
{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":9}
|
||||
{"c1_maxValue":770,"c1_minValue":129,"c1_nullCount":0,"c2_maxValue":" 770sdc","c2_minValue":" 129sdc","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":153.431,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.169-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":14,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"rw==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":6}
|
||||
{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":8}
|
||||
{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_nullCount":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10}
|
||||
{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":13}
|
||||
@@ -36,21 +36,22 @@ import scala.collection.JavaConverters._
|
||||
|
||||
// NOTE: Only A, B columns are indexed
|
||||
case class IndexRow(fileName: String,
|
||||
valueCount: Long = 1,
|
||||
|
||||
// Corresponding A column is LongType
|
||||
A_minValue: Long = -1,
|
||||
A_maxValue: Long = -1,
|
||||
A_num_nulls: Long = -1,
|
||||
A_nullCount: Long = -1,
|
||||
|
||||
// Corresponding B column is StringType
|
||||
B_minValue: String = null,
|
||||
B_maxValue: String = null,
|
||||
B_num_nulls: Long = -1,
|
||||
B_nullCount: Long = -1,
|
||||
|
||||
// Corresponding B column is TimestampType
|
||||
C_minValue: Timestamp = null,
|
||||
C_maxValue: Timestamp = null,
|
||||
C_num_nulls: Long = -1) {
|
||||
C_nullCount: Long = -1) {
|
||||
def toRow: Row = Row(productIterator.toSeq: _*)
|
||||
}
|
||||
|
||||
@@ -132,28 +133,28 @@ object TestDataSkippingUtils {
|
||||
arguments(
|
||||
col("B").startsWith("abc").expr,
|
||||
Seq(
|
||||
IndexRow("file_1", 0, 0, 0, "aba", "adf", 1), // may contain strings starting w/ "abc"
|
||||
IndexRow("file_2", 0, 0, 0, "adf", "azy", 0),
|
||||
IndexRow("file_3", 0, 0, 0, "aaa", "aba", 0)
|
||||
IndexRow("file_1", valueCount = 1, B_minValue = "aba", B_maxValue = "adf", B_nullCount = 1), // may contain strings starting w/ "abc"
|
||||
IndexRow("file_2", valueCount = 1, B_minValue = "adf", B_maxValue = "azy", B_nullCount = 0),
|
||||
IndexRow("file_3", valueCount = 1, B_minValue = "aaa", B_maxValue = "aba", B_nullCount = 0)
|
||||
),
|
||||
Seq("file_1")),
|
||||
arguments(
|
||||
Not(col("B").startsWith("abc").expr),
|
||||
Seq(
|
||||
IndexRow("file_1", 0, 0, 0, "aba", "adf", 1), // may contain strings starting w/ "abc"
|
||||
IndexRow("file_2", 0, 0, 0, "adf", "azy", 0),
|
||||
IndexRow("file_3", 0, 0, 0, "aaa", "aba", 0),
|
||||
IndexRow("file_4", 0, 0, 0, "abc123", "abc345", 0) // all strings start w/ "abc"
|
||||
IndexRow("file_1", valueCount = 1, B_minValue = "aba", B_maxValue = "adf", B_nullCount = 1), // may contain strings starting w/ "abc"
|
||||
IndexRow("file_2", valueCount = 1, B_minValue = "adf", B_maxValue = "azy", B_nullCount = 0),
|
||||
IndexRow("file_3", valueCount = 1, B_minValue = "aaa", B_maxValue = "aba", B_nullCount = 0),
|
||||
IndexRow("file_4", valueCount = 1, B_minValue = "abc123", B_maxValue = "abc345", B_nullCount = 0) // all strings start w/ "abc"
|
||||
),
|
||||
Seq("file_1", "file_2", "file_3")),
|
||||
arguments(
|
||||
// Composite expression
|
||||
Not(lower(col("B")).startsWith("abc").expr),
|
||||
Seq(
|
||||
IndexRow("file_1", 0, 0, 0, "ABA", "ADF", 1), // may contain strings starting w/ "ABC" (after upper)
|
||||
IndexRow("file_2", 0, 0, 0, "ADF", "AZY", 0),
|
||||
IndexRow("file_3", 0, 0, 0, "AAA", "ABA", 0),
|
||||
IndexRow("file_4", 0, 0, 0, "ABC123", "ABC345", 0) // all strings start w/ "ABC" (after upper)
|
||||
IndexRow("file_1", valueCount = 1, B_minValue = "ABA", B_maxValue = "ADF", B_nullCount = 1), // may contain strings starting w/ "ABC" (after upper)
|
||||
IndexRow("file_2", valueCount = 1, B_minValue = "ADF", B_maxValue = "AZY", B_nullCount = 0),
|
||||
IndexRow("file_3", valueCount = 1, B_minValue = "AAA", B_maxValue = "ABA", B_nullCount = 0),
|
||||
IndexRow("file_4", valueCount = 1, B_minValue = "ABC123", B_maxValue = "ABC345", B_nullCount = 0) // all strings start w/ "ABC" (after upper)
|
||||
),
|
||||
Seq("file_1", "file_2", "file_3"))
|
||||
)
|
||||
@@ -166,144 +167,151 @@ object TestDataSkippingUtils {
|
||||
arguments(
|
||||
"A = 0",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0)
|
||||
),
|
||||
Seq("file_2")),
|
||||
arguments(
|
||||
"0 = A",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0)
|
||||
),
|
||||
Seq("file_2")),
|
||||
arguments(
|
||||
"A != 0",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", 0, 0, 0) // Contains only 0s
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, 0, 0, 0) // Contains only 0s
|
||||
),
|
||||
Seq("file_1", "file_2")),
|
||||
arguments(
|
||||
"0 != A",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", 0, 0, 0) // Contains only 0s
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, 0, 0, 0) // Contains only 0s
|
||||
),
|
||||
Seq("file_1", "file_2")),
|
||||
arguments(
|
||||
"A < 0",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0)
|
||||
),
|
||||
Seq("file_2", "file_3")),
|
||||
arguments(
|
||||
"0 > A",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0)
|
||||
),
|
||||
Seq("file_2", "file_3")),
|
||||
arguments(
|
||||
"A > 0",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0)
|
||||
),
|
||||
Seq("file_1", "file_2")),
|
||||
arguments(
|
||||
"0 < A",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0)
|
||||
),
|
||||
Seq("file_1", "file_2")),
|
||||
arguments(
|
||||
"A <= -1",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0)
|
||||
),
|
||||
Seq("file_2", "file_3")),
|
||||
arguments(
|
||||
"-1 >= A",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0)
|
||||
),
|
||||
Seq("file_2", "file_3")),
|
||||
arguments(
|
||||
"A >= 1",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0)
|
||||
),
|
||||
Seq("file_1", "file_2")),
|
||||
arguments(
|
||||
"1 <= A",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0)
|
||||
),
|
||||
Seq("file_1", "file_2")),
|
||||
arguments(
|
||||
"A is null",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 1)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 1)
|
||||
),
|
||||
Seq("file_2")),
|
||||
arguments(
|
||||
"A is not null",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 1)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 2, -1, 1, 1) // might still contain non-null values (if nullCount < valueCount)
|
||||
),
|
||||
Seq("file_1", "file_2")),
|
||||
arguments(
|
||||
"A is not null",
|
||||
Seq(
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 1) // might NOT contain non-null values (nullCount == valueCount)
|
||||
),
|
||||
Seq("file_1")),
|
||||
arguments(
|
||||
"A in (0, 1)",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0)
|
||||
),
|
||||
Seq("file_1", "file_2")),
|
||||
arguments(
|
||||
"A not in (0, 1)",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0),
|
||||
IndexRow("file_4", 0, 0, 0), // only contains 0
|
||||
IndexRow("file_5", 1, 1, 0) // only contains 1
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0),
|
||||
IndexRow("file_4", valueCount = 1, 0, 0, 0), // only contains 0
|
||||
IndexRow("file_5", valueCount = 1, 1, 1, 0) // only contains 1
|
||||
),
|
||||
Seq("file_1", "file_2", "file_3")),
|
||||
arguments(
|
||||
// Value expression containing expression, which isn't a literal
|
||||
"A = int('0')",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0)
|
||||
),
|
||||
Seq("file_2")),
|
||||
arguments(
|
||||
// Value expression containing reference to the other attribute (column), fallback
|
||||
"A = D",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0)
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0)
|
||||
),
|
||||
Seq("file_1", "file_2", "file_3"))
|
||||
)
|
||||
@@ -315,22 +323,22 @@ object TestDataSkippingUtils {
|
||||
// Filter out all rows that contain either A = 0 OR A = 1
|
||||
"A != 0 AND A != 1",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0),
|
||||
IndexRow("file_4", 0, 0, 0), // only contains 0
|
||||
IndexRow("file_5", 1, 1, 0) // only contains 1
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0),
|
||||
IndexRow("file_4", valueCount = 1, 0, 0, 0), // only contains 0
|
||||
IndexRow("file_5", valueCount = 1, 1, 1, 0) // only contains 1
|
||||
),
|
||||
Seq("file_1", "file_2", "file_3")),
|
||||
arguments(
|
||||
// This is an equivalent to the above expression
|
||||
"NOT(A = 0 OR A = 1)",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0),
|
||||
IndexRow("file_4", 0, 0, 0), // only contains 0
|
||||
IndexRow("file_5", 1, 1, 0) // only contains 1
|
||||
IndexRow("file_1", valueCount = 1, 1, 2, 0),
|
||||
IndexRow("file_2", valueCount = 1, -1, 1, 0),
|
||||
IndexRow("file_3", valueCount = 1, -2, -1, 0),
|
||||
IndexRow("file_4", valueCount = 1, 0, 0, 0), // only contains 0
|
||||
IndexRow("file_5", valueCount = 1, 1, 1, 0) // only contains 1
|
||||
),
|
||||
Seq("file_1", "file_2", "file_3")),
|
||||
|
||||
@@ -338,22 +346,22 @@ object TestDataSkippingUtils {
|
||||
// Filter out all rows that contain A = 0 AND B = 'abc'
|
||||
"A != 0 OR B != 'abc'",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0),
|
||||
IndexRow("file_4", 0, 0, 0, "abc", "abc", 0), // only contains A = 0, B = 'abc'
|
||||
IndexRow("file_5", 0, 0, 0, "abc", "abc", 0) // only contains A = 0, B = 'abc'
|
||||
IndexRow("file_1", valueCount = 1, A_minValue = 1, A_maxValue = 2, A_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1, A_minValue = -1, A_maxValue = 1, A_nullCount = 0),
|
||||
IndexRow("file_3", valueCount = 1, A_minValue = -2, A_maxValue = -1, A_nullCount = 0),
|
||||
IndexRow("file_4", valueCount = 1, A_minValue = 0, A_maxValue = 0, A_nullCount = 0, B_minValue = "abc", B_maxValue = "abc", B_nullCount = 0), // only contains A = 0, B = 'abc'
|
||||
IndexRow("file_5", valueCount = 1, A_minValue = 0, A_maxValue = 0, A_nullCount = 0, B_minValue = "abc", B_maxValue = "abc", B_nullCount = 0) // only contains A = 0, B = 'abc'
|
||||
),
|
||||
Seq("file_1", "file_2", "file_3")),
|
||||
arguments(
|
||||
// This is an equivalent to the above expression
|
||||
"NOT(A = 0 AND B = 'abc')",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0),
|
||||
IndexRow("file_4", 0, 0, 0, "abc", "abc", 0), // only contains A = 0, B = 'abc'
|
||||
IndexRow("file_5", 0, 0, 0, "abc", "abc", 0) // only contains A = 0, B = 'abc'
|
||||
IndexRow("file_1", valueCount = 1, A_minValue = 1, A_maxValue = 2, A_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1, A_minValue = -1, A_maxValue = 1, A_nullCount = 0),
|
||||
IndexRow("file_3", valueCount = 1, A_minValue = -2, A_maxValue = -1, A_nullCount = 0),
|
||||
IndexRow("file_4", valueCount = 1, A_minValue = 0, A_maxValue = 0, A_nullCount = 0, B_minValue = "abc", B_maxValue = "abc", B_nullCount = 0), // only contains A = 0, B = 'abc'
|
||||
IndexRow("file_5", valueCount = 1, A_minValue = 0, A_maxValue = 0, A_nullCount = 0, B_minValue = "abc", B_maxValue = "abc", B_nullCount = 0) // only contains A = 0, B = 'abc'
|
||||
),
|
||||
Seq("file_1", "file_2", "file_3")),
|
||||
|
||||
@@ -361,10 +369,10 @@ object TestDataSkippingUtils {
|
||||
// Queries contains expression involving non-indexed column D
|
||||
"A = 0 AND B = 'abc' AND D IS NULL",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0),
|
||||
IndexRow("file_4", 0, 0, 0, "aaa", "xyz", 0) // might contain A = 0 AND B = 'abc'
|
||||
IndexRow("file_1", valueCount = 1, A_minValue = 1, A_maxValue = 2, A_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1, A_minValue = -1, A_maxValue = 1, A_nullCount = 0),
|
||||
IndexRow("file_3", valueCount = 1, A_minValue = -2, A_maxValue = -1, A_nullCount = 0),
|
||||
IndexRow("file_4", valueCount = 1, A_minValue = 0, A_maxValue = 0, A_nullCount = 0, B_minValue = "aaa", B_maxValue = "xyz", B_nullCount = 0) // might contain A = 0 AND B = 'abc'
|
||||
),
|
||||
Seq("file_4")),
|
||||
|
||||
@@ -372,10 +380,10 @@ object TestDataSkippingUtils {
|
||||
// Queries contains expression involving non-indexed column D
|
||||
"A = 0 OR B = 'abc' OR D IS NULL",
|
||||
Seq(
|
||||
IndexRow("file_1", 1, 2, 0),
|
||||
IndexRow("file_2", -1, 1, 0),
|
||||
IndexRow("file_3", -2, -1, 0),
|
||||
IndexRow("file_4", 0, 0, 0, "aaa", "xyz", 0) // might contain B = 'abc'
|
||||
IndexRow("file_1", valueCount = 1, A_minValue = 1, A_maxValue = 2, A_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1, A_minValue = -1, A_maxValue = 1, A_nullCount = 0),
|
||||
IndexRow("file_3", valueCount = 1, A_minValue = -2, A_maxValue = -1, A_nullCount = 0),
|
||||
IndexRow("file_4", valueCount = 1, B_minValue = "aaa", B_maxValue = "xyz", B_nullCount = 0) // might contain B = 'abc'
|
||||
),
|
||||
Seq("file_1", "file_2", "file_3", "file_4"))
|
||||
)
|
||||
@@ -387,197 +395,197 @@ object TestDataSkippingUtils {
|
||||
arguments(
|
||||
"date_format(C, 'MM/dd/yyyy') = '03/07/2022'",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_2")),
|
||||
arguments(
|
||||
"'03/07/2022' = date_format(C, 'MM/dd/yyyy')",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_2")),
|
||||
arguments(
|
||||
"'03/07/2022' != date_format(C, 'MM/dd/yyyy')",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_1")),
|
||||
arguments(
|
||||
"date_format(C, 'MM/dd/yyyy') != '03/07/2022'",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_1")),
|
||||
arguments(
|
||||
"date_format(C, 'MM/dd/yyyy') < '03/08/2022'",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_2")),
|
||||
arguments(
|
||||
"'03/08/2022' > date_format(C, 'MM/dd/yyyy')",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_2")),
|
||||
arguments(
|
||||
"'03/08/2022' < date_format(C, 'MM/dd/yyyy')",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_1")),
|
||||
arguments(
|
||||
"date_format(C, 'MM/dd/yyyy') > '03/08/2022'",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_1")),
|
||||
arguments(
|
||||
"date_format(C, 'MM/dd/yyyy') <= '03/07/2022'",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_2")),
|
||||
arguments(
|
||||
"'03/07/2022' >= date_format(C, 'MM/dd/yyyy')",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_2")),
|
||||
arguments(
|
||||
"'03/09/2022' <= date_format(C, 'MM/dd/yyyy')",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_1")),
|
||||
arguments(
|
||||
"date_format(C, 'MM/dd/yyyy') >= '03/09/2022'",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_1")),
|
||||
arguments(
|
||||
"date_format(C, 'MM/dd/yyyy') IN ('03/09/2022')",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_1")),
|
||||
arguments(
|
||||
"date_format(C, 'MM/dd/yyyy') NOT IN ('03/07/2022')",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646711448000L), // 03/08/2022
|
||||
C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
|
||||
C_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
C_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
C_minValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_maxValue = new Timestamp(1646625048000L), // 03/07/2022
|
||||
C_num_nulls = 0)
|
||||
C_nullCount = 0)
|
||||
),
|
||||
Seq("file_1")),
|
||||
arguments(
|
||||
// Should be identical to the one above
|
||||
"date_format(to_timestamp(B, 'yyyy-MM-dd'), 'MM/dd/yyyy') NOT IN ('03/06/2022')",
|
||||
Seq(
|
||||
IndexRow("file_1",
|
||||
IndexRow("file_1", valueCount = 1,
|
||||
B_minValue = "2022-03-07", // 03/07/2022
|
||||
B_maxValue = "2022-03-08", // 03/08/2022
|
||||
B_num_nulls = 0),
|
||||
IndexRow("file_2",
|
||||
B_nullCount = 0),
|
||||
IndexRow("file_2", valueCount = 1,
|
||||
B_minValue = "2022-03-06", // 03/06/2022
|
||||
B_maxValue = "2022-03-06", // 03/06/2022
|
||||
B_num_nulls = 0)
|
||||
B_nullCount = 0)
|
||||
),
|
||||
Seq("file_1"))
|
||||
|
||||
|
||||
@@ -209,7 +209,7 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup
|
||||
})
|
||||
}
|
||||
|
||||
private def buildColumnStatsTableManually(tablePath: String, zorderedCols: Seq[String], indexSchema: StructType) = {
|
||||
private def buildColumnStatsTableManually(tablePath: String, indexedCols: Seq[String], indexSchema: StructType) = {
|
||||
val files = {
|
||||
val it = fs.listFiles(new Path(tablePath), true)
|
||||
var seq = Seq[LocatedFileStatus]()
|
||||
@@ -224,15 +224,16 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup
|
||||
val df = spark.read.schema(sourceTableSchema).parquet(file.getPath.toString)
|
||||
val exprs: Seq[String] =
|
||||
s"'${typedLit(file.getPath.getName)}' AS file" +:
|
||||
s"sum(1) AS valueCount" +:
|
||||
df.columns
|
||||
.filter(col => zorderedCols.contains(col))
|
||||
.filter(col => indexedCols.contains(col))
|
||||
.flatMap(col => {
|
||||
val minColName = s"${col}_minValue"
|
||||
val maxColName = s"${col}_maxValue"
|
||||
Seq(
|
||||
s"min($col) AS $minColName",
|
||||
s"max($col) AS $maxColName",
|
||||
s"sum(cast(isnull($col) AS long)) AS ${col}_num_nulls"
|
||||
s"sum(cast(isnull($col) AS long)) AS ${col}_nullCount"
|
||||
)
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user