From d43b4cd95e2d23aafcd3aef390b94bb255354e97 Mon Sep 17 00:00:00 2001 From: Alexey Kudinkin Date: Wed, 6 Apr 2022 12:17:36 -0700 Subject: [PATCH] [HUDI-3739] Fix handling of the `isNotNull` predicate in Data Skipping (#5224) - Fix handling of the isNotNull predicate in Data Skipping --- .../apache/hudi/ColumnStatsIndexSupport.scala | 38 ++- .../org/apache/hudi/HoodieFileIndex.scala | 7 +- .../spark/sql/hudi/DataSkippingUtils.scala | 30 +- .../apache/hudi/ColumnStatsIndexHelper.java | 2 +- .../zorder/column-stats-index-table.json | 8 +- .../updated-column-stats-index-table.json | 16 +- .../apache/hudi/TestDataSkippingUtils.scala | 314 +++++++++--------- .../functional/TestColumnStatsIndex.scala | 7 +- 8 files changed, 214 insertions(+), 208 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala index d176a3755..743932341 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala @@ -54,7 +54,8 @@ trait ColumnStatsIndexSupport extends SparkAdapterSupport { HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE, - HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT) + HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT, + HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT) val requiredMetadataIndexColumns = (targetColStatsIndexColumns :+ HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME).map(colName => @@ -98,7 +99,7 @@ trait ColumnStatsIndexSupport extends SparkAdapterSupport { * *
    *  +---------------------------+------------+------------+-------------+
-   *  |          file             | A_minValue | A_maxValue | A_num_nulls |
+   *  |          file             | A_minValue | A_maxValue | A_nullCount |
    *  +---------------------------+------------+------------+-------------+
    *  | one_base_file.parquet     |          1 |         10 |           0 |
    *  | another_base_file.parquet |        -10 |          0 |           5 |
@@ -133,6 +134,7 @@ trait ColumnStatsIndexSupport extends SparkAdapterSupport {
     val maxValueOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE)
     val fileNameOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME)
     val nullCountOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT)
+    val valueCountOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT)
 
     val transposedRDD = colStatsDF.rdd
       .filter(row => sortedColumns.contains(row.getString(colNameOrdinal)))
@@ -155,11 +157,13 @@ trait ColumnStatsIndexSupport extends SparkAdapterSupport {
         case (_, columnRows) =>
           // Rows seq is always non-empty (otherwise it won't be grouped into)
           val fileName = columnRows.head.get(fileNameOrdinal)
+          val valueCount = columnRows.head.get(valueCountOrdinal)
+
           val coalescedRowValuesSeq = columnRows.toSeq
             // NOTE: It's crucial to maintain appropriate ordering of the columns
             //       matching table layout
             .sortBy(_.getString(colNameOrdinal))
-            .foldLeft(Seq[Any](fileName)) {
+            .foldLeft(Seq[Any](fileName, valueCount)) {
               case (acc, columnRow) =>
                 acc ++ Seq(minValueOrdinal, maxValueOrdinal, nullCountOrdinal).map(ord => columnRow.get(ord))
             }
@@ -223,11 +227,6 @@ trait ColumnStatsIndexSupport extends SparkAdapterSupport {
 
 object ColumnStatsIndexSupport {
 
-  private val COLUMN_STATS_INDEX_FILE_COLUMN_NAME = "fileName"
-  private val COLUMN_STATS_INDEX_MIN_VALUE_STAT_NAME = "minValue"
-  private val COLUMN_STATS_INDEX_MAX_VALUE_STAT_NAME = "maxValue"
-  private val COLUMN_STATS_INDEX_NUM_NULLS_STAT_NAME = "num_nulls"
-
   private val metadataRecordSchemaString: String = HoodieMetadataRecord.SCHEMA$.toString
   private val metadataRecordStructType: StructType = AvroConversionUtils.convertAvroSchemaToStructType(HoodieMetadataRecord.SCHEMA$)
 
@@ -235,28 +234,33 @@ object ColumnStatsIndexSupport {
    * @VisibleForTesting
    */
   def composeIndexSchema(targetColumnNames: Seq[String], tableSchema: StructType): StructType = {
-    val fileNameField = StructField(COLUMN_STATS_INDEX_FILE_COLUMN_NAME, StringType, nullable = true, Metadata.empty)
+    val fileNameField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME, StringType, nullable = true, Metadata.empty)
+    val valueCountField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT, LongType, nullable = true, Metadata.empty)
+
     val targetFields = targetColumnNames.map(colName => tableSchema.fields.find(f => f.name == colName).get)
 
     StructType(
-      targetFields.foldLeft(Seq(fileNameField)) {
+      targetFields.foldLeft(Seq(fileNameField, valueCountField)) {
         case (acc, field) =>
           acc ++ Seq(
-            composeColumnStatStructType(field.name, COLUMN_STATS_INDEX_MIN_VALUE_STAT_NAME, field.dataType),
-            composeColumnStatStructType(field.name, COLUMN_STATS_INDEX_MAX_VALUE_STAT_NAME, field.dataType),
-            composeColumnStatStructType(field.name, COLUMN_STATS_INDEX_NUM_NULLS_STAT_NAME, LongType))
+            composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE, field.dataType),
+            composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE, field.dataType),
+            composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT, LongType))
       }
     )
   }
 
   @inline def getMinColumnNameFor(colName: String): String =
-    formatColName(colName, COLUMN_STATS_INDEX_MIN_VALUE_STAT_NAME)
+    formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE)
 
   @inline def getMaxColumnNameFor(colName: String): String =
-    formatColName(colName, COLUMN_STATS_INDEX_MAX_VALUE_STAT_NAME)
+    formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE)
 
-  @inline def getNumNullsColumnNameFor(colName: String): String =
-    formatColName(colName, COLUMN_STATS_INDEX_NUM_NULLS_STAT_NAME)
+  @inline def getNullCountColumnNameFor(colName: String): String =
+    formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT)
+
+  @inline def getValueCountColumnNameFor: String =
+    HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT
 
   @inline private def formatColName(col: String, statName: String) = { // TODO add escaping for
     String.format("%s_%s", col, statName)
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
index 2cea67d27..0ea4d1cef 100644
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
@@ -26,12 +26,13 @@ import org.apache.hudi.common.util.StringUtils
 import org.apache.hudi.exception.HoodieException
 import org.apache.hudi.keygen.constant.KeyGeneratorOptions
 import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator}
-import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadata, HoodieTableMetadataUtil, MetadataPartitionType}
+import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadataUtil}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{And, Expression, Literal}
 import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory}
-import org.apache.spark.sql.hudi.{DataSkippingUtils, HoodieSqlCommonUtils}
+import org.apache.spark.sql.hudi.DataSkippingUtils.translateIntoColumnStatsIndexFilterExpr
+import org.apache.spark.sql.hudi.HoodieSqlCommonUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, DataFrame, SparkSession}
@@ -211,7 +212,7 @@ case class HoodieFileIndex(spark: SparkSession,
         withPersistence(transposedColStatsDF) {
           val indexSchema = transposedColStatsDF.schema
           val indexFilter =
-            queryFilters.map(DataSkippingUtils.translateIntoColumnStatsIndexFilterExpr(_, indexSchema))
+            queryFilters.map(translateIntoColumnStatsIndexFilterExpr(_, indexSchema))
               .reduce(And)
 
           val allIndexedFileNames =
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala
index bdaddd3f6..4db94e5b2 100644
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hudi
 
-import org.apache.hudi.ColumnStatsIndexSupport.{getMaxColumnNameFor, getMinColumnNameFor, getNumNullsColumnNameFor}
+import org.apache.hudi.ColumnStatsIndexSupport.{getMaxColumnNameFor, getMinColumnNameFor, getNullCountColumnNameFor, getValueCountColumnNameFor}
 import org.apache.hudi.SparkAdapterSupport
 import org.apache.hudi.common.util.ValidationUtils.checkState
 import org.apache.spark.internal.Logging
@@ -135,7 +135,7 @@ object DataSkippingUtils extends Logging {
           }
 
       // Filter "colA = null"
-      // Translates to "colA_num_nulls = null" for index lookup
+      // Translates to "colA_nullCount = null" for index lookup
       case EqualNullSafe(attrRef: AttributeReference, litNull @ Literal(null, _)) =>
         getTargetIndexedColumnName(attrRef, indexSchema)
           .map(colName => EqualTo(genColNumNullsExpr(colName), litNull))
@@ -205,16 +205,16 @@ object DataSkippingUtils extends Logging {
           }
 
       // Filter "colA is null"
-      // Translates to "colA_num_nulls > 0" for index lookup
+      // Translates to "colA_nullCount > 0" for index lookup
       case IsNull(attribute: AttributeReference) =>
         getTargetIndexedColumnName(attribute, indexSchema)
           .map(colName => GreaterThan(genColNumNullsExpr(colName), Literal(0)))
 
       // Filter "colA is not null"
-      // Translates to "colA_num_nulls = 0" for index lookup
+      // Translates to "colA_nullCount < colA_valueCount" for index lookup
       case IsNotNull(attribute: AttributeReference) =>
         getTargetIndexedColumnName(attribute, indexSchema)
-          .map(colName => EqualTo(genColNumNullsExpr(colName), Literal(0)))
+          .map(colName => LessThan(genColNumNullsExpr(colName), genColValueCountExpr))
 
       // Filter "expr(colA) in (B1, B2, ...)"
       // Translates to "(colA_minValue <= B1 AND colA_maxValue >= B1) OR (colA_minValue <= B2 AND colA_maxValue >= B2) ... "
@@ -294,7 +294,7 @@ object DataSkippingUtils extends Logging {
     Set.apply(
       getMinColumnNameFor(colName),
       getMaxColumnNameFor(colName),
-      getNumNullsColumnNameFor(colName)
+      getNullCountColumnNameFor(colName)
     )
       .forall(stat => indexSchema.exists(_.name == stat))
   }
@@ -325,19 +325,14 @@ object DataSkippingUtils extends Logging {
 
 private object ColumnStatsExpressionUtils {
 
-  def genColMinValueExpr(colName: String): Expression =
-    col(getMinColumnNameFor(colName)).expr
-  def genColMaxValueExpr(colName: String): Expression =
-    col(getMaxColumnNameFor(colName)).expr
-  def genColNumNullsExpr(colName: String): Expression =
-    col(getNumNullsColumnNameFor(colName)).expr
+  @inline def genColMinValueExpr(colName: String): Expression = col(getMinColumnNameFor(colName)).expr
+  @inline def genColMaxValueExpr(colName: String): Expression = col(getMaxColumnNameFor(colName)).expr
+  @inline def genColNumNullsExpr(colName: String): Expression = col(getNullCountColumnNameFor(colName)).expr
+  @inline def genColValueCountExpr: Expression = col(getValueCountColumnNameFor).expr
 
-  def genColumnValuesEqualToExpression(colName: String,
+  @inline def genColumnValuesEqualToExpression(colName: String,
                                        value: Expression,
                                        targetExprBuilder: Function[Expression, Expression] = Predef.identity): Expression = {
-    // TODO clean up
-    checkState(isValueExpression(value))
-
     val minValueExpr = targetExprBuilder.apply(genColMinValueExpr(colName))
     val maxValueExpr = targetExprBuilder.apply(genColMaxValueExpr(colName))
     // Only case when column C contains value V is when min(C) <= V <= max(c)
@@ -347,9 +342,6 @@ private object ColumnStatsExpressionUtils {
   def genColumnOnlyValuesEqualToExpression(colName: String,
                                            value: Expression,
                                            targetExprBuilder: Function[Expression, Expression] = Predef.identity): Expression = {
-    // TODO clean up
-    checkState(isValueExpression(value))
-
     val minValueExpr = targetExprBuilder.apply(genColMinValueExpr(colName))
     val maxValueExpr = targetExprBuilder.apply(genColMaxValueExpr(colName))
     // Only case when column C contains _only_ value V is when min(C) = V AND max(c) = V
diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/ColumnStatsIndexHelper.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/ColumnStatsIndexHelper.java
index a60fac232..7c9649d44 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/ColumnStatsIndexHelper.java
+++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/ColumnStatsIndexHelper.java
@@ -130,7 +130,7 @@ public class ColumnStatsIndexHelper {
    *
    * 
    * +---------------------------+------------+------------+-------------+
-   * |          file             | A_minValue | A_maxValue | A_num_nulls |
+   * |          file             | A_minValue | A_maxValue | A_nullCount |
    * +---------------------------+------------+------------+-------------+
    * | one_base_file.parquet     |          1 |         10 |           0 |
    * | another_base_file.parquet |        -10 |          0 |           5 |
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/column-stats-index-table.json
index 1ed929c79..297e000de 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/column-stats-index-table.json
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/column-stats-index-table.json
@@ -1,4 +1,4 @@
-{"c1_maxValue":769,"c1_minValue":309,"c1_num_nulls":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_num_nulls":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_num_nulls":0,"c5_maxValue":78,"c5_minValue":32,"c5_num_nulls":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_num_nulls":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
-{"c1_maxValue":932,"c1_minValue":0,"c1_num_nulls":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_num_nulls":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_num_nulls":0,"c5_maxValue":94,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
-{"c1_maxValue":943,"c1_minValue":89,"c1_num_nulls":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_num_nulls":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_num_nulls":0,"c5_maxValue":95,"c5_minValue":10,"c5_num_nulls":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_num_nulls":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
-{"c1_maxValue":959,"c1_minValue":74,"c1_num_nulls":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_num_nulls":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_num_nulls":0,"c5_maxValue":97,"c5_minValue":9,"c5_num_nulls":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_num_nulls":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
\ No newline at end of file
+{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":9}
+{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":8}
+{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_nullCount":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10}
+{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":13}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/updated-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/updated-column-stats-index-table.json
index b5486d169..bac789913 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/updated-column-stats-index-table.json
+++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/updated-column-stats-index-table.json
@@ -1,8 +1,8 @@
-{"c1_maxValue":568,"c1_minValue":8,"c1_num_nulls":0,"c2_maxValue":" 8sdc","c2_minValue":" 111sdc","c2_num_nulls":0,"c3_maxValue":979.272,"c3_minValue":82.111,"c3_num_nulls":0,"c4_maxValue":"2021-11-18T23:34:44.193-08:00","c4_minValue":"2021-11-18T23:34:44.159-08:00","c4_num_nulls":0,"c5_maxValue":58,"c5_minValue":2,"c5_num_nulls":0,"c6_maxValue":"2020-11-08","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"9g==","c7_minValue":"Ag==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
-{"c1_maxValue":715,"c1_minValue":76,"c1_num_nulls":0,"c2_maxValue":" 76sdc","c2_minValue":" 224sdc","c2_num_nulls":0,"c3_maxValue":958.579,"c3_minValue":246.427,"c3_num_nulls":0,"c4_maxValue":"2021-11-18T23:34:44.199-08:00","c4_minValue":"2021-11-18T23:34:44.166-08:00","c4_num_nulls":0,"c5_maxValue":73,"c5_minValue":9,"c5_num_nulls":0,"c6_maxValue":"2020-11-21","c6_minValue":"2020-01-16","c6_num_nulls":0,"c7_maxValue":"+g==","c7_minValue":"LA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
-{"c1_maxValue":768,"c1_minValue":59,"c1_num_nulls":0,"c2_maxValue":" 768sdc","c2_minValue":" 118sdc","c2_num_nulls":0,"c3_maxValue":959.131,"c3_minValue":64.768,"c3_num_nulls":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.164-08:00","c4_num_nulls":0,"c5_maxValue":78,"c5_minValue":7,"c5_num_nulls":0,"c6_maxValue":"2020-11-20","c6_minValue":"2020-05-04","c6_num_nulls":0,"c7_maxValue":"zw==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
-{"c1_maxValue":769,"c1_minValue":309,"c1_num_nulls":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_num_nulls":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_num_nulls":0,"c5_maxValue":78,"c5_minValue":32,"c5_num_nulls":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_num_nulls":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
-{"c1_maxValue":770,"c1_minValue":129,"c1_num_nulls":0,"c2_maxValue":" 770sdc","c2_minValue":" 129sdc","c2_num_nulls":0,"c3_maxValue":977.328,"c3_minValue":153.431,"c3_num_nulls":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.169-08:00","c4_num_nulls":0,"c5_maxValue":78,"c5_minValue":14,"c5_num_nulls":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_num_nulls":0,"c7_maxValue":"rw==","c7_minValue":"Ag==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
-{"c1_maxValue":932,"c1_minValue":0,"c1_num_nulls":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_num_nulls":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_num_nulls":0,"c5_maxValue":94,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
-{"c1_maxValue":943,"c1_minValue":89,"c1_num_nulls":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_num_nulls":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_num_nulls":0,"c5_maxValue":95,"c5_minValue":10,"c5_num_nulls":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_num_nulls":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
-{"c1_maxValue":959,"c1_minValue":74,"c1_num_nulls":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_num_nulls":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_num_nulls":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_num_nulls":0,"c5_maxValue":97,"c5_minValue":9,"c5_num_nulls":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_num_nulls":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0}
\ No newline at end of file
+{"c1_maxValue":568,"c1_minValue":8,"c1_nullCount":0,"c2_maxValue":" 8sdc","c2_minValue":" 111sdc","c2_nullCount":0,"c3_maxValue":979.272,"c3_minValue":82.111,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.193-08:00","c4_minValue":"2021-11-18T23:34:44.159-08:00","c4_nullCount":0,"c5_maxValue":58,"c5_minValue":2,"c5_nullCount":0,"c6_maxValue":"2020-11-08","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"9g==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":15}
+{"c1_maxValue":715,"c1_minValue":76,"c1_nullCount":0,"c2_maxValue":" 76sdc","c2_minValue":" 224sdc","c2_nullCount":0,"c3_maxValue":958.579,"c3_minValue":246.427,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.199-08:00","c4_minValue":"2021-11-18T23:34:44.166-08:00","c4_nullCount":0,"c5_maxValue":73,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-21","c6_minValue":"2020-01-16","c6_nullCount":0,"c7_maxValue":"+g==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":12}
+{"c1_maxValue":768,"c1_minValue":59,"c1_nullCount":0,"c2_maxValue":" 768sdc","c2_minValue":" 118sdc","c2_nullCount":0,"c3_maxValue":959.131,"c3_minValue":64.768,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.164-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":7,"c5_nullCount":0,"c6_maxValue":"2020-11-20","c6_minValue":"2020-05-04","c6_nullCount":0,"c7_maxValue":"zw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":7}
+{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":9}
+{"c1_maxValue":770,"c1_minValue":129,"c1_nullCount":0,"c2_maxValue":" 770sdc","c2_minValue":" 129sdc","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":153.431,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.169-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":14,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"rw==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":6}
+{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":8}
+{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_nullCount":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10}
+{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":13}
\ No newline at end of file
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSkippingUtils.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSkippingUtils.scala
index 10b4faf0c..e0e5cb266 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSkippingUtils.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSkippingUtils.scala
@@ -36,21 +36,22 @@ import scala.collection.JavaConverters._
 
 // NOTE: Only A, B columns are indexed
 case class IndexRow(fileName: String,
+                    valueCount: Long = 1,
 
                     // Corresponding A column is LongType
                     A_minValue: Long = -1,
                     A_maxValue: Long = -1,
-                    A_num_nulls: Long = -1,
+                    A_nullCount: Long = -1,
 
                     // Corresponding B column is StringType
                     B_minValue: String = null,
                     B_maxValue: String = null,
-                    B_num_nulls: Long = -1,
+                    B_nullCount: Long = -1,
 
                     // Corresponding B column is TimestampType
                     C_minValue: Timestamp = null,
                     C_maxValue: Timestamp = null,
-                    C_num_nulls: Long = -1) {
+                    C_nullCount: Long = -1) {
   def toRow: Row = Row(productIterator.toSeq: _*)
 }
 
@@ -132,28 +133,28 @@ object TestDataSkippingUtils {
       arguments(
         col("B").startsWith("abc").expr,
         Seq(
-          IndexRow("file_1", 0, 0, 0, "aba", "adf", 1), // may contain strings starting w/ "abc"
-          IndexRow("file_2", 0, 0, 0, "adf", "azy", 0),
-          IndexRow("file_3", 0, 0, 0, "aaa", "aba", 0)
+          IndexRow("file_1", valueCount = 1, B_minValue = "aba", B_maxValue = "adf", B_nullCount = 1), // may contain strings starting w/ "abc"
+          IndexRow("file_2", valueCount = 1, B_minValue = "adf", B_maxValue = "azy", B_nullCount = 0),
+          IndexRow("file_3", valueCount = 1, B_minValue = "aaa", B_maxValue = "aba", B_nullCount = 0)
         ),
         Seq("file_1")),
       arguments(
         Not(col("B").startsWith("abc").expr),
         Seq(
-          IndexRow("file_1", 0, 0, 0, "aba", "adf", 1), // may contain strings starting w/ "abc"
-          IndexRow("file_2", 0, 0, 0, "adf", "azy", 0),
-          IndexRow("file_3", 0, 0, 0, "aaa", "aba", 0),
-          IndexRow("file_4", 0, 0, 0, "abc123", "abc345", 0) // all strings start w/ "abc"
+          IndexRow("file_1", valueCount = 1, B_minValue = "aba", B_maxValue = "adf", B_nullCount = 1), // may contain strings starting w/ "abc"
+          IndexRow("file_2", valueCount = 1, B_minValue = "adf", B_maxValue = "azy", B_nullCount = 0),
+          IndexRow("file_3", valueCount = 1, B_minValue = "aaa", B_maxValue = "aba", B_nullCount = 0),
+          IndexRow("file_4", valueCount = 1, B_minValue = "abc123", B_maxValue = "abc345", B_nullCount = 0) // all strings start w/ "abc"
         ),
         Seq("file_1", "file_2", "file_3")),
       arguments(
         // Composite expression
         Not(lower(col("B")).startsWith("abc").expr),
         Seq(
-          IndexRow("file_1", 0, 0, 0, "ABA", "ADF", 1), // may contain strings starting w/ "ABC" (after upper)
-          IndexRow("file_2", 0, 0, 0, "ADF", "AZY", 0),
-          IndexRow("file_3", 0, 0, 0, "AAA", "ABA", 0),
-          IndexRow("file_4", 0, 0, 0, "ABC123", "ABC345", 0) // all strings start w/ "ABC" (after upper)
+          IndexRow("file_1", valueCount = 1, B_minValue = "ABA", B_maxValue = "ADF", B_nullCount = 1), // may contain strings starting w/ "ABC" (after upper)
+          IndexRow("file_2", valueCount = 1, B_minValue = "ADF", B_maxValue = "AZY", B_nullCount = 0),
+          IndexRow("file_3", valueCount = 1, B_minValue = "AAA", B_maxValue = "ABA", B_nullCount = 0),
+          IndexRow("file_4", valueCount = 1, B_minValue = "ABC123", B_maxValue = "ABC345", B_nullCount = 0) // all strings start w/ "ABC" (after upper)
         ),
         Seq("file_1", "file_2", "file_3"))
     )
@@ -166,144 +167,151 @@ object TestDataSkippingUtils {
       arguments(
         "A = 0",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0)
         ),
         Seq("file_2")),
       arguments(
         "0 = A",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0)
         ),
         Seq("file_2")),
       arguments(
         "A != 0",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", 0, 0, 0) // Contains only 0s
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, 0, 0, 0) // Contains only 0s
         ),
         Seq("file_1", "file_2")),
       arguments(
         "0 != A",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", 0, 0, 0) // Contains only 0s
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, 0, 0, 0) // Contains only 0s
         ),
         Seq("file_1", "file_2")),
       arguments(
         "A < 0",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0)
         ),
         Seq("file_2", "file_3")),
       arguments(
         "0 > A",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0)
         ),
         Seq("file_2", "file_3")),
       arguments(
         "A > 0",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0)
         ),
         Seq("file_1", "file_2")),
       arguments(
         "0 < A",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0)
         ),
         Seq("file_1", "file_2")),
       arguments(
         "A <= -1",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0)
         ),
         Seq("file_2", "file_3")),
       arguments(
         "-1 >= A",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0)
         ),
         Seq("file_2", "file_3")),
       arguments(
         "A >= 1",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0)
         ),
         Seq("file_1", "file_2")),
       arguments(
         "1 <= A",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0)
         ),
         Seq("file_1", "file_2")),
       arguments(
         "A is null",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 1)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 1)
         ),
         Seq("file_2")),
       arguments(
         "A is not null",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 1)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 2, -1, 1, 1) // might still contain non-null values (if nullCount < valueCount)
+        ),
+        Seq("file_1", "file_2")),
+      arguments(
+        "A is not null",
+        Seq(
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 1) // might NOT contain non-null values (nullCount == valueCount)
         ),
         Seq("file_1")),
       arguments(
         "A in (0, 1)",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0)
         ),
         Seq("file_1", "file_2")),
       arguments(
         "A not in (0, 1)",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0),
-          IndexRow("file_4", 0, 0, 0), // only contains 0
-          IndexRow("file_5", 1, 1, 0) // only contains 1
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0),
+          IndexRow("file_4", valueCount = 1, 0, 0, 0), // only contains 0
+          IndexRow("file_5", valueCount = 1, 1, 1, 0) // only contains 1
         ),
         Seq("file_1", "file_2", "file_3")),
       arguments(
         // Value expression containing expression, which isn't a literal
         "A = int('0')",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0)
         ),
         Seq("file_2")),
       arguments(
         // Value expression containing reference to the other attribute (column), fallback
         "A = D",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0)
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0)
         ),
         Seq("file_1", "file_2", "file_3"))
     )
@@ -315,22 +323,22 @@ object TestDataSkippingUtils {
         // Filter out all rows that contain either A = 0 OR A = 1
         "A != 0 AND A != 1",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0),
-          IndexRow("file_4", 0, 0, 0), // only contains 0
-          IndexRow("file_5", 1, 1, 0) // only contains 1
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0),
+          IndexRow("file_4", valueCount = 1, 0, 0, 0), // only contains 0
+          IndexRow("file_5", valueCount = 1, 1, 1, 0) // only contains 1
         ),
         Seq("file_1", "file_2", "file_3")),
       arguments(
         // This is an equivalent to the above expression
         "NOT(A = 0 OR A = 1)",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0),
-          IndexRow("file_4", 0, 0, 0), // only contains 0
-          IndexRow("file_5", 1, 1, 0) // only contains 1
+          IndexRow("file_1", valueCount = 1, 1, 2, 0),
+          IndexRow("file_2", valueCount = 1, -1, 1, 0),
+          IndexRow("file_3", valueCount = 1, -2, -1, 0),
+          IndexRow("file_4", valueCount = 1, 0, 0, 0), // only contains 0
+          IndexRow("file_5", valueCount = 1, 1, 1, 0) // only contains 1
         ),
         Seq("file_1", "file_2", "file_3")),
 
@@ -338,22 +346,22 @@ object TestDataSkippingUtils {
         // Filter out all rows that contain A = 0 AND B = 'abc'
         "A != 0 OR B != 'abc'",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0),
-          IndexRow("file_4", 0, 0, 0, "abc", "abc", 0), // only contains A = 0, B = 'abc'
-          IndexRow("file_5", 0, 0, 0, "abc", "abc", 0) // only contains A = 0, B = 'abc'
+          IndexRow("file_1", valueCount = 1, A_minValue = 1,  A_maxValue = 2,  A_nullCount = 0),
+          IndexRow("file_2", valueCount = 1, A_minValue = -1, A_maxValue = 1,  A_nullCount = 0),
+          IndexRow("file_3", valueCount = 1, A_minValue = -2, A_maxValue = -1, A_nullCount =  0),
+          IndexRow("file_4", valueCount = 1, A_minValue = 0, A_maxValue = 0, A_nullCount = 0, B_minValue = "abc", B_maxValue = "abc", B_nullCount = 0), // only contains A = 0, B = 'abc'
+          IndexRow("file_5", valueCount = 1, A_minValue = 0, A_maxValue = 0, A_nullCount = 0, B_minValue = "abc", B_maxValue = "abc", B_nullCount = 0) // only contains A = 0, B = 'abc'
         ),
         Seq("file_1", "file_2", "file_3")),
       arguments(
         // This is an equivalent to the above expression
         "NOT(A = 0 AND B = 'abc')",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0),
-          IndexRow("file_4", 0, 0, 0, "abc", "abc", 0), // only contains A = 0, B = 'abc'
-          IndexRow("file_5", 0, 0, 0, "abc", "abc", 0) // only contains A = 0, B = 'abc'
+          IndexRow("file_1", valueCount = 1, A_minValue = 1, A_maxValue = 2, A_nullCount = 0),
+          IndexRow("file_2", valueCount = 1, A_minValue = -1, A_maxValue = 1, A_nullCount = 0),
+          IndexRow("file_3", valueCount = 1, A_minValue = -2, A_maxValue = -1, A_nullCount = 0),
+          IndexRow("file_4", valueCount = 1, A_minValue = 0, A_maxValue = 0, A_nullCount = 0, B_minValue = "abc", B_maxValue = "abc", B_nullCount = 0), // only contains A = 0, B = 'abc'
+          IndexRow("file_5", valueCount = 1, A_minValue = 0, A_maxValue = 0, A_nullCount = 0, B_minValue = "abc", B_maxValue = "abc", B_nullCount = 0) // only contains A = 0, B = 'abc'
         ),
         Seq("file_1", "file_2", "file_3")),
 
@@ -361,10 +369,10 @@ object TestDataSkippingUtils {
         // Queries contains expression involving non-indexed column D
         "A = 0 AND B = 'abc' AND D IS NULL",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0),
-          IndexRow("file_4", 0, 0, 0, "aaa", "xyz", 0) // might contain A = 0 AND B = 'abc'
+          IndexRow("file_1", valueCount = 1, A_minValue = 1, A_maxValue = 2, A_nullCount = 0),
+          IndexRow("file_2", valueCount = 1, A_minValue = -1, A_maxValue = 1, A_nullCount = 0),
+          IndexRow("file_3", valueCount = 1, A_minValue = -2, A_maxValue = -1, A_nullCount = 0),
+          IndexRow("file_4", valueCount = 1, A_minValue = 0, A_maxValue = 0, A_nullCount = 0, B_minValue = "aaa", B_maxValue = "xyz", B_nullCount = 0) // might contain A = 0 AND B = 'abc'
         ),
         Seq("file_4")),
 
@@ -372,10 +380,10 @@ object TestDataSkippingUtils {
         // Queries contains expression involving non-indexed column D
         "A = 0 OR B = 'abc' OR D IS NULL",
         Seq(
-          IndexRow("file_1", 1, 2, 0),
-          IndexRow("file_2", -1, 1, 0),
-          IndexRow("file_3", -2, -1, 0),
-          IndexRow("file_4", 0, 0, 0, "aaa", "xyz", 0) // might contain B = 'abc'
+          IndexRow("file_1", valueCount = 1, A_minValue = 1, A_maxValue = 2, A_nullCount = 0),
+          IndexRow("file_2", valueCount = 1, A_minValue = -1, A_maxValue =  1, A_nullCount = 0),
+          IndexRow("file_3", valueCount = 1, A_minValue = -2, A_maxValue =  -1, A_nullCount = 0),
+          IndexRow("file_4", valueCount = 1, B_minValue = "aaa", B_maxValue = "xyz", B_nullCount = 0) // might contain B = 'abc'
         ),
         Seq("file_1", "file_2", "file_3", "file_4"))
     )
@@ -387,197 +395,197 @@ object TestDataSkippingUtils {
       arguments(
         "date_format(C, 'MM/dd/yyyy') = '03/07/2022'",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_2")),
       arguments(
         "'03/07/2022' = date_format(C, 'MM/dd/yyyy')",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_2")),
       arguments(
         "'03/07/2022' != date_format(C, 'MM/dd/yyyy')",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646625048000L), // 03/07/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_1")),
       arguments(
         "date_format(C, 'MM/dd/yyyy') != '03/07/2022'",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646625048000L), // 03/07/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_1")),
       arguments(
         "date_format(C, 'MM/dd/yyyy') < '03/08/2022'",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_2")),
       arguments(
         "'03/08/2022' > date_format(C, 'MM/dd/yyyy')",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_2")),
       arguments(
         "'03/08/2022' < date_format(C, 'MM/dd/yyyy')",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_1")),
       arguments(
         "date_format(C, 'MM/dd/yyyy') > '03/08/2022'",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_1")),
       arguments(
         "date_format(C, 'MM/dd/yyyy') <= '03/07/2022'",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_2")),
       arguments(
         "'03/07/2022' >= date_format(C, 'MM/dd/yyyy')",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_2")),
       arguments(
         "'03/09/2022' <= date_format(C, 'MM/dd/yyyy')",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_1")),
       arguments(
         "date_format(C, 'MM/dd/yyyy') >= '03/09/2022'",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_1")),
       arguments(
         "date_format(C, 'MM/dd/yyyy') IN ('03/09/2022')",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646711448000L), // 03/08/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_1")),
       arguments(
         "date_format(C, 'MM/dd/yyyy') NOT IN ('03/07/2022')",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             C_minValue = new Timestamp(1646711448000L), // 03/08/2022
             C_maxValue = new Timestamp(1646797848000L), // 03/09/2022
-            C_num_nulls = 0),
-          IndexRow("file_2",
+            C_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             C_minValue = new Timestamp(1646625048000L), // 03/07/2022
             C_maxValue = new Timestamp(1646625048000L), // 03/07/2022
-            C_num_nulls = 0)
+            C_nullCount = 0)
         ),
         Seq("file_1")),
       arguments(
         // Should be identical to the one above
         "date_format(to_timestamp(B, 'yyyy-MM-dd'), 'MM/dd/yyyy') NOT IN ('03/06/2022')",
         Seq(
-          IndexRow("file_1",
+          IndexRow("file_1", valueCount = 1,
             B_minValue = "2022-03-07", // 03/07/2022
             B_maxValue = "2022-03-08", // 03/08/2022
-            B_num_nulls = 0),
-          IndexRow("file_2",
+            B_nullCount = 0),
+          IndexRow("file_2", valueCount = 1,
             B_minValue = "2022-03-06", // 03/06/2022
             B_maxValue = "2022-03-06", // 03/06/2022
-            B_num_nulls = 0)
+            B_nullCount = 0)
         ),
         Seq("file_1"))
 
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
index 8f827f13d..dcc34aa38 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
@@ -209,7 +209,7 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup
     })
   }
 
-  private def buildColumnStatsTableManually(tablePath: String, zorderedCols: Seq[String], indexSchema: StructType) = {
+  private def buildColumnStatsTableManually(tablePath: String, indexedCols: Seq[String], indexSchema: StructType) = {
     val files = {
       val it = fs.listFiles(new Path(tablePath), true)
       var seq = Seq[LocatedFileStatus]()
@@ -224,15 +224,16 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup
         val df = spark.read.schema(sourceTableSchema).parquet(file.getPath.toString)
         val exprs: Seq[String] =
           s"'${typedLit(file.getPath.getName)}' AS file" +:
+          s"sum(1) AS valueCount" +:
             df.columns
-              .filter(col => zorderedCols.contains(col))
+              .filter(col => indexedCols.contains(col))
               .flatMap(col => {
                 val minColName = s"${col}_minValue"
                 val maxColName = s"${col}_maxValue"
                 Seq(
                   s"min($col) AS $minColName",
                   s"max($col) AS $maxColName",
-                  s"sum(cast(isnull($col) AS long)) AS ${col}_num_nulls"
+                  s"sum(cast(isnull($col) AS long)) AS ${col}_nullCount"
                 )
               })