[HUDI-2072] Add pre-commit validator framework (#3153)

* [HUDI-2072] Add pre-commit validator framework * trigger Travis rebuild
2021-08-03 12:07:45 -07:00
parent bec23bda50
commit 826a04d142
14 changed files with 1130 additions and 32 deletions
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkValidatorUtils.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkValidatorUtils.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.client.utils;
+
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.client.common.HoodieSparkEngineContext;
+import org.apache.hudi.client.validator.SparkPreCommitValidator;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.table.view.HoodieTablePreCommitFileSystemView;
+import org.apache.hudi.common.util.ReflectionUtils;
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieValidationException;
+import org.apache.hudi.table.HoodieSparkTable;
+import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.action.HoodieWriteMetadata;
+import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+import scala.collection.JavaConverters;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+/**
+ * Spark validator utils to verify and run any precommit validators configured.
+ */
+public class SparkValidatorUtils {
+  private static final Logger LOG = LogManager.getLogger(BaseSparkCommitActionExecutor.class);
+
+  /**
+   * Check configured pre-commit validators and run them. Note that this only works for COW tables
+   * 
+   * Throw error if there are validation failures.
+   */
+  public static void runValidators(HoodieWriteConfig config,
+                                   HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata,
+                                   HoodieEngineContext context,
+                                   HoodieTable table,
+                                   String instantTime) {
+    if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) {
+      LOG.info("no validators configured.");
+    } else {
+      if (!writeMetadata.getWriteStats().isPresent()) {
+        writeMetadata.setWriteStats(writeMetadata.getWriteStatuses().map(WriteStatus::getStat).collect());
+      }
+      Set<String> partitionsModified = new HashSet<>(writeMetadata.getWriteStats().get().stream().map(writeStats ->
+          writeStats.getPartitionPath()).collect(Collectors.toList()));
+      SQLContext sqlContext = new SQLContext(HoodieSparkEngineContext.getSparkContext(context));
+      // Refresh timeline to ensure validator sees the any other operations done on timeline (async operations such as other clustering/compaction/rollback)
+      table.getMetaClient().reloadActiveTimeline();
+      Dataset<Row> beforeState = getRecordsFromCommittedFiles(sqlContext, partitionsModified, table).cache();
+      Dataset<Row> afterState  = getRecordsFromPendingCommits(sqlContext, partitionsModified, writeMetadata, table, instantTime).cache();
+
+      Stream<SparkPreCommitValidator> validators = Arrays.stream(config.getPreCommitValidators().split(","))
+          .map(validatorClass -> {
+            return ((SparkPreCommitValidator) ReflectionUtils.loadClass(validatorClass,
+                new Class<?>[] {HoodieSparkTable.class, HoodieEngineContext.class, HoodieWriteConfig.class},
+                table, context, config));
+          });
+
+      boolean allSuccess = validators.map(v -> runValidatorAsync(v, writeMetadata, beforeState, afterState, instantTime)).map(CompletableFuture::join)
+          .reduce(true, Boolean::logicalAnd);
+
+      if (allSuccess) {
+        LOG.info("All validations succeeded");
+      } else {
+        LOG.error("At least one pre-commit validation failed");
+        throw new HoodieValidationException("At least one pre-commit validation failed");
+      }
+    }
+  }
+
+  /**
+   * Run validators in a separate threadpool for parallelism. Each of validator can submit a distributed spark job if needed.
+   */
+  private static CompletableFuture<Boolean> runValidatorAsync(SparkPreCommitValidator validator, HoodieWriteMetadata writeMetadata,
+                                                       Dataset<Row> beforeState, Dataset<Row> afterState, String instantTime) {
+    return CompletableFuture.supplyAsync(() -> {
+      try {
+        validator.validate(instantTime, writeMetadata, beforeState, afterState);
+        LOG.info("validation complete for " + validator.getClass().getName());
+        return true;
+      } catch (HoodieValidationException e) {
+        LOG.error("validation failed for " + validator.getClass().getName());
+        return false;
+      }
+    });
+  }
+
+  /**
+   * Get records from partitions modified as a dataset.
+   * Note that this only works for COW tables.
+   */
+  public static Dataset<Row> getRecordsFromCommittedFiles(SQLContext sqlContext,
+                                                      Set<String> partitionsAffected, HoodieTable table) {
+
+    List<String> committedFiles = partitionsAffected.stream()
+        .flatMap(partition -> table.getBaseFileOnlyView().getLatestBaseFiles(partition).map(bf -> bf.getPath()))
+        .collect(Collectors.toList());
+
+    if (committedFiles.isEmpty()) {
+      return sqlContext.emptyDataFrame();
+    }
+    return readRecordsForBaseFiles(sqlContext, committedFiles);
+  }
+
+  /**
+   * Get records from specified list of data files.
+   */
+  public static Dataset<Row> readRecordsForBaseFiles(SQLContext sqlContext, List<String> baseFilePaths) {
+    return sqlContext.read().parquet(JavaConverters.asScalaBufferConverter(baseFilePaths).asScala());
+  }
+
+  /**
+   * Get reads from paritions modified including any inflight commits.
+   * Note that this only works for COW tables
+   */
+  public static Dataset<Row> getRecordsFromPendingCommits(SQLContext sqlContext, 
+                                                          Set<String> partitionsAffected, 
+                                                          HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata,
+                                                          HoodieTable table,
+                                                          String instantTime) {
+
+    // build file system view with pending commits
+    HoodieTablePreCommitFileSystemView fsView = new HoodieTablePreCommitFileSystemView(table.getMetaClient(),
+        table.getHoodieView(),
+        writeMetadata.getWriteStats().get(),
+        writeMetadata.getPartitionToReplaceFileIds(),
+        instantTime);
+
+    List<String> newFiles = partitionsAffected.stream()
+        .flatMap(partition ->  fsView.getLatestBaseFiles(partition).map(bf -> bf.getPath()))
+        .collect(Collectors.toList());
+
+    if (newFiles.isEmpty()) {
+      return sqlContext.emptyDataFrame();
+    }
+
+    return readRecordsForBaseFiles(sqlContext, newFiles);
+  }
+}
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SparkPreCommitValidator.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SparkPreCommitValidator.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.client.validator;
+
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.model.HoodieWriteStat;
+import org.apache.hudi.common.util.HoodieTimer;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieValidationException;
+import org.apache.hudi.table.HoodieSparkTable;
+import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.action.HoodieWriteMetadata;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * Validator can be configured pre-commit. 
+ */
+public abstract class SparkPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> {
+  private static final Logger LOG = LogManager.getLogger(SparkPreCommitValidator.class);
+
+  private HoodieSparkTable<T> table;
+  private HoodieEngineContext engineContext;
+  private HoodieWriteConfig writeConfig;
+
+  protected SparkPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) {
+    this.table = table;
+    this.engineContext = engineContext;
+    this.writeConfig = writeConfig;
+  }
+  
+  protected Set<String> getPartitionsModified(HoodieWriteMetadata<O> writeResult) {
+    Set<String> partitionsModified;
+    if (writeResult.getWriteStats().isPresent()) {
+      partitionsModified = writeResult.getWriteStats().get().stream().map(HoodieWriteStat::getPartitionPath).collect(Collectors.toSet());
+    } else {
+      partitionsModified = new HashSet<>(writeResult.getWriteStatuses().map(WriteStatus::getPartitionPath).collect());
+    }
+    return partitionsModified;
+  }
+
+  /**
+   * Verify the data written as part of specified instant. 
+   * Throw HoodieValidationException if any unexpected data is written (Example: data files are not readable for some reason).
+   */
+  public void validate(String instantTime, HoodieWriteMetadata<O> writeResult, Dataset<Row> before, Dataset<Row> after) throws HoodieValidationException {
+    HoodieTimer timer = new HoodieTimer().startTimer();
+    try {
+      validateRecordsBeforeAndAfter(before, after, getPartitionsModified(writeResult));
+    } finally {
+      LOG.info(getClass() + " validator took " + timer.endTimer() + " ms");
+    }
+  }
+
+  /**
+   * Takes input of RDD 1) before clustering and 2) after clustering. Perform required validation 
+   * and throw error if validation fails
+   */
+  protected abstract void validateRecordsBeforeAndAfter(Dataset<Row> before,
+                                                        Dataset<Row> after,
+                                                        Set<String> partitionsAffected);
+
+  public HoodieTable getHoodieTable() {
+    return this.table;
+  }
+
+  public HoodieEngineContext getEngineContext() {
+    return this.engineContext;
+  }
+
+  public HoodieWriteConfig getWriteConfig() {
+    return this.writeConfig;
+  }
+}
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryEqualityPreCommitValidator.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryEqualityPreCommitValidator.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.client.validator;
+
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.config.HoodiePreCommitValidatorConfig;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieValidationException;
+import org.apache.hudi.table.HoodieSparkTable;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+
+/**
+ * Validator to run sql query and compare table state 
+ * 1) before new commit started.
+ * 2) current inflight commit (if successful).
+ * 
+ * Expects both queries to return same result.
+ */
+public class SqlQueryEqualityPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
+
+  private static final Logger LOG = LogManager.getLogger(SqlQueryEqualityPreCommitValidator.class);
+
+  public SqlQueryEqualityPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
+    super(table, engineContext, config);
+  }
+
+  @Override
+  protected String getQueryConfigName() {
+    return HoodiePreCommitValidatorConfig.PRE_COMMIT_VALIDATORS_EQUALITY_SQL_QUERIES.key();
+  }
+
+  @Override
+  protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) {
+    String queryWithPrevSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, prevTableSnapshot);
+    String queryWithNewSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot);
+    LOG.info("Running query on previous state: " + queryWithPrevSnapshot);
+    Dataset<Row> prevRows = sqlContext.sql(queryWithPrevSnapshot);
+    LOG.info("Running query on new state: " + queryWithNewSnapshot);
+    Dataset<Row> newRows  = sqlContext.sql(queryWithNewSnapshot);
+    printAllRowsIfDebugEnabled(prevRows);
+    printAllRowsIfDebugEnabled(newRows);
+    boolean areDatasetsEqual = prevRows.intersect(newRows).count() == prevRows.count();
+    LOG.info("Completed Equality Validation, datasets equal? " + areDatasetsEqual);
+    if (!areDatasetsEqual) {
+      LOG.error("query validation failed. See stdout for sample query results. Query: " + query);
+      System.out.println("Expected result (sample records only):");
+      prevRows.show();
+      System.out.println("Actual result (sample records only):");
+      newRows.show();
+      throw new HoodieValidationException("Query validation failed for '" + query + "'. See stdout for expected vs actual records");
+    }
+  }
+}
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryInequalityPreCommitValidator.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryInequalityPreCommitValidator.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.client.validator;
+
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.config.HoodiePreCommitValidatorConfig;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieValidationException;
+import org.apache.hudi.table.HoodieSparkTable;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+
+/**
+ * Validator to run sql query and compare table state 
+ * 1) before new commit started.
+ * 2) current inflight commit (if successful).
+ * 
+ * Expects query results dont match.
+ */
+public class SqlQueryInequalityPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
+  private static final Logger LOG = LogManager.getLogger(SqlQueryInequalityPreCommitValidator.class);
+
+  public SqlQueryInequalityPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
+    super(table, engineContext, config);
+  }
+
+  @Override
+  protected String getQueryConfigName() {
+    return HoodiePreCommitValidatorConfig.PRE_COMMIT_VALIDATORS_INEQUALITY_SQL_QUERIES.key();
+  }
+
+  @Override
+  protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) {
+    String queryWithPrevSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, prevTableSnapshot);
+    String queryWithNewSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot);
+    LOG.info("Running query on previous state: " + queryWithPrevSnapshot);
+    Dataset<Row> prevRows = sqlContext.sql(queryWithPrevSnapshot);
+    LOG.info("Running query on new state: " + queryWithNewSnapshot);
+    Dataset<Row> newRows  = sqlContext.sql(queryWithNewSnapshot);
+    printAllRowsIfDebugEnabled(prevRows);
+    printAllRowsIfDebugEnabled(newRows);
+    boolean areDatasetsEqual = prevRows.intersect(newRows).count() == prevRows.count();
+    LOG.info("Completed Inequality Validation, datasets equal? " + areDatasetsEqual);
+    if (areDatasetsEqual) {
+      LOG.error("query validation failed. See stdout for sample query results. Query: " + query);
+      System.out.println("Expected query results to be inequal, but they are same. Result (sample records only):");
+      prevRows.show();
+      throw new HoodieValidationException("Query validation failed for '" + query 
+          + "'. Expected " + prevRows.count() + " rows, Found " + newRows.count());
+    }
+  }
+}
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryPreCommitValidator.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryPreCommitValidator.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.client.validator;
+
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.client.common.HoodieSparkEngineContext;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieValidationException;
+import org.apache.hudi.table.HoodieSparkTable;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+
+import java.util.Arrays;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Validator framework to run sql queries and compare table state at different locations.
+ */
+public abstract class SqlQueryPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SparkPreCommitValidator<T, I, K, O> {
+  private static final Logger LOG = LogManager.getLogger(SqlQueryPreCommitValidator.class);
+  private static final AtomicInteger TABLE_COUNTER = new AtomicInteger(0);
+
+  public SqlQueryPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
+    super(table, engineContext, config);
+  }
+
+  /**
+   * Takes input datasets 1) before commit started and 2) with inflight commit. Perform required validation 
+   * and throw error if validation fails
+   */
+  @Override
+  public void validateRecordsBeforeAndAfter(Dataset<Row> before, Dataset<Row> after, final Set<String> partitionsAffected) {
+    String hoodieTableName = "staged_table_" + TABLE_COUNTER.incrementAndGet();
+    String hoodieTableBeforeCurrentCommit = hoodieTableName + "_before";
+    String hoodieTableWithInflightCommit = hoodieTableName + "_after";
+    before.registerTempTable(hoodieTableBeforeCurrentCommit);
+    after.registerTempTable(hoodieTableWithInflightCommit);
+    JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(getEngineContext());
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    String[] queries = getQueriesToRun();
+    //TODO run this in a thread pool to improve parallelism
+    Arrays.stream(queries).forEach(query -> 
+        validateUsingQuery(query, hoodieTableBeforeCurrentCommit, hoodieTableWithInflightCommit, sqlContext));
+  }
+
+  protected String[] getQueriesToRun() {
+    String sqlQueriesConfigured = getWriteConfig().getProps().getProperty(getQueryConfigName());
+    if (StringUtils.isNullOrEmpty(sqlQueriesConfigured)) {
+      throw new HoodieValidationException("Sql validator configured incorrectly. expecting at least one query. Found 0 queries in "
+          + sqlQueriesConfigured);
+    }
+    return sqlQueriesConfigured.trim().split(";");
+  }
+  
+  protected void printAllRowsIfDebugEnabled(Dataset<Row> dataset) {
+    if (LOG.isDebugEnabled()) {
+      dataset = dataset.cache();
+      LOG.debug("Printing all rows from query validation:");
+      dataset.show(Integer.MAX_VALUE,false);
+    }
+  }
+  
+  protected abstract String getQueryConfigName();
+  
+  protected abstract void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext);
+}
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQuerySingleResultPreCommitValidator.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQuerySingleResultPreCommitValidator.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.client.validator;
+
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.config.HoodiePreCommitValidatorConfig;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieValidationException;
+import org.apache.hudi.table.HoodieSparkTable;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+
+import java.util.List;
+
+/**
+ * Validator to run sql queries on new table state and expects a single result. If the result doesnt match expected result,
+ * throw validation error. 
+ * 
+ * Example configuration: "query1#expectedResult1;query2#expectedResult2;"
+ */
+public class SqlQuerySingleResultPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
+  private static final Logger LOG = LogManager.getLogger(SqlQueryInequalityPreCommitValidator.class);
+
+  public SqlQuerySingleResultPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
+    super(table, engineContext, config);
+  }
+
+  @Override
+  protected String getQueryConfigName() {
+    return HoodiePreCommitValidatorConfig.PRE_COMMIT_VALIDATORS_SINGLE_VALUE_SQL_QUERIES.key();
+  }
+
+  @Override
+  protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) {
+    String[] queryWithExpectedResult = query.split("#");
+    if (queryWithExpectedResult.length != 2) {
+      throw new HoodieValidationException("Invalid query format " + query);
+    }
+    
+    String queryToRun = queryWithExpectedResult[0];
+    String expectedResult = queryWithExpectedResult[1];
+    LOG.info("Running query on new state: " + queryToRun);
+    String queryWithNewSnapshot = queryToRun.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot);
+    List<Row> newRows  = sqlContext.sql(queryWithNewSnapshot).collectAsList();
+    if (newRows.size() != 1 && newRows.get(0).size() != 1) {
+      throw new HoodieValidationException("Invalid query result. expect single value for '" + query + "'");
+    }
+    Object result = newRows.get(0).apply(0);
+    if (result == null || !expectedResult.equals(result.toString())) {
+      LOG.error("Mismatch query result. Expected: " + expectedResult + " got " + result + "Query: " + query);
+      throw new HoodieValidationException("Query validation failed for '" + query
+          + "'. Expected " + expectedResult + " rows, Found " + result);
+    } else {
+      LOG.info("Query validation successful. Expected: " + expectedResult + " got " + result);
+    }
+  }
+}
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java
@@ -31,6 +31,7 @@ import org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector;
 import org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator;
 import org.apache.hudi.client.common.HoodieSparkEngineContext;
 import org.apache.hudi.client.utils.SparkMemoryUtils;
+import org.apache.hudi.client.utils.SparkValidatorUtils;
 import org.apache.hudi.common.bootstrap.FileStatusUtils;
 import org.apache.hudi.common.bootstrap.index.BootstrapIndex;
 import org.apache.hudi.common.config.TypedProperties;
@@ -422,4 +423,9 @@ public class SparkBootstrapCommitActionExecutor<T extends HoodieRecordPayload<T>
  protected Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
    throw new UnsupportedOperationException("Should not called in bootstrap code path");
  }
+
+  @Override
+  protected void runPrecommitValidators(HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata) {
+    SparkValidatorUtils.runValidators(config, writeMetadata, context, table, instantTime);
+  }
 }
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java
@@ -21,6 +21,7 @@ package org.apache.hudi.table.action.commit;
 import org.apache.hudi.client.WriteStatus;
 import org.apache.hudi.client.utils.SparkMemoryUtils;
 import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.client.utils.SparkValidatorUtils;
 import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.model.HoodieCommitMetadata;
 import org.apache.hudi.common.model.HoodieKey;
@@ -387,4 +388,8 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
    return getUpsertPartitioner(profile);
  }

+  @Override
+  protected void runPrecommitValidators(HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata) {
+    SparkValidatorUtils.runValidators(config, writeMetadata, context, table, instantTime);
+  }
 }