1
0

[HUDI-2072] Add pre-commit validator framework (#3153)

* [HUDI-2072] Add pre-commit validator framework

* trigger Travis rebuild
This commit is contained in:
satishkotha
2021-08-03 12:07:45 -07:00
committed by GitHub
parent bec23bda50
commit 826a04d142
14 changed files with 1130 additions and 32 deletions

View File

@@ -0,0 +1,167 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.utils;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.client.validator.SparkPreCommitValidator;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.table.view.HoodieTablePreCommitFileSystemView;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import scala.collection.JavaConverters;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Spark validator utils to verify and run any precommit validators configured.
*/
public class SparkValidatorUtils {
private static final Logger LOG = LogManager.getLogger(BaseSparkCommitActionExecutor.class);
/**
* Check configured pre-commit validators and run them. Note that this only works for COW tables
*
* Throw error if there are validation failures.
*/
public static void runValidators(HoodieWriteConfig config,
HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata,
HoodieEngineContext context,
HoodieTable table,
String instantTime) {
if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) {
LOG.info("no validators configured.");
} else {
if (!writeMetadata.getWriteStats().isPresent()) {
writeMetadata.setWriteStats(writeMetadata.getWriteStatuses().map(WriteStatus::getStat).collect());
}
Set<String> partitionsModified = new HashSet<>(writeMetadata.getWriteStats().get().stream().map(writeStats ->
writeStats.getPartitionPath()).collect(Collectors.toList()));
SQLContext sqlContext = new SQLContext(HoodieSparkEngineContext.getSparkContext(context));
// Refresh timeline to ensure validator sees the any other operations done on timeline (async operations such as other clustering/compaction/rollback)
table.getMetaClient().reloadActiveTimeline();
Dataset<Row> beforeState = getRecordsFromCommittedFiles(sqlContext, partitionsModified, table).cache();
Dataset<Row> afterState = getRecordsFromPendingCommits(sqlContext, partitionsModified, writeMetadata, table, instantTime).cache();
Stream<SparkPreCommitValidator> validators = Arrays.stream(config.getPreCommitValidators().split(","))
.map(validatorClass -> {
return ((SparkPreCommitValidator) ReflectionUtils.loadClass(validatorClass,
new Class<?>[] {HoodieSparkTable.class, HoodieEngineContext.class, HoodieWriteConfig.class},
table, context, config));
});
boolean allSuccess = validators.map(v -> runValidatorAsync(v, writeMetadata, beforeState, afterState, instantTime)).map(CompletableFuture::join)
.reduce(true, Boolean::logicalAnd);
if (allSuccess) {
LOG.info("All validations succeeded");
} else {
LOG.error("At least one pre-commit validation failed");
throw new HoodieValidationException("At least one pre-commit validation failed");
}
}
}
/**
* Run validators in a separate threadpool for parallelism. Each of validator can submit a distributed spark job if needed.
*/
private static CompletableFuture<Boolean> runValidatorAsync(SparkPreCommitValidator validator, HoodieWriteMetadata writeMetadata,
Dataset<Row> beforeState, Dataset<Row> afterState, String instantTime) {
return CompletableFuture.supplyAsync(() -> {
try {
validator.validate(instantTime, writeMetadata, beforeState, afterState);
LOG.info("validation complete for " + validator.getClass().getName());
return true;
} catch (HoodieValidationException e) {
LOG.error("validation failed for " + validator.getClass().getName());
return false;
}
});
}
/**
* Get records from partitions modified as a dataset.
* Note that this only works for COW tables.
*/
public static Dataset<Row> getRecordsFromCommittedFiles(SQLContext sqlContext,
Set<String> partitionsAffected, HoodieTable table) {
List<String> committedFiles = partitionsAffected.stream()
.flatMap(partition -> table.getBaseFileOnlyView().getLatestBaseFiles(partition).map(bf -> bf.getPath()))
.collect(Collectors.toList());
if (committedFiles.isEmpty()) {
return sqlContext.emptyDataFrame();
}
return readRecordsForBaseFiles(sqlContext, committedFiles);
}
/**
* Get records from specified list of data files.
*/
public static Dataset<Row> readRecordsForBaseFiles(SQLContext sqlContext, List<String> baseFilePaths) {
return sqlContext.read().parquet(JavaConverters.asScalaBufferConverter(baseFilePaths).asScala());
}
/**
* Get reads from paritions modified including any inflight commits.
* Note that this only works for COW tables
*/
public static Dataset<Row> getRecordsFromPendingCommits(SQLContext sqlContext,
Set<String> partitionsAffected,
HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata,
HoodieTable table,
String instantTime) {
// build file system view with pending commits
HoodieTablePreCommitFileSystemView fsView = new HoodieTablePreCommitFileSystemView(table.getMetaClient(),
table.getHoodieView(),
writeMetadata.getWriteStats().get(),
writeMetadata.getPartitionToReplaceFileIds(),
instantTime);
List<String> newFiles = partitionsAffected.stream()
.flatMap(partition -> fsView.getLatestBaseFiles(partition).map(bf -> bf.getPath()))
.collect(Collectors.toList());
if (newFiles.isEmpty()) {
return sqlContext.emptyDataFrame();
}
return readRecordsForBaseFiles(sqlContext, newFiles);
}
}

View File

@@ -0,0 +1,99 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.validator;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Validator can be configured pre-commit.
*/
public abstract class SparkPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> {
private static final Logger LOG = LogManager.getLogger(SparkPreCommitValidator.class);
private HoodieSparkTable<T> table;
private HoodieEngineContext engineContext;
private HoodieWriteConfig writeConfig;
protected SparkPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) {
this.table = table;
this.engineContext = engineContext;
this.writeConfig = writeConfig;
}
protected Set<String> getPartitionsModified(HoodieWriteMetadata<O> writeResult) {
Set<String> partitionsModified;
if (writeResult.getWriteStats().isPresent()) {
partitionsModified = writeResult.getWriteStats().get().stream().map(HoodieWriteStat::getPartitionPath).collect(Collectors.toSet());
} else {
partitionsModified = new HashSet<>(writeResult.getWriteStatuses().map(WriteStatus::getPartitionPath).collect());
}
return partitionsModified;
}
/**
* Verify the data written as part of specified instant.
* Throw HoodieValidationException if any unexpected data is written (Example: data files are not readable for some reason).
*/
public void validate(String instantTime, HoodieWriteMetadata<O> writeResult, Dataset<Row> before, Dataset<Row> after) throws HoodieValidationException {
HoodieTimer timer = new HoodieTimer().startTimer();
try {
validateRecordsBeforeAndAfter(before, after, getPartitionsModified(writeResult));
} finally {
LOG.info(getClass() + " validator took " + timer.endTimer() + " ms");
}
}
/**
* Takes input of RDD 1) before clustering and 2) after clustering. Perform required validation
* and throw error if validation fails
*/
protected abstract void validateRecordsBeforeAndAfter(Dataset<Row> before,
Dataset<Row> after,
Set<String> partitionsAffected);
public HoodieTable getHoodieTable() {
return this.table;
}
public HoodieEngineContext getEngineContext() {
return this.engineContext;
}
public HoodieWriteConfig getWriteConfig() {
return this.writeConfig;
}
}

View File

@@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.validator;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.config.HoodiePreCommitValidatorConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
/**
* Validator to run sql query and compare table state
* 1) before new commit started.
* 2) current inflight commit (if successful).
*
* Expects both queries to return same result.
*/
public class SqlQueryEqualityPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
private static final Logger LOG = LogManager.getLogger(SqlQueryEqualityPreCommitValidator.class);
public SqlQueryEqualityPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
super(table, engineContext, config);
}
@Override
protected String getQueryConfigName() {
return HoodiePreCommitValidatorConfig.PRE_COMMIT_VALIDATORS_EQUALITY_SQL_QUERIES.key();
}
@Override
protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) {
String queryWithPrevSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, prevTableSnapshot);
String queryWithNewSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot);
LOG.info("Running query on previous state: " + queryWithPrevSnapshot);
Dataset<Row> prevRows = sqlContext.sql(queryWithPrevSnapshot);
LOG.info("Running query on new state: " + queryWithNewSnapshot);
Dataset<Row> newRows = sqlContext.sql(queryWithNewSnapshot);
printAllRowsIfDebugEnabled(prevRows);
printAllRowsIfDebugEnabled(newRows);
boolean areDatasetsEqual = prevRows.intersect(newRows).count() == prevRows.count();
LOG.info("Completed Equality Validation, datasets equal? " + areDatasetsEqual);
if (!areDatasetsEqual) {
LOG.error("query validation failed. See stdout for sample query results. Query: " + query);
System.out.println("Expected result (sample records only):");
prevRows.show();
System.out.println("Actual result (sample records only):");
newRows.show();
throw new HoodieValidationException("Query validation failed for '" + query + "'. See stdout for expected vs actual records");
}
}
}

View File

@@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.validator;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.config.HoodiePreCommitValidatorConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
/**
* Validator to run sql query and compare table state
* 1) before new commit started.
* 2) current inflight commit (if successful).
*
* Expects query results dont match.
*/
public class SqlQueryInequalityPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
private static final Logger LOG = LogManager.getLogger(SqlQueryInequalityPreCommitValidator.class);
public SqlQueryInequalityPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
super(table, engineContext, config);
}
@Override
protected String getQueryConfigName() {
return HoodiePreCommitValidatorConfig.PRE_COMMIT_VALIDATORS_INEQUALITY_SQL_QUERIES.key();
}
@Override
protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) {
String queryWithPrevSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, prevTableSnapshot);
String queryWithNewSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot);
LOG.info("Running query on previous state: " + queryWithPrevSnapshot);
Dataset<Row> prevRows = sqlContext.sql(queryWithPrevSnapshot);
LOG.info("Running query on new state: " + queryWithNewSnapshot);
Dataset<Row> newRows = sqlContext.sql(queryWithNewSnapshot);
printAllRowsIfDebugEnabled(prevRows);
printAllRowsIfDebugEnabled(newRows);
boolean areDatasetsEqual = prevRows.intersect(newRows).count() == prevRows.count();
LOG.info("Completed Inequality Validation, datasets equal? " + areDatasetsEqual);
if (areDatasetsEqual) {
LOG.error("query validation failed. See stdout for sample query results. Query: " + query);
System.out.println("Expected query results to be inequal, but they are same. Result (sample records only):");
prevRows.show();
throw new HoodieValidationException("Query validation failed for '" + query
+ "'. Expected " + prevRows.count() + " rows, Found " + newRows.count());
}
}
}

View File

@@ -0,0 +1,92 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.validator;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import java.util.Arrays;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Validator framework to run sql queries and compare table state at different locations.
*/
public abstract class SqlQueryPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SparkPreCommitValidator<T, I, K, O> {
private static final Logger LOG = LogManager.getLogger(SqlQueryPreCommitValidator.class);
private static final AtomicInteger TABLE_COUNTER = new AtomicInteger(0);
public SqlQueryPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
super(table, engineContext, config);
}
/**
* Takes input datasets 1) before commit started and 2) with inflight commit. Perform required validation
* and throw error if validation fails
*/
@Override
public void validateRecordsBeforeAndAfter(Dataset<Row> before, Dataset<Row> after, final Set<String> partitionsAffected) {
String hoodieTableName = "staged_table_" + TABLE_COUNTER.incrementAndGet();
String hoodieTableBeforeCurrentCommit = hoodieTableName + "_before";
String hoodieTableWithInflightCommit = hoodieTableName + "_after";
before.registerTempTable(hoodieTableBeforeCurrentCommit);
after.registerTempTable(hoodieTableWithInflightCommit);
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(getEngineContext());
SQLContext sqlContext = new SQLContext(jsc);
String[] queries = getQueriesToRun();
//TODO run this in a thread pool to improve parallelism
Arrays.stream(queries).forEach(query ->
validateUsingQuery(query, hoodieTableBeforeCurrentCommit, hoodieTableWithInflightCommit, sqlContext));
}
protected String[] getQueriesToRun() {
String sqlQueriesConfigured = getWriteConfig().getProps().getProperty(getQueryConfigName());
if (StringUtils.isNullOrEmpty(sqlQueriesConfigured)) {
throw new HoodieValidationException("Sql validator configured incorrectly. expecting at least one query. Found 0 queries in "
+ sqlQueriesConfigured);
}
return sqlQueriesConfigured.trim().split(";");
}
protected void printAllRowsIfDebugEnabled(Dataset<Row> dataset) {
if (LOG.isDebugEnabled()) {
dataset = dataset.cache();
LOG.debug("Printing all rows from query validation:");
dataset.show(Integer.MAX_VALUE,false);
}
}
protected abstract String getQueryConfigName();
protected abstract void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext);
}

View File

@@ -0,0 +1,78 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.validator;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.config.HoodiePreCommitValidatorConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import java.util.List;
/**
* Validator to run sql queries on new table state and expects a single result. If the result doesnt match expected result,
* throw validation error.
*
* Example configuration: "query1#expectedResult1;query2#expectedResult2;"
*/
public class SqlQuerySingleResultPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
private static final Logger LOG = LogManager.getLogger(SqlQueryInequalityPreCommitValidator.class);
public SqlQuerySingleResultPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
super(table, engineContext, config);
}
@Override
protected String getQueryConfigName() {
return HoodiePreCommitValidatorConfig.PRE_COMMIT_VALIDATORS_SINGLE_VALUE_SQL_QUERIES.key();
}
@Override
protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) {
String[] queryWithExpectedResult = query.split("#");
if (queryWithExpectedResult.length != 2) {
throw new HoodieValidationException("Invalid query format " + query);
}
String queryToRun = queryWithExpectedResult[0];
String expectedResult = queryWithExpectedResult[1];
LOG.info("Running query on new state: " + queryToRun);
String queryWithNewSnapshot = queryToRun.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot);
List<Row> newRows = sqlContext.sql(queryWithNewSnapshot).collectAsList();
if (newRows.size() != 1 && newRows.get(0).size() != 1) {
throw new HoodieValidationException("Invalid query result. expect single value for '" + query + "'");
}
Object result = newRows.get(0).apply(0);
if (result == null || !expectedResult.equals(result.toString())) {
LOG.error("Mismatch query result. Expected: " + expectedResult + " got " + result + "Query: " + query);
throw new HoodieValidationException("Query validation failed for '" + query
+ "'. Expected " + expectedResult + " rows, Found " + result);
} else {
LOG.info("Query validation successful. Expected: " + expectedResult + " got " + result);
}
}
}

View File

@@ -31,6 +31,7 @@ import org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector;
import org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.client.utils.SparkMemoryUtils;
import org.apache.hudi.client.utils.SparkValidatorUtils;
import org.apache.hudi.common.bootstrap.FileStatusUtils;
import org.apache.hudi.common.bootstrap.index.BootstrapIndex;
import org.apache.hudi.common.config.TypedProperties;
@@ -422,4 +423,9 @@ public class SparkBootstrapCommitActionExecutor<T extends HoodieRecordPayload<T>
protected Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
throw new UnsupportedOperationException("Should not called in bootstrap code path");
}
@Override
protected void runPrecommitValidators(HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata) {
SparkValidatorUtils.runValidators(config, writeMetadata, context, table, instantTime);
}
}

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.table.action.commit;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.utils.SparkMemoryUtils;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.client.utils.SparkValidatorUtils;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieKey;
@@ -387,4 +388,8 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
return getUpsertPartitioner(profile);
}
@Override
protected void runPrecommitValidators(HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata) {
SparkValidatorUtils.runValidators(config, writeMetadata, context, table, instantTime);
}
}