[HUDI-2072] Add pre-commit validator framework (#3153)
* [HUDI-2072] Add pre-commit validator framework * trigger Travis rebuild
This commit is contained in:
@@ -0,0 +1,167 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.utils;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.client.validator.SparkPreCommitValidator;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.table.view.HoodieTablePreCommitFileSystemView;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieValidationException;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import scala.collection.JavaConverters;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* Spark validator utils to verify and run any precommit validators configured.
|
||||
*/
|
||||
public class SparkValidatorUtils {
|
||||
private static final Logger LOG = LogManager.getLogger(BaseSparkCommitActionExecutor.class);
|
||||
|
||||
/**
|
||||
* Check configured pre-commit validators and run them. Note that this only works for COW tables
|
||||
*
|
||||
* Throw error if there are validation failures.
|
||||
*/
|
||||
public static void runValidators(HoodieWriteConfig config,
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata,
|
||||
HoodieEngineContext context,
|
||||
HoodieTable table,
|
||||
String instantTime) {
|
||||
if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) {
|
||||
LOG.info("no validators configured.");
|
||||
} else {
|
||||
if (!writeMetadata.getWriteStats().isPresent()) {
|
||||
writeMetadata.setWriteStats(writeMetadata.getWriteStatuses().map(WriteStatus::getStat).collect());
|
||||
}
|
||||
Set<String> partitionsModified = new HashSet<>(writeMetadata.getWriteStats().get().stream().map(writeStats ->
|
||||
writeStats.getPartitionPath()).collect(Collectors.toList()));
|
||||
SQLContext sqlContext = new SQLContext(HoodieSparkEngineContext.getSparkContext(context));
|
||||
// Refresh timeline to ensure validator sees the any other operations done on timeline (async operations such as other clustering/compaction/rollback)
|
||||
table.getMetaClient().reloadActiveTimeline();
|
||||
Dataset<Row> beforeState = getRecordsFromCommittedFiles(sqlContext, partitionsModified, table).cache();
|
||||
Dataset<Row> afterState = getRecordsFromPendingCommits(sqlContext, partitionsModified, writeMetadata, table, instantTime).cache();
|
||||
|
||||
Stream<SparkPreCommitValidator> validators = Arrays.stream(config.getPreCommitValidators().split(","))
|
||||
.map(validatorClass -> {
|
||||
return ((SparkPreCommitValidator) ReflectionUtils.loadClass(validatorClass,
|
||||
new Class<?>[] {HoodieSparkTable.class, HoodieEngineContext.class, HoodieWriteConfig.class},
|
||||
table, context, config));
|
||||
});
|
||||
|
||||
boolean allSuccess = validators.map(v -> runValidatorAsync(v, writeMetadata, beforeState, afterState, instantTime)).map(CompletableFuture::join)
|
||||
.reduce(true, Boolean::logicalAnd);
|
||||
|
||||
if (allSuccess) {
|
||||
LOG.info("All validations succeeded");
|
||||
} else {
|
||||
LOG.error("At least one pre-commit validation failed");
|
||||
throw new HoodieValidationException("At least one pre-commit validation failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run validators in a separate threadpool for parallelism. Each of validator can submit a distributed spark job if needed.
|
||||
*/
|
||||
private static CompletableFuture<Boolean> runValidatorAsync(SparkPreCommitValidator validator, HoodieWriteMetadata writeMetadata,
|
||||
Dataset<Row> beforeState, Dataset<Row> afterState, String instantTime) {
|
||||
return CompletableFuture.supplyAsync(() -> {
|
||||
try {
|
||||
validator.validate(instantTime, writeMetadata, beforeState, afterState);
|
||||
LOG.info("validation complete for " + validator.getClass().getName());
|
||||
return true;
|
||||
} catch (HoodieValidationException e) {
|
||||
LOG.error("validation failed for " + validator.getClass().getName());
|
||||
return false;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get records from partitions modified as a dataset.
|
||||
* Note that this only works for COW tables.
|
||||
*/
|
||||
public static Dataset<Row> getRecordsFromCommittedFiles(SQLContext sqlContext,
|
||||
Set<String> partitionsAffected, HoodieTable table) {
|
||||
|
||||
List<String> committedFiles = partitionsAffected.stream()
|
||||
.flatMap(partition -> table.getBaseFileOnlyView().getLatestBaseFiles(partition).map(bf -> bf.getPath()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (committedFiles.isEmpty()) {
|
||||
return sqlContext.emptyDataFrame();
|
||||
}
|
||||
return readRecordsForBaseFiles(sqlContext, committedFiles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get records from specified list of data files.
|
||||
*/
|
||||
public static Dataset<Row> readRecordsForBaseFiles(SQLContext sqlContext, List<String> baseFilePaths) {
|
||||
return sqlContext.read().parquet(JavaConverters.asScalaBufferConverter(baseFilePaths).asScala());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get reads from paritions modified including any inflight commits.
|
||||
* Note that this only works for COW tables
|
||||
*/
|
||||
public static Dataset<Row> getRecordsFromPendingCommits(SQLContext sqlContext,
|
||||
Set<String> partitionsAffected,
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata,
|
||||
HoodieTable table,
|
||||
String instantTime) {
|
||||
|
||||
// build file system view with pending commits
|
||||
HoodieTablePreCommitFileSystemView fsView = new HoodieTablePreCommitFileSystemView(table.getMetaClient(),
|
||||
table.getHoodieView(),
|
||||
writeMetadata.getWriteStats().get(),
|
||||
writeMetadata.getPartitionToReplaceFileIds(),
|
||||
instantTime);
|
||||
|
||||
List<String> newFiles = partitionsAffected.stream()
|
||||
.flatMap(partition -> fsView.getLatestBaseFiles(partition).map(bf -> bf.getPath()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (newFiles.isEmpty()) {
|
||||
return sqlContext.emptyDataFrame();
|
||||
}
|
||||
|
||||
return readRecordsForBaseFiles(sqlContext, newFiles);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.validator;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.util.HoodieTimer;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieValidationException;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Validator can be configured pre-commit.
|
||||
*/
|
||||
public abstract class SparkPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> {
|
||||
private static final Logger LOG = LogManager.getLogger(SparkPreCommitValidator.class);
|
||||
|
||||
private HoodieSparkTable<T> table;
|
||||
private HoodieEngineContext engineContext;
|
||||
private HoodieWriteConfig writeConfig;
|
||||
|
||||
protected SparkPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) {
|
||||
this.table = table;
|
||||
this.engineContext = engineContext;
|
||||
this.writeConfig = writeConfig;
|
||||
}
|
||||
|
||||
protected Set<String> getPartitionsModified(HoodieWriteMetadata<O> writeResult) {
|
||||
Set<String> partitionsModified;
|
||||
if (writeResult.getWriteStats().isPresent()) {
|
||||
partitionsModified = writeResult.getWriteStats().get().stream().map(HoodieWriteStat::getPartitionPath).collect(Collectors.toSet());
|
||||
} else {
|
||||
partitionsModified = new HashSet<>(writeResult.getWriteStatuses().map(WriteStatus::getPartitionPath).collect());
|
||||
}
|
||||
return partitionsModified;
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify the data written as part of specified instant.
|
||||
* Throw HoodieValidationException if any unexpected data is written (Example: data files are not readable for some reason).
|
||||
*/
|
||||
public void validate(String instantTime, HoodieWriteMetadata<O> writeResult, Dataset<Row> before, Dataset<Row> after) throws HoodieValidationException {
|
||||
HoodieTimer timer = new HoodieTimer().startTimer();
|
||||
try {
|
||||
validateRecordsBeforeAndAfter(before, after, getPartitionsModified(writeResult));
|
||||
} finally {
|
||||
LOG.info(getClass() + " validator took " + timer.endTimer() + " ms");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes input of RDD 1) before clustering and 2) after clustering. Perform required validation
|
||||
* and throw error if validation fails
|
||||
*/
|
||||
protected abstract void validateRecordsBeforeAndAfter(Dataset<Row> before,
|
||||
Dataset<Row> after,
|
||||
Set<String> partitionsAffected);
|
||||
|
||||
public HoodieTable getHoodieTable() {
|
||||
return this.table;
|
||||
}
|
||||
|
||||
public HoodieEngineContext getEngineContext() {
|
||||
return this.engineContext;
|
||||
}
|
||||
|
||||
public HoodieWriteConfig getWriteConfig() {
|
||||
return this.writeConfig;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.validator;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.config.HoodiePreCommitValidatorConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieValidationException;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
|
||||
/**
|
||||
* Validator to run sql query and compare table state
|
||||
* 1) before new commit started.
|
||||
* 2) current inflight commit (if successful).
|
||||
*
|
||||
* Expects both queries to return same result.
|
||||
*/
|
||||
public class SqlQueryEqualityPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SqlQueryEqualityPreCommitValidator.class);
|
||||
|
||||
public SqlQueryEqualityPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
|
||||
super(table, engineContext, config);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getQueryConfigName() {
|
||||
return HoodiePreCommitValidatorConfig.PRE_COMMIT_VALIDATORS_EQUALITY_SQL_QUERIES.key();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) {
|
||||
String queryWithPrevSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, prevTableSnapshot);
|
||||
String queryWithNewSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot);
|
||||
LOG.info("Running query on previous state: " + queryWithPrevSnapshot);
|
||||
Dataset<Row> prevRows = sqlContext.sql(queryWithPrevSnapshot);
|
||||
LOG.info("Running query on new state: " + queryWithNewSnapshot);
|
||||
Dataset<Row> newRows = sqlContext.sql(queryWithNewSnapshot);
|
||||
printAllRowsIfDebugEnabled(prevRows);
|
||||
printAllRowsIfDebugEnabled(newRows);
|
||||
boolean areDatasetsEqual = prevRows.intersect(newRows).count() == prevRows.count();
|
||||
LOG.info("Completed Equality Validation, datasets equal? " + areDatasetsEqual);
|
||||
if (!areDatasetsEqual) {
|
||||
LOG.error("query validation failed. See stdout for sample query results. Query: " + query);
|
||||
System.out.println("Expected result (sample records only):");
|
||||
prevRows.show();
|
||||
System.out.println("Actual result (sample records only):");
|
||||
newRows.show();
|
||||
throw new HoodieValidationException("Query validation failed for '" + query + "'. See stdout for expected vs actual records");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.validator;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.config.HoodiePreCommitValidatorConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieValidationException;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
|
||||
/**
|
||||
* Validator to run sql query and compare table state
|
||||
* 1) before new commit started.
|
||||
* 2) current inflight commit (if successful).
|
||||
*
|
||||
* Expects query results dont match.
|
||||
*/
|
||||
public class SqlQueryInequalityPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
|
||||
private static final Logger LOG = LogManager.getLogger(SqlQueryInequalityPreCommitValidator.class);
|
||||
|
||||
public SqlQueryInequalityPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
|
||||
super(table, engineContext, config);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getQueryConfigName() {
|
||||
return HoodiePreCommitValidatorConfig.PRE_COMMIT_VALIDATORS_INEQUALITY_SQL_QUERIES.key();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) {
|
||||
String queryWithPrevSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, prevTableSnapshot);
|
||||
String queryWithNewSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot);
|
||||
LOG.info("Running query on previous state: " + queryWithPrevSnapshot);
|
||||
Dataset<Row> prevRows = sqlContext.sql(queryWithPrevSnapshot);
|
||||
LOG.info("Running query on new state: " + queryWithNewSnapshot);
|
||||
Dataset<Row> newRows = sqlContext.sql(queryWithNewSnapshot);
|
||||
printAllRowsIfDebugEnabled(prevRows);
|
||||
printAllRowsIfDebugEnabled(newRows);
|
||||
boolean areDatasetsEqual = prevRows.intersect(newRows).count() == prevRows.count();
|
||||
LOG.info("Completed Inequality Validation, datasets equal? " + areDatasetsEqual);
|
||||
if (areDatasetsEqual) {
|
||||
LOG.error("query validation failed. See stdout for sample query results. Query: " + query);
|
||||
System.out.println("Expected query results to be inequal, but they are same. Result (sample records only):");
|
||||
prevRows.show();
|
||||
throw new HoodieValidationException("Query validation failed for '" + query
|
||||
+ "'. Expected " + prevRows.count() + " rows, Found " + newRows.count());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.validator;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieValidationException;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* Validator framework to run sql queries and compare table state at different locations.
|
||||
*/
|
||||
public abstract class SqlQueryPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SparkPreCommitValidator<T, I, K, O> {
|
||||
private static final Logger LOG = LogManager.getLogger(SqlQueryPreCommitValidator.class);
|
||||
private static final AtomicInteger TABLE_COUNTER = new AtomicInteger(0);
|
||||
|
||||
public SqlQueryPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
|
||||
super(table, engineContext, config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes input datasets 1) before commit started and 2) with inflight commit. Perform required validation
|
||||
* and throw error if validation fails
|
||||
*/
|
||||
@Override
|
||||
public void validateRecordsBeforeAndAfter(Dataset<Row> before, Dataset<Row> after, final Set<String> partitionsAffected) {
|
||||
String hoodieTableName = "staged_table_" + TABLE_COUNTER.incrementAndGet();
|
||||
String hoodieTableBeforeCurrentCommit = hoodieTableName + "_before";
|
||||
String hoodieTableWithInflightCommit = hoodieTableName + "_after";
|
||||
before.registerTempTable(hoodieTableBeforeCurrentCommit);
|
||||
after.registerTempTable(hoodieTableWithInflightCommit);
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(getEngineContext());
|
||||
SQLContext sqlContext = new SQLContext(jsc);
|
||||
|
||||
String[] queries = getQueriesToRun();
|
||||
//TODO run this in a thread pool to improve parallelism
|
||||
Arrays.stream(queries).forEach(query ->
|
||||
validateUsingQuery(query, hoodieTableBeforeCurrentCommit, hoodieTableWithInflightCommit, sqlContext));
|
||||
}
|
||||
|
||||
protected String[] getQueriesToRun() {
|
||||
String sqlQueriesConfigured = getWriteConfig().getProps().getProperty(getQueryConfigName());
|
||||
if (StringUtils.isNullOrEmpty(sqlQueriesConfigured)) {
|
||||
throw new HoodieValidationException("Sql validator configured incorrectly. expecting at least one query. Found 0 queries in "
|
||||
+ sqlQueriesConfigured);
|
||||
}
|
||||
return sqlQueriesConfigured.trim().split(";");
|
||||
}
|
||||
|
||||
protected void printAllRowsIfDebugEnabled(Dataset<Row> dataset) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
dataset = dataset.cache();
|
||||
LOG.debug("Printing all rows from query validation:");
|
||||
dataset.show(Integer.MAX_VALUE,false);
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract String getQueryConfigName();
|
||||
|
||||
protected abstract void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext);
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.validator;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.config.HoodiePreCommitValidatorConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieValidationException;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Validator to run sql queries on new table state and expects a single result. If the result doesnt match expected result,
|
||||
* throw validation error.
|
||||
*
|
||||
* Example configuration: "query1#expectedResult1;query2#expectedResult2;"
|
||||
*/
|
||||
public class SqlQuerySingleResultPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
|
||||
private static final Logger LOG = LogManager.getLogger(SqlQueryInequalityPreCommitValidator.class);
|
||||
|
||||
public SqlQuerySingleResultPreCommitValidator(HoodieSparkTable<T> table, HoodieEngineContext engineContext, HoodieWriteConfig config) {
|
||||
super(table, engineContext, config);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getQueryConfigName() {
|
||||
return HoodiePreCommitValidatorConfig.PRE_COMMIT_VALIDATORS_SINGLE_VALUE_SQL_QUERIES.key();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) {
|
||||
String[] queryWithExpectedResult = query.split("#");
|
||||
if (queryWithExpectedResult.length != 2) {
|
||||
throw new HoodieValidationException("Invalid query format " + query);
|
||||
}
|
||||
|
||||
String queryToRun = queryWithExpectedResult[0];
|
||||
String expectedResult = queryWithExpectedResult[1];
|
||||
LOG.info("Running query on new state: " + queryToRun);
|
||||
String queryWithNewSnapshot = queryToRun.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot);
|
||||
List<Row> newRows = sqlContext.sql(queryWithNewSnapshot).collectAsList();
|
||||
if (newRows.size() != 1 && newRows.get(0).size() != 1) {
|
||||
throw new HoodieValidationException("Invalid query result. expect single value for '" + query + "'");
|
||||
}
|
||||
Object result = newRows.get(0).apply(0);
|
||||
if (result == null || !expectedResult.equals(result.toString())) {
|
||||
LOG.error("Mismatch query result. Expected: " + expectedResult + " got " + result + "Query: " + query);
|
||||
throw new HoodieValidationException("Query validation failed for '" + query
|
||||
+ "'. Expected " + expectedResult + " rows, Found " + result);
|
||||
} else {
|
||||
LOG.info("Query validation successful. Expected: " + expectedResult + " got " + result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -31,6 +31,7 @@ import org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector;
|
||||
import org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||
import org.apache.hudi.client.utils.SparkValidatorUtils;
|
||||
import org.apache.hudi.common.bootstrap.FileStatusUtils;
|
||||
import org.apache.hudi.common.bootstrap.index.BootstrapIndex;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
@@ -422,4 +423,9 @@ public class SparkBootstrapCommitActionExecutor<T extends HoodieRecordPayload<T>
|
||||
protected Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
|
||||
throw new UnsupportedOperationException("Should not called in bootstrap code path");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void runPrecommitValidators(HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata) {
|
||||
SparkValidatorUtils.runValidators(config, writeMetadata, context, table, instantTime);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ package org.apache.hudi.table.action.commit;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.client.utils.SparkValidatorUtils;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
@@ -387,4 +388,8 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
|
||||
return getUpsertPartitioner(profile);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void runPrecommitValidators(HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata) {
|
||||
SparkValidatorUtils.runValidators(config, writeMetadata, context, table, instantTime);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user