1
0

[HUDI-3784] Improve docs and logs of HoodieMetadataTableValidator (#5216)

This commit is contained in:
Y Ethan Guo
2022-04-02 13:16:17 -07:00
committed by GitHub
parent eef3f9c74a
commit c19f505b5a

View File

@@ -43,6 +43,7 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.CleanerUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ParquetUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
@@ -86,6 +87,9 @@ import java.util.stream.Collectors;
* - `--validate-all-file-groups`: validate all file groups, and all file slices within file groups.
* - `--validate-all-column-stats`: validate column stats for all columns in the schema
* - `--validate-bloom-filters`: validate bloom filters of base files
*
* If the Hudi table is on the local file system, the base path passed to `--base-path` must have
* "file:" prefix to avoid validation failure.
* <p>
* - Default : This validator will compare the results between metadata table and filesystem only once.
* <p>
@@ -139,8 +143,11 @@ public class HoodieMetadataTableValidator implements Serializable {
protected transient Option<AsyncMetadataTableValidateService> asyncMetadataTableValidateService;
private final String taskLabels;
public HoodieMetadataTableValidator(HoodieTableMetaClient metaClient) {
this.metaClient = metaClient;
this.taskLabels = StringUtils.EMPTY_STRING;
}
public HoodieMetadataTableValidator(JavaSparkContext jsc, Config cfg) {
@@ -157,6 +164,27 @@ public class HoodieMetadataTableValidator implements Serializable {
.build();
this.asyncMetadataTableValidateService = cfg.continuous ? Option.of(new AsyncMetadataTableValidateService()) : Option.empty();
this.taskLabels = generateValidationTaskLabels();
}
private String generateValidationTaskLabels() {
List<String> labelList = new ArrayList<>();
if (cfg.validateLatestBaseFiles) {
labelList.add("validate-latest-base-files");
}
if (cfg.validateLatestFileSlices) {
labelList.add("validate-latest-file-slices");
}
if (cfg.validateAllFileGroups) {
labelList.add("validate-all-file-groups");
}
if (cfg.validateAllColumnStats) {
labelList.add("validate-all-column-stats");
}
if (cfg.validateBloomFilters) {
labelList.add("validate-bloom-filters");
}
return String.join(",", labelList);
}
/**
@@ -397,10 +425,12 @@ public class HoodieMetadataTableValidator implements Serializable {
List<Boolean> result = engineContext.parallelize(allPartitions, allPartitions.size()).map(partitionPath -> {
try {
validateFilesInPartition(metadataTableBasedContext, fsBasedContext, partitionPath, finalBaseFilesForCleaning);
LOG.info("Metadata table validation succeeded for " + partitionPath);
LOG.info(String.format("Metadata table validation succeeded for partition %s (partition %s)", partitionPath, taskLabels));
return true;
} catch (HoodieValidationException e) {
LOG.error("Metadata table validation failed for " + partitionPath + " due to HoodieValidationException", e);
LOG.error(
String.format("Metadata table validation failed for partition %s due to HoodieValidationException (partition %s)",
partitionPath, taskLabels), e);
if (!cfg.ignoreFailed) {
throw e;
}
@@ -413,9 +443,9 @@ public class HoodieMetadataTableValidator implements Serializable {
}
if (finalResult) {
LOG.info("Metadata table validation succeeded.");
LOG.info(String.format("Metadata table validation succeeded (%s).", taskLabels));
} else {
LOG.warn("Metadata table validation failed.");
LOG.warn(String.format("Metadata table validation failed (%s).", taskLabels));
}
}