[HUDI-3784] Improve docs and logs of HoodieMetadataTableValidator (#5216)
This commit is contained in:
@@ -43,6 +43,7 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
|||||||
import org.apache.hudi.common.util.CleanerUtils;
|
import org.apache.hudi.common.util.CleanerUtils;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.ParquetUtils;
|
import org.apache.hudi.common.util.ParquetUtils;
|
||||||
|
import org.apache.hudi.common.util.StringUtils;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
@@ -86,6 +87,9 @@ import java.util.stream.Collectors;
|
|||||||
* - `--validate-all-file-groups`: validate all file groups, and all file slices within file groups.
|
* - `--validate-all-file-groups`: validate all file groups, and all file slices within file groups.
|
||||||
* - `--validate-all-column-stats`: validate column stats for all columns in the schema
|
* - `--validate-all-column-stats`: validate column stats for all columns in the schema
|
||||||
* - `--validate-bloom-filters`: validate bloom filters of base files
|
* - `--validate-bloom-filters`: validate bloom filters of base files
|
||||||
|
*
|
||||||
|
* If the Hudi table is on the local file system, the base path passed to `--base-path` must have
|
||||||
|
* "file:" prefix to avoid validation failure.
|
||||||
* <p>
|
* <p>
|
||||||
* - Default : This validator will compare the results between metadata table and filesystem only once.
|
* - Default : This validator will compare the results between metadata table and filesystem only once.
|
||||||
* <p>
|
* <p>
|
||||||
@@ -139,8 +143,11 @@ public class HoodieMetadataTableValidator implements Serializable {
|
|||||||
|
|
||||||
protected transient Option<AsyncMetadataTableValidateService> asyncMetadataTableValidateService;
|
protected transient Option<AsyncMetadataTableValidateService> asyncMetadataTableValidateService;
|
||||||
|
|
||||||
|
private final String taskLabels;
|
||||||
|
|
||||||
public HoodieMetadataTableValidator(HoodieTableMetaClient metaClient) {
|
public HoodieMetadataTableValidator(HoodieTableMetaClient metaClient) {
|
||||||
this.metaClient = metaClient;
|
this.metaClient = metaClient;
|
||||||
|
this.taskLabels = StringUtils.EMPTY_STRING;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieMetadataTableValidator(JavaSparkContext jsc, Config cfg) {
|
public HoodieMetadataTableValidator(JavaSparkContext jsc, Config cfg) {
|
||||||
@@ -157,6 +164,27 @@ public class HoodieMetadataTableValidator implements Serializable {
|
|||||||
.build();
|
.build();
|
||||||
|
|
||||||
this.asyncMetadataTableValidateService = cfg.continuous ? Option.of(new AsyncMetadataTableValidateService()) : Option.empty();
|
this.asyncMetadataTableValidateService = cfg.continuous ? Option.of(new AsyncMetadataTableValidateService()) : Option.empty();
|
||||||
|
this.taskLabels = generateValidationTaskLabels();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String generateValidationTaskLabels() {
|
||||||
|
List<String> labelList = new ArrayList<>();
|
||||||
|
if (cfg.validateLatestBaseFiles) {
|
||||||
|
labelList.add("validate-latest-base-files");
|
||||||
|
}
|
||||||
|
if (cfg.validateLatestFileSlices) {
|
||||||
|
labelList.add("validate-latest-file-slices");
|
||||||
|
}
|
||||||
|
if (cfg.validateAllFileGroups) {
|
||||||
|
labelList.add("validate-all-file-groups");
|
||||||
|
}
|
||||||
|
if (cfg.validateAllColumnStats) {
|
||||||
|
labelList.add("validate-all-column-stats");
|
||||||
|
}
|
||||||
|
if (cfg.validateBloomFilters) {
|
||||||
|
labelList.add("validate-bloom-filters");
|
||||||
|
}
|
||||||
|
return String.join(",", labelList);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -397,10 +425,12 @@ public class HoodieMetadataTableValidator implements Serializable {
|
|||||||
List<Boolean> result = engineContext.parallelize(allPartitions, allPartitions.size()).map(partitionPath -> {
|
List<Boolean> result = engineContext.parallelize(allPartitions, allPartitions.size()).map(partitionPath -> {
|
||||||
try {
|
try {
|
||||||
validateFilesInPartition(metadataTableBasedContext, fsBasedContext, partitionPath, finalBaseFilesForCleaning);
|
validateFilesInPartition(metadataTableBasedContext, fsBasedContext, partitionPath, finalBaseFilesForCleaning);
|
||||||
LOG.info("Metadata table validation succeeded for " + partitionPath);
|
LOG.info(String.format("Metadata table validation succeeded for partition %s (partition %s)", partitionPath, taskLabels));
|
||||||
return true;
|
return true;
|
||||||
} catch (HoodieValidationException e) {
|
} catch (HoodieValidationException e) {
|
||||||
LOG.error("Metadata table validation failed for " + partitionPath + " due to HoodieValidationException", e);
|
LOG.error(
|
||||||
|
String.format("Metadata table validation failed for partition %s due to HoodieValidationException (partition %s)",
|
||||||
|
partitionPath, taskLabels), e);
|
||||||
if (!cfg.ignoreFailed) {
|
if (!cfg.ignoreFailed) {
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
@@ -413,9 +443,9 @@ public class HoodieMetadataTableValidator implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (finalResult) {
|
if (finalResult) {
|
||||||
LOG.info("Metadata table validation succeeded.");
|
LOG.info(String.format("Metadata table validation succeeded (%s).", taskLabels));
|
||||||
} else {
|
} else {
|
||||||
LOG.warn("Metadata table validation failed.");
|
LOG.warn(String.format("Metadata table validation failed (%s).", taskLabels));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user