1
0

[HUDI-3656] Adding medium sized dataset for clustering and minor fixes to integ tests (#5063)

This commit is contained in:
Sivabalan Narayanan
2022-03-18 09:44:56 -07:00
committed by GitHub
parent 6fe4d6e2f6
commit 2551c26183
3 changed files with 152 additions and 15 deletions

View File

@@ -18,8 +18,15 @@
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.CleanerUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
@@ -59,21 +66,19 @@ public class ValidateAsyncOperations extends DagNode<Option<String>> {
int maxCommitsRetained = executionContext.getHoodieTestSuiteWriter().getWriteConfig().getCleanerCommitsRetained() + 1;
FileSystem fs = FSUtils.getFs(basePath, executionContext.getHoodieTestSuiteWriter().getConfiguration());
Map<String, Integer> fileIdCount = new HashMap<>();
AtomicInteger maxVal = new AtomicInteger();
List<String> partitionPaths = FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, basePath);
for (String partitionPath : partitionPaths) {
List<FileStatus> fileStatuses = Arrays.stream(FSUtils.getAllDataFilesInPartition(fs, new Path(basePath + "/" + partitionPath))).collect(Collectors.toList());
fileStatuses.forEach(entry -> {
String fileId = FSUtils.getFileId(entry.getPath().getName());
fileIdCount.computeIfAbsent(fileId, k -> 0);
fileIdCount.put(fileId, fileIdCount.get(fileId) + 1);
maxVal.set(Math.max(maxVal.get(), fileIdCount.get(fileId)));
});
}
if (maxVal.get() > maxCommitsRetained) {
throw new AssertionError("Total commits (" + maxVal + ") retained exceeds max value of " + maxCommitsRetained + ", total commits : ");
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath)
.setConf(executionContext.getJsc().hadoopConfiguration()).build();
Option<HoodieInstant> latestCleanInstant = metaClient.getActiveTimeline().filter(instant -> instant.getAction().equals(HoodieTimeline.CLEAN_ACTION)).lastInstant();
if (latestCleanInstant.isPresent()) {
log.warn("Latest clean commit " + latestCleanInstant.get());
HoodieCleanMetadata cleanMetadata = CleanerUtils.getCleanerMetadata(metaClient, latestCleanInstant.get());
String earliestCommitToRetain = cleanMetadata.getEarliestCommitToRetain();
log.warn("Earliest commit to retain : " + earliestCommitToRetain);
long unCleanedInstants = metaClient.getActiveTimeline().filterCompletedInstants().filter(instant ->
HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, earliestCommitToRetain)).getInstants().count();
ValidationUtils.checkArgument(unCleanedInstants >= (maxCommitsRetained + 1), "Total uncleaned instants " + unCleanedInstants +
" mismatched with max commits retained " + (maxCommitsRetained + 1));
}
if (config.validateArchival() || config.validateClean()) {