[HUDI-3656] Adding medium sized dataset for clustering and minor fixes to integ tests (#5063)

2022-03-18 09:44:56 -07:00
parent 6fe4d6e2f6
commit 2551c26183
3 changed files with 152 additions and 15 deletions
--- a/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml
+++ b/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml
@@ -0,0 +1,73 @@
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # to be used with test-aggressive-clean-archival.properties
 dag_name: deltastreamer-medium-clustering.yaml
 dag_rounds: 20
 dag_intermittent_delay_mins: 1
 dag_content:
  first_insert:
    config:
      record_size: 1000
      num_partitions_insert: 5
      repeat_count: 1
      num_records_insert: 1000
    type: InsertNode
    deps: none
  second_insert:
    config:
      record_size: 1000
      num_partitions_insert: 50
      repeat_count: 1
      num_records_insert: 10000
    deps: first_insert
    type: InsertNode
  third_insert:
    config:
      record_size: 1000
      num_partitions_insert: 2
      repeat_count: 1
      num_records_insert: 300
    deps: second_insert
    type: InsertNode
  first_upsert:
    config:
      record_size: 1000
      num_partitions_insert: 2
      num_records_insert: 300
      repeat_count: 1
      num_records_upsert: 100
      num_partitions_upsert: 1
    type: UpsertNode
    deps: third_insert
  first_delete:
    config:
      num_partitions_delete: 50
      num_records_delete: 8000
    type: DeleteNode
    deps: first_upsert
  second_validate:
    config:
      validate_hive: false
      delete_input_data: true
    type: ValidateDatasetNode
    deps: first_delete
  last_validate:
    config:
      execute_itr_count: 20
    type: ValidateAsyncOperations
    deps: second_validate
--- a/docker/demo/config/test-suite/spark-medium-clustering.yaml
+++ b/docker/demo/config/test-suite/spark-medium-clustering.yaml
@@ -0,0 +1,59 @@
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 dag_name: spark-medium-clustering.yaml
 dag_rounds: 20
 dag_intermittent_delay_mins: 0
 dag_content:
  first_insert:
    config:
      record_size: 200
      num_partitions_insert: 50
      repeat_count: 1
      num_records_insert: 10000
    type: SparkInsertNode
    deps: none
  first_validate:
    config:
      validate_hive: false
    type: ValidateDatasetNode
    deps: first_insert
  first_upsert:
    config:
      record_size: 200
      num_partitions_insert: 50
      num_records_insert: 300
      repeat_count: 1
      num_records_upsert: 3000
      num_partitions_upsert: 50
    type: SparkUpsertNode
    deps: first_validate
  first_delete:
    config:
      num_partitions_delete: 50
      num_records_delete: 8000
    type: SparkDeleteNode
    deps: first_upsert
  second_validate:
    config:
      validate_hive: false
      delete_input_data: true
    type: ValidateDatasetNode
    deps: first_delete
  last_validate:
    config:
      execute_itr_count: 20
    type: ValidateAsyncOperations
    deps: second_validate
--- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateAsyncOperations.java
+++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateAsyncOperations.java
@@ -18,8 +18,15 @@
 package org.apache.hudi.integ.testsuite.dag.nodes;
 import org.apache.hudi.avro.model.HoodieCleanMetadata;
 import org.apache.hudi.common.fs.FSUtils;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
 import org.apache.hudi.common.util.CleanerUtils;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.ValidationUtils;
 import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
 import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
@@ -59,21 +66,19 @@ public class ValidateAsyncOperations extends DagNode<Option<String>> {
        int maxCommitsRetained = executionContext.getHoodieTestSuiteWriter().getWriteConfig().getCleanerCommitsRetained() + 1;
        FileSystem fs = FSUtils.getFs(basePath, executionContext.getHoodieTestSuiteWriter().getConfiguration());
        Map<String, Integer> fileIdCount = new HashMap<>();
-        AtomicInteger maxVal = new AtomicInteger();
+        HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath)
-        List<String> partitionPaths = FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, basePath);
+            .setConf(executionContext.getJsc().hadoopConfiguration()).build();
-        for (String partitionPath : partitionPaths) {
+        Option<HoodieInstant> latestCleanInstant = metaClient.getActiveTimeline().filter(instant -> instant.getAction().equals(HoodieTimeline.CLEAN_ACTION)).lastInstant();
-          List<FileStatus> fileStatuses = Arrays.stream(FSUtils.getAllDataFilesInPartition(fs, new Path(basePath + "/" + partitionPath))).collect(Collectors.toList());
+        if (latestCleanInstant.isPresent()) {
-          fileStatuses.forEach(entry -> {
+          log.warn("Latest clean commit " + latestCleanInstant.get());
-            String fileId = FSUtils.getFileId(entry.getPath().getName());
+          HoodieCleanMetadata cleanMetadata = CleanerUtils.getCleanerMetadata(metaClient, latestCleanInstant.get());
-            fileIdCount.computeIfAbsent(fileId, k -> 0);
+          String earliestCommitToRetain = cleanMetadata.getEarliestCommitToRetain();
-            fileIdCount.put(fileId, fileIdCount.get(fileId) + 1);
+          log.warn("Earliest commit to retain : " + earliestCommitToRetain);
-            maxVal.set(Math.max(maxVal.get(), fileIdCount.get(fileId)));
+          long unCleanedInstants = metaClient.getActiveTimeline().filterCompletedInstants().filter(instant ->
-          });
+              HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, earliestCommitToRetain)).getInstants().count();
-        }
+          ValidationUtils.checkArgument(unCleanedInstants >= (maxCommitsRetained + 1), "Total uncleaned instants " + unCleanedInstants +
-        if (maxVal.get() > maxCommitsRetained) {
+              " mismatched with max commits retained " + (maxCommitsRetained + 1));
          throw new AssertionError("Total commits (" + maxVal + ") retained exceeds max value of " + maxCommitsRetained + ", total commits : ");
        }
        if (config.validateArchival() || config.validateClean()) {