[HUDI-3429] Support clustering scheduleAndExecute for hudi-cli and add clustering-cli Tests (#4817)

Co-authored-by: yuezhang <yuezhang@freewheel.tv>
2022-02-25 12:28:38 +08:00
parent aa1810d737
commit 3694485609
3 changed files with 253 additions and 5 deletions
--- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java
+++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java
@@ -116,4 +116,40 @@ public class ClusteringCommand implements CommandMarker {
    }
    return "Succeeded to run clustering for " + clusteringInstantTime;
  }
+
+  /**
+   * Run clustering table service.
+   * <p>
+   * Example:
+   * > connect --path {path to hudi table}
+   * > clustering scheduleAndExecute --sparkMaster local --sparkMemory 2g
+   */
+  @CliCommand(value = "clustering scheduleAndExecute", help = "Run Clustering. Make a cluster plan first and execute that plan immediately")
+  public String runClustering(
+      @CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master,
+      @CliOption(key = "sparkMemory", help = "Spark executor memory", unspecifiedDefaultValue = "4g") final String sparkMemory,
+      @CliOption(key = "parallelism", help = "Parallelism for hoodie clustering", unspecifiedDefaultValue = "1") final String parallelism,
+      @CliOption(key = "retry", help = "Number of retries", unspecifiedDefaultValue = "1") final String retry,
+      @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for "
+          + "hoodie client for compacting", unspecifiedDefaultValue = "") final String propsFilePath,
+      @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be "
+          + "passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs) throws Exception {
+    HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
+    boolean initialized = HoodieCLI.initConf();
+    HoodieCLI.initFS(initialized);
+
+    String sparkPropertiesPath =
+        Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
+    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
+    sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_SCHEDULE_AND_EXECUTE.toString(), master, sparkMemory,
+        client.getBasePath(), client.getTableConfig().getTableName(), parallelism, retry, propsFilePath);
+    UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
+    Process process = sparkLauncher.launch();
+    InputStreamConsumer.captureOutput(process);
+    int exitCode = process.waitFor();
+    if (exitCode != 0) {
+      return "Failed to run clustering for scheduleAndExecute.";
+    }
+    return "Succeeded to run clustering for scheduleAndExecute";
+  }
 }
--- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java
+++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java
@@ -76,7 +76,7 @@ public class SparkMain {
  enum SparkCommand {
    BOOTSTRAP, ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, COMPACT_SCHEDULE_AND_EXECUTE,
    COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR, CLUSTERING_SCHEDULE,
-    CLUSTERING_RUN, CLEAN, DELETE_SAVEPOINT, UPGRADE, DOWNGRADE
+    CLUSTERING_RUN, CLUSTERING_SCHEDULE_AND_EXECUTE, CLEAN, DELETE_SAVEPOINT, UPGRADE, DOWNGRADE
  }

  public static void main(String[] args) throws Exception {
@@ -190,7 +190,20 @@ public class SparkMain {
            configs.addAll(Arrays.asList(args).subList(9, args.length));
          }
          returnCode = cluster(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), args[2],
-              Integer.parseInt(args[7]), false, propsFilePath, configs);
+              Integer.parseInt(args[7]), HoodieClusteringJob.EXECUTE, propsFilePath, configs);
+          break;
+        case CLUSTERING_SCHEDULE_AND_EXECUTE:
+          assert (args.length >= 8);
+          propsFilePath = null;
+          if (!StringUtils.isNullOrEmpty(args[7])) {
+            propsFilePath = args[7];
+          }
+          configs = new ArrayList<>();
+          if (args.length > 8) {
+            configs.addAll(Arrays.asList(args).subList(8, args.length));
+          }
+          returnCode = cluster(jsc, args[3], args[4], null, Integer.parseInt(args[5]), args[2],
+              Integer.parseInt(args[6]), HoodieClusteringJob.SCHEDULE_AND_EXECUTE, propsFilePath, configs);
          break;
        case CLUSTERING_SCHEDULE:
          assert (args.length >= 7);
@@ -203,7 +216,7 @@ public class SparkMain {
            configs.addAll(Arrays.asList(args).subList(7, args.length));
          }
          returnCode = cluster(jsc, args[3], args[4], args[5], 1, args[2],
-              0, true, propsFilePath, configs);
+              0, HoodieClusteringJob.SCHEDULE, propsFilePath, configs);
          break;
        case CLEAN:
          assert (args.length >= 5);
@@ -351,13 +364,13 @@ public class SparkMain {
  }

  private static int cluster(JavaSparkContext jsc, String basePath, String tableName, String clusteringInstant,
-      int parallelism, String sparkMemory, int retry, boolean schedule, String propsFilePath, List<String> configs) {
+      int parallelism, String sparkMemory, int retry, String runningMode, String propsFilePath, List<String> configs) {
    HoodieClusteringJob.Config cfg = new HoodieClusteringJob.Config();
    cfg.basePath = basePath;
    cfg.tableName = tableName;
    cfg.clusteringInstantTime = clusteringInstant;
    cfg.parallelism = parallelism;
-    cfg.runSchedule = schedule;
+    cfg.runningMode = runningMode;
    cfg.propsFilePath = propsFilePath;
    cfg.configs = configs;
    jsc.getConf().set("spark.executor.memory", sparkMemory);
--- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java
+++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli.integ;
+
+import org.apache.hudi.cli.HoodieCLI;
+import org.apache.hudi.cli.commands.TableCommand;
+import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest;
+import org.apache.hudi.client.SparkRDDWriteClient;
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.client.common.HoodieSparkEngineContext;
+import org.apache.hudi.common.model.HoodieAvroPayload;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
+import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.config.HoodieIndexConfig;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.index.HoodieIndex;
+import org.apache.hudi.testutils.HoodieClientTestBase;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.springframework.shell.core.CommandResult;
+
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import static org.junit.jupiter.api.Assertions.assertAll;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Integration test class for {@link org.apache.hudi.cli.commands.ClusteringCommand}.
+ * <p/>
+ * A command use SparkLauncher need load jars under lib which generate during mvn package.
+ * Use integration test instead of unit test.
+ */
+public class ITTestClusteringCommand extends AbstractShellIntegrationTest {
+
+  private String tablePath;
+  private String tableName;
+
+  @BeforeEach
+  public void init() throws IOException {
+    tableName = "test_table_" + ITTestClusteringCommand.class.getName();
+    tablePath = Paths.get(basePath, tableName).toString();
+
+    HoodieCLI.conf = jsc.hadoopConfiguration();
+    // Create table and connect
+    new TableCommand().createTable(
+        tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(),
+        "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload");
+    metaClient.setBasePath(tablePath);
+    metaClient = HoodieTableMetaClient.reload(metaClient);
+  }
+
+  /**
+   * Test case for command 'clustering schedule'.
+   */
+  @Test
+  public void testScheduleClustering() throws IOException {
+    // generate commits
+    generateCommits();
+
+    CommandResult cr = scheduleClustering();
+    assertAll("Command run failed",
+        () -> assertTrue(cr.isSuccess()),
+        () -> assertTrue(
+            cr.getResult().toString().startsWith("Succeeded to schedule clustering for")));
+
+    // there is 1 requested clustering
+    HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline();
+    assertEquals(1, timeline.filterPendingReplaceTimeline().countInstants());
+  }
+
+  /**
+   * Test case for command 'clustering run'.
+   */
+  @Test
+  public void testClustering() throws IOException {
+    // generate commits
+    generateCommits();
+
+    CommandResult cr1 = scheduleClustering();
+    assertTrue(cr1.isSuccess());
+
+    // get clustering instance
+    HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline();
+    Option<String> instance =
+        timeline.filterPendingReplaceTimeline().firstInstant().map(HoodieInstant::getTimestamp);
+    assertTrue(instance.isPresent(), "Must have pending clustering.");
+
+    CommandResult cr2 = getShell().executeCommand(
+        String.format("clustering run --parallelism %s --clusteringInstant %s --sparkMaster %s",
+            2, instance, "local"));
+
+    assertAll("Command run failed",
+        () -> assertTrue(cr2.isSuccess()),
+        () -> assertTrue(
+            cr2.getResult().toString().startsWith("Succeeded to run clustering for ")));
+
+    // assert clustering complete
+    assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload()
+        .filterCompletedInstants().getInstants()
+        .map(HoodieInstant::getTimestamp).collect(Collectors.toList()).contains(instance),
+        "Pending clustering must be completed");
+
+    assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload()
+            .getCompletedReplaceTimeline().getInstants()
+            .map(HoodieInstant::getTimestamp).collect(Collectors.toList()).contains(instance),
+        "Pending clustering must be completed");
+  }
+
+  /**
+   * Test case for command 'clustering scheduleAndExecute'.
+   */
+  @Test
+  public void testClusteringScheduleAndExecute() throws IOException {
+    // generate commits
+    generateCommits();
+
+    CommandResult cr2 = getShell().executeCommand(
+        String.format("clustering scheduleAndExecute --parallelism %s --sparkMaster %s", 2, "local"));
+
+    assertAll("Command run failed",
+        () -> assertTrue(cr2.isSuccess()),
+        () -> assertTrue(
+            cr2.getResult().toString().startsWith("Succeeded to run clustering for scheduleAndExecute")));
+
+    // assert clustering complete
+    assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload()
+            .getCompletedReplaceTimeline().getInstants()
+            .map(HoodieInstant::getTimestamp).count() > 0,
+        "Completed clustering couldn't be 0");
+  }
+
+  private CommandResult scheduleClustering() {
+    // generate requested clustering
+    return getShell().executeCommand(
+        String.format("clustering schedule --hoodieConfigs hoodie.clustering.inline.max.commits=1 --sparkMaster %s", "local"));
+  }
+
+  private void generateCommits() throws IOException {
+    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
+
+    // Create the write client to write some records in
+    HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath)
+        .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
+        .withDeleteParallelism(2).forTable(tableName)
+        .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
+
+    SparkRDDWriteClient<HoodieAvroPayload> client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg);
+
+    insert(jsc, client, dataGen, "001");
+    insert(jsc, client, dataGen, "002");
+  }
+
+  private List<HoodieRecord> insert(JavaSparkContext jsc, SparkRDDWriteClient<HoodieAvroPayload> client,
+      HoodieTestDataGenerator dataGen, String newCommitTime) throws IOException {
+    // inserts
+    client.startCommitWithTime(newCommitTime);
+
+    List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 10);
+    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
+    operateFunc(SparkRDDWriteClient::insert, client, writeRecords, newCommitTime);
+    return records;
+  }
+
+  private JavaRDD<WriteStatus> operateFunc(
+      HoodieClientTestBase.Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
+      SparkRDDWriteClient<HoodieAvroPayload> client, JavaRDD<HoodieRecord> writeRecords, String commitTime)
+      throws IOException {
+    return writeFn.apply(client, writeRecords, commitTime);
+  }
+}