[HUDI-3429] Support clustering scheduleAndExecute for hudi-cli and add clustering-cli Tests (#4817)

Co-authored-by: yuezhang <yuezhang@freewheel.tv>
2022-02-25 12:28:38 +08:00
parent aa1810d737
commit 3694485609
3 changed files with 253 additions and 5 deletions
--- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java
+++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java
@@ -116,4 +116,40 @@ public class ClusteringCommand implements CommandMarker {
    }
    return "Succeeded to run clustering for " + clusteringInstantTime;
  }
+
+  /**
+   * Run clustering table service.
+   * <p>
+   * Example:
+   * > connect --path {path to hudi table}
+   * > clustering scheduleAndExecute --sparkMaster local --sparkMemory 2g
+   */
+  @CliCommand(value = "clustering scheduleAndExecute", help = "Run Clustering. Make a cluster plan first and execute that plan immediately")
+  public String runClustering(
+      @CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master,
+      @CliOption(key = "sparkMemory", help = "Spark executor memory", unspecifiedDefaultValue = "4g") final String sparkMemory,
+      @CliOption(key = "parallelism", help = "Parallelism for hoodie clustering", unspecifiedDefaultValue = "1") final String parallelism,
+      @CliOption(key = "retry", help = "Number of retries", unspecifiedDefaultValue = "1") final String retry,
+      @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for "
+          + "hoodie client for compacting", unspecifiedDefaultValue = "") final String propsFilePath,
+      @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be "
+          + "passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs) throws Exception {
+    HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
+    boolean initialized = HoodieCLI.initConf();
+    HoodieCLI.initFS(initialized);
+
+    String sparkPropertiesPath =
+        Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
+    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
+    sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_SCHEDULE_AND_EXECUTE.toString(), master, sparkMemory,
+        client.getBasePath(), client.getTableConfig().getTableName(), parallelism, retry, propsFilePath);
+    UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
+    Process process = sparkLauncher.launch();
+    InputStreamConsumer.captureOutput(process);
+    int exitCode = process.waitFor();
+    if (exitCode != 0) {
+      return "Failed to run clustering for scheduleAndExecute.";
+    }
+    return "Succeeded to run clustering for scheduleAndExecute";
+  }
 }
--- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java
+++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java
@@ -76,7 +76,7 @@ public class SparkMain {
  enum SparkCommand {
    BOOTSTRAP, ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, COMPACT_SCHEDULE_AND_EXECUTE,
    COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR, CLUSTERING_SCHEDULE,
-    CLUSTERING_RUN, CLEAN, DELETE_SAVEPOINT, UPGRADE, DOWNGRADE
+    CLUSTERING_RUN, CLUSTERING_SCHEDULE_AND_EXECUTE, CLEAN, DELETE_SAVEPOINT, UPGRADE, DOWNGRADE
  }

  public static void main(String[] args) throws Exception {
@@ -190,7 +190,20 @@ public class SparkMain {
            configs.addAll(Arrays.asList(args).subList(9, args.length));
          }
          returnCode = cluster(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), args[2],
-              Integer.parseInt(args[7]), false, propsFilePath, configs);
+              Integer.parseInt(args[7]), HoodieClusteringJob.EXECUTE, propsFilePath, configs);
+          break;
+        case CLUSTERING_SCHEDULE_AND_EXECUTE:
+          assert (args.length >= 8);
+          propsFilePath = null;
+          if (!StringUtils.isNullOrEmpty(args[7])) {
+            propsFilePath = args[7];
+          }
+          configs = new ArrayList<>();
+          if (args.length > 8) {
+            configs.addAll(Arrays.asList(args).subList(8, args.length));
+          }
+          returnCode = cluster(jsc, args[3], args[4], null, Integer.parseInt(args[5]), args[2],
+              Integer.parseInt(args[6]), HoodieClusteringJob.SCHEDULE_AND_EXECUTE, propsFilePath, configs);
          break;
        case CLUSTERING_SCHEDULE:
          assert (args.length >= 7);
@@ -203,7 +216,7 @@ public class SparkMain {
            configs.addAll(Arrays.asList(args).subList(7, args.length));
          }
          returnCode = cluster(jsc, args[3], args[4], args[5], 1, args[2],
-              0, true, propsFilePath, configs);
+              0, HoodieClusteringJob.SCHEDULE, propsFilePath, configs);
          break;
        case CLEAN:
          assert (args.length >= 5);
@@ -351,13 +364,13 @@ public class SparkMain {
  }

  private static int cluster(JavaSparkContext jsc, String basePath, String tableName, String clusteringInstant,
-      int parallelism, String sparkMemory, int retry, boolean schedule, String propsFilePath, List<String> configs) {
+      int parallelism, String sparkMemory, int retry, String runningMode, String propsFilePath, List<String> configs) {
    HoodieClusteringJob.Config cfg = new HoodieClusteringJob.Config();
    cfg.basePath = basePath;
    cfg.tableName = tableName;
    cfg.clusteringInstantTime = clusteringInstant;
    cfg.parallelism = parallelism;
-    cfg.runSchedule = schedule;
+    cfg.runningMode = runningMode;
    cfg.propsFilePath = propsFilePath;
    cfg.configs = configs;
    jsc.getConf().set("spark.executor.memory", sparkMemory);