[HUDI-3485] Adding scheduler pool configs for async clustering (#5043)

2022-03-29 18:27:45 -07:00
parent 5c1b482a1b
commit 4fed8dd319
10 changed files with 72 additions and 19 deletions
--- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncClusteringService.java
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncClusteringService.java
@@ -22,6 +22,7 @@ package org.apache.hudi.async;
 import org.apache.hudi.client.BaseClusterer;
 import org.apache.hudi.client.BaseHoodieWriteClient;
 import org.apache.hudi.client.HoodieSparkClusteringClient;
+import org.apache.hudi.common.engine.HoodieEngineContext;

 /**
 * Async clustering service for Spark structured streaming.
@@ -31,8 +32,8 @@ public class SparkStreamingAsyncClusteringService extends AsyncClusteringService

  private static final long serialVersionUID = 1L;

-  public SparkStreamingAsyncClusteringService(BaseHoodieWriteClient writeClient) {
-    super(writeClient, true);
+  public SparkStreamingAsyncClusteringService(HoodieEngineContext context, BaseHoodieWriteClient writeClient) {
+    super(context, writeClient, true);
  }

  @Override
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala
@@ -205,7 +205,8 @@ class HoodieStreamingSink(sqlContext: SQLContext,
  protected def triggerAsyncClustering(client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]]): Unit = {
    if (null ==  asyncClusteringService) {
      log.info("Triggering async clustering!")
-      asyncClusteringService = new SparkStreamingAsyncClusteringService(client)
+      asyncClusteringService = new SparkStreamingAsyncClusteringService(new HoodieSparkEngineContext(new JavaSparkContext(sqlContext.sparkContext)),
+        client)
      asyncClusteringService.start(new Function[java.lang.Boolean, java.lang.Boolean] {
        override def apply(errored: lang.Boolean): lang.Boolean = {
          log.info(s"Async clustering service shutdown. Errored ? $errored")
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkConfigs.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkConfigs.scala
@@ -27,9 +27,9 @@ object SparkConfigs {

  /*
  When async compaction is enabled (deltastreamer or streaming sink), users might be interested to set custom
-  scheduling configs for regular writes and async compaction. This is the property used to set custom scheduler config
-  file with spark. In Deltastreamer, the file is generated within hudi and set if necessary. Where as in case of streaming
-  sink, users have to set this property when they invoke spark shell.
+  scheduling configs for regular writes and async table services like compaction and clustering. This is the property
+  used to set custom scheduler config file with spark. In Deltastreamer, the file is generated within hudi and set if
+  necessary. Where as in case of streaming sink, users have to set this property when they invoke spark shell.
  Sample format of the file contents.
  <?xml version="1.0"?>
  <allocations>
@@ -43,6 +43,11 @@ object SparkConfigs {
      <weight>3</weight>
      <minShare>1</minShare>
    </pool>
+    <pool name="hoodiecluster">
+      <schedulingMode>FAIR</schedulingMode>
+      <weight>2</weight>
+      <minShare>1</minShare>
+    </pool>
  </allocations>
   */
  val SPARK_SCHEDULER_ALLOCATION_FILE_KEY = "spark.scheduler.allocation.file"