1
0

[HUDI-3485] Adding scheduler pool configs for async clustering (#5043)

This commit is contained in:
Sivabalan Narayanan
2022-03-29 18:27:45 -07:00
committed by GitHub
parent 5c1b482a1b
commit 4fed8dd319
10 changed files with 72 additions and 19 deletions

View File

@@ -22,6 +22,7 @@ package org.apache.hudi.async;
import org.apache.hudi.client.BaseClusterer;
import org.apache.hudi.client.BaseHoodieWriteClient;
import org.apache.hudi.client.HoodieSparkClusteringClient;
import org.apache.hudi.common.engine.HoodieEngineContext;
/**
* Async clustering service for Spark structured streaming.
@@ -31,8 +32,8 @@ public class SparkStreamingAsyncClusteringService extends AsyncClusteringService
private static final long serialVersionUID = 1L;
public SparkStreamingAsyncClusteringService(BaseHoodieWriteClient writeClient) {
super(writeClient, true);
public SparkStreamingAsyncClusteringService(HoodieEngineContext context, BaseHoodieWriteClient writeClient) {
super(context, writeClient, true);
}
@Override

View File

@@ -205,7 +205,8 @@ class HoodieStreamingSink(sqlContext: SQLContext,
protected def triggerAsyncClustering(client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]]): Unit = {
if (null == asyncClusteringService) {
log.info("Triggering async clustering!")
asyncClusteringService = new SparkStreamingAsyncClusteringService(client)
asyncClusteringService = new SparkStreamingAsyncClusteringService(new HoodieSparkEngineContext(new JavaSparkContext(sqlContext.sparkContext)),
client)
asyncClusteringService.start(new Function[java.lang.Boolean, java.lang.Boolean] {
override def apply(errored: lang.Boolean): lang.Boolean = {
log.info(s"Async clustering service shutdown. Errored ? $errored")

View File

@@ -27,9 +27,9 @@ object SparkConfigs {
/*
When async compaction is enabled (deltastreamer or streaming sink), users might be interested to set custom
scheduling configs for regular writes and async compaction. This is the property used to set custom scheduler config
file with spark. In Deltastreamer, the file is generated within hudi and set if necessary. Where as in case of streaming
sink, users have to set this property when they invoke spark shell.
scheduling configs for regular writes and async table services like compaction and clustering. This is the property
used to set custom scheduler config file with spark. In Deltastreamer, the file is generated within hudi and set if
necessary. Where as in case of streaming sink, users have to set this property when they invoke spark shell.
Sample format of the file contents.
<?xml version="1.0"?>
<allocations>
@@ -43,6 +43,11 @@ object SparkConfigs {
<weight>3</weight>
<minShare>1</minShare>
</pool>
<pool name="hoodiecluster">
<schedulingMode>FAIR</schedulingMode>
<weight>2</weight>
<minShare>1</minShare>
</pool>
</allocations>
*/
val SPARK_SCHEDULER_ALLOCATION_FILE_KEY = "spark.scheduler.allocation.file"