[HUDI-2850] Fixing Clustering CLI - schedule and run command fixes to avoid NumberFormatException (#4101)
This commit is contained in:
committed by
GitHub
parent
e9efbdb63c
commit
3d75aca40d
@@ -40,14 +40,21 @@ public class ClusteringCommand implements CommandMarker {
|
|||||||
|
|
||||||
private static final Logger LOG = LogManager.getLogger(ClusteringCommand.class);
|
private static final Logger LOG = LogManager.getLogger(ClusteringCommand.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Schedule clustering table service.
|
||||||
|
* <p>
|
||||||
|
* Example:
|
||||||
|
* > connect --path {path to hudi table}
|
||||||
|
* > clustering schedule --sparkMaster local --sparkMemory 2g
|
||||||
|
*/
|
||||||
@CliCommand(value = "clustering schedule", help = "Schedule Clustering")
|
@CliCommand(value = "clustering schedule", help = "Schedule Clustering")
|
||||||
public String scheduleClustering(
|
public String scheduleClustering(
|
||||||
@CliOption(key = "sparkMemory", help = "Spark executor memory",
|
@CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master,
|
||||||
unspecifiedDefaultValue = "1G") final String sparkMemory,
|
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1g", help = "Spark executor memory") final String sparkMemory,
|
||||||
@CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for clustering",
|
@CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations "
|
||||||
unspecifiedDefaultValue = "") final String propsFilePath,
|
+ "for hoodie client for clustering", unspecifiedDefaultValue = "") final String propsFilePath,
|
||||||
@CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
|
@CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can "
|
||||||
unspecifiedDefaultValue = "") final String[] configs) throws Exception {
|
+ "be passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs) throws Exception {
|
||||||
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
|
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
@@ -59,8 +66,8 @@ public class ClusteringCommand implements CommandMarker {
|
|||||||
// First get a clustering instant time and pass it to spark launcher for scheduling clustering
|
// First get a clustering instant time and pass it to spark launcher for scheduling clustering
|
||||||
String clusteringInstantTime = HoodieActiveTimeline.createNewInstantTime();
|
String clusteringInstantTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
|
|
||||||
sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_SCHEDULE.toString(), client.getBasePath(),
|
sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_SCHEDULE.toString(), master, sparkMemory,
|
||||||
client.getTableConfig().getTableName(), clusteringInstantTime, sparkMemory, propsFilePath);
|
client.getBasePath(), client.getTableConfig().getTableName(), clusteringInstantTime, propsFilePath);
|
||||||
UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
|
UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
@@ -71,21 +78,25 @@ public class ClusteringCommand implements CommandMarker {
|
|||||||
return "Succeeded to schedule clustering for " + clusteringInstantTime;
|
return "Succeeded to schedule clustering for " + clusteringInstantTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run clustering table service.
|
||||||
|
* <p>
|
||||||
|
* Example:
|
||||||
|
* > connect --path {path to hudi table}
|
||||||
|
* > clustering schedule --sparkMaster local --sparkMemory 2g
|
||||||
|
* > clustering run --sparkMaster local --sparkMemory 2g --clusteringInstant 20211124005208
|
||||||
|
*/
|
||||||
@CliCommand(value = "clustering run", help = "Run Clustering")
|
@CliCommand(value = "clustering run", help = "Run Clustering")
|
||||||
public String runClustering(
|
public String runClustering(
|
||||||
@CliOption(key = "parallelism", help = "Parallelism for hoodie clustering",
|
@CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master,
|
||||||
unspecifiedDefaultValue = "1") final String parallelism,
|
@CliOption(key = "sparkMemory", help = "Spark executor memory", unspecifiedDefaultValue = "4g") final String sparkMemory,
|
||||||
@CliOption(key = "sparkMemory", help = "Spark executor memory",
|
@CliOption(key = "parallelism", help = "Parallelism for hoodie clustering", unspecifiedDefaultValue = "1") final String parallelism,
|
||||||
unspecifiedDefaultValue = "4G") final String sparkMemory,
|
@CliOption(key = "retry", help = "Number of retries", unspecifiedDefaultValue = "1") final String retry,
|
||||||
@CliOption(key = "retry", help = "Number of retries",
|
@CliOption(key = "clusteringInstant", help = "Clustering instant time", mandatory = true) final String clusteringInstantTime,
|
||||||
unspecifiedDefaultValue = "1") final String retry,
|
@CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for "
|
||||||
@CliOption(key = "clusteringInstant", help = "Clustering instant time",
|
+ "hoodie client for compacting", unspecifiedDefaultValue = "") final String propsFilePath,
|
||||||
mandatory = true) final String clusteringInstantTime,
|
@CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be "
|
||||||
@CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting",
|
+ "passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs) throws Exception {
|
||||||
unspecifiedDefaultValue = "") final String propsFilePath,
|
|
||||||
@CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
|
|
||||||
unspecifiedDefaultValue = "") final String[] configs
|
|
||||||
) throws Exception {
|
|
||||||
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
|
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
@@ -93,8 +104,9 @@ public class ClusteringCommand implements CommandMarker {
|
|||||||
String sparkPropertiesPath =
|
String sparkPropertiesPath =
|
||||||
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
|
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_RUN.toString(), client.getBasePath(),
|
sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_RUN.toString(), master, sparkMemory,
|
||||||
client.getTableConfig().getTableName(), clusteringInstantTime, parallelism, sparkMemory, retry, propsFilePath);
|
client.getBasePath(), client.getTableConfig().getTableName(), clusteringInstantTime,
|
||||||
|
parallelism, retry, propsFilePath);
|
||||||
UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
|
UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ import org.apache.hudi.common.table.HoodieTableVersion;
|
|||||||
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
|
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.StringUtils;
|
import org.apache.hudi.common.util.StringUtils;
|
||||||
|
import org.apache.hudi.common.util.ValidationUtils;
|
||||||
import org.apache.hudi.config.HoodieBootstrapConfig;
|
import org.apache.hudi.config.HoodieBootstrapConfig;
|
||||||
import org.apache.hudi.config.HoodieIndexConfig;
|
import org.apache.hudi.config.HoodieIndexConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
@@ -79,12 +80,14 @@ public class SparkMain {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
String command = args[0];
|
ValidationUtils.checkArgument(args.length >= 4);
|
||||||
LOG.info("Invoking SparkMain:" + command);
|
final String commandString = args[0];
|
||||||
|
LOG.info("Invoking SparkMain: " + commandString);
|
||||||
|
final SparkCommand cmd = SparkCommand.valueOf(commandString);
|
||||||
|
|
||||||
SparkCommand cmd = SparkCommand.valueOf(command);
|
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + commandString,
|
||||||
|
Option.of(args[1]), Option.of(args[2]));
|
||||||
|
|
||||||
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command, Option.of(args[1]), Option.of(args[2]));
|
|
||||||
int returnCode = 0;
|
int returnCode = 0;
|
||||||
try {
|
try {
|
||||||
switch (cmd) {
|
switch (cmd) {
|
||||||
@@ -111,8 +114,8 @@ public class SparkMain {
|
|||||||
if (args.length > 13) {
|
if (args.length > 13) {
|
||||||
configs.addAll(Arrays.asList(args).subList(13, args.length));
|
configs.addAll(Arrays.asList(args).subList(13, args.length));
|
||||||
}
|
}
|
||||||
returnCode = dataLoad(jsc, command, args[3], args[4], args[5], args[6], args[7], args[8],
|
returnCode = dataLoad(jsc, commandString, args[3], args[4], args[5], args[6], args[7], args[8],
|
||||||
Integer.parseInt(args[9]), args[10], Integer.parseInt(args[11]), propsFilePath, configs);
|
Integer.parseInt(args[9]), args[10], Integer.parseInt(args[11]), propsFilePath, configs);
|
||||||
break;
|
break;
|
||||||
case COMPACT_RUN:
|
case COMPACT_RUN:
|
||||||
assert (args.length >= 10);
|
assert (args.length >= 10);
|
||||||
@@ -159,33 +162,34 @@ public class SparkMain {
|
|||||||
case COMPACT_UNSCHEDULE_PLAN:
|
case COMPACT_UNSCHEDULE_PLAN:
|
||||||
assert (args.length == 9);
|
assert (args.length == 9);
|
||||||
doCompactUnschedule(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]),
|
doCompactUnschedule(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]),
|
||||||
Boolean.parseBoolean(args[7]), Boolean.parseBoolean(args[8]));
|
Boolean.parseBoolean(args[7]), Boolean.parseBoolean(args[8]));
|
||||||
returnCode = 0;
|
returnCode = 0;
|
||||||
break;
|
break;
|
||||||
case CLUSTERING_RUN:
|
case CLUSTERING_RUN:
|
||||||
assert (args.length >= 8);
|
assert (args.length >= 9);
|
||||||
propsFilePath = null;
|
propsFilePath = null;
|
||||||
if (!StringUtils.isNullOrEmpty(args[7])) {
|
if (!StringUtils.isNullOrEmpty(args[8])) {
|
||||||
propsFilePath = args[7];
|
propsFilePath = args[8];
|
||||||
}
|
}
|
||||||
configs = new ArrayList<>();
|
configs = new ArrayList<>();
|
||||||
if (args.length > 8) {
|
if (args.length > 9) {
|
||||||
configs.addAll(Arrays.asList(args).subList(8, args.length));
|
configs.addAll(Arrays.asList(args).subList(9, args.length));
|
||||||
}
|
}
|
||||||
returnCode = cluster(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]), args[5],
|
returnCode = cluster(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), args[2],
|
||||||
Integer.parseInt(args[6]), false, propsFilePath, configs);
|
Integer.parseInt(args[7]), false, propsFilePath, configs);
|
||||||
break;
|
break;
|
||||||
case CLUSTERING_SCHEDULE:
|
case CLUSTERING_SCHEDULE:
|
||||||
assert (args.length >= 6);
|
assert (args.length >= 7);
|
||||||
propsFilePath = null;
|
propsFilePath = null;
|
||||||
if (!StringUtils.isNullOrEmpty(args[5])) {
|
if (!StringUtils.isNullOrEmpty(args[6])) {
|
||||||
propsFilePath = args[5];
|
propsFilePath = args[6];
|
||||||
}
|
}
|
||||||
configs = new ArrayList<>();
|
configs = new ArrayList<>();
|
||||||
if (args.length > 6) {
|
if (args.length > 7) {
|
||||||
configs.addAll(Arrays.asList(args).subList(6, args.length));
|
configs.addAll(Arrays.asList(args).subList(7, args.length));
|
||||||
}
|
}
|
||||||
returnCode = cluster(jsc, args[1], args[2], args[3], 1, args[4], 0, true, propsFilePath, configs);
|
returnCode = cluster(jsc, args[3], args[4], args[5], 1, args[2],
|
||||||
|
0, true, propsFilePath, configs);
|
||||||
break;
|
break;
|
||||||
case CLEAN:
|
case CLEAN:
|
||||||
assert (args.length >= 5);
|
assert (args.length >= 5);
|
||||||
@@ -229,7 +233,7 @@ public class SparkMain {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} catch (Throwable throwable) {
|
} catch (Throwable throwable) {
|
||||||
LOG.error("Fail to execute command", throwable);
|
LOG.error("Fail to execute commandString", throwable);
|
||||||
returnCode = -1;
|
returnCode = -1;
|
||||||
} finally {
|
} finally {
|
||||||
jsc.stop();
|
jsc.stop();
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ import java.util.Objects;
|
|||||||
*/
|
*/
|
||||||
public class SparkUtil {
|
public class SparkUtil {
|
||||||
|
|
||||||
private static final String DEFAULT_SPARK_MASTER = "yarn";
|
public static final String DEFAULT_SPARK_MASTER = "yarn";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro.
|
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro.
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ public class HoodieClusteringJob {
|
|||||||
public HoodieClusteringJob(JavaSparkContext jsc, Config cfg) {
|
public HoodieClusteringJob(JavaSparkContext jsc, Config cfg) {
|
||||||
this.cfg = cfg;
|
this.cfg = cfg;
|
||||||
this.jsc = jsc;
|
this.jsc = jsc;
|
||||||
this.props = cfg.propsFilePath == null
|
this.props = StringUtils.isNullOrEmpty(cfg.propsFilePath)
|
||||||
? UtilHelpers.buildProperties(cfg.configs)
|
? UtilHelpers.buildProperties(cfg.configs)
|
||||||
: readConfigFromFileSystem(jsc, cfg);
|
: readConfigFromFileSystem(jsc, cfg);
|
||||||
this.metaClient = UtilHelpers.createMetaClient(jsc, cfg.basePath, true);
|
this.metaClient = UtilHelpers.createMetaClient(jsc, cfg.basePath, true);
|
||||||
|
|||||||
Reference in New Issue
Block a user