1
0

Simplify and fix CLI to schedule and run compactions

This commit is contained in:
Balaji Varadarajan
2018-09-06 12:02:09 -07:00
committed by vinoth chandar
parent fad4b513ea
commit e2dee68ccd
4 changed files with 27 additions and 40 deletions

View File

@@ -162,14 +162,8 @@ public class CompactionCommand implements CommandMarker {
@CliCommand(value = "compaction schedule", help = "Schedule Compaction")
public String scheduleCompact(
@CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
@CliOption(key = {
"parallelism"}, mandatory = true, help = "Parallelism for hoodie compaction") final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String
schemaFilePath,
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
@CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry) throws Exception {
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", help = "Spark executor memory")
final String sparkMemory) throws Exception {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
@@ -181,7 +175,7 @@ public class CompactionCommand implements CommandMarker {
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), HoodieCLI.tableMetadata.getBasePath(),
tableName, compactionInstantTime, rowKeyField, parallelism, schemaFilePath, sparkMemory, retry);
HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, sparkMemory);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
@@ -196,16 +190,15 @@ public class CompactionCommand implements CommandMarker {
@CliCommand(value = "compaction run", help = "Run Compaction for given instant time")
public String compact(
@CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
@CliOption(key = {
"parallelism"}, mandatory = true, help = "Parallelism for hoodie compaction") final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String
schemaFilePath,
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
@CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry,
@CliOption(key = "compactionInstant", mandatory = true, help = "Base path for the target hoodie dataset") final
String compactionInstantTime) throws Exception {
@CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie compaction")
final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file")
final String schemaFilePath,
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory")
final String sparkMemory,
@CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry,
@CliOption(key = "compactionInstant", mandatory = true, help = "Base path for the target hoodie dataset")
final String compactionInstantTime) throws Exception {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
@@ -214,7 +207,8 @@ public class CompactionCommand implements CommandMarker {
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), HoodieCLI.tableMetadata.getBasePath(),
tableName, compactionInstantTime, rowKeyField, parallelism, schemaFilePath, sparkMemory, retry);
HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath,
sparkMemory, retry);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();

View File

@@ -23,6 +23,7 @@ import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.io.compact.strategy.UnBoundedCompactionStrategy;
import com.uber.hoodie.utilities.HDFSParquetImporter;
import com.uber.hoodie.utilities.HoodieCompactor;
import org.apache.log4j.Logger;
@@ -68,14 +69,14 @@ public class SparkMain {
Integer.parseInt(args[7]), args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9], Integer.parseInt(args[10]));
break;
case COMPACT_RUN:
assert (args.length == 9);
returnCode = compact(jsc, args[1], args[2], args[3], args[4], args[5], Integer.parseInt(args[6]),
args[7], args[8], Integer.parseInt(args[9]), false);
assert (args.length == 8);
returnCode = compact(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]),
args[5], args[6], Integer.parseInt(args[7]), false);
break;
case COMPACT_SCHEDULE:
assert (args.length == 10);
returnCode = compact(jsc, args[1], args[2], args[3], args[4], args[5], Integer.parseInt(args[6]),
args[7], args[8], Integer.parseInt(args[9]), true);
assert (args.length == 5);
returnCode = compact(jsc, args[1], args[2], args[3], 1,
"", args[4], 0, true);
break;
default:
break;
@@ -103,14 +104,13 @@ public class SparkMain {
}
private static int compact(JavaSparkContext jsc, String basePath, String tableName, String compactionInstant,
String rowKey, String partitionKey, int parallelism, String schemaFile,
String sparkMemory, int retry, boolean schedule) throws Exception {
int parallelism, String schemaFile, String sparkMemory, int retry, boolean schedule) throws Exception {
HoodieCompactor.Config cfg = new HoodieCompactor.Config();
cfg.basePath = basePath;
cfg.tableName = tableName;
cfg.compactionInstantTime = compactionInstant;
cfg.rowKey = rowKey;
cfg.partitionKey = partitionKey;
// TODO: Make this configurable along with strategy specific config - For now, this is a generic enough strategy
cfg.strategyClassName = UnBoundedCompactionStrategy.class.getCanonicalName();
cfg.parallelism = parallelism;
cfg.schemaFile = schemaFile;
cfg.runSchedule = schedule;