[HUDI-92] Provide reasonable names for Spark DAG stages in HUDI. (#1289)
This commit is contained in:
@@ -166,6 +166,7 @@ public class HDFSParquetImporter implements Serializable {
|
||||
AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
|
||||
ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));
|
||||
|
||||
jsc.setJobGroup(this.getClass().getSimpleName(), "Build records for import");
|
||||
return jsc.newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class,
|
||||
job.getConfiguration())
|
||||
// To reduce large number of tasks.
|
||||
|
||||
@@ -97,6 +97,7 @@ public class HoodieSnapshotCopier implements Serializable {
|
||||
fs.delete(new Path(outputDir), true);
|
||||
}
|
||||
|
||||
jsc.setJobGroup(this.getClass().getSimpleName(), "Creating a snapshot");
|
||||
jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
|
||||
// Only take latest version files <= latestCommit.
|
||||
FileSystem fs1 = FSUtils.getFs(baseDir, serConf.newCopy());
|
||||
|
||||
@@ -175,6 +175,7 @@ public class HoodieSnapshotExporter {
|
||||
? defaultPartitioner
|
||||
: ReflectionUtils.loadClass(cfg.outputPartitioner);
|
||||
|
||||
jsc.setJobGroup(this.getClass().getSimpleName(), "Exporting as non-HUDI dataset");
|
||||
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
|
||||
Iterator<String> exportingFilePaths = jsc
|
||||
.parallelize(partitions, partitions.size())
|
||||
@@ -193,6 +194,7 @@ public class HoodieSnapshotExporter {
|
||||
private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
|
||||
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
|
||||
final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
|
||||
jsc.setJobGroup(this.getClass().getSimpleName(), "Exporting as HUDI dataset");
|
||||
jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
|
||||
// Only take latest version files <= latestCommit.
|
||||
List<Tuple2<String, String>> filePaths = new ArrayList<>();
|
||||
|
||||
@@ -86,6 +86,7 @@ public class HoodieWithTimelineServer implements Serializable {
|
||||
System.out.println("Driver Hostname is :" + driverHost);
|
||||
List<String> messages = new ArrayList<>();
|
||||
IntStream.range(0, cfg.numPartitions).forEach(i -> messages.add("Hello World"));
|
||||
jsc.setJobGroup(this.getClass().getSimpleName(), "Sending requests to driver host");
|
||||
List<String> gotMessages = jsc.parallelize(messages).map(msg -> sendRequest(driverHost, cfg.serverPort)).collect();
|
||||
System.out.println("Got Messages :" + gotMessages);
|
||||
ValidationUtils.checkArgument(gotMessages.equals(messages), "Got expected reply from Server");
|
||||
|
||||
@@ -132,6 +132,7 @@ public class TimelineServerPerf implements Serializable {
|
||||
|
||||
public List<PerfStats> runLookups(JavaSparkContext jsc, List<String> partitionPaths, SyncableFileSystemView fsView,
|
||||
int numIterations, int concurrency) {
|
||||
jsc.setJobGroup(this.getClass().getSimpleName(), "Lookup all performance stats");
|
||||
return jsc.parallelize(partitionPaths, cfg.numExecutors).flatMap(p -> {
|
||||
ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(100);
|
||||
final List<PerfStats> result = new ArrayList<>();
|
||||
|
||||
@@ -56,6 +56,7 @@ public class AvroDFSSource extends AvroSource {
|
||||
}
|
||||
|
||||
private JavaRDD<GenericRecord> fromFiles(String pathStr) {
|
||||
sparkContext.setJobGroup(this.getClass().getSimpleName(), "Fetch Avro data from files");
|
||||
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class,
|
||||
AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration());
|
||||
return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
|
||||
|
||||
@@ -128,6 +128,7 @@ public class HiveIncrPullSource extends AvroSource {
|
||||
String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
|
||||
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class,
|
||||
AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration());
|
||||
sparkContext.setJobGroup(this.getClass().getSimpleName(), "Fetch new data");
|
||||
return new InputBatch<>(Option.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
|
||||
String.valueOf(commitToPull.get()));
|
||||
} catch (IOException ioe) {
|
||||
|
||||
Reference in New Issue
Block a user