1
0

[HUDI-92] Provide reasonable names for Spark DAG stages in HUDI. (#1289)

This commit is contained in:
Prashant Wason
2020-07-19 10:29:25 -07:00
committed by GitHub
parent 1aae437257
commit b71f25f210
25 changed files with 79 additions and 10 deletions

View File

@@ -166,6 +166,7 @@ public class HDFSParquetImporter implements Serializable {
AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));
jsc.setJobGroup(this.getClass().getSimpleName(), "Build records for import");
return jsc.newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class,
job.getConfiguration())
// To reduce large number of tasks.

View File

@@ -97,6 +97,7 @@ public class HoodieSnapshotCopier implements Serializable {
fs.delete(new Path(outputDir), true);
}
jsc.setJobGroup(this.getClass().getSimpleName(), "Creating a snapshot");
jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
// Only take latest version files <= latestCommit.
FileSystem fs1 = FSUtils.getFs(baseDir, serConf.newCopy());

View File

@@ -175,6 +175,7 @@ public class HoodieSnapshotExporter {
? defaultPartitioner
: ReflectionUtils.loadClass(cfg.outputPartitioner);
jsc.setJobGroup(this.getClass().getSimpleName(), "Exporting as non-HUDI dataset");
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
Iterator<String> exportingFilePaths = jsc
.parallelize(partitions, partitions.size())
@@ -193,6 +194,7 @@ public class HoodieSnapshotExporter {
private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
jsc.setJobGroup(this.getClass().getSimpleName(), "Exporting as HUDI dataset");
jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
// Only take latest version files <= latestCommit.
List<Tuple2<String, String>> filePaths = new ArrayList<>();

View File

@@ -86,6 +86,7 @@ public class HoodieWithTimelineServer implements Serializable {
System.out.println("Driver Hostname is :" + driverHost);
List<String> messages = new ArrayList<>();
IntStream.range(0, cfg.numPartitions).forEach(i -> messages.add("Hello World"));
jsc.setJobGroup(this.getClass().getSimpleName(), "Sending requests to driver host");
List<String> gotMessages = jsc.parallelize(messages).map(msg -> sendRequest(driverHost, cfg.serverPort)).collect();
System.out.println("Got Messages :" + gotMessages);
ValidationUtils.checkArgument(gotMessages.equals(messages), "Got expected reply from Server");

View File

@@ -132,6 +132,7 @@ public class TimelineServerPerf implements Serializable {
public List<PerfStats> runLookups(JavaSparkContext jsc, List<String> partitionPaths, SyncableFileSystemView fsView,
int numIterations, int concurrency) {
jsc.setJobGroup(this.getClass().getSimpleName(), "Lookup all performance stats");
return jsc.parallelize(partitionPaths, cfg.numExecutors).flatMap(p -> {
ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(100);
final List<PerfStats> result = new ArrayList<>();

View File

@@ -56,6 +56,7 @@ public class AvroDFSSource extends AvroSource {
}
private JavaRDD<GenericRecord> fromFiles(String pathStr) {
sparkContext.setJobGroup(this.getClass().getSimpleName(), "Fetch Avro data from files");
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class,
AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration());
return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));

View File

@@ -128,6 +128,7 @@ public class HiveIncrPullSource extends AvroSource {
String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class,
AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration());
sparkContext.setJobGroup(this.getClass().getSimpleName(), "Fetch new data");
return new InputBatch<>(Option.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
String.valueOf(commitToPull.get()));
} catch (IOException ioe) {