1
0

[HUDI-1690] use jsc union instead of rdd union (#2872)

This commit is contained in:
satishkotha
2021-04-26 23:35:01 -07:00
committed by GitHub
parent 63fa2b6186
commit 2999586509

View File

@@ -66,6 +66,7 @@ import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class SparkExecuteClusteringCommitActionExecutor<T extends HoodieRecordPayload<T>>
extends BaseSparkCommitActionExecutor<T> {
@@ -90,10 +91,12 @@ public class SparkExecuteClusteringCommitActionExecutor<T extends HoodieRecordPa
JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(context);
// execute clustering for each group async and collect WriteStatus
JavaRDD<WriteStatus> writeStatusRDD = clusteringPlan.getInputGroups().stream()
Stream<JavaRDD<WriteStatus>> writeStatusRDDStream = clusteringPlan.getInputGroups().stream()
.map(inputGroup -> runClusteringForGroupAsync(inputGroup, clusteringPlan.getStrategy().getStrategyParams()))
.map(CompletableFuture::join)
.reduce((rdd1, rdd2) -> rdd1.union(rdd2)).orElse(engineContext.emptyRDD());
.map(CompletableFuture::join);
JavaRDD<WriteStatus>[] writeStatuses = convertStreamToArray(writeStatusRDDStream);
JavaRDD<WriteStatus> writeStatusRDD = engineContext.union(writeStatuses);
HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata = buildWriteMetadata(writeStatusRDD);
JavaRDD<WriteStatus> statuses = updateIndex(writeStatusRDD, writeMetadata);
@@ -109,6 +112,19 @@ public class SparkExecuteClusteringCommitActionExecutor<T extends HoodieRecordPa
return writeMetadata;
}
/**
* Stream to array conversion with generic type is not straightforward.
* Implement a utility method to abstract high level logic. This needs to be improved in future
*/
private JavaRDD<WriteStatus>[] convertStreamToArray(Stream<JavaRDD<WriteStatus>> writeStatusRDDStream) {
Object[] writeStatusObjects = writeStatusRDDStream.toArray(Object[]::new);
JavaRDD<WriteStatus>[] writeStatusRDDArray = new JavaRDD[writeStatusObjects.length];
for (int i = 0; i < writeStatusObjects.length; i++) {
writeStatusRDDArray[i] = (JavaRDD<WriteStatus>) writeStatusObjects[i];
}
return writeStatusRDDArray;
}
/**
* Validate actions taken by clustering. In the first implementation, we validate at least one new file is written.
* But we can extend this to add more validation. E.g. number of records read = number of records written etc.