1
0

Moving to Spark 2.1.0

This commit is contained in:
Prasanna Rajaperumal
2017-02-20 16:47:52 -08:00
parent be1dd9444f
commit 0e234ac0ef
6 changed files with 35 additions and 33 deletions

View File

@@ -115,10 +115,10 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
});
return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).mapToPair(
new PairFunction<Tuple2<String, Tuple2<HoodieKey, Optional<String>>>, HoodieKey, Optional<String>>() {
new PairFunction<Tuple2<String, Tuple2<HoodieKey, org.apache.spark.api.java.Optional<String>>>, HoodieKey, Optional<String>>() {
@Override
public Tuple2<HoodieKey, Optional<String>> call(
Tuple2<String, Tuple2<HoodieKey, Optional<String>>> keyPathTuple)
Tuple2<String, Tuple2<HoodieKey, org.apache.spark.api.java.Optional<String>>> keyPathTuple)
throws Exception {
Optional<String> recordLocationPath;
if (keyPathTuple._2._2.isPresent()) {
@@ -146,13 +146,13 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
private JavaPairRDD<String, String> lookupIndex(
JavaPairRDD<String, String> partitionRecordKeyPairRDD, final HoodieTableMetadata metadata) {
// Obtain records per partition, in the incoming records
Map<String, Object> recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
Map<String, Long> recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
// Step 2: Load all involved files as <Partition, filename> pairs
JavaPairRDD<String, String> partitionFilePairRDD =
loadInvolvedFiles(affectedPartitionPathList, metadata);
Map<String, Object> filesPerPartition = partitionFilePairRDD.countByKey();
Map<String, Long> filesPerPartition = partitionFilePairRDD.countByKey();
// Compute total subpartitions, to split partitions into.
Map<String, Long> subpartitionCountMap =
@@ -174,7 +174,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
* @param filesPerPartition
* @return
*/
private Map<String, Long> computeSubPartitions(Map<String, Object> recordsPerPartition, Map<String, Object> filesPerPartition) {
private Map<String, Long> computeSubPartitions(Map<String, Long> recordsPerPartition, Map<String, Long> filesPerPartition) {
Map<String, Long> subpartitionCountMap = new HashMap<>();
long totalRecords = 0;
long totalFiles = 0;
@@ -214,7 +214,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
return jsc.parallelize(partitions, Math.max(partitions.size(), 1))
.flatMapToPair(new PairFlatMapFunction<String, String, String>() {
@Override
public Iterable<Tuple2<String, String>> call(String partitionPath) {
public Iterator<Tuple2<String, String>> call(String partitionPath) {
FileSystem fs = FSUtils.getFs();
String latestCommitTime = metadata.getAllCommits().lastCommit();
FileStatus[] filteredStatus = metadata.getLatestVersionInPartition(fs, partitionPath, latestCommitTime);
@@ -222,7 +222,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
for (FileStatus fileStatus : filteredStatus) {
list.add(new Tuple2<>(partitionPath, fileStatus.getPath().getName()));
}
return list;
return list.iterator();
}
});
}
@@ -261,8 +261,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
})
.flatMapToPair(new PairFlatMapFunction<List<Tuple2<String, String>>, String, String>() {
@Override
public Iterable<Tuple2<String, String>> call(List<Tuple2<String, String>> exploded) throws Exception {
return exploded;
public Iterator<Tuple2<String, String>> call(List<Tuple2<String, String>> exploded) throws Exception {
return exploded.iterator();
}
});
@@ -362,9 +362,9 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(config.getBasePath()), true)
.flatMap(new FlatMapFunction<List<IndexLookupResult>, IndexLookupResult>() {
@Override
public Iterable<IndexLookupResult> call(List<IndexLookupResult> indexLookupResults)
public Iterator<IndexLookupResult> call(List<IndexLookupResult> indexLookupResults)
throws Exception {
return indexLookupResults;
return indexLookupResults.iterator();
}
}).filter(new Function<IndexLookupResult, Boolean>() {
@Override
@@ -373,13 +373,13 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
}
}).flatMapToPair(new PairFlatMapFunction<IndexLookupResult, String, String>() {
@Override
public Iterable<Tuple2<String, String>> call(IndexLookupResult lookupResult)
public Iterator<Tuple2<String, String>> call(IndexLookupResult lookupResult)
throws Exception {
List<Tuple2<String, String>> vals = new ArrayList<>();
for (String recordKey : lookupResult.getMatchingRecordKeys()) {
vals.add(new Tuple2<>(recordKey, lookupResult.getFileName()));
}
return vals;
return vals.iterator();
}
});
}
@@ -399,9 +399,9 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join.
return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(
new Function<Tuple2<HoodieRecord<T>, Optional<String>>, HoodieRecord<T>>() {
new Function<Tuple2<HoodieRecord<T>, org.apache.spark.api.java.Optional<String>>, HoodieRecord<T>>() {
@Override
public HoodieRecord<T> call(Tuple2<HoodieRecord<T>, Optional<String>> v1) throws Exception {
public HoodieRecord<T> call(Tuple2<HoodieRecord<T>, org.apache.spark.api.java.Optional<String>> v1) throws Exception {
HoodieRecord<T> record = v1._1();
if (v1._2().isPresent()) {
String filename = v1._2().get();