Remove redundant string from file comp rdd
This commit is contained in:
committed by
vinoth chandar
parent
a7e6cf5197
commit
3fd2fd6e9d
@@ -19,12 +19,14 @@ package com.uber.hoodie.index.bloom;
|
|||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.Partitioner;
|
import org.apache.spark.Partitioner;
|
||||||
@@ -139,11 +141,10 @@ public class BucketizedBloomCheckPartitioner extends Partitioner {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getPartition(Object key) {
|
public int getPartition(Object key) {
|
||||||
String[] parts = ((String) key).split("#");
|
final Pair<String, String> parts = (Pair<String, String>) key;
|
||||||
String fileName = parts[0];
|
final long hashOfKey = Hashing.md5().hashString(parts.getRight(), StandardCharsets.UTF_8).asLong();
|
||||||
final long hashOfKey = Hashing.md5().hashString(parts[1], StandardCharsets.UTF_8).asLong();
|
final List<Integer> candidatePartitions = fileGroupToPartitions.get(parts.getLeft());
|
||||||
List<Integer> candidatePartitions = fileGroupToPartitions.get(fileName);
|
final int idx = (int) Math.floorMod(hashOfKey, candidatePartitions.size());
|
||||||
int idx = (int) Math.floorMod(hashOfKey, candidatePartitions.size());
|
|
||||||
assert idx >= 0;
|
assert idx >= 0;
|
||||||
return candidatePartitions.get(idx);
|
return candidatePartitions.get(idx);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
|||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.common.util.ParquetUtils;
|
import com.uber.hoodie.common.util.ParquetUtils;
|
||||||
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.MetadataNotFoundException;
|
import com.uber.hoodie.exception.MetadataNotFoundException;
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
@@ -42,9 +43,12 @@ import java.util.ArrayList;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.spark.Partitioner;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
@@ -171,7 +175,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
// we will just try exploding the input and then count to determine comparisons
|
// we will just try exploding the input and then count to determine comparisons
|
||||||
// FIX(vc): Only do sampling here and extrapolate?
|
// FIX(vc): Only do sampling here and extrapolate?
|
||||||
fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
|
fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
|
||||||
partitionRecordKeyPairRDD).mapToPair(t -> t._2()).countByKey();
|
partitionRecordKeyPairRDD).mapToPair(t -> t).countByKey();
|
||||||
} else {
|
} else {
|
||||||
fileToComparisons = new HashMap<>();
|
fileToComparisons = new HashMap<>();
|
||||||
partitionToFileInfo.entrySet().stream().forEach(e -> {
|
partitionToFileInfo.entrySet().stream().forEach(e -> {
|
||||||
@@ -290,8 +294,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be
|
* For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be
|
||||||
* checked. For datasets, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files
|
* checked. For datasets, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files
|
||||||
@@ -301,24 +303,21 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
* recordKey ranges in the index info.
|
* recordKey ranges in the index info.
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
JavaRDD<Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
||||||
IndexFileFilter indexFileFilter = config.useBloomIndexTreebasedFilter()
|
IndexFileFilter indexFileFilter = config.useBloomIndexTreebasedFilter()
|
||||||
? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo)
|
? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo)
|
||||||
: new ListBasedIndexFileFilter(partitionToFileIndexInfo);
|
: new ListBasedIndexFileFilter(partitionToFileIndexInfo);
|
||||||
|
|
||||||
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
||||||
String recordKey = partitionRecordKeyPair._2();
|
String recordKey = partitionRecordKeyPair._2();
|
||||||
String partitionPath = partitionRecordKeyPair._1();
|
String partitionPath = partitionRecordKeyPair._1();
|
||||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>();
|
|
||||||
indexFileFilter.getMatchingFiles(partitionPath, recordKey).forEach(matchingFile -> {
|
return indexFileFilter.getMatchingFiles(partitionPath, recordKey).stream()
|
||||||
recordComparisons.add(
|
.map(matchingFile -> new Tuple2<>(matchingFile, new HoodieKey(recordKey, partitionPath)))
|
||||||
new Tuple2<>(String.format("%s#%s", matchingFile, recordKey),
|
.collect(Collectors.toList());
|
||||||
new Tuple2<>(matchingFile,
|
}).flatMap(List::iterator);
|
||||||
new HoodieKey(recordKey, partitionPath))));
|
|
||||||
});
|
|
||||||
return recordComparisons;
|
|
||||||
}).flatMapToPair(List::iterator);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -332,28 +331,32 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTableMetaClient metaClient,
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTableMetaClient metaClient,
|
||||||
Map<String, Long> fileGroupToComparisons) {
|
Map<String, Long> fileGroupToComparisons) {
|
||||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD =
|
JavaRDD<Tuple2<String, HoodieKey>> fileComparisonsRDD =
|
||||||
explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);
|
explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);
|
||||||
|
|
||||||
if (config.useBloomIndexBucketizedChecking()) {
|
if (config.useBloomIndexBucketizedChecking()) {
|
||||||
BucketizedBloomCheckPartitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism,
|
Partitioner partitioner = new BucketizedBloomCheckPartitioner(
|
||||||
fileGroupToComparisons, config.getBloomIndexKeysPerBucket());
|
shuffleParallelism,
|
||||||
fileSortedTripletRDD = fileSortedTripletRDD.repartitionAndSortWithinPartitions(partitioner);
|
fileGroupToComparisons,
|
||||||
|
config.getBloomIndexKeysPerBucket()
|
||||||
|
);
|
||||||
|
|
||||||
|
fileComparisonsRDD = fileComparisonsRDD
|
||||||
|
.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
|
||||||
|
.repartitionAndSortWithinPartitions(partitioner)
|
||||||
|
.map(Tuple2::_2);
|
||||||
} else {
|
} else {
|
||||||
// sort further based on filename, such that all checking for the file can happen within
|
fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
|
||||||
// a single partition, on-the-fly
|
|
||||||
fileSortedTripletRDD = fileSortedTripletRDD.sortByKey(true, shuffleParallelism);
|
|
||||||
}
|
}
|
||||||
return fileSortedTripletRDD.mapPartitionsWithIndex(
|
|
||||||
new HoodieBloomIndexCheckFunction(metaClient, config.getBasePath()), true)
|
return fileComparisonsRDD
|
||||||
|
.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(metaClient, config.getBasePath()), true)
|
||||||
.flatMap(List::iterator)
|
.flatMap(List::iterator)
|
||||||
.filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0)
|
.filter(lr -> lr.getMatchingRecordKeys().size() > 0)
|
||||||
.flatMapToPair(lookupResult -> {
|
.flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
|
||||||
List<Tuple2<String, String>> vals = new ArrayList<>();
|
.map(recordKey -> new Tuple2<>(recordKey, lookupResult.getFileName()))
|
||||||
for (String recordKey : lookupResult.getMatchingRecordKeys()) {
|
.collect(Collectors.toList())
|
||||||
vals.add(new Tuple2<>(recordKey, lookupResult.getFileName()));
|
.iterator());
|
||||||
}
|
|
||||||
return vals.iterator();
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ import scala.Tuple2;
|
|||||||
* actual files
|
* actual files
|
||||||
*/
|
*/
|
||||||
public class HoodieBloomIndexCheckFunction implements
|
public class HoodieBloomIndexCheckFunction implements
|
||||||
Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>,
|
Function2<Integer, Iterator<Tuple2<String, HoodieKey>>,
|
||||||
Iterator<List<KeyLookupResult>>> {
|
Iterator<List<KeyLookupResult>>> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
|
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
|
||||||
@@ -84,13 +84,13 @@ public class HoodieBloomIndexCheckFunction implements
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<List<KeyLookupResult>> call(Integer partition,
|
public Iterator<List<KeyLookupResult>> call(Integer partition,
|
||||||
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr)
|
Iterator<Tuple2<String, HoodieKey>> fileParitionRecordKeyTripletItr)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
|
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
|
||||||
}
|
}
|
||||||
|
|
||||||
class LazyKeyCheckIterator extends
|
class LazyKeyCheckIterator extends
|
||||||
LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<KeyLookupResult>> {
|
LazyIterableIterator<Tuple2<String, HoodieKey>, List<KeyLookupResult>> {
|
||||||
|
|
||||||
private List<String> candidateRecordKeys;
|
private List<String> candidateRecordKeys;
|
||||||
|
|
||||||
@@ -103,7 +103,7 @@ public class HoodieBloomIndexCheckFunction implements
|
|||||||
private long totalKeysChecked;
|
private long totalKeysChecked;
|
||||||
|
|
||||||
LazyKeyCheckIterator(
|
LazyKeyCheckIterator(
|
||||||
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> filePartitionRecordKeyTripletItr) {
|
Iterator<Tuple2<String, HoodieKey>> filePartitionRecordKeyTripletItr) {
|
||||||
super(filePartitionRecordKeyTripletItr);
|
super(filePartitionRecordKeyTripletItr);
|
||||||
currentFile = null;
|
currentFile = null;
|
||||||
candidateRecordKeys = new ArrayList<>();
|
candidateRecordKeys = new ArrayList<>();
|
||||||
@@ -162,10 +162,10 @@ public class HoodieBloomIndexCheckFunction implements
|
|||||||
try {
|
try {
|
||||||
// process one file in each go.
|
// process one file in each go.
|
||||||
while (inputItr.hasNext()) {
|
while (inputItr.hasNext()) {
|
||||||
Tuple2<String, Tuple2<String, HoodieKey>> currentTuple = inputItr.next();
|
Tuple2<String, HoodieKey> currentTuple = inputItr.next();
|
||||||
String fileName = currentTuple._2._1;
|
String fileName = currentTuple._1;
|
||||||
String partitionPath = currentTuple._2._2.getPartitionPath();
|
String partitionPath = currentTuple._2.getPartitionPath();
|
||||||
String recordKey = currentTuple._2._2.getRecordKey();
|
String recordKey = currentTuple._2.getRecordKey();
|
||||||
|
|
||||||
// lazily init state
|
// lazily init state
|
||||||
if (currentFile == null) {
|
if (currentFile == null) {
|
||||||
|
|||||||
@@ -27,12 +27,14 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
|||||||
import com.uber.hoodie.exception.HoodieIOException;
|
import com.uber.hoodie.exception.HoodieIOException;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Map.Entry;
|
import java.util.Map.Entry;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
@@ -76,7 +78,7 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
JavaRDD<Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
||||||
Map<String, String> indexToPartitionMap = new HashMap<>();
|
Map<String, String> indexToPartitionMap = new HashMap<>();
|
||||||
@@ -87,17 +89,14 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
IndexFileFilter indexFileFilter = config.getBloomIndexPruneByRanges()
|
IndexFileFilter indexFileFilter = config.getBloomIndexPruneByRanges()
|
||||||
? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo)
|
? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo)
|
||||||
: new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo);
|
: new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo);
|
||||||
|
|
||||||
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
||||||
String recordKey = partitionRecordKeyPair._2();
|
String recordKey = partitionRecordKeyPair._2();
|
||||||
String partitionPath = partitionRecordKeyPair._1();
|
String partitionPath = partitionRecordKeyPair._1();
|
||||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>();
|
|
||||||
indexFileFilter.getMatchingFiles(partitionPath, recordKey).forEach(matchingFile -> {
|
return indexFileFilter.getMatchingFiles(partitionPath, recordKey).stream()
|
||||||
recordComparisons.add(
|
.map(file -> new Tuple2<>(file, new HoodieKey(recordKey, indexToPartitionMap.get(file))))
|
||||||
new Tuple2<>(String.format("%s#%s", matchingFile, recordKey),
|
.collect(Collectors.toList());
|
||||||
new Tuple2<>(matchingFile,
|
}).flatMap(List::iterator);
|
||||||
new HoodieKey(recordKey, indexToPartitionMap.get(matchingFile)))));
|
|
||||||
});
|
|
||||||
return recordComparisons;
|
|
||||||
}).flatMapToPair(List::iterator);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -259,12 +259,12 @@ public class TestHoodieBloomIndex {
|
|||||||
new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"),
|
new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"),
|
||||||
new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t);
|
new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t);
|
||||||
|
|
||||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons(
|
List<Tuple2<String, HoodieKey>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons(
|
||||||
partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();
|
partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();
|
||||||
|
|
||||||
assertEquals(10, comparisonKeyList.size());
|
assertEquals(10, comparisonKeyList.size());
|
||||||
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy(
|
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy(
|
||||||
t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList())));
|
t -> t._2.getRecordKey(), Collectors.mapping(t -> t._1, Collectors.toList())));
|
||||||
|
|
||||||
assertEquals(4, recordKeyToFileComps.size());
|
assertEquals(4, recordKeyToFileComps.size());
|
||||||
assertEquals(new HashSet<>(Arrays.asList("f1", "f3", "f4")), new HashSet<>(recordKeyToFileComps.get("002")));
|
assertEquals(new HashSet<>(Arrays.asList("f1", "f3", "f4")), new HashSet<>(recordKeyToFileComps.get("002")));
|
||||||
|
|||||||
@@ -190,25 +190,25 @@ public class TestHoodieGlobalBloomIndex {
|
|||||||
new Tuple2<>("2017/10/21", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"),
|
new Tuple2<>("2017/10/21", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"),
|
||||||
new Tuple2<>("2017/10/23", "004"))).mapToPair(t -> t);
|
new Tuple2<>("2017/10/23", "004"))).mapToPair(t -> t);
|
||||||
|
|
||||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons(
|
List<Tuple2<String, HoodieKey>> comparisonKeyList =
|
||||||
partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();
|
index.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();
|
||||||
|
|
||||||
/* epecting:
|
/* expecting:
|
||||||
f4#003, f4, HoodieKey { recordKey=003 partitionPath=2017/10/23}
|
f4, HoodieKey { recordKey=003 partitionPath=2017/10/23}
|
||||||
f1#003, f1, HoodieKey { recordKey=003 partitionPath=2017/10/22}
|
f1, HoodieKey { recordKey=003 partitionPath=2017/10/22}
|
||||||
f3#003, f3, HoodieKey { recordKey=003 partitionPath=2017/10/22}
|
f3, HoodieKey { recordKey=003 partitionPath=2017/10/22}
|
||||||
f4#002, f4, HoodieKey { recordKey=002 partitionPath=2017/10/23}
|
f4, HoodieKey { recordKey=002 partitionPath=2017/10/23}
|
||||||
f1#002, f1, HoodieKey { recordKey=002 partitionPath=2017/10/22}
|
f1, HoodieKey { recordKey=002 partitionPath=2017/10/22}
|
||||||
f3#002, f3, HoodieKey { recordKey=002 partitionPath=2017/10/22}
|
f3, HoodieKey { recordKey=002 partitionPath=2017/10/22}
|
||||||
f4#005, f4, HoodieKey { recordKey=005 partitionPath=2017/10/23}
|
f4, HoodieKey { recordKey=005 partitionPath=2017/10/23}
|
||||||
f1#005, f1, HoodieKey { recordKey=005 partitionPath=2017/10/22}
|
f1, HoodieKey { recordKey=005 partitionPath=2017/10/22}
|
||||||
f4#004, f4, HoodieKey { recordKey=004 partitionPath=2017/10/23}
|
f4, HoodieKey { recordKey=004 partitionPath=2017/10/23}
|
||||||
f1#004, f1, HoodieKey { recordKey=004 partitionPath=2017/10/22}
|
f1, HoodieKey { recordKey=004 partitionPath=2017/10/22}
|
||||||
*/
|
*/
|
||||||
assertEquals(10, comparisonKeyList.size());
|
assertEquals(10, comparisonKeyList.size());
|
||||||
|
|
||||||
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy(
|
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream()
|
||||||
t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList())));
|
.collect(Collectors.groupingBy(t -> t._2.getRecordKey(), Collectors.mapping(Tuple2::_1, Collectors.toList())));
|
||||||
|
|
||||||
assertEquals(4, recordKeyToFileComps.size());
|
assertEquals(4, recordKeyToFileComps.size());
|
||||||
assertEquals(new HashSet<>(Arrays.asList("f4", "f1", "f3")), new HashSet<>(recordKeyToFileComps.get("002")));
|
assertEquals(new HashSet<>(Arrays.asList("f4", "f1", "f3")), new HashSet<>(recordKeyToFileComps.get("002")));
|
||||||
|
|||||||
Reference in New Issue
Block a user