1
0

Improving Tag location using interval trees for index files

Adding interface for index look up

Adding index filtering implementations for global bloom index too
This commit is contained in:
Sivabalan Narayanan
2018-11-28 11:00:02 -08:00
committed by vinoth chandar
parent 461ce18bd1
commit 7129dc5bb7
14 changed files with 953 additions and 187 deletions

View File

@@ -40,10 +40,7 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.avro.Schema;
import org.apache.commons.io.IOUtils;
@@ -174,7 +171,7 @@ public class TestHoodieBloomIndex {
HoodieClientTestUtils
.writeParquetFile(basePath, "2015/03/12", "4_0_20150312101010.parquet",
Arrays.asList(record2, record3, record4), schema, null,
false);
false);
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
@@ -211,31 +208,34 @@ public class TestHoodieBloomIndex {
@Test
public void testRangePruning() {
for (Boolean rangePruning : new boolean[]{false, true}) {
Map<String, String> props = new HashMap<>();
props.put("hoodie.bloom.index.prune.by" + ".ranges", rangePruning.toString());
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withProps(props).build();
HoodieBloomIndex index = new HoodieBloomIndex(config);
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
HoodieBloomIndex index = new HoodieBloomIndex(config);
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"),
new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"),
new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010")));
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"),
new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"),
new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010")));
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc.parallelize(Arrays.asList(
new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"),
new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t);
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc.parallelize(Arrays.asList(
new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"),
new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t);
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons(
partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons(
partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();
assertEquals(10, comparisonKeyList.size());
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy(
t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList())));
assertEquals(10, comparisonKeyList.size());
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy(
t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList())));
assertEquals(4, recordKeyToFileComps.size());
assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002"));
assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("003"));
assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("004"));
assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("005"));
assertEquals(4, recordKeyToFileComps.size());
assertEquals(new HashSet<>(Arrays.asList("f1", "f3", "f4")), new HashSet<>(recordKeyToFileComps.get("002")));
assertEquals(new HashSet<>(Arrays.asList("f1", "f3", "f4")), new HashSet<>(recordKeyToFileComps.get("003")));
assertEquals(new HashSet<>(Arrays.asList("f1", "f4")), new HashSet<>(recordKeyToFileComps.get("004")));
assertEquals(new HashSet<>(Arrays.asList("f1", "f4")), new HashSet<>(recordKeyToFileComps.get("005")));
}
}
@Test
@@ -315,68 +315,76 @@ public class TestHoodieBloomIndex {
@Test
public void testTagLocation() throws Exception {
// We have some records to be tagged (two different partitions)
for (Boolean rangePruning : new boolean[]{false, true}) {
Map<String, String> props = new HashMap<>();
props.put("hoodie.bloom.index.prune.by" + ".ranges", rangePruning.toString());
// We have some records to be tagged (two different partitions)
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
String rowKey1 = UUID.randomUUID().toString();
String rowKey2 = UUID.randomUUID().toString();
String rowKey3 = UUID.randomUUID().toString();
String rowKey4 = UUID.randomUUID().toString();
String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"" + rowKey4 + "\","
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
// Also create the metadata and config
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);
// Also create the metadata and config
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withProps(props).build();
HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);
// Let's tag
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);
// Let's tag
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);
// Should not find any files
for (HoodieRecord record : taggedRecordRDD.collect()) {
assertTrue(!record.isCurrentLocationKnown());
}
// Should not find any files
for (HoodieRecord record : taggedRecordRDD.collect()) {
assertFalse(record.isCurrentLocationKnown());
}
// We create three parquet file, each having one record. (two different partitions)
String filename1 =
HoodieClientTestUtils.writeParquetFile(basePath, "2016/01/31", Arrays.asList(record1), schema, null, true);
String filename2 =
HoodieClientTestUtils.writeParquetFile(basePath, "2016/01/31", Arrays.asList(record2), schema, null, true);
String filename3 =
HoodieClientTestUtils.writeParquetFile(basePath, "2015/01/31", Arrays.asList(record4), schema, null, true);
// We create three parquet file, each having one record. (two different partitions)
String filename1 =
HoodieClientTestUtils.writeParquetFile(basePath, "2016/01/31", Arrays.asList(record1), schema, null, true);
String filename2 =
HoodieClientTestUtils.writeParquetFile(basePath, "2016/01/31", Arrays.asList(record2), schema, null, true);
String filename3 =
HoodieClientTestUtils.writeParquetFile(basePath, "2015/01/31", Arrays.asList(record4), schema, null, true);
// We do the tag again
metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = HoodieTable.getHoodieTable(metadata, config, jsc);
// We do the tag again
metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = HoodieTable.getHoodieTable(metadata, config, jsc);
taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);
taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);
// Check results
for (HoodieRecord record : taggedRecordRDD.collect()) {
if (record.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename1)));
} else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2)));
} else if (record.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
assertTrue(!record.isCurrentLocationKnown());
} else if (record.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
// Check results
for (HoodieRecord record : taggedRecordRDD.collect()) {
if (record.getRecordKey().equals(rowKey1)) {
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename1)));
} else if (record.getRecordKey().equals(rowKey2)) {
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2)));
} else if (record.getRecordKey().equals(rowKey3)) {
assertTrue(!record.isCurrentLocationKnown());
} else if (record.getRecordKey().equals(rowKey4)) {
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
}
}
}
}

View File

@@ -211,10 +211,10 @@ public class TestHoodieGlobalBloomIndex {
t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList())));
assertEquals(4, recordKeyToFileComps.size());
assertEquals(Arrays.asList("f4", "f1", "f3"), recordKeyToFileComps.get("002"));
assertEquals(Arrays.asList("f4", "f1", "f3"), recordKeyToFileComps.get("003"));
assertEquals(Arrays.asList("f4", "f1"), recordKeyToFileComps.get("004"));
assertEquals(Arrays.asList("f4", "f1"), recordKeyToFileComps.get("005"));
assertEquals(new HashSet<>(Arrays.asList("f4", "f1", "f3")), new HashSet<>(recordKeyToFileComps.get("002")));
assertEquals(new HashSet<>(Arrays.asList("f4", "f1", "f3")), new HashSet<>(recordKeyToFileComps.get("003")));
assertEquals(new HashSet<>(Arrays.asList("f4", "f1")), new HashSet<>(recordKeyToFileComps.get("004")));
assertEquals(new HashSet<>(Arrays.asList("f4", "f1")), new HashSet<>(recordKeyToFileComps.get("005")));
}

View File

@@ -0,0 +1,176 @@
/*
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*/
package com.uber.hoodie.index.bloom;
import static junit.framework.TestCase.assertEquals;
import static junit.framework.TestCase.assertTrue;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.UUID;
import org.junit.Test;
/**
* Tests {@link KeyRangeLookupTree}
*/
public class TestKeyRangeLookupTree {
private static final Random RANDOM = new Random();
private KeyRangeLookupTree keyRangeLookupTree;
private Map<String, HashSet<String>> expectedMatches;
public TestKeyRangeLookupTree() {
keyRangeLookupTree = new KeyRangeLookupTree();
expectedMatches = new HashMap<>();
}
/**
* Tests for single node in the tree for different inputs.
*/
@Test
public void testFileGroupLookUpOneEntry() {
KeyRangeNode toInsert = new KeyRangeNode(Long.toString(300), Long.toString(450), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
testRangeOfInputs(290, 305);
testRangeOfInputs(390, 400);
testRangeOfInputs(445, 455);
testRangeOfInputs(600, 605);
}
/**
* Tests for many entries in the tree with same start value and different end values
*/
@Test
public void testFileGroupLookUpManyEntriesWithSameStartValue() {
String startKey = Long.toString(120);
long endKey = 250;
KeyRangeNode toInsert = new KeyRangeNode(startKey, Long.toString(endKey), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
for (int i = 0; i < 10; i++) {
endKey += 1 + RANDOM.nextInt(100);
toInsert = new KeyRangeNode(startKey, Long.toString(endKey), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
}
testRangeOfInputs(110, endKey + 5);
}
/**
* Tests for many duplicte entries in the tree
*/
@Test
public void testFileGroupLookUpManyDulicateEntries() {
KeyRangeNode toInsert = new KeyRangeNode(Long.toString(1200), Long.toString(2000), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
for (int i = 0; i < 10; i++) {
toInsert = new KeyRangeNode(Long.toString(1200), Long.toString(2000), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
}
testRangeOfInputs(1050, 1100);
testRangeOfInputs(1500, 1600);
testRangeOfInputs(1990, 2100);
}
// Tests helpers
/**
* Tests for curated entries in look up tree.
*/
@Test
public void testFileGroupLookUp() {
// testing with hand curated inputs
KeyRangeNode toInsert = new KeyRangeNode(Long.toString(500), Long.toString(600), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
toInsert = new KeyRangeNode(Long.toString(750), Long.toString(950), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
toInsert = new KeyRangeNode(Long.toString(120), Long.toString(620), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
toInsert = new KeyRangeNode(Long.toString(550), Long.toString(775), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
toInsert = new KeyRangeNode(Long.toString(725), Long.toString(850), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
toInsert = new KeyRangeNode(Long.toString(750), Long.toString(825), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
toInsert = new KeyRangeNode(Long.toString(750), Long.toString(990), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
toInsert = new KeyRangeNode(Long.toString(800), Long.toString(820), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
toInsert = new KeyRangeNode(Long.toString(200), Long.toString(550), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
toInsert = new KeyRangeNode(Long.toString(520), Long.toString(600), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
toInsert = new KeyRangeNode(Long.toString(120), Long.toString(620), UUID.randomUUID().toString());
updateExpectedMatchesToTest(toInsert);
keyRangeLookupTree.insert(toInsert);
testRangeOfInputs(110, 999);
}
/**
* Method to test the look up tree for different range of input keys.
*
* @param start starting value of the look up key
* @param end end value of the look up tree
*/
private void testRangeOfInputs(long start, long end) {
for (long i = start; i <= end; i++) {
String iStr = Long.toString(i);
if (!expectedMatches.containsKey(iStr)) {
assertEquals(Collections.EMPTY_SET, keyRangeLookupTree.getMatchingIndexFiles(iStr));
} else {
assertTrue(expectedMatches.get(iStr).equals(keyRangeLookupTree.getMatchingIndexFiles(iStr)));
}
}
}
/**
* Updates the expected matches for a given {@link KeyRangeNode}
*
* @param toInsert the {@link KeyRangeNode} to be inserted
*/
private void updateExpectedMatchesToTest(KeyRangeNode toInsert) {
long startKey = Long.parseLong(toInsert.getMinRecordKey());
long endKey = Long.parseLong(toInsert.getMaxRecordKey());
for (long i = startKey; i <= endKey; i++) {
String iStr = Long.toString(i);
if (!expectedMatches.containsKey(iStr)) {
expectedMatches.put(iStr, new HashSet<>());
}
expectedMatches.get(iStr).add(toInsert.getFileNameList().get(0));
}
}
}