1
0

[HUDI-2502] Refactor index in hudi-client module (#3778)

- Refactor Index to reduce Line of Code and re-use across engines.
This commit is contained in:
Y Ethan Guo
2021-10-28 01:16:00 -07:00
committed by GitHub
parent e5b6b8602c
commit 0223c442ec
70 changed files with 2196 additions and 1567 deletions

View File

@@ -21,6 +21,8 @@ package org.apache.hudi.index.bloom;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.data.HoodieMapPair;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.HoodieTableMetaClient;
@@ -100,7 +102,7 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
@MethodSource("configParams")
public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
FlinkHoodieBloomIndex index = new FlinkHoodieBloomIndex(config);
HoodieBloomIndex index = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
HoodieTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient, false);
HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA);
@@ -165,7 +167,7 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
@MethodSource("configParams")
public void testRangePruning(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) {
HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
FlinkHoodieBloomIndex index = new FlinkHoodieBloomIndex(config);
HoodieBloomIndex index = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
partitionToFileIndexInfo.put("2017/10/22",
@@ -176,14 +178,14 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
Map<String, List<String>> partitionRecordKeyMap = new HashMap<>();
asList(Pair.of("2017/10/22", "003"), Pair.of("2017/10/22", "002"),
Pair.of("2017/10/22", "005"), Pair.of("2017/10/22", "004"))
.forEach(t -> {
List<String> recordKeyList = partitionRecordKeyMap.getOrDefault(t.getLeft(), new ArrayList<>());
recordKeyList.add(t.getRight());
partitionRecordKeyMap.put(t.getLeft(), recordKeyList);
});
.forEach(t -> {
List<String> recordKeyList = partitionRecordKeyMap.getOrDefault(t.getLeft(), new ArrayList<>());
recordKeyList.add(t.getRight());
partitionRecordKeyMap.put(t.getLeft(), recordKeyList);
});
List<Pair<String, HoodieKey>> comparisonKeyList =
index.explodeRecordsWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyMap);
List<Pair<String, HoodieKey>> comparisonKeyList = HoodieList.getList(
index.explodeRecordsWithFileComparisons(partitionToFileIndexInfo, HoodieMapPair.of(partitionRecordKeyMap)));
assertEquals(10, comparisonKeyList.size());
java.util.Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream()
@@ -264,10 +266,10 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient);
// Let's tag
FlinkHoodieBloomIndex bloomIndex = new FlinkHoodieBloomIndex(config);
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
assertDoesNotThrow(() -> {
bloomIndex.tagLocation(records, context, table);
tagLocation(bloomIndex, records, table);
}, "EmptyList should not result in IllegalArgumentException: Positive number of slices required");
}
@@ -303,8 +305,8 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA);
// Let's tag
FlinkHoodieBloomIndex bloomIndex = new FlinkHoodieBloomIndex(config);
List<HoodieRecord> taggedRecords = bloomIndex.tagLocation(records, context, hoodieTable);
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, records, hoodieTable);
// Should not find any files
for (HoodieRecord record : taggedRecords) {
@@ -319,7 +321,7 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
metaClient.reloadActiveTimeline();
// We do the tag again
taggedRecords = bloomIndex.tagLocation(records, context, HoodieFlinkTable.create(config, context, metaClient));
taggedRecords = tagLocation(bloomIndex, records, HoodieFlinkTable.create(config, context, metaClient));
// Check results
for (HoodieRecord record : taggedRecords) {
@@ -370,10 +372,10 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA);
// Let's tag
FlinkHoodieBloomIndex bloomIndex = new FlinkHoodieBloomIndex(config);
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
List<HoodieRecord> toTagRecords = new ArrayList<>();
toTagRecords.add(new HoodieRecord(record4.getKey(), null));
List<HoodieRecord> taggedRecords = bloomIndex.tagLocation(toTagRecords, context, hoodieTable);
List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, toTagRecords, hoodieTable);
Map<HoodieKey, Option<Pair<String, String>>> recordLocations = new HashMap<>();
for (HoodieRecord taggedRecord : taggedRecords) {
recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown()
@@ -398,7 +400,7 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
taggedRecords.add(new HoodieRecord(key, null));
}
taggedRecords = bloomIndex.tagLocation(toTagRecords1, context, hoodieTable);
taggedRecords = tagLocation(bloomIndex, toTagRecords1, hoodieTable);
recordLocations.clear();
for (HoodieRecord taggedRecord : taggedRecords) {
recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown()
@@ -452,8 +454,8 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieFlinkTable.create(config, context, metaClient);
FlinkHoodieBloomIndex bloomIndex = new FlinkHoodieBloomIndex(config);
List<HoodieRecord> taggedRecords = bloomIndex.tagLocation(records, context, table);
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, records, table);
// Check results
for (HoodieRecord record : taggedRecords) {

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.testutils;
import org.apache.hudi.client.FlinkTaskContextSupplier;
import org.apache.hudi.client.HoodieFlinkWriteClient;
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieTableType;
@@ -29,7 +30,9 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.common.testutils.minicluster.HdfsTestService;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.bloom.TestFlinkHoodieBloomIndex;
import org.apache.hudi.table.HoodieTable;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
@@ -128,6 +131,10 @@ public class HoodieFlinkClientTestHarness extends HoodieCommonTestHarness implem
metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType);
}
protected List<HoodieRecord> tagLocation(
HoodieIndex index, List<HoodieRecord> records, HoodieTable table) {
return HoodieList.getList(index.tagLocation(HoodieList.of(records), context, table));
}
/**
* Cleanups file system.