[HUDI-3760] Adding capability to fetch Metadata Records by prefix (#5208)
- Adding capability to fetch Metadata Records by key prefix so that Data Skipping could fetch only Column Stats - Index records pertaining to the columns being queried by, instead of reading out whole Index. - Fixed usages of HFileScanner in HFileReader. few code paths uses cached scanner if available. Other code paths uses its own HFileScanner w/ positional read. Brief change log - Rebasing ColumnStatsIndexSupport to rely on HoodieBackedTableMetadata in lieu of reading t/h Spark DS - Adding methods enabling key-prefix lookups to HoodiFileReader, HoodieHFileReader - Wiring key-prefix lookup t/h LogRecordScanner impls - Cleaning up HoodieHFileReader impl Co-authored-by: sivabalan <n.siva.b@gmail.com> Co-authored-by: Sagar Sumit <sagarsumit09@gmail.com>
This commit is contained in:
@@ -18,17 +18,6 @@
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex;
|
||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieAvroRecord;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.FileIOUtils;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
@@ -39,7 +28,17 @@ import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.CellComparatorImpl;
|
||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
|
||||
import org.apache.hadoop.hbase.io.hfile.HFile;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex;
|
||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieAvroRecord;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.FileIOUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
@@ -51,21 +50,25 @@ import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.TreeMap;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM;
|
||||
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
|
||||
import static org.apache.hudi.common.util.CollectionUtils.toStream;
|
||||
import static org.apache.hudi.io.storage.HoodieHFileConfig.HFILE_COMPARATOR;
|
||||
import static org.apache.hudi.io.storage.HoodieHFileReader.KEY_SCHEMA;
|
||||
import static org.apache.hudi.io.storage.HoodieHFileReader.SCHEMA_KEY;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
@@ -124,7 +127,7 @@ public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase {
|
||||
FileSystem fs = getFilePath().getFileSystem(conf);
|
||||
HFile.Reader hfileReader = HoodieHFileUtils.createHFileReader(fs, getFilePath(), new CacheConfig(conf), conf);
|
||||
assertEquals(getSchemaFromResource(TestHoodieHFileReaderWriter.class, schemaPath),
|
||||
new Schema.Parser().parse(new String(hfileReader.getHFileInfo().get(KEY_SCHEMA.getBytes()))));
|
||||
new Schema.Parser().parse(new String(hfileReader.getHFileInfo().get(SCHEMA_KEY.getBytes()))));
|
||||
}
|
||||
|
||||
private static Stream<Arguments> populateMetaFieldsAndTestAvroWithMeta() {
|
||||
@@ -142,7 +145,7 @@ public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchemaWithMetaFields.avsc");
|
||||
HoodieFileWriter<GenericRecord> writer = createWriter(avroSchema, populateMetaFields);
|
||||
List<String> keys = new ArrayList<>();
|
||||
Map<String, GenericRecord> recordMap = new HashMap<>();
|
||||
Map<String, GenericRecord> recordMap = new TreeMap<>();
|
||||
for (int i = 0; i < 100; i++) {
|
||||
GenericRecord record = new GenericData.Record(avroSchema);
|
||||
String key = String.format("%s%04d", "key", i);
|
||||
@@ -163,24 +166,30 @@ public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase {
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
HoodieHFileReader hoodieHFileReader = (HoodieHFileReader) createReader(conf);
|
||||
List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
|
||||
records.forEach(entry -> assertEquals(entry.getSecond(), recordMap.get(entry.getFirst())));
|
||||
List<IndexedRecord> records = HoodieHFileReader.readAllRecords(hoodieHFileReader);
|
||||
assertEquals(new ArrayList<>(recordMap.values()), records);
|
||||
|
||||
hoodieHFileReader.close();
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int randomRowstoFetch = 5 + RANDOM.nextInt(10);
|
||||
Set<String> rowsToFetch = getRandomKeys(randomRowstoFetch, keys);
|
||||
|
||||
List<String> rowsList = new ArrayList<>(rowsToFetch);
|
||||
Collections.sort(rowsList);
|
||||
hoodieHFileReader = (HoodieHFileReader) createReader(conf);
|
||||
List<Pair<String, GenericRecord>> result = hoodieHFileReader.readRecords(rowsList);
|
||||
assertEquals(result.size(), randomRowstoFetch);
|
||||
|
||||
List<GenericRecord> expectedRecords = rowsList.stream().map(recordMap::get).collect(Collectors.toList());
|
||||
|
||||
hoodieHFileReader = (HoodieHFileReader<GenericRecord>) createReader(conf);
|
||||
List<GenericRecord> result = HoodieHFileReader.readRecords(hoodieHFileReader, rowsList);
|
||||
|
||||
assertEquals(expectedRecords, result);
|
||||
|
||||
result.forEach(entry -> {
|
||||
assertEquals(entry.getSecond(), recordMap.get(entry.getFirst()));
|
||||
if (populateMetaFields && testAvroWithMeta) {
|
||||
assertNotNull(entry.getSecond().get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
|
||||
assertNotNull(entry.get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
|
||||
} else {
|
||||
assertNull(entry.getSecond().get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
|
||||
assertNull(entry.get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
|
||||
}
|
||||
});
|
||||
hoodieHFileReader.close();
|
||||
@@ -202,7 +211,7 @@ public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase {
|
||||
fs.open(getFilePath()), (int) fs.getFileStatus(getFilePath()).getLen());
|
||||
// Reading byte array in HFile format, without actual file path
|
||||
HoodieHFileReader<GenericRecord> hfileReader =
|
||||
new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content);
|
||||
new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content, Option.empty());
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
|
||||
assertEquals(NUM_RECORDS, hfileReader.getTotalRecords());
|
||||
verifySimpleRecords(hfileReader.getRecordIterator(avroSchema));
|
||||
@@ -217,7 +226,7 @@ public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase {
|
||||
IntStream.concat(IntStream.range(40, NUM_RECORDS * 2), IntStream.range(10, 20))
|
||||
.mapToObj(i -> "key" + String.format("%02d", i)).collect(Collectors.toList());
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
|
||||
Iterator<GenericRecord> iterator = hfileReader.getRecordIterator(keys, avroSchema);
|
||||
Iterator<GenericRecord> iterator = hfileReader.getRecordsByKeysIterator(keys, avroSchema);
|
||||
|
||||
List<Integer> expectedIds =
|
||||
IntStream.concat(IntStream.range(40, NUM_RECORDS), IntStream.range(10, 20))
|
||||
@@ -233,6 +242,59 @@ public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReaderGetRecordIteratorByKeyPrefixes() throws Exception {
|
||||
writeFileWithSimpleSchema();
|
||||
HoodieHFileReader<GenericRecord> hfileReader =
|
||||
(HoodieHFileReader<GenericRecord>) createReader(new Configuration());
|
||||
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
|
||||
|
||||
List<String> keyPrefixes = Collections.singletonList("key");
|
||||
Iterator<GenericRecord> iterator =
|
||||
hfileReader.getRecordsByKeyPrefixIterator(keyPrefixes, avroSchema);
|
||||
|
||||
List<GenericRecord> recordsByPrefix = toStream(iterator).collect(Collectors.toList());
|
||||
|
||||
List<GenericRecord> allRecords = toStream(hfileReader.getRecordIterator()).collect(Collectors.toList());
|
||||
|
||||
assertEquals(allRecords, recordsByPrefix);
|
||||
|
||||
// filter for "key1" : entries from key10 to key19 should be matched
|
||||
List<GenericRecord> expectedKey1s = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key1")).collect(Collectors.toList());
|
||||
iterator =
|
||||
hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key1"), avroSchema);
|
||||
recordsByPrefix =
|
||||
StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false)
|
||||
.collect(Collectors.toList());
|
||||
assertEquals(expectedKey1s, recordsByPrefix);
|
||||
|
||||
// exact match
|
||||
List<GenericRecord> expectedKey25 = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key25")).collect(Collectors.toList());
|
||||
iterator =
|
||||
hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key25"), avroSchema);
|
||||
recordsByPrefix =
|
||||
StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false)
|
||||
.collect(Collectors.toList());
|
||||
assertEquals(expectedKey25, recordsByPrefix);
|
||||
|
||||
// no match. key prefix is beyond entries in file.
|
||||
iterator =
|
||||
hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key99"), avroSchema);
|
||||
recordsByPrefix =
|
||||
StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false)
|
||||
.collect(Collectors.toList());
|
||||
assertEquals(Collections.emptyList(), recordsByPrefix);
|
||||
|
||||
// no match. but keyPrefix is in between the entries found in file.
|
||||
iterator =
|
||||
hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key1234"), avroSchema);
|
||||
recordsByPrefix =
|
||||
StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false)
|
||||
.collect(Collectors.toList());
|
||||
assertEquals(Collections.emptyList(), recordsByPrefix);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@ValueSource(strings = {
|
||||
"/hudi_0_9_hbase_1_2_3", "/hudi_0_10_hbase_1_2_3", "/hudi_0_11_hbase_2_4_9"})
|
||||
@@ -253,7 +315,7 @@ public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase {
|
||||
HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content),
|
||||
hfilePrefix, true, HFILE_COMPARATOR.getClass(), NUM_RECORDS_FIXTURE);
|
||||
HoodieHFileReader<GenericRecord> hfileReader =
|
||||
new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content);
|
||||
new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content, Option.empty());
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
|
||||
assertEquals(NUM_RECORDS_FIXTURE, hfileReader.getTotalRecords());
|
||||
verifySimpleRecords(hfileReader.getRecordIterator(avroSchema));
|
||||
@@ -261,7 +323,7 @@ public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase {
|
||||
content = readHFileFromResources(complexHFile);
|
||||
verifyHFileReader(HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content),
|
||||
hfilePrefix, true, HFILE_COMPARATOR.getClass(), NUM_RECORDS_FIXTURE);
|
||||
hfileReader = new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content);
|
||||
hfileReader = new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content, Option.empty());
|
||||
avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchemaWithUDT.avsc");
|
||||
assertEquals(NUM_RECORDS_FIXTURE, hfileReader.getTotalRecords());
|
||||
verifySimpleRecords(hfileReader.getRecordIterator(avroSchema));
|
||||
|
||||
@@ -38,6 +38,7 @@ import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
@@ -217,7 +218,7 @@ public abstract class TestHoodieReaderWriterBase {
|
||||
|
||||
private void verifyFilterRowKeys(HoodieFileReader<GenericRecord> hoodieReader) {
|
||||
Set<String> candidateRowKeys = IntStream.range(40, NUM_RECORDS * 2)
|
||||
.mapToObj(i -> "key" + String.format("%02d", i)).collect(Collectors.toSet());
|
||||
.mapToObj(i -> "key" + String.format("%02d", i)).collect(Collectors.toCollection(TreeSet::new));
|
||||
List<String> expectedKeys = IntStream.range(40, NUM_RECORDS)
|
||||
.mapToObj(i -> "key" + String.format("%02d", i)).sorted().collect(Collectors.toList());
|
||||
assertEquals(expectedKeys, hoodieReader.filterRowKeys(candidateRowKeys)
|
||||
|
||||
Reference in New Issue
Block a user