[HUDI-1180] Upgrade HBase to 2.4.9 (#5004)
Co-authored-by: Sagar Sumit <sagarsumit09@gmail.com>
This commit is contained in:
@@ -548,7 +548,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload, I, K, O> extends
|
||||
case AVRO_DATA_BLOCK:
|
||||
return new HoodieAvroDataBlock(recordList, header, keyField);
|
||||
case HFILE_DATA_BLOCK:
|
||||
return new HoodieHFileDataBlock(recordList, header, writeConfig.getHFileCompressionAlgorithm());
|
||||
return new HoodieHFileDataBlock(
|
||||
recordList, header, writeConfig.getHFileCompressionAlgorithm(), new Path(writeConfig.getBasePath()));
|
||||
case PARQUET_DATA_BLOCK:
|
||||
return new HoodieParquetDataBlock(recordList, header, keyField, writeConfig.getParquetCompressionCodec());
|
||||
default:
|
||||
|
||||
@@ -30,6 +30,7 @@ import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
|
||||
@@ -53,10 +54,12 @@ public class HoodieFileWriterFactory {
|
||||
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier, config.populateMetaFields());
|
||||
}
|
||||
if (HFILE.getFileExtension().equals(extension)) {
|
||||
return newHFileFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier);
|
||||
return newHFileFileWriter(
|
||||
instantTime, path, config, schema, hoodieTable.getHadoopConf(), taskContextSupplier);
|
||||
}
|
||||
if (ORC.getFileExtension().equals(extension)) {
|
||||
return newOrcFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier);
|
||||
return newOrcFileWriter(
|
||||
instantTime, path, config, schema, hoodieTable.getHadoopConf(), taskContextSupplier);
|
||||
}
|
||||
throw new UnsupportedOperationException(extension + " format not supported yet.");
|
||||
}
|
||||
@@ -64,28 +67,29 @@ public class HoodieFileWriterFactory {
|
||||
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
|
||||
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
|
||||
TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException {
|
||||
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier, populateMetaFields, populateMetaFields);
|
||||
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable.getHadoopConf(),
|
||||
taskContextSupplier, populateMetaFields, populateMetaFields);
|
||||
}
|
||||
|
||||
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
|
||||
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
|
||||
String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf,
|
||||
TaskContextSupplier taskContextSupplier, boolean populateMetaFields, boolean enableBloomFilter) throws IOException {
|
||||
Option<BloomFilter> filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty();
|
||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(hoodieTable.getHadoopConf()).convert(schema), schema, filter);
|
||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(conf).convert(schema), schema, filter);
|
||||
|
||||
HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(),
|
||||
config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
|
||||
hoodieTable.getHadoopConf(), config.getParquetCompressionRatio(), config.parquetDictionaryEnabled());
|
||||
conf, config.getParquetCompressionRatio(), config.parquetDictionaryEnabled());
|
||||
|
||||
return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier, populateMetaFields);
|
||||
}
|
||||
|
||||
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newHFileFileWriter(
|
||||
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
|
||||
static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newHFileFileWriter(
|
||||
String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf,
|
||||
TaskContextSupplier taskContextSupplier) throws IOException {
|
||||
|
||||
BloomFilter filter = createBloomFilter(config);
|
||||
HoodieHFileConfig hfileConfig = new HoodieHFileConfig(hoodieTable.getHadoopConf(),
|
||||
HoodieHFileConfig hfileConfig = new HoodieHFileConfig(conf,
|
||||
config.getHFileCompressionAlgorithm(), config.getHFileBlockSize(), config.getHFileMaxFileSize(),
|
||||
HoodieHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION,
|
||||
filter, HFILE_COMPARATOR);
|
||||
@@ -94,10 +98,10 @@ public class HoodieFileWriterFactory {
|
||||
}
|
||||
|
||||
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newOrcFileWriter(
|
||||
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
|
||||
String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf,
|
||||
TaskContextSupplier taskContextSupplier) throws IOException {
|
||||
BloomFilter filter = createBloomFilter(config);
|
||||
HoodieOrcConfig orcConfig = new HoodieOrcConfig(hoodieTable.getHadoopConf(), config.getOrcCompressionCodec(),
|
||||
HoodieOrcConfig orcConfig = new HoodieOrcConfig(conf, config.getOrcCompressionCodec(),
|
||||
config.getOrcStripeSize(), config.getOrcBlockSize(), config.getOrcMaxFileSize(), filter);
|
||||
return new HoodieOrcWriter<>(instantTime, path, orcConfig, schema, taskContextSupplier);
|
||||
}
|
||||
|
||||
@@ -21,14 +21,14 @@ package org.apache.hudi.io.storage;
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.CellComparator;
|
||||
import org.apache.hadoop.hbase.HColumnDescriptor;
|
||||
import org.apache.hadoop.hbase.KeyValue;
|
||||
import org.apache.hadoop.hbase.io.compress.Compression;
|
||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
|
||||
|
||||
public class HoodieHFileConfig {
|
||||
|
||||
public static final KeyValue.KVComparator HFILE_COMPARATOR = new HoodieHBaseKVComparator();
|
||||
public static final CellComparator HFILE_COMPARATOR = new HoodieHBaseKVComparator();
|
||||
public static final boolean PREFETCH_ON_OPEN = CacheConfig.DEFAULT_PREFETCH_ON_OPEN;
|
||||
public static final boolean CACHE_DATA_IN_L1 = HColumnDescriptor.DEFAULT_CACHE_DATA_IN_L1;
|
||||
// This is private in CacheConfig so have been copied here.
|
||||
@@ -42,12 +42,12 @@ public class HoodieHFileConfig {
|
||||
private final boolean dropBehindCacheCompaction;
|
||||
private final Configuration hadoopConf;
|
||||
private final BloomFilter bloomFilter;
|
||||
private final KeyValue.KVComparator hfileComparator;
|
||||
private final CellComparator hfileComparator;
|
||||
private final String keyFieldName;
|
||||
|
||||
public HoodieHFileConfig(Configuration hadoopConf, Compression.Algorithm compressionAlgorithm, int blockSize,
|
||||
long maxFileSize, String keyFieldName, boolean prefetchBlocksOnOpen, boolean cacheDataInL1,
|
||||
boolean dropBehindCacheCompaction, BloomFilter bloomFilter, KeyValue.KVComparator hfileComparator) {
|
||||
boolean dropBehindCacheCompaction, BloomFilter bloomFilter, CellComparator hfileComparator) {
|
||||
this.hadoopConf = hadoopConf;
|
||||
this.compressionAlgorithm = compressionAlgorithm;
|
||||
this.blockSize = blockSize;
|
||||
@@ -96,7 +96,7 @@ public class HoodieHFileConfig {
|
||||
return bloomFilter;
|
||||
}
|
||||
|
||||
public KeyValue.KVComparator getHfileComparator() {
|
||||
public CellComparator getHFileComparator() {
|
||||
return hfileComparator;
|
||||
}
|
||||
|
||||
|
||||
@@ -25,6 +25,8 @@ import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
@@ -38,8 +40,6 @@ import org.apache.hadoop.hbase.io.hfile.HFile;
|
||||
import org.apache.hadoop.hbase.io.hfile.HFileContext;
|
||||
import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
|
||||
import org.apache.hadoop.io.Writable;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
@@ -95,6 +95,7 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
|
||||
|
||||
HFileContext context = new HFileContextBuilder().withBlockSize(hfileConfig.getBlockSize())
|
||||
.withCompression(hfileConfig.getCompressionAlgorithm())
|
||||
.withCellComparator(hfileConfig.getHFileComparator())
|
||||
.build();
|
||||
|
||||
conf.set(CacheConfig.PREFETCH_BLOCKS_ON_OPEN_KEY, String.valueOf(hfileConfig.shouldPrefetchBlocksOnOpen()));
|
||||
@@ -104,7 +105,6 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
|
||||
this.writer = HFile.getWriterFactory(conf, cacheConfig)
|
||||
.withPath(this.fs, this.file)
|
||||
.withFileContext(context)
|
||||
.withComparator(hfileConfig.getHfileComparator())
|
||||
.create();
|
||||
|
||||
writer.appendFileInfo(HoodieHFileReader.KEY_SCHEMA.getBytes(), schema.toString().getBytes());
|
||||
|
||||
@@ -18,71 +18,113 @@
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
||||
import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex;
|
||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieAvroRecord;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.FileIOUtils;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.io.compress.Compression;
|
||||
import org.apache.hadoop.hbase.CellComparatorImpl;
|
||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
|
||||
import org.apache.hadoop.hbase.io.hfile.HFile;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
import org.junit.jupiter.params.provider.ValueSource;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM;
|
||||
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
|
||||
import static org.apache.hudi.io.storage.HoodieHFileConfig.CACHE_DATA_IN_L1;
|
||||
import static org.apache.hudi.io.storage.HoodieHFileConfig.DROP_BEHIND_CACHE_COMPACTION;
|
||||
import static org.apache.hudi.io.storage.HoodieHFileConfig.HFILE_COMPARATOR;
|
||||
import static org.apache.hudi.io.storage.HoodieHFileConfig.PREFETCH_ON_OPEN;
|
||||
import static org.apache.hudi.io.storage.HoodieHFileReader.KEY_SCHEMA;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class TestHoodieHFileReaderWriter {
|
||||
@TempDir File tempDir;
|
||||
private Path filePath;
|
||||
public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase {
|
||||
private static final String DUMMY_BASE_PATH = "dummy_base_path";
|
||||
// Number of records in HFile fixtures for compatibility tests
|
||||
private static final int NUM_RECORDS_FIXTURE = 50;
|
||||
private static final String SIMPLE_SCHEMA_HFILE_SUFFIX = "_simple.hfile";
|
||||
private static final String COMPLEX_SCHEMA_HFILE_SUFFIX = "_complex.hfile";
|
||||
private static final String BOOTSTRAP_INDEX_HFILE_SUFFIX = "_bootstrap_index_partitions.hfile";
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws IOException {
|
||||
filePath = new Path(tempDir.toString() + "tempFile.txt");
|
||||
@Override
|
||||
protected Path getFilePath() {
|
||||
return new Path(tempDir.toString() + "/f1_1-0-1_000.hfile");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void clearTempFile() {
|
||||
File file = new File(filePath.toString());
|
||||
if (file.exists()) {
|
||||
file.delete();
|
||||
}
|
||||
@Override
|
||||
protected HoodieFileWriter<GenericRecord> createWriter(
|
||||
Schema avroSchema, boolean populateMetaFields) throws Exception {
|
||||
String instantTime = "000";
|
||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder()
|
||||
.withPath(DUMMY_BASE_PATH)
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder()
|
||||
.bloomFilterNumEntries(1000).bloomFilterFPP(0.00001).build())
|
||||
.withPopulateMetaFields(populateMetaFields)
|
||||
.build();
|
||||
Configuration conf = new Configuration();
|
||||
TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class);
|
||||
Supplier<Integer> partitionSupplier = Mockito.mock(Supplier.class);
|
||||
when(mockTaskContextSupplier.getPartitionIdSupplier()).thenReturn(partitionSupplier);
|
||||
when(partitionSupplier.get()).thenReturn(10);
|
||||
|
||||
return HoodieFileWriterFactory.newHFileFileWriter(
|
||||
instantTime, getFilePath(), writeConfig, avroSchema, conf, mockTaskContextSupplier);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected HoodieFileReader<GenericRecord> createReader(
|
||||
Configuration conf) throws Exception {
|
||||
CacheConfig cacheConfig = new CacheConfig(conf);
|
||||
return new HoodieHFileReader<>(conf, getFilePath(), cacheConfig, getFilePath().getFileSystem(conf));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void verifyMetadata(Configuration conf) throws IOException {
|
||||
FileSystem fs = getFilePath().getFileSystem(conf);
|
||||
HFile.Reader hfileReader = HoodieHFileUtils.createHFileReader(fs, getFilePath(), new CacheConfig(conf), conf);
|
||||
assertEquals(HFILE_COMPARATOR.getClass(), hfileReader.getComparator().getClass());
|
||||
assertEquals(NUM_RECORDS, hfileReader.getEntries());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void verifySchema(Configuration conf, String schemaPath) throws IOException {
|
||||
FileSystem fs = getFilePath().getFileSystem(conf);
|
||||
HFile.Reader hfileReader = HoodieHFileUtils.createHFileReader(fs, getFilePath(), new CacheConfig(conf), conf);
|
||||
assertEquals(getSchemaFromResource(TestHoodieHFileReaderWriter.class, schemaPath),
|
||||
new Schema.Parser().parse(new String(hfileReader.getHFileInfo().get(KEY_SCHEMA.getBytes()))));
|
||||
}
|
||||
|
||||
private static Stream<Arguments> populateMetaFieldsAndTestAvroWithMeta() {
|
||||
@@ -94,25 +136,11 @@ public class TestHoodieHFileReaderWriter {
|
||||
}).map(Arguments::of);
|
||||
}
|
||||
|
||||
private HoodieHFileWriter createHFileWriter(Schema avroSchema, boolean populateMetaFields) throws Exception {
|
||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.00001, -1, BloomFilterTypeCode.SIMPLE.name());
|
||||
Configuration conf = new Configuration();
|
||||
TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class);
|
||||
Supplier<Integer> partitionSupplier = Mockito.mock(Supplier.class);
|
||||
when(mockTaskContextSupplier.getPartitionIdSupplier()).thenReturn(partitionSupplier);
|
||||
when(partitionSupplier.get()).thenReturn(10);
|
||||
String instantTime = "000";
|
||||
|
||||
HoodieHFileConfig hoodieHFileConfig = new HoodieHFileConfig(conf, Compression.Algorithm.GZ, 1024 * 1024, 120 * 1024 * 1024,
|
||||
HoodieHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, filter, HFILE_COMPARATOR);
|
||||
return new HoodieHFileWriter(instantTime, filePath, hoodieHFileConfig, avroSchema, mockTaskContextSupplier, populateMetaFields);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsAndTestAvroWithMeta")
|
||||
public void testWriteReadHFile(boolean populateMetaFields, boolean testAvroWithMeta) throws Exception {
|
||||
public void testWriteReadHFileWithMetaFields(boolean populateMetaFields, boolean testAvroWithMeta) throws Exception {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchemaWithMetaFields.avsc");
|
||||
HoodieHFileWriter writer = createHFileWriter(avroSchema, populateMetaFields);
|
||||
HoodieFileWriter<GenericRecord> writer = createWriter(avroSchema, populateMetaFields);
|
||||
List<String> keys = new ArrayList<>();
|
||||
Map<String, GenericRecord> recordMap = new HashMap<>();
|
||||
for (int i = 0; i < 100; i++) {
|
||||
@@ -134,8 +162,7 @@ public class TestHoodieHFileReaderWriter {
|
||||
writer.close();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
CacheConfig cacheConfig = new CacheConfig(conf);
|
||||
HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf));
|
||||
HoodieHFileReader hoodieHFileReader = (HoodieHFileReader) createReader(conf);
|
||||
List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
|
||||
records.forEach(entry -> assertEquals(entry.getSecond(), recordMap.get(entry.getFirst())));
|
||||
hoodieHFileReader.close();
|
||||
@@ -145,7 +172,7 @@ public class TestHoodieHFileReaderWriter {
|
||||
Set<String> rowsToFetch = getRandomKeys(randomRowstoFetch, keys);
|
||||
List<String> rowsList = new ArrayList<>(rowsToFetch);
|
||||
Collections.sort(rowsList);
|
||||
hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf));
|
||||
hoodieHFileReader = (HoodieHFileReader) createReader(conf);
|
||||
List<Pair<String, GenericRecord>> result = hoodieHFileReader.readRecords(rowsList);
|
||||
assertEquals(result.size(), randomRowstoFetch);
|
||||
result.forEach(entry -> {
|
||||
@@ -160,6 +187,90 @@ public class TestHoodieHFileReaderWriter {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@Test
|
||||
public void testWriteReadWithEvolvedSchema() throws Exception {
|
||||
// Disable the test with evolved schema for HFile since it's not supported
|
||||
// TODO(HUDI-3683): fix the schema evolution for HFile
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadHFileFormatRecords() throws Exception {
|
||||
writeFileWithSimpleSchema();
|
||||
FileSystem fs = FSUtils.getFs(getFilePath().toString(), new Configuration());
|
||||
byte[] content = FileIOUtils.readAsByteArray(
|
||||
fs.open(getFilePath()), (int) fs.getFileStatus(getFilePath()).getLen());
|
||||
// Reading byte array in HFile format, without actual file path
|
||||
HoodieHFileReader<GenericRecord> hfileReader =
|
||||
new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content);
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
|
||||
assertEquals(NUM_RECORDS, hfileReader.getTotalRecords());
|
||||
verifySimpleRecords(hfileReader.getRecordIterator(avroSchema));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReaderGetRecordIterator() throws Exception {
|
||||
writeFileWithSimpleSchema();
|
||||
HoodieHFileReader<GenericRecord> hfileReader =
|
||||
(HoodieHFileReader<GenericRecord>) createReader(new Configuration());
|
||||
List<String> keys =
|
||||
IntStream.concat(IntStream.range(40, NUM_RECORDS * 2), IntStream.range(10, 20))
|
||||
.mapToObj(i -> "key" + String.format("%02d", i)).collect(Collectors.toList());
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
|
||||
Iterator<GenericRecord> iterator = hfileReader.getRecordIterator(keys, avroSchema);
|
||||
|
||||
List<Integer> expectedIds =
|
||||
IntStream.concat(IntStream.range(40, NUM_RECORDS), IntStream.range(10, 20))
|
||||
.boxed().collect(Collectors.toList());
|
||||
int index = 0;
|
||||
while (iterator.hasNext()) {
|
||||
GenericRecord record = iterator.next();
|
||||
String key = "key" + String.format("%02d", expectedIds.get(index));
|
||||
assertEquals(key, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(expectedIds.get(index)), record.get("time").toString());
|
||||
assertEquals(expectedIds.get(index), record.get("number"));
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@ValueSource(strings = {
|
||||
"/hudi_0_9_hbase_1_2_3", "/hudi_0_10_hbase_1_2_3", "/hudi_0_11_hbase_2_4_9"})
|
||||
public void testHoodieHFileCompatibility(String hfilePrefix) throws IOException {
|
||||
// This fixture is generated from TestHoodieReaderWriterBase#testWriteReadPrimitiveRecord()
|
||||
// using different Hudi releases
|
||||
String simpleHFile = hfilePrefix + SIMPLE_SCHEMA_HFILE_SUFFIX;
|
||||
// This fixture is generated from TestHoodieReaderWriterBase#testWriteReadComplexRecord()
|
||||
// using different Hudi releases
|
||||
String complexHFile = hfilePrefix + COMPLEX_SCHEMA_HFILE_SUFFIX;
|
||||
// This fixture is generated from TestBootstrapIndex#testBootstrapIndex()
|
||||
// using different Hudi releases. The file is copied from .hoodie/.aux/.bootstrap/.partitions/
|
||||
String bootstrapIndexFile = hfilePrefix + BOOTSTRAP_INDEX_HFILE_SUFFIX;
|
||||
|
||||
FileSystem fs = FSUtils.getFs(getFilePath().toString(), new Configuration());
|
||||
byte[] content = readHFileFromResources(simpleHFile);
|
||||
verifyHFileReader(
|
||||
HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content),
|
||||
hfilePrefix, true, HFILE_COMPARATOR.getClass(), NUM_RECORDS_FIXTURE);
|
||||
HoodieHFileReader<GenericRecord> hfileReader =
|
||||
new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content);
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
|
||||
assertEquals(NUM_RECORDS_FIXTURE, hfileReader.getTotalRecords());
|
||||
verifySimpleRecords(hfileReader.getRecordIterator(avroSchema));
|
||||
|
||||
content = readHFileFromResources(complexHFile);
|
||||
verifyHFileReader(HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content),
|
||||
hfilePrefix, true, HFILE_COMPARATOR.getClass(), NUM_RECORDS_FIXTURE);
|
||||
hfileReader = new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content);
|
||||
avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchemaWithUDT.avsc");
|
||||
assertEquals(NUM_RECORDS_FIXTURE, hfileReader.getTotalRecords());
|
||||
verifySimpleRecords(hfileReader.getRecordIterator(avroSchema));
|
||||
|
||||
content = readHFileFromResources(bootstrapIndexFile);
|
||||
verifyHFileReader(HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content),
|
||||
hfilePrefix, false, HFileBootstrapIndex.HoodieKVComparator.class, 4);
|
||||
}
|
||||
|
||||
private Set<String> getRandomKeys(int count, List<String> keys) {
|
||||
Set<String> rowKeys = new HashSet<>();
|
||||
int totalKeys = keys.size();
|
||||
@@ -171,4 +282,26 @@ public class TestHoodieHFileReaderWriter {
|
||||
}
|
||||
return rowKeys;
|
||||
}
|
||||
|
||||
private byte[] readHFileFromResources(String filename) throws IOException {
|
||||
long size = TestHoodieHFileReaderWriter.class
|
||||
.getResource(filename).openConnection().getContentLength();
|
||||
return FileIOUtils.readAsByteArray(
|
||||
TestHoodieHFileReaderWriter.class.getResourceAsStream(filename), (int) size);
|
||||
}
|
||||
|
||||
private void verifyHFileReader(
|
||||
HFile.Reader reader, String hfileName, boolean mayUseDefaultComparator,
|
||||
Class<?> clazz, int count) {
|
||||
// HFile version is 3
|
||||
assertEquals(3, reader.getTrailer().getMajorVersion());
|
||||
if (mayUseDefaultComparator && hfileName.contains("hudi_0_9")) {
|
||||
// Pre Hudi 0.10, the default comparator is used for metadata table HFiles
|
||||
// For bootstrap index HFiles, the custom comparator is always used
|
||||
assertEquals(CellComparatorImpl.class, reader.getComparator().getClass());
|
||||
} else {
|
||||
assertEquals(clazz, reader.getComparator().getClass());
|
||||
}
|
||||
assertEquals(count, reader.getEntries());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,53 +18,40 @@
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||
import org.apache.hudi.config.HoodieStorageConfig;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.orc.CompressionKind;
|
||||
import org.apache.orc.OrcFile;
|
||||
import org.apache.orc.Reader;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY;
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER;
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER;
|
||||
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
|
||||
import static org.apache.hudi.io.storage.HoodieOrcConfig.AVRO_SCHEMA_METADATA_KEY;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class TestHoodieOrcReaderWriter {
|
||||
private final Path filePath = new Path(System.getProperty("java.io.tmpdir") + "/f1_1-0-1_000.orc");
|
||||
public class TestHoodieOrcReaderWriter extends TestHoodieReaderWriterBase {
|
||||
|
||||
@BeforeEach
|
||||
@AfterEach
|
||||
public void clearTempFile() {
|
||||
File file = new File(filePath.toString());
|
||||
if (file.exists()) {
|
||||
file.delete();
|
||||
}
|
||||
@Override
|
||||
protected Path getFilePath() {
|
||||
return new Path(tempDir.toString() + "/f1_1-0-1_000.orc");
|
||||
}
|
||||
|
||||
private HoodieOrcWriter createOrcWriter(Schema avroSchema) throws Exception {
|
||||
@Override
|
||||
protected HoodieFileWriter<GenericRecord> createWriter(
|
||||
Schema avroSchema, boolean populateMetaFields) throws Exception {
|
||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.00001, -1, BloomFilterTypeCode.SIMPLE.name());
|
||||
Configuration conf = new Configuration();
|
||||
int orcStripSize = Integer.parseInt(HoodieStorageConfig.ORC_STRIPE_SIZE.defaultValue());
|
||||
@@ -73,189 +60,41 @@ public class TestHoodieOrcReaderWriter {
|
||||
HoodieOrcConfig config = new HoodieOrcConfig(conf, CompressionKind.ZLIB, orcStripSize, orcBlockSize, maxFileSize, filter);
|
||||
TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class);
|
||||
String instantTime = "000";
|
||||
return new HoodieOrcWriter(instantTime, filePath, config, avroSchema, mockTaskContextSupplier);
|
||||
return new HoodieOrcWriter<>(instantTime, getFilePath(), config, avroSchema, mockTaskContextSupplier);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteReadMetadata() throws Exception {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
|
||||
HoodieOrcWriter writer = createOrcWriter(avroSchema);
|
||||
for (int i = 0; i < 3; i++) {
|
||||
GenericRecord record = new GenericData.Record(avroSchema);
|
||||
record.put("_row_key", "key" + i);
|
||||
record.put("time", Integer.toString(i));
|
||||
record.put("number", i);
|
||||
writer.writeAvro("key" + i, record);
|
||||
}
|
||||
writer.close();
|
||||
@Override
|
||||
protected HoodieFileReader<GenericRecord> createReader(
|
||||
Configuration conf) throws Exception {
|
||||
return HoodieFileReaderFactory.getFileReader(conf, getFilePath());
|
||||
}
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
Reader orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
|
||||
@Override
|
||||
protected void verifyMetadata(Configuration conf) throws IOException {
|
||||
Reader orcReader = OrcFile.createReader(getFilePath(), OrcFile.readerOptions(conf));
|
||||
assertEquals(4, orcReader.getMetadataKeys().size());
|
||||
assertTrue(orcReader.getMetadataKeys().contains(HOODIE_MIN_RECORD_KEY_FOOTER));
|
||||
assertTrue(orcReader.getMetadataKeys().contains(HOODIE_MAX_RECORD_KEY_FOOTER));
|
||||
assertTrue(orcReader.getMetadataKeys().contains(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY));
|
||||
assertTrue(orcReader.getMetadataKeys().contains(AVRO_SCHEMA_METADATA_KEY));
|
||||
assertEquals(CompressionKind.ZLIB.name(), orcReader.getCompressionKind().toString());
|
||||
|
||||
HoodieFileReader<GenericRecord> hoodieReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
|
||||
BloomFilter filter = hoodieReader.readBloomFilter();
|
||||
for (int i = 0; i < 3; i++) {
|
||||
assertTrue(filter.mightContain("key" + i));
|
||||
}
|
||||
assertFalse(filter.mightContain("non-existent-key"));
|
||||
assertEquals(3, hoodieReader.getTotalRecords());
|
||||
String[] minMaxRecordKeys = hoodieReader.readMinMaxRecordKeys();
|
||||
assertEquals(2, minMaxRecordKeys.length);
|
||||
assertEquals("key0", minMaxRecordKeys[0]);
|
||||
assertEquals("key2", minMaxRecordKeys[1]);
|
||||
assertEquals(NUM_RECORDS, orcReader.getNumberOfRows());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteReadPrimitiveRecord() throws Exception {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
|
||||
HoodieOrcWriter writer = createOrcWriter(avroSchema);
|
||||
for (int i = 0; i < 3; i++) {
|
||||
GenericRecord record = new GenericData.Record(avroSchema);
|
||||
record.put("_row_key", "key" + i);
|
||||
record.put("time", Integer.toString(i));
|
||||
record.put("number", i);
|
||||
writer.writeAvro("key" + i, record);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
Reader orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
|
||||
assertEquals("struct<_row_key:string,time:string,number:int>", orcReader.getSchema().toString());
|
||||
assertEquals(3, orcReader.getNumberOfRows());
|
||||
|
||||
HoodieFileReader<GenericRecord> hoodieReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
|
||||
Iterator<GenericRecord> iter = hoodieReader.getRecordIterator();
|
||||
int index = 0;
|
||||
while (iter.hasNext()) {
|
||||
GenericRecord record = iter.next();
|
||||
assertEquals("key" + index, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(index), record.get("time").toString());
|
||||
assertEquals(index, record.get("number"));
|
||||
index++;
|
||||
@Override
|
||||
protected void verifySchema(Configuration conf, String schemaPath) throws IOException {
|
||||
Reader orcReader = OrcFile.createReader(getFilePath(), OrcFile.readerOptions(conf));
|
||||
if ("/exampleSchema.avsc".equals(schemaPath)) {
|
||||
assertEquals("struct<_row_key:string,time:string,number:int>",
|
||||
orcReader.getSchema().toString());
|
||||
} else if ("/exampleSchemaWithUDT.avsc".equals(schemaPath)) {
|
||||
assertEquals("struct<_row_key:string,time:string,number:int,driver:struct<driver_name:string,list:array<int>,map:map<string,string>>>",
|
||||
orcReader.getSchema().toString());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteReadComplexRecord() throws Exception {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchemaWithUDT.avsc");
|
||||
Schema udtSchema = avroSchema.getField("driver").schema().getTypes().get(1);
|
||||
HoodieOrcWriter writer = createOrcWriter(avroSchema);
|
||||
for (int i = 0; i < 3; i++) {
|
||||
GenericRecord record = new GenericData.Record(avroSchema);
|
||||
record.put("_row_key", "key" + i);
|
||||
record.put("time", Integer.toString(i));
|
||||
record.put("number", i);
|
||||
GenericRecord innerRecord = new GenericData.Record(udtSchema);
|
||||
innerRecord.put("driver_name", "driver" + i);
|
||||
innerRecord.put("list", Collections.singletonList(i));
|
||||
innerRecord.put("map", Collections.singletonMap("key" + i, "value" + i));
|
||||
record.put("driver", innerRecord);
|
||||
writer.writeAvro("key" + i, record);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
|
||||
assertEquals("struct<_row_key:string,time:string,number:int,driver:struct<driver_name:string,list:array<int>,map:map<string,string>>>",
|
||||
reader.getSchema().toString());
|
||||
assertEquals(3, reader.getNumberOfRows());
|
||||
|
||||
HoodieFileReader<GenericRecord> hoodieReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
|
||||
Iterator<GenericRecord> iter = hoodieReader.getRecordIterator();
|
||||
int index = 0;
|
||||
while (iter.hasNext()) {
|
||||
GenericRecord record = iter.next();
|
||||
assertEquals("key" + index, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(index), record.get("time").toString());
|
||||
assertEquals(index, record.get("number"));
|
||||
GenericRecord innerRecord = (GenericRecord) record.get("driver");
|
||||
assertEquals("driver" + index, innerRecord.get("driver_name").toString());
|
||||
assertEquals(1, ((List<?>)innerRecord.get("list")).size());
|
||||
assertEquals(index, ((List<?>)innerRecord.get("list")).get(0));
|
||||
assertEquals("value" + index, ((Map<?,?>)innerRecord.get("map")).get("key" + index).toString());
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteReadWithEvolvedSchema() throws Exception {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
|
||||
HoodieOrcWriter writer = createOrcWriter(avroSchema);
|
||||
for (int i = 0; i < 3; i++) {
|
||||
GenericRecord record = new GenericData.Record(avroSchema);
|
||||
record.put("_row_key", "key" + i);
|
||||
record.put("time", Integer.toString(i));
|
||||
record.put("number", i);
|
||||
writer.writeAvro("key" + i, record);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
HoodieFileReader<GenericRecord> hoodieReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
|
||||
Schema evolvedSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleEvolvedSchema.avsc");
|
||||
Iterator<GenericRecord> iter = hoodieReader.getRecordIterator(evolvedSchema);
|
||||
int index = 0;
|
||||
while (iter.hasNext()) {
|
||||
GenericRecord record = iter.next();
|
||||
assertEquals("key" + index, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(index), record.get("time").toString());
|
||||
assertEquals(index, record.get("number"));
|
||||
assertNull(record.get("added_field"));
|
||||
index++;
|
||||
}
|
||||
|
||||
evolvedSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleEvolvedSchemaChangeOrder.avsc");
|
||||
iter = hoodieReader.getRecordIterator(evolvedSchema);
|
||||
index = 0;
|
||||
while (iter.hasNext()) {
|
||||
GenericRecord record = iter.next();
|
||||
assertEquals("key" + index, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(index), record.get("time").toString());
|
||||
assertEquals(index, record.get("number"));
|
||||
assertNull(record.get("added_field"));
|
||||
index++;
|
||||
}
|
||||
|
||||
evolvedSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleEvolvedSchemaColumnRequire.avsc");
|
||||
iter = hoodieReader.getRecordIterator(evolvedSchema);
|
||||
index = 0;
|
||||
while (iter.hasNext()) {
|
||||
GenericRecord record = iter.next();
|
||||
assertEquals("key" + index, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(index), record.get("time").toString());
|
||||
assertEquals(index, record.get("number"));
|
||||
assertNull(record.get("added_field"));
|
||||
index++;
|
||||
}
|
||||
|
||||
evolvedSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleEvolvedSchemaColumnType.avsc");
|
||||
iter = hoodieReader.getRecordIterator(evolvedSchema);
|
||||
index = 0;
|
||||
while (iter.hasNext()) {
|
||||
GenericRecord record = iter.next();
|
||||
assertEquals("key" + index, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(index), record.get("time").toString());
|
||||
assertEquals(Integer.toString(index), record.get("number").toString());
|
||||
assertNull(record.get("added_field"));
|
||||
index++;
|
||||
}
|
||||
|
||||
evolvedSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleEvolvedSchemaDeleteColumn.avsc");
|
||||
iter = hoodieReader.getRecordIterator(evolvedSchema);
|
||||
index = 0;
|
||||
while (iter.hasNext()) {
|
||||
GenericRecord record = iter.next();
|
||||
assertEquals("key" + index, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(index), record.get("time").toString());
|
||||
assertNull(record.get("number"));
|
||||
assertNull(record.get("added_field"));
|
||||
index++;
|
||||
}
|
||||
@Override
|
||||
public void testReaderFilterRowKeys() {
|
||||
// TODO(HUDI-3682): fix filterRowKeys test for ORC due to a bug in ORC logic
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,250 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
/**
|
||||
* Abstract class for unit tests of {@link HoodieFileReader} and {@link HoodieFileWriter}
|
||||
* for different file format
|
||||
*/
|
||||
public abstract class TestHoodieReaderWriterBase {
|
||||
protected static final int NUM_RECORDS = 50;
|
||||
@TempDir
|
||||
protected File tempDir;
|
||||
|
||||
protected abstract Path getFilePath();
|
||||
|
||||
protected abstract HoodieFileWriter<GenericRecord> createWriter(
|
||||
Schema avroSchema, boolean populateMetaFields) throws Exception;
|
||||
|
||||
protected abstract HoodieFileReader<GenericRecord> createReader(
|
||||
Configuration conf) throws Exception;
|
||||
|
||||
protected abstract void verifyMetadata(Configuration conf) throws IOException;
|
||||
|
||||
protected abstract void verifySchema(Configuration conf, String schemaPath) throws IOException;
|
||||
|
||||
@BeforeEach
|
||||
@AfterEach
|
||||
public void clearTempFile() {
|
||||
File file = new File(getFilePath().toString());
|
||||
if (file.exists()) {
|
||||
file.delete();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteReadMetadata() throws Exception {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
|
||||
writeFileWithSimpleSchema();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
verifyMetadata(conf);
|
||||
|
||||
HoodieFileReader<GenericRecord> hoodieReader = createReader(conf);
|
||||
BloomFilter filter = hoodieReader.readBloomFilter();
|
||||
for (int i = 0; i < NUM_RECORDS; i++) {
|
||||
String key = "key" + String.format("%02d", i);
|
||||
assertTrue(filter.mightContain(key));
|
||||
}
|
||||
assertFalse(filter.mightContain("non-existent-key"));
|
||||
assertEquals(avroSchema, hoodieReader.getSchema());
|
||||
assertEquals(NUM_RECORDS, hoodieReader.getTotalRecords());
|
||||
String[] minMaxRecordKeys = hoodieReader.readMinMaxRecordKeys();
|
||||
assertEquals(2, minMaxRecordKeys.length);
|
||||
assertEquals("key00", minMaxRecordKeys[0]);
|
||||
assertEquals("key" + (NUM_RECORDS - 1), minMaxRecordKeys[1]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteReadPrimitiveRecord() throws Exception {
|
||||
String schemaPath = "/exampleSchema.avsc";
|
||||
writeFileWithSimpleSchema();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
verifyMetadata(conf);
|
||||
verifySchema(conf, schemaPath);
|
||||
verifySimpleRecords(createReader(conf).getRecordIterator());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteReadComplexRecord() throws Exception {
|
||||
String schemaPath = "/exampleSchemaWithUDT.avsc";
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, schemaPath);
|
||||
Schema udtSchema = avroSchema.getField("driver").schema().getTypes().get(1);
|
||||
HoodieFileWriter<GenericRecord> writer = createWriter(avroSchema, true);
|
||||
for (int i = 0; i < NUM_RECORDS; i++) {
|
||||
GenericRecord record = new GenericData.Record(avroSchema);
|
||||
String key = "key" + String.format("%02d", i);
|
||||
record.put("_row_key", key);
|
||||
record.put("time", Integer.toString(i));
|
||||
record.put("number", i);
|
||||
GenericRecord innerRecord = new GenericData.Record(udtSchema);
|
||||
innerRecord.put("driver_name", "driver" + i);
|
||||
innerRecord.put("list", Collections.singletonList(i));
|
||||
innerRecord.put("map", Collections.singletonMap(key, "value" + i));
|
||||
record.put("driver", innerRecord);
|
||||
writer.writeAvro(key, record);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
verifyMetadata(conf);
|
||||
verifySchema(conf, schemaPath);
|
||||
verifyComplexRecords(createReader(conf).getRecordIterator());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteReadWithEvolvedSchema() throws Exception {
|
||||
writeFileWithSimpleSchema();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
HoodieFileReader<GenericRecord> hoodieReader = createReader(conf);
|
||||
String[] schemaList = new String[] {
|
||||
"/exampleEvolvedSchema.avsc", "/exampleEvolvedSchemaChangeOrder.avsc",
|
||||
"/exampleEvolvedSchemaColumnRequire.avsc", "/exampleEvolvedSchemaColumnType.avsc",
|
||||
"/exampleEvolvedSchemaDeleteColumn.avsc"};
|
||||
|
||||
for (String evolvedSchemaPath : schemaList) {
|
||||
verifyReaderWithSchema(evolvedSchemaPath, hoodieReader);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReaderFilterRowKeys() throws Exception {
|
||||
writeFileWithSimpleSchema();
|
||||
Configuration conf = new Configuration();
|
||||
verifyMetadata(conf);
|
||||
verifyFilterRowKeys(createReader(conf));
|
||||
}
|
||||
|
||||
protected void writeFileWithSimpleSchema() throws Exception {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
|
||||
HoodieFileWriter<GenericRecord> writer = createWriter(avroSchema, true);
|
||||
for (int i = 0; i < NUM_RECORDS; i++) {
|
||||
GenericRecord record = new GenericData.Record(avroSchema);
|
||||
String key = "key" + String.format("%02d", i);
|
||||
record.put("_row_key", key);
|
||||
record.put("time", Integer.toString(i));
|
||||
record.put("number", i);
|
||||
writer.writeAvro(key, record);
|
||||
}
|
||||
writer.close();
|
||||
}
|
||||
|
||||
protected void verifySimpleRecords(Iterator<GenericRecord> iterator) {
|
||||
int index = 0;
|
||||
while (iterator.hasNext()) {
|
||||
GenericRecord record = iterator.next();
|
||||
String key = "key" + String.format("%02d", index);
|
||||
assertEquals(key, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(index), record.get("time").toString());
|
||||
assertEquals(index, record.get("number"));
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
protected void verifyComplexRecords(Iterator<GenericRecord> iterator) {
|
||||
int index = 0;
|
||||
while (iterator.hasNext()) {
|
||||
GenericRecord record = iterator.next();
|
||||
String key = "key" + String.format("%02d", index);
|
||||
assertEquals(key, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(index), record.get("time").toString());
|
||||
assertEquals(index, record.get("number"));
|
||||
GenericRecord innerRecord = (GenericRecord) record.get("driver");
|
||||
assertEquals("driver" + index, innerRecord.get("driver_name").toString());
|
||||
assertEquals(1, ((List<?>) innerRecord.get("list")).size());
|
||||
assertEquals(index, ((List<?>) innerRecord.get("list")).get(0));
|
||||
Map<?, ?> mapping = (Map<?, ?>) innerRecord.get("map");
|
||||
boolean match = false;
|
||||
for (Object innerKey : mapping.keySet()) {
|
||||
// The innerKey may not be in the type of String, so we have to
|
||||
// use the following logic for validation
|
||||
if (innerKey.toString().equals(key)) {
|
||||
assertEquals("value" + index, mapping.get(innerKey).toString());
|
||||
match = true;
|
||||
}
|
||||
}
|
||||
assertTrue(match);
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyFilterRowKeys(HoodieFileReader<GenericRecord> hoodieReader) {
|
||||
Set<String> candidateRowKeys = IntStream.range(40, NUM_RECORDS * 2)
|
||||
.mapToObj(i -> "key" + String.format("%02d", i)).collect(Collectors.toSet());
|
||||
List<String> expectedKeys = IntStream.range(40, NUM_RECORDS)
|
||||
.mapToObj(i -> "key" + String.format("%02d", i)).sorted().collect(Collectors.toList());
|
||||
assertEquals(expectedKeys, hoodieReader.filterRowKeys(candidateRowKeys)
|
||||
.stream().sorted().collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
private void verifyReaderWithSchema(String schemaPath, HoodieFileReader<GenericRecord> hoodieReader) throws IOException {
|
||||
Schema evolvedSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, schemaPath);
|
||||
Iterator<GenericRecord> iter = hoodieReader.getRecordIterator(evolvedSchema);
|
||||
int index = 0;
|
||||
while (iter.hasNext()) {
|
||||
verifyRecord(schemaPath, iter.next(), index);
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyRecord(String schemaPath, GenericRecord record, int index) {
|
||||
String numStr = String.format("%02d", index);
|
||||
assertEquals("key" + numStr, record.get("_row_key").toString());
|
||||
assertEquals(Integer.toString(index), record.get("time").toString());
|
||||
if ("/exampleEvolvedSchemaColumnType.avsc".equals(schemaPath)) {
|
||||
assertEquals(Integer.toString(index), record.get("number").toString());
|
||||
} else if ("/exampleEvolvedSchemaDeleteColumn.avsc".equals(schemaPath)) {
|
||||
assertNull(record.get("number"));
|
||||
} else {
|
||||
assertEquals(index, record.get("number"));
|
||||
}
|
||||
assertNull(record.get("added_field"));
|
||||
}
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user