[HUDI-1294] Adding inline read and seek based read(batch get) for hfile log blocks in metadata table (#3762)
This commit is contained in:
committed by
GitHub
parent
0223c442ec
commit
69ee790a47
@@ -0,0 +1,134 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.io.compress.Compression;
|
||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM;
|
||||
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class TestHoodieHFileReaderWriter {
|
||||
@TempDir File tempDir;
|
||||
private Path filePath;
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws IOException {
|
||||
filePath = new Path(tempDir.toString() + "tempFile.txt");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void clearTempFile() {
|
||||
File file = new File(filePath.toString());
|
||||
if (file.exists()) {
|
||||
file.delete();
|
||||
}
|
||||
}
|
||||
|
||||
private HoodieHFileWriter createHFileWriter(Schema avroSchema) throws Exception {
|
||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.00001, -1, BloomFilterTypeCode.SIMPLE.name());
|
||||
Configuration conf = new Configuration();
|
||||
TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class);
|
||||
String instantTime = "000";
|
||||
|
||||
HoodieHFileConfig hoodieHFileConfig = new HoodieHFileConfig(conf, Compression.Algorithm.GZ, 1024 * 1024, 120 * 1024 * 1024,
|
||||
filter);
|
||||
return new HoodieHFileWriter(instantTime, filePath, hoodieHFileConfig, avroSchema, mockTaskContextSupplier);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteReadHFile() throws Exception {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
|
||||
HoodieHFileWriter writer = createHFileWriter(avroSchema);
|
||||
List<String> keys = new ArrayList<>();
|
||||
Map<String, GenericRecord> recordMap = new HashMap<>();
|
||||
for (int i = 0; i < 100; i++) {
|
||||
GenericRecord record = new GenericData.Record(avroSchema);
|
||||
String key = String.format("%s%04d", "key", i);
|
||||
record.put("_row_key", key);
|
||||
keys.add(key);
|
||||
record.put("time", Integer.toString(RANDOM.nextInt()));
|
||||
record.put("number", i);
|
||||
writer.writeAvro(key, record);
|
||||
recordMap.put(key, record);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
CacheConfig cacheConfig = new CacheConfig(conf);
|
||||
HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf));
|
||||
List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
|
||||
records.forEach(entry -> assertEquals(entry.getSecond(), recordMap.get(entry.getFirst())));
|
||||
hoodieHFileReader.close();
|
||||
|
||||
for (int i = 0; i < 20; i++) {
|
||||
int randomRowstoFetch = 5 + RANDOM.nextInt(50);
|
||||
Set<String> rowsToFetch = getRandomKeys(randomRowstoFetch, keys);
|
||||
List<String> rowsList = new ArrayList<>(rowsToFetch);
|
||||
Collections.sort(rowsList);
|
||||
hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf));
|
||||
List<Pair<String, GenericRecord>> result = hoodieHFileReader.readRecords(rowsList);
|
||||
assertEquals(result.size(), randomRowstoFetch);
|
||||
result.forEach(entry -> {
|
||||
assertEquals(entry.getSecond(), recordMap.get(entry.getFirst()));
|
||||
});
|
||||
hoodieHFileReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
private Set<String> getRandomKeys(int count, List<String> keys) {
|
||||
Set<String> rowKeys = new HashSet<>();
|
||||
int totalKeys = keys.size();
|
||||
while (rowKeys.size() < count) {
|
||||
int index = RANDOM.nextInt(totalKeys);
|
||||
if (!rowKeys.contains(index)) {
|
||||
rowKeys.add(keys.get(index));
|
||||
}
|
||||
}
|
||||
return rowKeys;
|
||||
}
|
||||
}
|
||||
@@ -160,9 +160,8 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
doRollbackAndValidate(testTable, "0000003", "0000004");
|
||||
}
|
||||
|
||||
doWriteOperationAndValidate(testTable, "0000005");
|
||||
|
||||
// trigger an upsert and validate
|
||||
// trigger couple of upserts
|
||||
doWriteOperation(testTable, "0000005");
|
||||
doWriteOperation(testTable, "0000006");
|
||||
validateMetadata(testTable, true);
|
||||
}
|
||||
@@ -222,9 +221,9 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
* Test various table operations sync to Metadata Table correctly.
|
||||
*/
|
||||
@ParameterizedTest
|
||||
@EnumSource(HoodieTableType.class)
|
||||
public void testTableOperations(HoodieTableType tableType) throws Exception {
|
||||
init(tableType);
|
||||
@MethodSource("bootstrapAndTableOperationTestArgs")
|
||||
public void testTableOperations(HoodieTableType tableType, boolean enableFullScan) throws Exception {
|
||||
init(tableType, true, enableFullScan);
|
||||
doWriteInsertAndUpsert(testTable);
|
||||
|
||||
// trigger an upsert
|
||||
@@ -236,7 +235,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
}
|
||||
|
||||
// trigger an upsert
|
||||
doWriteOperationAndValidate(testTable, "0000005");
|
||||
doWriteOperation(testTable, "0000005");
|
||||
|
||||
// trigger clean
|
||||
doCleanAndValidate(testTable, "0000006", singletonList("0000001"));
|
||||
@@ -255,7 +254,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
doWriteOperation(testTable, "0000002");
|
||||
doCleanAndValidate(testTable, "0000003", Arrays.asList("0000001"));
|
||||
if (tableType == MERGE_ON_READ) {
|
||||
doCompactionAndValidate(testTable, "0000004");
|
||||
doCompaction(testTable, "0000004");
|
||||
}
|
||||
doWriteOperation(testTable, "0000005");
|
||||
validateMetadata(testTable, emptyList(), true);
|
||||
@@ -288,7 +287,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
doWriteOperationAndValidate(testTable, "0000003");
|
||||
|
||||
// trigger a commit and rollback
|
||||
doWriteOperationAndValidate(testTable, "0000004");
|
||||
doWriteOperation(testTable, "0000004");
|
||||
doRollbackAndValidate(testTable, "0000004", "0000005");
|
||||
|
||||
// trigger few upserts and validate
|
||||
@@ -297,7 +296,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
}
|
||||
validateMetadata(testTable);
|
||||
|
||||
doWriteOperationAndValidate(testTable, "0000010");
|
||||
doWriteOperation(testTable, "0000010");
|
||||
|
||||
// rollback last commit. and validate.
|
||||
doRollbackAndValidate(testTable, "0000010", "0000011");
|
||||
@@ -309,7 +308,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
}
|
||||
|
||||
// roll back of delete
|
||||
doWriteOperationAndValidate(testTable, "0000014", DELETE);
|
||||
doWriteOperation(testTable, "0000014", DELETE);
|
||||
doRollbackAndValidate(testTable, "0000014", "0000015");
|
||||
|
||||
// rollback partial commit
|
||||
@@ -394,9 +393,9 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
syncTableMetadata(writeConfig);
|
||||
validateMetadata(testTable);
|
||||
|
||||
doWriteOperationAndValidate(testTable, "00000003", INSERT);
|
||||
doWriteOperationAndValidate(testTable, "00000004", UPSERT);
|
||||
doWriteOperationAndValidate(testTable, "00000005", UPSERT);
|
||||
doWriteOperation(testTable, "00000003", INSERT);
|
||||
doWriteOperation(testTable, "00000004", UPSERT);
|
||||
doWriteOperation(testTable, "00000005", UPSERT);
|
||||
|
||||
// trigger compaction
|
||||
if (MERGE_ON_READ.equals(tableType)) {
|
||||
@@ -404,13 +403,13 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
}
|
||||
|
||||
// trigger an upsert
|
||||
doWriteOperationAndValidate(testTable, "00000008");
|
||||
doWriteOperation(testTable, "00000008");
|
||||
// trigger delete
|
||||
doWriteOperationAndValidate(testTable, "00000009", DELETE);
|
||||
doWriteOperation(testTable, "00000009", DELETE);
|
||||
// trigger clean
|
||||
doCleanAndValidate(testTable, "00000010", asList("00000003", "00000004"));
|
||||
// trigger another upsert
|
||||
doWriteOperationAndValidate(testTable, "00000011");
|
||||
doWriteOperation(testTable, "00000011");
|
||||
// trigger clustering
|
||||
doClusterAndValidate(testTable, "00000012");
|
||||
|
||||
@@ -528,7 +527,6 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
records = dataGen.generateUniqueUpdates(newCommitTime, 10);
|
||||
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||
assertNoWriteErrors(writeStatuses);
|
||||
validateMetadata(client);
|
||||
|
||||
// Write 4 (updates and inserts)
|
||||
newCommitTime = "0000004";
|
||||
@@ -552,7 +550,6 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
records = dataGen.generateUpdates(newCommitTime, 5);
|
||||
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||
assertNoWriteErrors(writeStatuses);
|
||||
validateMetadata(client);
|
||||
|
||||
// Compaction
|
||||
if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||
@@ -568,7 +565,6 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
JavaRDD<HoodieKey> deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey());
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
client.delete(deleteKeys, newCommitTime);
|
||||
validateMetadata(client);
|
||||
|
||||
// Clean
|
||||
newCommitTime = "0000009";
|
||||
@@ -1128,7 +1124,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
|
||||
Collections.sort(fsFileNames);
|
||||
Collections.sort(metadataFilenames);
|
||||
|
||||
assertEquals(fsStatuses.length, partitionToFilesMap.get(basePath + "/" + partition).length);
|
||||
assertEquals(fsStatuses.length, partitionToFilesMap.get(partitionPath.toString()).length);
|
||||
|
||||
// File sizes should be valid
|
||||
Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getLen() > 0));
|
||||
|
||||
@@ -72,6 +72,10 @@ public class TestHoodieMetadataBase extends HoodieClientTestHarness {
|
||||
}
|
||||
|
||||
public void init(HoodieTableType tableType, boolean enableMetadataTable) throws IOException {
|
||||
init(tableType, enableMetadataTable, true);
|
||||
}
|
||||
|
||||
public void init(HoodieTableType tableType, boolean enableMetadataTable, boolean enableFullScan) throws IOException {
|
||||
this.tableType = tableType;
|
||||
initPath();
|
||||
initSparkContexts("TestHoodieMetadata");
|
||||
@@ -80,7 +84,8 @@ public class TestHoodieMetadataBase extends HoodieClientTestHarness {
|
||||
initMetaClient(tableType);
|
||||
initTestDataGenerator();
|
||||
metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath);
|
||||
writeConfig = getWriteConfig(true, enableMetadataTable);
|
||||
writeConfig = getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, enableMetadataTable, false,
|
||||
enableFullScan).build();
|
||||
initWriteConfigAndMetatableWriter(writeConfig, enableMetadataTable);
|
||||
}
|
||||
|
||||
@@ -256,7 +261,13 @@ public class TestHoodieMetadataBase extends HoodieClientTestHarness {
|
||||
return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics);
|
||||
}
|
||||
|
||||
protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) {
|
||||
protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata,
|
||||
boolean enableMetrics) {
|
||||
return getWriteConfigBuilder(policy, autoCommit, useFileListingMetadata, enableMetrics, true);
|
||||
}
|
||||
|
||||
protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata,
|
||||
boolean enableMetrics, boolean enableFullScan) {
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA)
|
||||
.withParallelism(2, 2).withDeleteParallelism(2).withRollbackParallelism(2).withFinalizeWriteParallelism(2)
|
||||
.withAutoCommit(autoCommit)
|
||||
@@ -271,6 +282,7 @@ public class TestHoodieMetadataBase extends HoodieClientTestHarness {
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder()
|
||||
.enable(useFileListingMetadata)
|
||||
.enableFullScan(enableFullScan)
|
||||
.enableMetrics(enableMetrics).build())
|
||||
.withMetricsConfig(HoodieMetricsConfig.newBuilder().on(enableMetrics)
|
||||
.withExecutorMetrics(true).build())
|
||||
|
||||
Reference in New Issue
Block a user