1
0

[HUDI-1294] Adding inline read and seek based read(batch get) for hfile log blocks in metadata table (#3762)

This commit is contained in:
Sivabalan Narayanan
2021-10-29 12:12:44 -04:00
committed by GitHub
parent 0223c442ec
commit 69ee790a47
17 changed files with 591 additions and 142 deletions

View File

@@ -0,0 +1,134 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.io.storage;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.util.Pair;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mockito;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM;
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestHoodieHFileReaderWriter {
@TempDir File tempDir;
private Path filePath;
@BeforeEach
public void setup() throws IOException {
filePath = new Path(tempDir.toString() + "tempFile.txt");
}
@AfterEach
public void clearTempFile() {
File file = new File(filePath.toString());
if (file.exists()) {
file.delete();
}
}
private HoodieHFileWriter createHFileWriter(Schema avroSchema) throws Exception {
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.00001, -1, BloomFilterTypeCode.SIMPLE.name());
Configuration conf = new Configuration();
TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class);
String instantTime = "000";
HoodieHFileConfig hoodieHFileConfig = new HoodieHFileConfig(conf, Compression.Algorithm.GZ, 1024 * 1024, 120 * 1024 * 1024,
filter);
return new HoodieHFileWriter(instantTime, filePath, hoodieHFileConfig, avroSchema, mockTaskContextSupplier);
}
@Test
public void testWriteReadHFile() throws Exception {
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
HoodieHFileWriter writer = createHFileWriter(avroSchema);
List<String> keys = new ArrayList<>();
Map<String, GenericRecord> recordMap = new HashMap<>();
for (int i = 0; i < 100; i++) {
GenericRecord record = new GenericData.Record(avroSchema);
String key = String.format("%s%04d", "key", i);
record.put("_row_key", key);
keys.add(key);
record.put("time", Integer.toString(RANDOM.nextInt()));
record.put("number", i);
writer.writeAvro(key, record);
recordMap.put(key, record);
}
writer.close();
Configuration conf = new Configuration();
CacheConfig cacheConfig = new CacheConfig(conf);
HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf));
List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
records.forEach(entry -> assertEquals(entry.getSecond(), recordMap.get(entry.getFirst())));
hoodieHFileReader.close();
for (int i = 0; i < 20; i++) {
int randomRowstoFetch = 5 + RANDOM.nextInt(50);
Set<String> rowsToFetch = getRandomKeys(randomRowstoFetch, keys);
List<String> rowsList = new ArrayList<>(rowsToFetch);
Collections.sort(rowsList);
hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf));
List<Pair<String, GenericRecord>> result = hoodieHFileReader.readRecords(rowsList);
assertEquals(result.size(), randomRowstoFetch);
result.forEach(entry -> {
assertEquals(entry.getSecond(), recordMap.get(entry.getFirst()));
});
hoodieHFileReader.close();
}
}
private Set<String> getRandomKeys(int count, List<String> keys) {
Set<String> rowKeys = new HashSet<>();
int totalKeys = keys.size();
while (rowKeys.size() < count) {
int index = RANDOM.nextInt(totalKeys);
if (!rowKeys.contains(index)) {
rowKeys.add(keys.get(index));
}
}
return rowKeys;
}
}

View File

@@ -160,9 +160,8 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
doRollbackAndValidate(testTable, "0000003", "0000004");
}
doWriteOperationAndValidate(testTable, "0000005");
// trigger an upsert and validate
// trigger couple of upserts
doWriteOperation(testTable, "0000005");
doWriteOperation(testTable, "0000006");
validateMetadata(testTable, true);
}
@@ -222,9 +221,9 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
* Test various table operations sync to Metadata Table correctly.
*/
@ParameterizedTest
@EnumSource(HoodieTableType.class)
public void testTableOperations(HoodieTableType tableType) throws Exception {
init(tableType);
@MethodSource("bootstrapAndTableOperationTestArgs")
public void testTableOperations(HoodieTableType tableType, boolean enableFullScan) throws Exception {
init(tableType, true, enableFullScan);
doWriteInsertAndUpsert(testTable);
// trigger an upsert
@@ -236,7 +235,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
}
// trigger an upsert
doWriteOperationAndValidate(testTable, "0000005");
doWriteOperation(testTable, "0000005");
// trigger clean
doCleanAndValidate(testTable, "0000006", singletonList("0000001"));
@@ -255,7 +254,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
doWriteOperation(testTable, "0000002");
doCleanAndValidate(testTable, "0000003", Arrays.asList("0000001"));
if (tableType == MERGE_ON_READ) {
doCompactionAndValidate(testTable, "0000004");
doCompaction(testTable, "0000004");
}
doWriteOperation(testTable, "0000005");
validateMetadata(testTable, emptyList(), true);
@@ -288,7 +287,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
doWriteOperationAndValidate(testTable, "0000003");
// trigger a commit and rollback
doWriteOperationAndValidate(testTable, "0000004");
doWriteOperation(testTable, "0000004");
doRollbackAndValidate(testTable, "0000004", "0000005");
// trigger few upserts and validate
@@ -297,7 +296,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
}
validateMetadata(testTable);
doWriteOperationAndValidate(testTable, "0000010");
doWriteOperation(testTable, "0000010");
// rollback last commit. and validate.
doRollbackAndValidate(testTable, "0000010", "0000011");
@@ -309,7 +308,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
}
// roll back of delete
doWriteOperationAndValidate(testTable, "0000014", DELETE);
doWriteOperation(testTable, "0000014", DELETE);
doRollbackAndValidate(testTable, "0000014", "0000015");
// rollback partial commit
@@ -394,9 +393,9 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
syncTableMetadata(writeConfig);
validateMetadata(testTable);
doWriteOperationAndValidate(testTable, "00000003", INSERT);
doWriteOperationAndValidate(testTable, "00000004", UPSERT);
doWriteOperationAndValidate(testTable, "00000005", UPSERT);
doWriteOperation(testTable, "00000003", INSERT);
doWriteOperation(testTable, "00000004", UPSERT);
doWriteOperation(testTable, "00000005", UPSERT);
// trigger compaction
if (MERGE_ON_READ.equals(tableType)) {
@@ -404,13 +403,13 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
}
// trigger an upsert
doWriteOperationAndValidate(testTable, "00000008");
doWriteOperation(testTable, "00000008");
// trigger delete
doWriteOperationAndValidate(testTable, "00000009", DELETE);
doWriteOperation(testTable, "00000009", DELETE);
// trigger clean
doCleanAndValidate(testTable, "00000010", asList("00000003", "00000004"));
// trigger another upsert
doWriteOperationAndValidate(testTable, "00000011");
doWriteOperation(testTable, "00000011");
// trigger clustering
doClusterAndValidate(testTable, "00000012");
@@ -528,7 +527,6 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
records = dataGen.generateUniqueUpdates(newCommitTime, 10);
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// Write 4 (updates and inserts)
newCommitTime = "0000004";
@@ -552,7 +550,6 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
records = dataGen.generateUpdates(newCommitTime, 5);
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// Compaction
if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) {
@@ -568,7 +565,6 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
JavaRDD<HoodieKey> deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey());
client.startCommitWithTime(newCommitTime);
client.delete(deleteKeys, newCommitTime);
validateMetadata(client);
// Clean
newCommitTime = "0000009";
@@ -1128,7 +1124,7 @@ public class TestHoodieBackedMetadata extends TestHoodieMetadataBase {
Collections.sort(fsFileNames);
Collections.sort(metadataFilenames);
assertEquals(fsStatuses.length, partitionToFilesMap.get(basePath + "/" + partition).length);
assertEquals(fsStatuses.length, partitionToFilesMap.get(partitionPath.toString()).length);
// File sizes should be valid
Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getLen() > 0));

View File

@@ -72,6 +72,10 @@ public class TestHoodieMetadataBase extends HoodieClientTestHarness {
}
public void init(HoodieTableType tableType, boolean enableMetadataTable) throws IOException {
init(tableType, enableMetadataTable, true);
}
public void init(HoodieTableType tableType, boolean enableMetadataTable, boolean enableFullScan) throws IOException {
this.tableType = tableType;
initPath();
initSparkContexts("TestHoodieMetadata");
@@ -80,7 +84,8 @@ public class TestHoodieMetadataBase extends HoodieClientTestHarness {
initMetaClient(tableType);
initTestDataGenerator();
metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath);
writeConfig = getWriteConfig(true, enableMetadataTable);
writeConfig = getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, enableMetadataTable, false,
enableFullScan).build();
initWriteConfigAndMetatableWriter(writeConfig, enableMetadataTable);
}
@@ -256,7 +261,13 @@ public class TestHoodieMetadataBase extends HoodieClientTestHarness {
return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics);
}
protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) {
protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata,
boolean enableMetrics) {
return getWriteConfigBuilder(policy, autoCommit, useFileListingMetadata, enableMetrics, true);
}
protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata,
boolean enableMetrics, boolean enableFullScan) {
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA)
.withParallelism(2, 2).withDeleteParallelism(2).withRollbackParallelism(2).withFinalizeWriteParallelism(2)
.withAutoCommit(autoCommit)
@@ -271,6 +282,7 @@ public class TestHoodieMetadataBase extends HoodieClientTestHarness {
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.withMetadataConfig(HoodieMetadataConfig.newBuilder()
.enable(useFileListingMetadata)
.enableFullScan(enableFullScan)
.enableMetrics(enableMetrics).build())
.withMetricsConfig(HoodieMetricsConfig.newBuilder().on(enableMetrics)
.withExecutorMetrics(true).build())