1
0

[HUDI-960] Implementation of the HFile base and log file format. (#1804)

* [HUDI-960] Implementation of the HFile base and log file format.

1. Includes HFileWriter and HFileReader
2. Includes HFileInputFormat for both snapshot and realtime input format for Hive
3. Unit test for new code
4. IT for using HFile format and querying using Hive (Presto and SparkSQL are not supported)

Advantage:
HFile file format saves data as binary key-value pairs. This implementation chooses the following values:
1. Key = Hoodie Record Key (as bytes)
2. Value = Avro encoded GenericRecord (as bytes)

HFile allows efficient lookup of a record by key or range of keys. Hence, this base file format is well suited to applications like RFC-15, RFC-08 which will benefit from the ability to lookup records by key or search in a range of keys without having to read the entire data/log format.

Limitations:
HFile storage format has certain limitations when used as a general purpose data storage format.
1. Does not have a implemented reader for Presto and SparkSQL
2. Is not a columnar file format and hence may lead to lower compression levels and greater IO on query side due to lack of column pruning


Other changes: 
 - Remove databricks/avro from pom
 - Fix HoodieClientTestUtils from not using scala imports/reflection based conversion etc
 - Breaking up limitFileSize(), per parquet and hfile base files
 - Added three new configs for HoodieHFileConfig - prefetchBlocksOnOpen, cacheDataInL1, dropBehindCacheCompaction
 - Throw UnsupportedException in HFileReader.getRecordKeys()
 - Updated HoodieCopyOnWriteTable to create the correct merge handle (HoodieSortedMergeHandle for HFile and HoodieMergeHandle otherwise)

* Fixing checkstyle

Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
Prashant Wason
2020-08-31 08:05:59 -07:00
committed by GitHub
parent 6df8f88d86
commit 6461927eac
54 changed files with 2224 additions and 295 deletions

View File

@@ -1,30 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.functional;
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType;
/**
* Tests Avro log format {@link HoodieAvroDataBlock}.
*/
public class TestHoodieAvroLogFormat extends TestHoodieLogFormat {
public TestHoodieAvroLogFormat() {
super(HoodieLogBlockType.AVRO_DATA_BLOCK);
}
}

View File

@@ -34,6 +34,7 @@ import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
import org.apache.hudi.common.table.log.block.HoodieDataBlock;
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType;
@@ -56,6 +57,7 @@ import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.IOException;
@@ -82,20 +84,13 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
* Tests hoodie log format {@link HoodieLogFormat}.
*/
@SuppressWarnings("Duplicates")
public abstract class TestHoodieLogFormat extends HoodieCommonTestHarness {
public class TestHoodieLogFormat extends HoodieCommonTestHarness {
private static String BASE_OUTPUT_PATH = "/tmp/";
private FileSystem fs;
private Path partitionPath;
private int bufferSize = 4096;
private HoodieLogBlockType dataBlockType;
public TestHoodieLogFormat(HoodieLogBlockType dataBlockType) {
this.dataBlockType = dataBlockType;
}
private TestHoodieLogFormat() {
}
private HoodieLogBlockType dataBlockType = HoodieLogBlockType.AVRO_DATA_BLOCK;
@BeforeAll
public static void setUpClass() throws IOException, InterruptedException {
@@ -133,8 +128,9 @@ public abstract class TestHoodieLogFormat extends HoodieCommonTestHarness {
assertEquals(1, writer.getLogFile().getLogVersion(), "Version should be 1 for new log created");
}
@Test
public void testBasicAppend() throws IOException, InterruptedException, URISyntaxException {
@ParameterizedTest
@EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK" })
public void testBasicAppend(HoodieLogBlockType dataBlockType) throws IOException, InterruptedException, URISyntaxException {
Writer writer =
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
@@ -142,7 +138,7 @@ public abstract class TestHoodieLogFormat extends HoodieCommonTestHarness {
Map<HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
HoodieDataBlock dataBlock = getDataBlock(records, header);
HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records, header);
writer = writer.appendBlock(dataBlock);
long size = writer.getCurrentSize();
assertTrue(size > 0, "We just wrote a block - size should be > 0");
@@ -151,7 +147,8 @@ public abstract class TestHoodieLogFormat extends HoodieCommonTestHarness {
writer.close();
}
@Test
@ParameterizedTest
@EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK" })
public void testRollover() throws IOException, InterruptedException, URISyntaxException {
Writer writer =
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
@@ -335,7 +332,8 @@ public abstract class TestHoodieLogFormat extends HoodieCommonTestHarness {
assertEquals(2, statuses.length);
}
@Test
@ParameterizedTest
@EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK" })
public void testBasicWriteAndScan() throws IOException, URISyntaxException, InterruptedException {
Writer writer =
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
@@ -363,7 +361,8 @@ public abstract class TestHoodieLogFormat extends HoodieCommonTestHarness {
reader.close();
}
@Test
@ParameterizedTest
@EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK" })
public void testBasicAppendAndRead() throws IOException, URISyntaxException, InterruptedException {
Writer writer =
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
@@ -1440,9 +1439,16 @@ public abstract class TestHoodieLogFormat extends HoodieCommonTestHarness {
}
private HoodieDataBlock getDataBlock(List<IndexedRecord> records, Map<HeaderMetadataType, String> header) {
return getDataBlock(dataBlockType, records, header);
}
private HoodieDataBlock getDataBlock(HoodieLogBlockType dataBlockType, List<IndexedRecord> records,
Map<HeaderMetadataType, String> header) {
switch (dataBlockType) {
case AVRO_DATA_BLOCK:
return new HoodieAvroDataBlock(records, header);
case HFILE_DATA_BLOCK:
return new HoodieHFileDataBlock(records, header);
default:
throw new RuntimeException("Unknown data block type " + dataBlockType);
}