1
0

[HUDI-960] Implementation of the HFile base and log file format. (#1804)

* [HUDI-960] Implementation of the HFile base and log file format.

1. Includes HFileWriter and HFileReader
2. Includes HFileInputFormat for both snapshot and realtime input format for Hive
3. Unit test for new code
4. IT for using HFile format and querying using Hive (Presto and SparkSQL are not supported)

Advantage:
HFile file format saves data as binary key-value pairs. This implementation chooses the following values:
1. Key = Hoodie Record Key (as bytes)
2. Value = Avro encoded GenericRecord (as bytes)

HFile allows efficient lookup of a record by key or range of keys. Hence, this base file format is well suited to applications like RFC-15, RFC-08 which will benefit from the ability to lookup records by key or search in a range of keys without having to read the entire data/log format.

Limitations:
HFile storage format has certain limitations when used as a general purpose data storage format.
1. Does not have a implemented reader for Presto and SparkSQL
2. Is not a columnar file format and hence may lead to lower compression levels and greater IO on query side due to lack of column pruning


Other changes: 
 - Remove databricks/avro from pom
 - Fix HoodieClientTestUtils from not using scala imports/reflection based conversion etc
 - Breaking up limitFileSize(), per parquet and hfile base files
 - Added three new configs for HoodieHFileConfig - prefetchBlocksOnOpen, cacheDataInL1, dropBehindCacheCompaction
 - Throw UnsupportedException in HFileReader.getRecordKeys()
 - Updated HoodieCopyOnWriteTable to create the correct merge handle (HoodieSortedMergeHandle for HFile and HoodieMergeHandle otherwise)

* Fixing checkstyle

Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
Prashant Wason
2020-08-31 08:05:59 -07:00
committed by GitHub
parent 6df8f88d86
commit 6461927eac
54 changed files with 2224 additions and 295 deletions

View File

@@ -255,7 +255,7 @@ public abstract class ITTestBase {
try {
// save up the Hive log files for introspection
String hiveLogStr =
executeCommandStringInDocker(HIVESERVER, "cat /tmp/root/hive.log | grep -i exception -A 10 -B 5", true).getStdout().toString();
executeCommandStringInDocker(HIVESERVER, "cat /tmp/root/hive.log | grep -i exception -A 10 -B 5", false).getStdout().toString();
String filePath = System.getProperty("java.io.tmpdir") + "/" + System.currentTimeMillis() + "-hive.log";
FileIOUtils.writeStringToFile(hiveLogStr, filePath);
LOG.info("Hive log saved up at : " + filePath);

View File

@@ -23,6 +23,8 @@ import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.util.Collections;
@@ -77,7 +79,7 @@ public class ITTestHoodieDemo extends ITTestBase {
private static final String HIVE_INCREMENTAL_MOR_RO_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-incremental-mor-ro.commands";
private static final String HIVE_INCREMENTAL_MOR_RT_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-incremental-mor-rt.commands";
private static HoodieFileFormat baseFileFormat;
private HoodieFileFormat baseFileFormat;
private static String HIVE_SYNC_CMD_FMT =
" --enable-hive-sync --hoodie-conf hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000 "
@@ -115,6 +117,36 @@ public class ITTestHoodieDemo extends ITTestBase {
testIncrementalHiveQueryAfterCompaction();
}
@Test
@Disabled
public void testHFileDemo() throws Exception {
baseFileFormat = HoodieFileFormat.HFILE;
// TODO: Preseto and SparkSQL support for HFile format
setupDemo();
// batch 1
ingestFirstBatchAndHiveSync();
testHiveAfterFirstBatch();
//testPrestoAfterFirstBatch();
//testSparkSQLAfterFirstBatch();
// batch 2
ingestSecondBatchAndHiveSync();
testHiveAfterSecondBatch();
//testPrestoAfterSecondBatch();
//testSparkSQLAfterSecondBatch();
testIncrementalHiveQueryBeforeCompaction();
//testIncrementalSparkSQLQuery();
// compaction
scheduleAndRunCompaction();
testHiveAfterSecondBatchAfterCompaction();
//testPrestoAfterSecondBatchAfterCompaction();
//testIncrementalHiveQueryAfterCompaction();
}
private void setupDemo() throws Exception {
List<String> cmds = CollectionUtils.createImmutableList("hdfs dfsadmin -safemode wait",
"hdfs dfs -mkdir -p " + HDFS_DATA_DIR,