1
0

[HUDI-684] Introduced abstraction for writing and reading different types of base file formats. (#1687)

Notable changes:
    1. HoodieFileWriter and HoodieFileReader abstractions for writer/reader side of a base file format
    2. HoodieDataBlock abstraction for creation specific data blocks for base file formats. (e.g. Parquet has HoodieAvroDataBlock)
    3. All hardocded references to Parquet / Parquet based classes have been abstracted to call methods which accept a base file format
    4. HiveSyncTool accepts the base file format as a CLI parameter
    5. HoodieDeltaStreamer accepts the base file format as a CLI parameter
    6. HoodieSparkSqlWriter accepts the base file format as a parameter
This commit is contained in:
Prashant Wason
2020-06-25 23:46:55 -07:00
committed by GitHub
parent 5e47673341
commit 2603cfb33e
55 changed files with 1086 additions and 466 deletions

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.integ;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.collection.Pair;
@@ -67,6 +68,8 @@ public class ITTestHoodieDemo extends ITTestBase {
private static final String HIVE_INCREMENTAL_MOR_RO_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-incremental-mor-ro.commands";
private static final String HIVE_INCREMENTAL_MOR_RT_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-incremental-mor-rt.commands";
private static HoodieFileFormat baseFileFormat;
private static String HIVE_SYNC_CMD_FMT =
" --enable-hive-sync --hoodie-conf hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000 "
+ " --hoodie-conf hoodie.datasource.hive_sync.username=hive "
@@ -76,7 +79,9 @@ public class ITTestHoodieDemo extends ITTestBase {
+ " --hoodie-conf hoodie.datasource.hive_sync.table=%s";
@Test
public void testDemo() throws Exception {
public void testParquetDemo() throws Exception {
baseFileFormat = HoodieFileFormat.PARQUET;
setupDemo();
// batch 1
@@ -122,6 +127,7 @@ public class ITTestHoodieDemo extends ITTestBase {
List<String> cmds = CollectionUtils.createImmutableList(
"spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE
+ " --table-type COPY_ON_WRITE "
+ " --base-file-format " + baseFileFormat.toString()
+ " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts "
+ " --target-base-path " + COW_BASE_PATH + " --target-table " + COW_TABLE_NAME
+ " --props /var/demo/config/dfs-source.properties"
@@ -130,12 +136,14 @@ public class ITTestHoodieDemo extends ITTestBase {
+ " --database default"
+ " --table " + COW_TABLE_NAME
+ " --base-path " + COW_BASE_PATH
+ " --base-file-format " + baseFileFormat.toString()
+ " --user hive"
+ " --pass hive"
+ " --jdbc-url jdbc:hive2://hiveserver:10000"
+ " --partitioned-by dt",
("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE
+ " --table-type MERGE_ON_READ "
+ " --base-file-format " + baseFileFormat.toString()
+ " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts "
+ " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME
+ " --props /var/demo/config/dfs-source.properties"