[HUDI-684] Introduced abstraction for writing and reading different types of base file formats. (#1687)
Notable changes:
1. HoodieFileWriter and HoodieFileReader abstractions for writer/reader side of a base file format
2. HoodieDataBlock abstraction for creation specific data blocks for base file formats. (e.g. Parquet has HoodieAvroDataBlock)
3. All hardocded references to Parquet / Parquet based classes have been abstracted to call methods which accept a base file format
4. HiveSyncTool accepts the base file format as a CLI parameter
5. HoodieDeltaStreamer accepts the base file format as a CLI parameter
6. HoodieSparkSqlWriter accepts the base file format as a parameter
This commit is contained in:
@@ -205,7 +205,7 @@ public class DeltaSync implements Serializable {
|
||||
} else {
|
||||
this.commitTimelineOpt = Option.empty();
|
||||
HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath,
|
||||
cfg.tableType, cfg.targetTableName, "archived", cfg.payloadClassName);
|
||||
cfg.tableType, cfg.targetTableName, "archived", cfg.payloadClassName, cfg.baseFileFormat);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -274,7 +274,7 @@ public class DeltaSync implements Serializable {
|
||||
}
|
||||
} else {
|
||||
HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath,
|
||||
cfg.tableType, cfg.targetTableName, "archived", cfg.payloadClassName);
|
||||
cfg.tableType, cfg.targetTableName, "archived", cfg.payloadClassName, cfg.baseFileFormat);
|
||||
}
|
||||
|
||||
if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
|
||||
@@ -474,7 +474,7 @@ public class DeltaSync implements Serializable {
|
||||
*/
|
||||
private void syncHive() {
|
||||
if (cfg.enableHiveSync) {
|
||||
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath);
|
||||
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat);
|
||||
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
|
||||
+ hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
|
||||
new HiveSyncTool(hiveSyncConfig, new HiveConf(conf, HiveConf.class), fs).syncHoodieTable();
|
||||
|
||||
@@ -177,6 +177,9 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
@Parameter(names = {"--table-type"}, description = "Type of table. COPY_ON_WRITE (or) MERGE_ON_READ", required = true)
|
||||
public String tableType;
|
||||
|
||||
@Parameter(names = {"--base-file-format"}, description = "File format for the base files. PARQUET (or) HFILE", required = false)
|
||||
public String baseFileFormat;
|
||||
|
||||
@Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
|
||||
+ "hoodie client, schema provider, key generator and data source. For hoodie client props, sane defaults are "
|
||||
+ "used, but recommend use to provide basic things like metrics endpoints, hive configs etc. For sources, refer"
|
||||
@@ -379,8 +382,20 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
// This will guarantee there is no surprise with table type
|
||||
ValidationUtils.checkArgument(tableType.equals(HoodieTableType.valueOf(cfg.tableType)),
|
||||
"Hoodie table is of type " + tableType + " but passed in CLI argument is " + cfg.tableType);
|
||||
|
||||
// Load base file format
|
||||
// This will guarantee there is no surprise with base file type
|
||||
String baseFileFormat = meta.getTableConfig().getBaseFileFormat().toString();
|
||||
ValidationUtils.checkArgument(baseFileFormat.equals(cfg.baseFileFormat) || cfg.baseFileFormat == null,
|
||||
"Hoodie table's base file format is of type " + baseFileFormat + " but passed in CLI argument is "
|
||||
+ cfg.baseFileFormat);
|
||||
cfg.baseFileFormat = meta.getTableConfig().getBaseFileFormat().toString();
|
||||
this.cfg.baseFileFormat = cfg.baseFileFormat;
|
||||
} else {
|
||||
tableType = HoodieTableType.valueOf(cfg.tableType);
|
||||
if (cfg.baseFileFormat == null) {
|
||||
cfg.baseFileFormat = "PARQUET"; // default for backward compatibility
|
||||
}
|
||||
}
|
||||
|
||||
ValidationUtils.checkArgument(!cfg.filterDupes || cfg.operation != Operation.UPSERT,
|
||||
|
||||
Reference in New Issue
Block a user