1
0

New Features in DeltaStreamer :

(1) Apply transformation when using delta-streamer to ingest data.
 (2) Add Hudi Incremental Source for Delta Streamer
 (3) Allow delta-streamer config-property to be passed as command-line
 (4) Add Hive Integration to Delta-Streamer and address Review comments
 (5) Ensure MultiPartKeysValueExtractor  handle hive style partition description
 (6) Reuse same spark session on both source and transformer
 (7) Support extracting partition fields from _hoodie_partition_path for HoodieIncrSource
 (8) Reuse Binary Avro coders
 (9) Add push down filter for Incremental source
 (10) Add Hoodie DeltaStreamer metrics to track total time taken
This commit is contained in:
Balaji Varadarajan
2018-10-10 10:31:34 -07:00
committed by vinoth chandar
parent c70dbc13e9
commit 3a0044216c
65 changed files with 2752 additions and 911 deletions

View File

@@ -90,7 +90,7 @@ public class HoodieHiveClient {
private Connection connection;
private HoodieTimeline activeTimeline;
HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
public HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
this.syncConfig = cfg;
this.fs = fs;
this.metaClient = new HoodieTableMetaClient(fs.getConf(), cfg.basePath, true);
@@ -231,7 +231,7 @@ public class HoodieHiveClient {
/**
* Scan table partitions
*/
List<Partition> scanTablePartitions() throws TException {
public List<Partition> scanTablePartitions() throws TException {
return client.listPartitions(syncConfig.databaseName, syncConfig.tableName, (short) -1);
}
@@ -268,7 +268,7 @@ public class HoodieHiveClient {
/**
* Get the table schema
*/
Map<String, String> getTableSchema() {
public Map<String, String> getTableSchema() {
if (!doesTableExist()) {
throw new IllegalArgumentException(
"Failed to get schema for table " + syncConfig.tableName + " does not exist");
@@ -435,7 +435,7 @@ public class HoodieHiveClient {
/**
* @return true if the configured table exists
*/
boolean doesTableExist() {
public boolean doesTableExist() {
try {
return client.tableExists(syncConfig.databaseName, syncConfig.tableName);
} catch (TException e) {
@@ -449,7 +449,7 @@ public class HoodieHiveClient {
*
* @param s SQL to execute
*/
void updateHiveSQL(String s) {
public void updateHiveSQL(String s) {
Statement stmt = null;
try {
stmt = connection.createStatement();
@@ -468,8 +468,10 @@ public class HoodieHiveClient {
BasicDataSource ds = new HiveDataSource();
ds.setDriverClassName(HiveDriver.class.getCanonicalName());
ds.setUrl(getHiveJdbcUrlWithDefaultDBName());
ds.setUsername(syncConfig.hiveUser);
ds.setPassword(syncConfig.hivePass);
if (syncConfig.hiveUser != null) {
ds.setUsername(syncConfig.hiveUser);
ds.setPassword(syncConfig.hivePass);
}
LOG.info("Getting Hive Connection from Datasource " + ds);
try {
this.connection = ds.getConnection();
@@ -520,7 +522,7 @@ public class HoodieHiveClient {
return fs;
}
Optional<String> getLastCommitTimeSynced() {
public Optional<String> getLastCommitTimeSynced() {
// Get the last commit time from the TBLproperties
try {
Table database = client.getTable(syncConfig.databaseName, syncConfig.tableName);
@@ -532,7 +534,7 @@ public class HoodieHiveClient {
}
}
void close() {
public void close() {
try {
if (connection != null) {
connection.close();
@@ -548,7 +550,7 @@ public class HoodieHiveClient {
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
List<String> getPartitionsWrittenToSince(Optional<String> lastCommitTimeSynced) {
if (!lastCommitTimeSynced.isPresent()) {
LOG.info("Last commit time synced is not known, listing all partitions");
LOG.info("Last commit time synced is not known, listing all partitions in " + syncConfig.basePath + ",FS :" + fs);
try {
return FSUtils.getAllPartitionPaths(fs, syncConfig.basePath,
syncConfig.assumeDatePartitioning);
@@ -573,6 +575,10 @@ public class HoodieHiveClient {
}
}
List<String> getAllTables(String db) throws Exception {
return client.getAllTables(db);
}
void updateLastCommitTimeSynced() {
// Set the last commit time from the TBLproperties
String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp();

View File

@@ -16,8 +16,10 @@
package com.uber.hoodie.hive;
import com.google.common.base.Preconditions;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
/**
* Partition Key extractor treating each value delimited by slash as separate key.
@@ -27,6 +29,14 @@ public class MultiPartKeysValueExtractor implements PartitionValueExtractor {
@Override
public List<String> extractPartitionValuesInPath(String partitionPath) {
String[] splits = partitionPath.split("/");
return Arrays.asList(splits);
return Arrays.stream(splits).map(s -> {
if (s.contains("=")) {
String[] moreSplit = s.split("=");
Preconditions.checkArgument(moreSplit.length == 2,
"Partition Field (" + s + ") not in expected format");
return moreSplit[1];
}
return s;
}).collect(Collectors.toList());
}
}

View File

@@ -18,6 +18,7 @@
package com.uber.hoodie.hive;
import java.io.Serializable;
import java.util.List;
/**
@@ -28,7 +29,7 @@ import java.util.List;
* e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path
* /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd]
*/
public interface PartitionValueExtractor {
public interface PartitionValueExtractor extends Serializable {
List<String> extractPartitionValuesInPath(String partitionPath);
}

View File

@@ -33,12 +33,19 @@ import org.joda.time.format.DateTimeFormatter;
*/
public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExtractor {
private final DateTimeFormatter dtfOut;
private transient DateTimeFormatter dtfOut;
public SlashEncodedDayPartitionValueExtractor() {
this.dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd");
}
private DateTimeFormatter getDtfOut() {
if (dtfOut == null) {
dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd");
}
return dtfOut;
}
@Override
public List<String> extractPartitionValuesInPath(String partitionPath) {
// partition path is expected to be in this format yyyy/mm/dd
@@ -52,6 +59,6 @@ public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExt
int mm = Integer.parseInt(splits[1]);
int dd = Integer.parseInt(splits[2]);
DateTime dateTime = new DateTime(year, mm, dd, 0, 0);
return Lists.newArrayList(dtfOut.print(dateTime));
return Lists.newArrayList(getDtfOut().print(dateTime));
}
}