New Features in DeltaStreamer :
(1) Apply transformation when using delta-streamer to ingest data. (2) Add Hudi Incremental Source for Delta Streamer (3) Allow delta-streamer config-property to be passed as command-line (4) Add Hive Integration to Delta-Streamer and address Review comments (5) Ensure MultiPartKeysValueExtractor handle hive style partition description (6) Reuse same spark session on both source and transformer (7) Support extracting partition fields from _hoodie_partition_path for HoodieIncrSource (8) Reuse Binary Avro coders (9) Add push down filter for Incremental source (10) Add Hoodie DeltaStreamer metrics to track total time taken
This commit is contained in:
committed by
vinoth chandar
parent
c70dbc13e9
commit
3a0044216c
@@ -90,7 +90,7 @@ public class HoodieHiveClient {
|
||||
private Connection connection;
|
||||
private HoodieTimeline activeTimeline;
|
||||
|
||||
HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
|
||||
public HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
|
||||
this.syncConfig = cfg;
|
||||
this.fs = fs;
|
||||
this.metaClient = new HoodieTableMetaClient(fs.getConf(), cfg.basePath, true);
|
||||
@@ -231,7 +231,7 @@ public class HoodieHiveClient {
|
||||
/**
|
||||
* Scan table partitions
|
||||
*/
|
||||
List<Partition> scanTablePartitions() throws TException {
|
||||
public List<Partition> scanTablePartitions() throws TException {
|
||||
return client.listPartitions(syncConfig.databaseName, syncConfig.tableName, (short) -1);
|
||||
}
|
||||
|
||||
@@ -268,7 +268,7 @@ public class HoodieHiveClient {
|
||||
/**
|
||||
* Get the table schema
|
||||
*/
|
||||
Map<String, String> getTableSchema() {
|
||||
public Map<String, String> getTableSchema() {
|
||||
if (!doesTableExist()) {
|
||||
throw new IllegalArgumentException(
|
||||
"Failed to get schema for table " + syncConfig.tableName + " does not exist");
|
||||
@@ -435,7 +435,7 @@ public class HoodieHiveClient {
|
||||
/**
|
||||
* @return true if the configured table exists
|
||||
*/
|
||||
boolean doesTableExist() {
|
||||
public boolean doesTableExist() {
|
||||
try {
|
||||
return client.tableExists(syncConfig.databaseName, syncConfig.tableName);
|
||||
} catch (TException e) {
|
||||
@@ -449,7 +449,7 @@ public class HoodieHiveClient {
|
||||
*
|
||||
* @param s SQL to execute
|
||||
*/
|
||||
void updateHiveSQL(String s) {
|
||||
public void updateHiveSQL(String s) {
|
||||
Statement stmt = null;
|
||||
try {
|
||||
stmt = connection.createStatement();
|
||||
@@ -468,8 +468,10 @@ public class HoodieHiveClient {
|
||||
BasicDataSource ds = new HiveDataSource();
|
||||
ds.setDriverClassName(HiveDriver.class.getCanonicalName());
|
||||
ds.setUrl(getHiveJdbcUrlWithDefaultDBName());
|
||||
ds.setUsername(syncConfig.hiveUser);
|
||||
ds.setPassword(syncConfig.hivePass);
|
||||
if (syncConfig.hiveUser != null) {
|
||||
ds.setUsername(syncConfig.hiveUser);
|
||||
ds.setPassword(syncConfig.hivePass);
|
||||
}
|
||||
LOG.info("Getting Hive Connection from Datasource " + ds);
|
||||
try {
|
||||
this.connection = ds.getConnection();
|
||||
@@ -520,7 +522,7 @@ public class HoodieHiveClient {
|
||||
return fs;
|
||||
}
|
||||
|
||||
Optional<String> getLastCommitTimeSynced() {
|
||||
public Optional<String> getLastCommitTimeSynced() {
|
||||
// Get the last commit time from the TBLproperties
|
||||
try {
|
||||
Table database = client.getTable(syncConfig.databaseName, syncConfig.tableName);
|
||||
@@ -532,7 +534,7 @@ public class HoodieHiveClient {
|
||||
}
|
||||
}
|
||||
|
||||
void close() {
|
||||
public void close() {
|
||||
try {
|
||||
if (connection != null) {
|
||||
connection.close();
|
||||
@@ -548,7 +550,7 @@ public class HoodieHiveClient {
|
||||
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
|
||||
List<String> getPartitionsWrittenToSince(Optional<String> lastCommitTimeSynced) {
|
||||
if (!lastCommitTimeSynced.isPresent()) {
|
||||
LOG.info("Last commit time synced is not known, listing all partitions");
|
||||
LOG.info("Last commit time synced is not known, listing all partitions in " + syncConfig.basePath + ",FS :" + fs);
|
||||
try {
|
||||
return FSUtils.getAllPartitionPaths(fs, syncConfig.basePath,
|
||||
syncConfig.assumeDatePartitioning);
|
||||
@@ -573,6 +575,10 @@ public class HoodieHiveClient {
|
||||
}
|
||||
}
|
||||
|
||||
List<String> getAllTables(String db) throws Exception {
|
||||
return client.getAllTables(db);
|
||||
}
|
||||
|
||||
void updateLastCommitTimeSynced() {
|
||||
// Set the last commit time from the TBLproperties
|
||||
String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp();
|
||||
|
||||
@@ -16,8 +16,10 @@
|
||||
|
||||
package com.uber.hoodie.hive;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Partition Key extractor treating each value delimited by slash as separate key.
|
||||
@@ -27,6 +29,14 @@ public class MultiPartKeysValueExtractor implements PartitionValueExtractor {
|
||||
@Override
|
||||
public List<String> extractPartitionValuesInPath(String partitionPath) {
|
||||
String[] splits = partitionPath.split("/");
|
||||
return Arrays.asList(splits);
|
||||
return Arrays.stream(splits).map(s -> {
|
||||
if (s.contains("=")) {
|
||||
String[] moreSplit = s.split("=");
|
||||
Preconditions.checkArgument(moreSplit.length == 2,
|
||||
"Partition Field (" + s + ") not in expected format");
|
||||
return moreSplit[1];
|
||||
}
|
||||
return s;
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package com.uber.hoodie.hive;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
@@ -28,7 +29,7 @@ import java.util.List;
|
||||
* e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path
|
||||
* /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd]
|
||||
*/
|
||||
public interface PartitionValueExtractor {
|
||||
public interface PartitionValueExtractor extends Serializable {
|
||||
|
||||
List<String> extractPartitionValuesInPath(String partitionPath);
|
||||
}
|
||||
|
||||
@@ -33,12 +33,19 @@ import org.joda.time.format.DateTimeFormatter;
|
||||
*/
|
||||
public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExtractor {
|
||||
|
||||
private final DateTimeFormatter dtfOut;
|
||||
private transient DateTimeFormatter dtfOut;
|
||||
|
||||
public SlashEncodedDayPartitionValueExtractor() {
|
||||
this.dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd");
|
||||
}
|
||||
|
||||
private DateTimeFormatter getDtfOut() {
|
||||
if (dtfOut == null) {
|
||||
dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd");
|
||||
}
|
||||
return dtfOut;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> extractPartitionValuesInPath(String partitionPath) {
|
||||
// partition path is expected to be in this format yyyy/mm/dd
|
||||
@@ -52,6 +59,6 @@ public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExt
|
||||
int mm = Integer.parseInt(splits[1]);
|
||||
int dd = Integer.parseInt(splits[2]);
|
||||
DateTime dateTime = new DateTime(year, mm, dd, 0, 0);
|
||||
return Lists.newArrayList(dtfOut.print(dateTime));
|
||||
return Lists.newArrayList(getDtfOut().print(dateTime));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user