1
0

Add --filter-dupes to DeltaStreamer

- Optionally filter out duplicates before inserting data
 - Unit tests
This commit is contained in:
Vinoth Chandar
2018-10-03 18:02:09 +01:00
committed by vinoth chandar
parent 0a200c32e5
commit 1fca9b21cc
3 changed files with 65 additions and 1 deletions

View File

@@ -179,8 +179,20 @@ public class HoodieDeltaStreamer implements Serializable {
return new HoodieRecord<>(keyGenerator.getKey(gr), payload);
});
// Perform the write
// filter dupes if needed
HoodieWriteConfig hoodieCfg = getHoodieClientConfig();
if (cfg.filterDupes) {
// turn upserts to insert
cfg.operation = cfg.operation == Operation.UPSERT ? Operation.INSERT : cfg.operation;
records = DataSourceUtils.dropDuplicates(jssc, records, hoodieCfg);
}
if (records.isEmpty()) {
log.info("No new data, nothing to commit.. ");
return;
}
// Perform the write
HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg);
String commitTime = client.startCommit();
log.info("Starting commit : " + commitTime);
@@ -285,6 +297,10 @@ public class HoodieDeltaStreamer implements Serializable {
converter = OperationConvertor.class)
public Operation operation = Operation.UPSERT;
@Parameter(names = {"--filter-dupes"}, description = "Should duplicate records from source be dropped/filtered out"
+ "before insert/bulk-insert")
public Boolean filterDupes = false;
@Parameter(names = {"--spark-master"}, description = "spark master to use.")
public String sparkMaster = "local[2]";