1
0

Add --filter-dupes to DeltaStreamer

- Optionally filter out duplicates before inserting data
 - Unit tests
This commit is contained in:
Vinoth Chandar
2018-10-03 18:02:09 +01:00
committed by vinoth chandar
parent 0a200c32e5
commit 1fca9b21cc
3 changed files with 65 additions and 1 deletions

View File

@@ -26,6 +26,7 @@ import com.uber.hoodie.common.util.TypedProperties;
import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.DatasetNotFoundException;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieNotSupportedException;
import com.uber.hoodie.index.HoodieIndex;
@@ -142,4 +143,19 @@ public class DataSourceUtils {
HoodieRecordPayload payload = DataSourceUtils.createPayload(payloadClass, gr, orderingVal);
return new HoodieRecord<>(hKey, payload);
}
@SuppressWarnings("unchecked")
public static JavaRDD<HoodieRecord> dropDuplicates(JavaSparkContext jssc,
JavaRDD<HoodieRecord> incomingHoodieRecords,
HoodieWriteConfig writeConfig) throws Exception {
try {
HoodieReadClient client = new HoodieReadClient<>(jssc, writeConfig);
return client.tagLocation(incomingHoodieRecords)
.filter(r -> !((HoodieRecord<HoodieRecordPayload>) r).isCurrentLocationKnown());
} catch (DatasetNotFoundException e) {
// this will be executed when there is no hoodie dataset yet
// so no dups to drop
return incomingHoodieRecords;
}
}
}