1
0

Revamped Deltastreamer (#93)

* Add analytics to site

* Fix ugly favicon

* New & Improved HoodieDeltaStreamer

 - Can incrementally consume from HDFS or Kafka, with exactly-once semantics!
 - Supports Json/Avro data, Source can also do custom things
 - Source is totally pluggable, via reflection
 - Key generation is pluggable, currently added SimpleKeyGenerator
 - Schema provider is pluggable, currently Filebased schemas
 - Configurable field to break ties during preCombine
 - Finally, can also plugin the HoodieRecordPayload, to get other merge types than overwriting
 - Handles efficient avro serialization in Spark

 Pending :
 - Rewriting of HiveIncrPullSource
 - Hive sync via hoodie-hive
 - Cleanup & tests

* Minor fixes from master rebase

* Implementation of HiveIncrPullSource
 - Copies commit by commit from source to target

* Adding TimestampBasedKeyGenerator
 - Supports unix time & date strings
This commit is contained in:
vinoth chandar
2017-03-13 12:41:29 -07:00
committed by prazanna
parent c3257b9680
commit 69d3950a32
33 changed files with 1925 additions and 263 deletions

View File

@@ -17,6 +17,7 @@
package com.uber.hoodie.avro;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

View File

@@ -29,6 +29,7 @@ import java.io.IOException;
/**
* This is a payload to wrap a existing Hoodie Avro Record.
* Useful to create a HoodieRecord over existing GenericRecords in a hoodie datasets (useful in compactions)
*
*/
public class HoodieAvroPayload implements HoodieRecordPayload<HoodieAvroPayload> {
private final Optional<GenericRecord> record;

View File

@@ -16,6 +16,8 @@
package com.uber.hoodie.common.model;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.codehaus.jackson.annotate.JsonAutoDetect;
@@ -33,21 +35,29 @@ import java.util.Map;
/**
* All the metadata that gets stored along with a commit.
*/
@JsonIgnoreProperties(ignoreUnknown = true)
public class HoodieCommitMetadata implements Serializable {
private static volatile Logger log = LogManager.getLogger(HoodieCommitMetadata.class);
private HashMap<String, List<HoodieWriteStat>> partitionToWriteStats;
private HashMap<String, String> extraMetadataMap;
public HoodieCommitMetadata() {
extraMetadataMap = new HashMap<>();
partitionToWriteStats = new HashMap<>();
}
public void addWriteStat(String partitionPath, HoodieWriteStat stat) {
if (!partitionToWriteStats.containsKey(partitionPath)) {
partitionToWriteStats.put(partitionPath, new ArrayList<HoodieWriteStat>());
partitionToWriteStats.put(partitionPath, new ArrayList<>());
}
partitionToWriteStats.get(partitionPath).add(stat);
}
public void addMetadata(String metaKey, String value) {
extraMetadataMap.put(metaKey, value);
}
public List<HoodieWriteStat> getWriteStats(String partitionPath) {
return partitionToWriteStats.get(partitionPath);
}
@@ -56,6 +66,10 @@ public class HoodieCommitMetadata implements Serializable {
return partitionToWriteStats;
}
public String getMetadata(String metaKey) {
return extraMetadataMap.get(metaKey);
}
public HashMap<String, String> getFileIdAndFullPaths() {
HashMap<String, String> filePaths = new HashMap<>();
// list all partitions paths