1
0

New Features in DeltaStreamer :

(1) Apply transformation when using delta-streamer to ingest data.
 (2) Add Hudi Incremental Source for Delta Streamer
 (3) Allow delta-streamer config-property to be passed as command-line
 (4) Add Hive Integration to Delta-Streamer and address Review comments
 (5) Ensure MultiPartKeysValueExtractor  handle hive style partition description
 (6) Reuse same spark session on both source and transformer
 (7) Support extracting partition fields from _hoodie_partition_path for HoodieIncrSource
 (8) Reuse Binary Avro coders
 (9) Add push down filter for Incremental source
 (10) Add Hoodie DeltaStreamer metrics to track total time taken
This commit is contained in:
Balaji Varadarajan
2018-10-10 10:31:34 -07:00
committed by vinoth chandar
parent c70dbc13e9
commit 3a0044216c
65 changed files with 2752 additions and 911 deletions

View File

@@ -15,47 +15,58 @@
"fields":[
{
"name":"fileId",
"type":["null","string"]
"type":["null","string"],
"default" : null
},
{
"name":"path",
"type":["null","string"]
"type":["null","string"],
"default" : null
},
{
"name":"prevCommit",
"type":["null","string"]
"type":["null","string"],
"default" : null
},
{
"name":"numWrites",
"type":["null","long"]
"type":["null","long"],
"default" : null
},
{
"name":"numDeletes",
"type":["null","long"]
"type":["null","long"],
"default" : null
},
{
"name":"numUpdateWrites",
"type":["null","long"]
"type":["null","long"],
"default" : null
},
{
"name":"totalWriteBytes",
"type":["null","long"]
"type":["null","long"],
"default" : null
},
{
"name":"totalWriteErrors",
"type":["null","long"]
"type":["null","long"],
"default" : null
},
{
"name":"partitionPath",
"type":["null","string"]
"type":["null","string"],
"default" : null
},
{
"name":"totalLogRecords",
"type":["null","long"]
"type":["null","long"],
"default" : null
},
{
"name":"totalLogFiles",
"type":["null","long"]
"type":["null","long"],
"default" : null
},
{
"name":"totalUpdatedRecordsCompacted",
@@ -69,15 +80,18 @@
},
{
"name":"totalLogBlocks",
"type":["null","long"]
"type":["null","long"],
"default" : null
},
{
"name":"totalCorruptLogBlock",
"type":["null","long"]
"type":["null","long"],
"default" : null
},
{
"name":"totalRollbackBlocks",
"type":["null","long"]
"type":["null","long"],
"default" : null
}
]
}

View File

@@ -17,7 +17,9 @@
package com.uber.hoodie.common.model;
import com.google.common.base.Objects;
import com.google.common.collect.ImmutableList;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
/**
@@ -31,6 +33,14 @@ public class HoodieRecord<T extends HoodieRecordPayload> implements Serializable
public static String PARTITION_PATH_METADATA_FIELD = "_hoodie_partition_path";
public static String FILENAME_METADATA_FIELD = "_hoodie_file_name";
public static final List<String> HOODIE_META_COLUMNS =
new ImmutableList.Builder<String>().add(COMMIT_TIME_METADATA_FIELD)
.add(COMMIT_SEQNO_METADATA_FIELD)
.add(RECORD_KEY_METADATA_FIELD)
.add(PARTITION_PATH_METADATA_FIELD)
.add(FILENAME_METADATA_FIELD)
.build();
/**
* Identifies the record across the table
*/

View File

@@ -73,6 +73,20 @@ public class DFSPropertiesConfiguration {
}
visitedFiles.add(file.getName());
BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(file)));
addProperties(reader);
} catch (IOException ioe) {
log.error("Error reading in properies from dfs", ioe);
throw new IllegalArgumentException("Cannot read properties from dfs", ioe);
}
}
/**
* Add properties from input stream
* @param reader Buffered Reader
* @throws IOException
*/
public void addProperties(BufferedReader reader) throws IOException {
try {
String line;
while ((line = reader.readLine()) != null) {
if (line.startsWith("#") || line.equals("") || !line.contains("=")) {
@@ -85,10 +99,8 @@ public class DFSPropertiesConfiguration {
props.setProperty(split[0], split[1]);
}
}
} finally {
reader.close();
} catch (IOException ioe) {
log.error("Error reading in properies from dfs", ioe);
throw new IllegalArgumentException("Cannot read properties from dfs", ioe);
}
}

View File

@@ -37,8 +37,8 @@ import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.Decoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.EncoderFactory;
import org.codehaus.jackson.JsonNode;
@@ -48,6 +48,10 @@ import org.codehaus.jackson.JsonNode;
*/
public class HoodieAvroUtils {
private static ThreadLocal<BinaryEncoder> reuseEncoder = ThreadLocal.withInitial(() -> null);
private static ThreadLocal<BinaryDecoder> reuseDecoder = ThreadLocal.withInitial(() -> null);
// All metadata fields are optional strings.
private static final Schema METADATA_FIELD_SCHEMA = Schema.createUnion(Arrays.asList(
Schema.create(Schema.Type.NULL),
@@ -62,7 +66,8 @@ public class HoodieAvroUtils {
GenericDatumWriter<GenericRecord> writer =
new GenericDatumWriter<>(record.getSchema());
ByteArrayOutputStream out = new ByteArrayOutputStream();
BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, null);
BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, reuseEncoder.get());
reuseEncoder.set(encoder);
writer.write(record, encoder);
encoder.flush();
out.close();
@@ -73,7 +78,8 @@ public class HoodieAvroUtils {
* Convert serialized bytes back into avro record
*/
public static GenericRecord bytesToAvro(byte[] bytes, Schema schema) throws IOException {
Decoder decoder = DecoderFactory.get().binaryDecoder(bytes, null);
BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(bytes, reuseDecoder.get());
reuseDecoder.set(decoder);
GenericDatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
return reader.read(null, decoder);
}

View File

@@ -19,7 +19,10 @@
package com.uber.hoodie.common.util;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.stream.Collectors;
/**
* Type-aware extension of {@link java.util.Properties}
@@ -49,6 +52,13 @@ public class TypedProperties extends Properties implements Serializable {
return containsKey(property) ? getProperty(property) : defaultValue;
}
public List<String> getStringList(String property, String delimiter, List<String> defaultVal) {
if (!containsKey(property)) {
return defaultVal;
}
return Arrays.stream(getProperty(property).split(delimiter)).map(String::trim).collect(Collectors.toList());
}
public int getInteger(String property) {
checkKey(property);
return Integer.valueOf(getProperty(property));