1
0

Reworking the deltastreamer tool

- Standardize version of jackson
 - DFSPropertiesConfiguration replaces usage of commons PropertiesConfiguration
 - Remove dependency on ConstructorUtils
 - Throw error if ordering value is not present, during key generation
 - Switch to shade plugin for hoodie-utilities
 - Added support for consumption for Confluent avro kafka serdes
 - Support for Confluent schema registry
 - KafkaSource now deals with skews nicely, by doing round robin allocation of source limit across partitions
 - Added support for BULK_INSERT operations as well
 - Pass in the payload class config properly into HoodieWriteClient
 - Fix documentation based on new usage
 - Adding tests on deltastreamer, sources and all new util classes.
This commit is contained in:
Vinoth Chandar
2018-08-04 03:35:30 -07:00
committed by vinoth chandar
parent fb95dbdedb
commit d58ddbd999
49 changed files with 1919 additions and 754 deletions

View File

@@ -18,6 +18,7 @@
package com.uber.hoodie;
import com.uber.hoodie.exception.HoodieException;
import java.io.Serializable;
import org.apache.avro.generic.GenericRecord;
@@ -44,5 +45,8 @@ public abstract class BaseAvroPayload implements Serializable {
public BaseAvroPayload(GenericRecord record, Comparable orderingVal) {
this.record = record;
this.orderingVal = orderingVal;
if (orderingVal == null) {
throw new HoodieException("Ordering value is null for record: " + record);
}
}
}

View File

@@ -21,6 +21,8 @@ package com.uber.hoodie;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.util.ReflectionUtils;
import com.uber.hoodie.common.util.TypedProperties;
import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
@@ -31,8 +33,6 @@ import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.lang3.reflect.ConstructorUtils;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@@ -72,10 +72,9 @@ public class DataSourceUtils {
* Create a key generator class via reflection, passing in any configs needed
*/
public static KeyGenerator createKeyGenerator(String keyGeneratorClass,
PropertiesConfiguration cfg) throws IOException {
TypedProperties props) throws IOException {
try {
return (KeyGenerator) ConstructorUtils
.invokeConstructor(Class.forName(keyGeneratorClass), (Object) cfg);
return (KeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, props);
} catch (Throwable e) {
throw new IOException("Could not load key generator class " + keyGeneratorClass, e);
}
@@ -87,17 +86,17 @@ public class DataSourceUtils {
public static HoodieRecordPayload createPayload(String payloadClass, GenericRecord record,
Comparable orderingVal) throws IOException {
try {
return (HoodieRecordPayload) ConstructorUtils.invokeConstructor(Class.forName(payloadClass),
(Object) record, (Object) orderingVal);
return (HoodieRecordPayload) ReflectionUtils
.loadClass(payloadClass, new Class<?>[]{GenericRecord.class, Comparable.class}, record, orderingVal);
} catch (Throwable e) {
throw new IOException("Could not create payload for class: " + payloadClass, e);
}
}
public static void checkRequiredProperties(PropertiesConfiguration configuration,
public static void checkRequiredProperties(TypedProperties props,
List<String> checkPropNames) {
checkPropNames.stream().forEach(prop -> {
if (!configuration.containsKey(prop)) {
if (!props.containsKey(prop)) {
throw new HoodieNotSupportedException("Required property " + prop + " is missing");
}
});

View File

@@ -19,9 +19,9 @@
package com.uber.hoodie;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.util.TypedProperties;
import java.io.Serializable;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.configuration.PropertiesConfiguration;
/**
* Abstract class to extend for plugging in extraction of
@@ -30,9 +30,9 @@ import org.apache.commons.configuration.PropertiesConfiguration;
*/
public abstract class KeyGenerator implements Serializable {
protected transient PropertiesConfiguration config;
protected transient TypedProperties config;
protected KeyGenerator(PropertiesConfiguration config) {
protected KeyGenerator(TypedProperties config) {
this.config = config;
}

View File

@@ -19,9 +19,9 @@
package com.uber.hoodie;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.util.TypedProperties;
import com.uber.hoodie.exception.HoodieException;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.configuration.PropertiesConfiguration;
/**
* Simple key generator, which takes names of fields to be used for recordKey and partitionPath as
@@ -29,14 +29,16 @@ import org.apache.commons.configuration.PropertiesConfiguration;
*/
public class SimpleKeyGenerator extends KeyGenerator {
private static final String DEFAULT_PARTITION_PATH = "default";
protected final String recordKeyField;
protected final String partitionPathField;
public SimpleKeyGenerator(PropertiesConfiguration config) {
super(config);
this.recordKeyField = config.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY());
this.partitionPathField = config
public SimpleKeyGenerator(TypedProperties props) {
super(props);
this.recordKeyField = props.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY());
this.partitionPathField = props
.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY());
}
@@ -46,7 +48,16 @@ public class SimpleKeyGenerator extends KeyGenerator {
throw new HoodieException(
"Unable to find field names for record key or partition path in cfg");
}
return new HoodieKey(DataSourceUtils.getNestedFieldValAsString(record, recordKeyField),
DataSourceUtils.getNestedFieldValAsString(record, partitionPathField));
String recordKey = DataSourceUtils.getNestedFieldValAsString(record, recordKeyField);
String partitionPath;
try {
partitionPath = DataSourceUtils.getNestedFieldValAsString(record, partitionPathField);
} catch (HoodieException e) {
// if field is not found, lump it into default partition
partitionPath = DEFAULT_PARTITION_PATH;
}
return new HoodieKey(recordKey, partitionPath);
}
}

View File

@@ -25,10 +25,10 @@ import java.util.{Optional, Properties}
import com.uber.hoodie.DataSourceReadOptions._
import com.uber.hoodie.DataSourceWriteOptions._
import com.uber.hoodie.common.table.{HoodieTableConfig, HoodieTableMetaClient}
import com.uber.hoodie.common.util.TypedProperties
import com.uber.hoodie.config.HoodieWriteConfig
import com.uber.hoodie.exception.HoodieException
import org.apache.avro.generic.GenericRecord
import org.apache.commons.configuration.PropertiesConfiguration
import org.apache.hadoop.fs.Path
import org.apache.log4j.LogManager
import org.apache.spark.api.java.JavaSparkContext
@@ -121,10 +121,10 @@ class DefaultSource extends RelationProvider
mapAsScalaMap(defaultsMap)
}
def toPropertiesConfiguration(params: Map[String, String]): PropertiesConfiguration = {
val propCfg = new PropertiesConfiguration()
params.foreach(kv => propCfg.addProperty(kv._1, kv._2))
propCfg
def toProperties(params: Map[String, String]): TypedProperties = {
val props = new TypedProperties()
params.foreach(kv => props.setProperty(kv._1, kv._2))
props
}
@@ -161,7 +161,7 @@ class DefaultSource extends RelationProvider
// Convert to RDD[HoodieRecord]
val keyGenerator = DataSourceUtils.createKeyGenerator(
parameters(KEYGENERATOR_CLASS_OPT_KEY),
toPropertiesConfiguration(parameters)
toProperties(parameters)
)
val genericRecords: RDD[GenericRecord] = AvroConversionUtils.createRdd(df, structName, nameSpace)
val hoodieRecords = genericRecords.map(gr => {

View File

@@ -16,11 +16,10 @@
*
*/
import com.uber.hoodie.common.util.SchemaTestUtil
import com.uber.hoodie.common.util.{SchemaTestUtil, TypedProperties}
import com.uber.hoodie.exception.HoodieException
import com.uber.hoodie.{DataSourceWriteOptions, OverwriteWithLatestAvroPayload, SimpleKeyGenerator}
import org.apache.avro.generic.GenericRecord
import org.apache.commons.configuration.PropertiesConfiguration
import org.junit.Assert._
import org.junit.{Before, Test}
import org.scalatest.junit.AssertionsForJUnit
@@ -39,10 +38,10 @@ class DataSourceDefaultsTest extends AssertionsForJUnit {
}
private def getKeyConfig(recordKeyFieldName: String, paritionPathField: String): PropertiesConfiguration = {
val props = new PropertiesConfiguration()
props.addProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, recordKeyFieldName)
props.addProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, paritionPathField)
private def getKeyConfig(recordKeyFieldName: String, paritionPathField: String): TypedProperties = {
val props = new TypedProperties()
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, recordKeyFieldName)
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, paritionPathField)
props
}
@@ -52,24 +51,26 @@ class DataSourceDefaultsTest extends AssertionsForJUnit {
assertEquals("field1", hk1.getRecordKey)
assertEquals("name1", hk1.getPartitionPath)
// recordKey field not specified
// partition path field not specified
try {
val props = new PropertiesConfiguration()
props.addProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "field1")
val props = new TypedProperties()
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "field1")
new SimpleKeyGenerator(props).getKey(baseRecord)
fail("Should have errored out")
} catch {
case e: HoodieException => {
case e: IllegalArgumentException => {
// do nothing
}
};
// partitionPath field is null
// recordkey field not specified
try {
new SimpleKeyGenerator(getKeyConfig("field1", null)).getKey(baseRecord)
val props = new TypedProperties()
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "partitionField")
new SimpleKeyGenerator(props).getKey(baseRecord)
fail("Should have errored out")
} catch {
case e: HoodieException => {
case e: IllegalArgumentException => {
// do nothing
}
};
@@ -90,6 +91,11 @@ class DataSourceDefaultsTest extends AssertionsForJUnit {
// do nothing
}
};
// if partition path can't be found, return default partition path
val hk3 = new SimpleKeyGenerator(getKeyConfig("testNestedRecord.userId", "testNestedRecord.notThere"))
.getKey(baseRecord);
assertEquals("default", hk3.getPartitionPath)
}
@Test def testOverwriteWithLatestAvroPayload() = {