1
0

Improving out of box experience for data source

- Fixes #246
 - Bump up default parallelism to 1500, to handle large upserts
 - Add docs on s3 confuration & tuning tips with tested spark knobs
 - Fix bug to not duplicate hoodie metadata fields when input dataframe is another hoodie dataset
 - Improve speed of ROTablePathFilter by removing directory check
 - Move to spark-avro 4.0 to handle issue with nested fields with same name
 - Keep AvroConversionUtils in sync with spark-avro 4.0
This commit is contained in:
Vinoth Chandar
2018-01-05 14:06:18 -08:00
committed by vinoth chandar
parent a97814462d
commit 85dd265b7b
8 changed files with 112 additions and 19 deletions

View File

@@ -43,6 +43,16 @@ object AvroConversionUtils {
}
}
def getNewRecordNamespace(elementDataType: DataType,
currentRecordNamespace: String,
elementName: String): String = {
elementDataType match {
case StructType(_) => s"$currentRecordNamespace.$elementName"
case _ => currentRecordNamespace
}
}
def createConverterToAvro(dataType: DataType,
structName: String,
recordNamespace: String): (Any) => Any = {
@@ -60,7 +70,10 @@ object AvroConversionUtils {
case DateType => (item: Any) =>
if (item == null) null else item.asInstanceOf[Date].getTime
case ArrayType(elementType, _) =>
val elementConverter = createConverterToAvro(elementType, structName, recordNamespace)
val elementConverter = createConverterToAvro(
elementType,
structName,
getNewRecordNamespace(elementType, recordNamespace, structName))
(item: Any) => {
if (item == null) {
null
@@ -77,7 +90,10 @@ object AvroConversionUtils {
}
}
case MapType(StringType, valueType, _) =>
val valueConverter = createConverterToAvro(valueType, structName, recordNamespace)
val valueConverter = createConverterToAvro(
valueType,
structName,
getNewRecordNamespace(valueType, recordNamespace, structName))
(item: Any) => {
if (item == null) {
null
@@ -94,7 +110,10 @@ object AvroConversionUtils {
val schema: Schema = SchemaConverters.convertStructToAvro(
structType, builder, recordNamespace)
val fieldConverters = structType.fields.map(field =>
createConverterToAvro(field.dataType, field.name, recordNamespace))
createConverterToAvro(
field.dataType,
field.name,
getNewRecordNamespace(field.dataType, recordNamespace, field.name)))
(item: Any) => {
if (item == null) {
null

View File

@@ -134,11 +134,16 @@ class DefaultSource extends RelationProvider
df: DataFrame): BaseRelation = {
val parameters = parametersWithWriteDefaults(optParams).toMap
val sparkContext = sqlContext.sparkContext
val path = parameters.get("path")
val tblName = parameters.get(HoodieWriteConfig.TABLE_NAME)
if (path.isEmpty || tblName.isEmpty) {
throw new HoodieException(s"'${HoodieWriteConfig.TABLE_NAME}', 'path' must be set.")
}
val serializer = sparkContext.getConf.get("spark.serializer")
if (!serializer.equals("org.apache.spark.serializer.KryoSerializer")) {
throw new HoodieException(s"${serializer} serialization is not supported by hoodie. Please use kryo.")
}
val storageType = parameters(STORAGE_TYPE_OPT_KEY)
val operation = parameters(OPERATION_OPT_KEY)
@@ -146,11 +151,12 @@ class DefaultSource extends RelationProvider
// register classes & schemas
val structName = s"${tblName.get}_record"
val nameSpace = s"hoodie.${tblName.get}"
sqlContext.sparkContext.getConf.registerKryoClasses(
sparkContext.getConf.registerKryoClasses(
Array(classOf[org.apache.avro.generic.GenericData],
classOf[org.apache.avro.Schema]))
val schema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace)
sqlContext.sparkContext.getConf.registerAvroSchemas(schema)
sparkContext.getConf.registerAvroSchemas(schema)
log.info(s"Registered avro schema : ${schema.toString(true)}");
// Convert to RDD[HoodieRecord]
val keyGenerator = DataSourceUtils.createKeyGenerator(
@@ -167,7 +173,7 @@ class DefaultSource extends RelationProvider
val basePath = new Path(parameters.get("path").get)
val fs = basePath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration)
var exists = fs.exists(basePath)
// Handle various save modes
@@ -190,12 +196,11 @@ class DefaultSource extends RelationProvider
properties.put(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tblName.get);
properties.put(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, storageType);
properties.put(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, "archived");
HoodieTableMetaClient.initializePathAsHoodieDataset(
sqlContext.sparkContext.hadoopConfiguration, path.get, properties);
HoodieTableMetaClient.initializePathAsHoodieDataset(sparkContext.hadoopConfiguration, path.get, properties);
}
// Create a HoodieWriteClient & issue the write.
val client = DataSourceUtils.createHoodieClient(new JavaSparkContext(sqlContext.sparkContext),
val client = DataSourceUtils.createHoodieClient(new JavaSparkContext(sparkContext),
schema.toString,
path.get,
tblName.get,