Improving out of box experience for data source
- Fixes #246 - Bump up default parallelism to 1500, to handle large upserts - Add docs on s3 confuration & tuning tips with tested spark knobs - Fix bug to not duplicate hoodie metadata fields when input dataframe is another hoodie dataset - Improve speed of ROTablePathFilter by removing directory check - Move to spark-avro 4.0 to handle issue with nested fields with same name - Keep AvroConversionUtils in sync with spark-avro 4.0
This commit is contained in:
committed by
vinoth chandar
parent
a97814462d
commit
85dd265b7b
@@ -43,6 +43,16 @@ object AvroConversionUtils {
|
||||
}
|
||||
}
|
||||
|
||||
def getNewRecordNamespace(elementDataType: DataType,
|
||||
currentRecordNamespace: String,
|
||||
elementName: String): String = {
|
||||
|
||||
elementDataType match {
|
||||
case StructType(_) => s"$currentRecordNamespace.$elementName"
|
||||
case _ => currentRecordNamespace
|
||||
}
|
||||
}
|
||||
|
||||
def createConverterToAvro(dataType: DataType,
|
||||
structName: String,
|
||||
recordNamespace: String): (Any) => Any = {
|
||||
@@ -60,7 +70,10 @@ object AvroConversionUtils {
|
||||
case DateType => (item: Any) =>
|
||||
if (item == null) null else item.asInstanceOf[Date].getTime
|
||||
case ArrayType(elementType, _) =>
|
||||
val elementConverter = createConverterToAvro(elementType, structName, recordNamespace)
|
||||
val elementConverter = createConverterToAvro(
|
||||
elementType,
|
||||
structName,
|
||||
getNewRecordNamespace(elementType, recordNamespace, structName))
|
||||
(item: Any) => {
|
||||
if (item == null) {
|
||||
null
|
||||
@@ -77,7 +90,10 @@ object AvroConversionUtils {
|
||||
}
|
||||
}
|
||||
case MapType(StringType, valueType, _) =>
|
||||
val valueConverter = createConverterToAvro(valueType, structName, recordNamespace)
|
||||
val valueConverter = createConverterToAvro(
|
||||
valueType,
|
||||
structName,
|
||||
getNewRecordNamespace(valueType, recordNamespace, structName))
|
||||
(item: Any) => {
|
||||
if (item == null) {
|
||||
null
|
||||
@@ -94,7 +110,10 @@ object AvroConversionUtils {
|
||||
val schema: Schema = SchemaConverters.convertStructToAvro(
|
||||
structType, builder, recordNamespace)
|
||||
val fieldConverters = structType.fields.map(field =>
|
||||
createConverterToAvro(field.dataType, field.name, recordNamespace))
|
||||
createConverterToAvro(
|
||||
field.dataType,
|
||||
field.name,
|
||||
getNewRecordNamespace(field.dataType, recordNamespace, field.name)))
|
||||
(item: Any) => {
|
||||
if (item == null) {
|
||||
null
|
||||
|
||||
@@ -134,11 +134,16 @@ class DefaultSource extends RelationProvider
|
||||
df: DataFrame): BaseRelation = {
|
||||
|
||||
val parameters = parametersWithWriteDefaults(optParams).toMap
|
||||
val sparkContext = sqlContext.sparkContext
|
||||
val path = parameters.get("path")
|
||||
val tblName = parameters.get(HoodieWriteConfig.TABLE_NAME)
|
||||
if (path.isEmpty || tblName.isEmpty) {
|
||||
throw new HoodieException(s"'${HoodieWriteConfig.TABLE_NAME}', 'path' must be set.")
|
||||
}
|
||||
val serializer = sparkContext.getConf.get("spark.serializer")
|
||||
if (!serializer.equals("org.apache.spark.serializer.KryoSerializer")) {
|
||||
throw new HoodieException(s"${serializer} serialization is not supported by hoodie. Please use kryo.")
|
||||
}
|
||||
|
||||
val storageType = parameters(STORAGE_TYPE_OPT_KEY)
|
||||
val operation = parameters(OPERATION_OPT_KEY)
|
||||
@@ -146,11 +151,12 @@ class DefaultSource extends RelationProvider
|
||||
// register classes & schemas
|
||||
val structName = s"${tblName.get}_record"
|
||||
val nameSpace = s"hoodie.${tblName.get}"
|
||||
sqlContext.sparkContext.getConf.registerKryoClasses(
|
||||
sparkContext.getConf.registerKryoClasses(
|
||||
Array(classOf[org.apache.avro.generic.GenericData],
|
||||
classOf[org.apache.avro.Schema]))
|
||||
val schema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace)
|
||||
sqlContext.sparkContext.getConf.registerAvroSchemas(schema)
|
||||
sparkContext.getConf.registerAvroSchemas(schema)
|
||||
log.info(s"Registered avro schema : ${schema.toString(true)}");
|
||||
|
||||
// Convert to RDD[HoodieRecord]
|
||||
val keyGenerator = DataSourceUtils.createKeyGenerator(
|
||||
@@ -167,7 +173,7 @@ class DefaultSource extends RelationProvider
|
||||
|
||||
|
||||
val basePath = new Path(parameters.get("path").get)
|
||||
val fs = basePath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
|
||||
val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration)
|
||||
var exists = fs.exists(basePath)
|
||||
|
||||
// Handle various save modes
|
||||
@@ -190,12 +196,11 @@ class DefaultSource extends RelationProvider
|
||||
properties.put(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tblName.get);
|
||||
properties.put(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, storageType);
|
||||
properties.put(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, "archived");
|
||||
HoodieTableMetaClient.initializePathAsHoodieDataset(
|
||||
sqlContext.sparkContext.hadoopConfiguration, path.get, properties);
|
||||
HoodieTableMetaClient.initializePathAsHoodieDataset(sparkContext.hadoopConfiguration, path.get, properties);
|
||||
}
|
||||
|
||||
// Create a HoodieWriteClient & issue the write.
|
||||
val client = DataSourceUtils.createHoodieClient(new JavaSparkContext(sqlContext.sparkContext),
|
||||
val client = DataSourceUtils.createHoodieClient(new JavaSparkContext(sparkContext),
|
||||
schema.toString,
|
||||
path.get,
|
||||
tblName.get,
|
||||
|
||||
Reference in New Issue
Block a user