[HUDI-1716]: Resolving default values for schema from dataframe (#2765)
- Adding default values and setting null as first entry in UNION data types in avro schema. Co-authored-by: Aditya Tiwari <aditya.tiwari@flipkart.com>
This commit is contained in:
@@ -20,6 +20,7 @@ package org.apache.hudi.client.bootstrap;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.AvroConversionUtils;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.avro.model.HoodieFileStatus;
|
||||
import org.apache.hudi.common.bootstrap.FileStatusUtils;
|
||||
@@ -29,7 +30,6 @@ import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.spark.sql.avro.SchemaConverters;
|
||||
import org.apache.spark.sql.execution.datasources.parquet.ParquetToSparkSchemaConverter;
|
||||
import org.apache.spark.sql.internal.SQLConf;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
@@ -63,6 +63,6 @@ public class HoodieSparkBootstrapSchemaProvider extends HoodieBootstrapSchemaPro
|
||||
String structName = tableName + "_record";
|
||||
String recordNamespace = "hoodie." + tableName;
|
||||
|
||||
return SchemaConverters.toAvroType(sparkSchema, false, structName, recordNamespace);
|
||||
return AvroConversionUtils.convertStructTypeToAvroSchema(sparkSchema, structName, recordNamespace);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ import org.apache.spark.sql.avro.{IncompatibleSchemaException, SchemaConverters}
|
||||
import org.apache.spark.sql.catalyst.expressions.GenericRow
|
||||
import org.apache.spark.sql.catalyst.util.DateTimeUtils
|
||||
import org.apache.spark.sql.types._
|
||||
import org.apache.hudi.AvroConversionUtils._
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
@@ -340,7 +341,7 @@ object AvroConversionHelper {
|
||||
}
|
||||
}
|
||||
case structType: StructType =>
|
||||
val schema: Schema = SchemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace)
|
||||
val schema: Schema = convertStructTypeToAvroSchema(structType, structName, recordNamespace)
|
||||
val childNameSpace = if (recordNamespace != "") s"$recordNamespace.$structName" else structName
|
||||
val fieldConverters = structType.fields.map(field =>
|
||||
createConverterToAvro(
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.avro.Schema
|
||||
import org.apache.avro.JsonProperties
|
||||
import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder, IndexedRecord}
|
||||
import org.apache.hudi.avro.HoodieAvroUtils
|
||||
import org.apache.spark.rdd.RDD
|
||||
@@ -27,6 +28,7 @@ import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.JavaConversions._
|
||||
|
||||
object AvroConversionUtils {
|
||||
|
||||
@@ -46,10 +48,67 @@ object AvroConversionUtils {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Returns avro schema from spark StructType.
|
||||
*
|
||||
* @param structType Dataframe Struct Type.
|
||||
* @param structName Avro record name.
|
||||
* @param recordNamespace Avro record namespace.
|
||||
* @return Avro schema corresponding to given struct type.
|
||||
*/
|
||||
def convertStructTypeToAvroSchema(structType: StructType,
|
||||
structName: String,
|
||||
recordNamespace: String): Schema = {
|
||||
SchemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace)
|
||||
getAvroSchemaWithDefaults(SchemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace))
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Method to add default value of null to nullable fields in given avro schema
|
||||
*
|
||||
* @param schema input avro schema
|
||||
* @return Avro schema with null default set to nullable fields
|
||||
*/
|
||||
def getAvroSchemaWithDefaults(schema: Schema): Schema = {
|
||||
|
||||
schema.getType match {
|
||||
case Schema.Type.RECORD => {
|
||||
|
||||
val modifiedFields = schema.getFields.map(field => {
|
||||
val newSchema = getAvroSchemaWithDefaults(field.schema())
|
||||
field.schema().getType match {
|
||||
case Schema.Type.UNION => {
|
||||
val innerFields = newSchema.getTypes
|
||||
val containsNullSchema = innerFields.foldLeft(false)((nullFieldEncountered, schema) => nullFieldEncountered | schema.getType == Schema.Type.NULL)
|
||||
if(containsNullSchema) {
|
||||
// Need to re shuffle the fields in list because to set null as default, null schema must be head in union schema
|
||||
val restructuredNewSchema = Schema.createUnion(List(Schema.create(Schema.Type.NULL)) ++ innerFields.filter(innerSchema => !(innerSchema.getType == Schema.Type.NULL)))
|
||||
new Schema.Field(field.name(), restructuredNewSchema, field.doc(), JsonProperties.NULL_VALUE)
|
||||
} else {
|
||||
new Schema.Field(field.name(), newSchema, field.doc(), field.defaultVal())
|
||||
}
|
||||
}
|
||||
case _ => new Schema.Field(field.name(), newSchema, field.doc(), field.defaultVal())
|
||||
}
|
||||
}).toList
|
||||
Schema.createRecord(schema.getName, schema.getDoc, schema.getNamespace, schema.isError, modifiedFields)
|
||||
}
|
||||
|
||||
case Schema.Type.UNION => {
|
||||
Schema.createUnion(schema.getTypes.map(innerSchema => getAvroSchemaWithDefaults(innerSchema)))
|
||||
}
|
||||
|
||||
case Schema.Type.MAP => {
|
||||
Schema.createMap(getAvroSchemaWithDefaults(schema.getValueType))
|
||||
}
|
||||
|
||||
case Schema.Type.ARRAY => {
|
||||
Schema.createArray(getAvroSchemaWithDefaults(schema.getElementType))
|
||||
}
|
||||
|
||||
case _ => schema
|
||||
}
|
||||
}
|
||||
|
||||
def convertAvroSchemaToStructType(avroSchema: Schema): StructType = {
|
||||
|
||||
Reference in New Issue
Block a user