The code-style rules follow google style with some changes: 1. Increase line length from 100 to 120 2. Disable JavaDoc related checkstyles as this needs more manual work. Both source and test code are checked for code-style
128 lines
4.8 KiB
Scala
128 lines
4.8 KiB
Scala
/*
|
|
* Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
*
|
|
*/
|
|
package com.uber.hoodie
|
|
|
|
import java.nio.ByteBuffer
|
|
import java.sql.{Date, Timestamp}
|
|
import java.util
|
|
|
|
import com.databricks.spark.avro.SchemaConverters
|
|
import org.apache.avro.generic.GenericData.Record
|
|
import org.apache.avro.generic.GenericRecord
|
|
import org.apache.avro.{Schema, SchemaBuilder}
|
|
import org.apache.spark.rdd.RDD
|
|
import org.apache.spark.sql.types._
|
|
import org.apache.spark.sql.{DataFrame, Row}
|
|
|
|
|
|
object AvroConversionUtils {
|
|
|
|
def createRdd(df: DataFrame, structName: String, recordNamespace: String): RDD[GenericRecord] = {
|
|
val dataType = df.schema
|
|
df.rdd.mapPartitions { records =>
|
|
if (records.isEmpty) Iterator.empty
|
|
else {
|
|
val convertor = createConverterToAvro(dataType, structName, recordNamespace)
|
|
records.map { x => convertor(x).asInstanceOf[GenericRecord] }
|
|
}
|
|
}
|
|
}
|
|
|
|
def createConverterToAvro(dataType: DataType,
|
|
structName: String,
|
|
recordNamespace: String): (Any) => Any = {
|
|
dataType match {
|
|
case BinaryType => (item: Any) =>
|
|
item match {
|
|
case null => null
|
|
case bytes: Array[Byte] => ByteBuffer.wrap(bytes)
|
|
}
|
|
case ByteType | ShortType | IntegerType | LongType |
|
|
FloatType | DoubleType | StringType | BooleanType => identity
|
|
case _: DecimalType => (item: Any) => if (item == null) null else item.toString
|
|
case TimestampType => (item: Any) =>
|
|
if (item == null) null else item.asInstanceOf[Timestamp].getTime
|
|
case DateType => (item: Any) =>
|
|
if (item == null) null else item.asInstanceOf[Date].getTime
|
|
case ArrayType(elementType, _) =>
|
|
val elementConverter = createConverterToAvro(elementType, structName, recordNamespace)
|
|
(item: Any) => {
|
|
if (item == null) {
|
|
null
|
|
} else {
|
|
val sourceArray = item.asInstanceOf[Seq[Any]]
|
|
val sourceArraySize = sourceArray.size
|
|
val targetList = new util.ArrayList[Any](sourceArraySize)
|
|
var idx = 0
|
|
while (idx < sourceArraySize) {
|
|
targetList.add(elementConverter(sourceArray(idx)))
|
|
idx += 1
|
|
}
|
|
targetList
|
|
}
|
|
}
|
|
case MapType(StringType, valueType, _) =>
|
|
val valueConverter = createConverterToAvro(valueType, structName, recordNamespace)
|
|
(item: Any) => {
|
|
if (item == null) {
|
|
null
|
|
} else {
|
|
val javaMap = new util.HashMap[String, Any]()
|
|
item.asInstanceOf[Map[String, Any]].foreach { case (key, value) =>
|
|
javaMap.put(key, valueConverter(value))
|
|
}
|
|
javaMap
|
|
}
|
|
}
|
|
case structType: StructType =>
|
|
val builder = SchemaBuilder.record(structName).namespace(recordNamespace)
|
|
val schema: Schema = SchemaConverters.convertStructToAvro(
|
|
structType, builder, recordNamespace)
|
|
val fieldConverters = structType.fields.map(field =>
|
|
createConverterToAvro(field.dataType, field.name, recordNamespace))
|
|
(item: Any) => {
|
|
if (item == null) {
|
|
null
|
|
} else {
|
|
val record = new Record(schema)
|
|
val convertersIterator = fieldConverters.iterator
|
|
val fieldNamesIterator = dataType.asInstanceOf[StructType].fieldNames.iterator
|
|
val rowIterator = item.asInstanceOf[Row].toSeq.iterator
|
|
|
|
while (convertersIterator.hasNext) {
|
|
val converter = convertersIterator.next()
|
|
record.put(fieldNamesIterator.next(), converter(rowIterator.next()))
|
|
}
|
|
record
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
def convertStructTypeToAvroSchema(structType: StructType,
|
|
structName: String,
|
|
recordNamespace: String): Schema = {
|
|
val builder = SchemaBuilder.record(structName).namespace(recordNamespace)
|
|
SchemaConverters.convertStructToAvro(structType, builder, recordNamespace)
|
|
}
|
|
|
|
def convertAvroSchemaToStructType(avroSchema: Schema): StructType = {
|
|
SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType];
|
|
}
|
|
}
|