[HUDI-1296] Support Metadata Table in Spark Datasource (#4789)
* Bootstrapping initial support for Metadata Table in Spark Datasource - Consolidated Avro/Row conversion utilities to center around Spark's AvroDeserializer ; removed duplication - Bootstrapped HoodieBaseRelation - Updated HoodieMergeOnReadRDD to be able to handle Metadata Table - Modified MOR relations to be able to read different Base File formats (Parquet, HFile)
This commit is contained in:
@@ -19,7 +19,6 @@
|
||||
package org.apache.hudi;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.ReduceFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
@@ -30,14 +29,13 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
|
||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
|
||||
import org.apache.spark.sql.catalyst.expressions.Attribute;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import scala.Tuple2;
|
||||
import scala.collection.JavaConversions;
|
||||
import scala.collection.JavaConverters;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Helper class to assist in deduplicating Rows for BulkInsert with Rows.
|
||||
*/
|
||||
@@ -55,20 +53,13 @@ public class SparkRowWriteHelper {
|
||||
}
|
||||
|
||||
public Dataset<Row> deduplicateRows(Dataset<Row> inputDf, String preCombineField, boolean isGlobalIndex) {
|
||||
ExpressionEncoder encoder = getEncoder(inputDf.schema());
|
||||
|
||||
return inputDf.groupByKey(
|
||||
(MapFunction<Row, String>) value ->
|
||||
isGlobalIndex ? (value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD)) :
|
||||
(value.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD) + "+" + value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD)), Encoders.STRING())
|
||||
.reduceGroups((ReduceFunction<Row>) (v1, v2) -> {
|
||||
if (((Comparable) v1.getAs(preCombineField)).compareTo(((Comparable) v2.getAs(preCombineField))) >= 0) {
|
||||
return v1;
|
||||
} else {
|
||||
return v2;
|
||||
}
|
||||
}
|
||||
).map((MapFunction<Tuple2<String, Row>, Row>) value -> value._2, encoder);
|
||||
return inputDf.groupByKey((MapFunction<Row, String>) value ->
|
||||
isGlobalIndex
|
||||
? (value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD))
|
||||
: (value.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD) + "+" + value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD)), Encoders.STRING())
|
||||
.reduceGroups((ReduceFunction<Row>) (v1, v2) ->
|
||||
((Comparable) v1.getAs(preCombineField)).compareTo(v2.getAs(preCombineField)) >= 0 ? v1 : v2)
|
||||
.map((MapFunction<Tuple2<String, Row>, Row>) value -> value._2, getEncoder(inputDf.schema()));
|
||||
}
|
||||
|
||||
private ExpressionEncoder getEncoder(StructType schema) {
|
||||
|
||||
@@ -18,17 +18,30 @@
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.avro.Schema
|
||||
|
||||
import org.apache.avro.generic.GenericRecord
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig
|
||||
import org.apache.hudi.common.config.SerializableConfiguration
|
||||
import org.apache.hudi.common.fs.FSUtils
|
||||
import org.apache.hudi.common.model.HoodieFileFormat
|
||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||
|
||||
import org.apache.hudi.common.util.StringUtils
|
||||
import org.apache.hudi.io.storage.HoodieHFileReader
|
||||
import org.apache.hudi.metadata.HoodieTableMetadata
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.{SQLContext, SparkSession}
|
||||
import org.apache.spark.sql.avro.SchemaConverters
|
||||
import org.apache.spark.sql.sources.{BaseRelation, PrunedFilteredScan}
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.execution.datasources.PartitionedFile
|
||||
import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
|
||||
import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.{SQLContext, SparkSession}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.util.Try
|
||||
|
||||
case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String)
|
||||
|
||||
/**
|
||||
* Hoodie BaseRelation which extends [[PrunedFilteredScan]].
|
||||
*/
|
||||
@@ -41,15 +54,105 @@ abstract class HoodieBaseRelation(
|
||||
|
||||
protected val sparkSession: SparkSession = sqlContext.sparkSession
|
||||
|
||||
protected val tableAvroSchema: Schema = {
|
||||
protected lazy val tableAvroSchema: Schema = {
|
||||
val schemaUtil = new TableSchemaResolver(metaClient)
|
||||
Try (schemaUtil.getTableAvroSchema).getOrElse(SchemaConverters.toAvroType(userSchema.get))
|
||||
Try(schemaUtil.getTableAvroSchema).getOrElse(
|
||||
// If there is no commit in the table, we can't get the schema
|
||||
// t/h [[TableSchemaResolver]], fallback to the provided [[userSchema]] instead.
|
||||
userSchema match {
|
||||
case Some(s) => SchemaConverters.toAvroType(s)
|
||||
case _ => throw new IllegalArgumentException("User-provided schema is required in case the table is empty")
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
protected val tableStructSchema: StructType = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)
|
||||
|
||||
protected val partitionColumns: Array[String] = metaClient.getTableConfig.getPartitionFields.orElse(Array.empty)
|
||||
|
||||
override def schema: StructType = userSchema.getOrElse(tableStructSchema)
|
||||
protected def getPrecombineFieldProperty: Option[String] =
|
||||
Option(metaClient.getTableConfig.getPreCombineField)
|
||||
.orElse(optParams.get(DataSourceWriteOptions.PRECOMBINE_FIELD.key)) match {
|
||||
// NOTE: This is required to compensate for cases when empty string is used to stub
|
||||
// property value to avoid it being set with the default value
|
||||
// TODO(HUDI-3456) cleanup
|
||||
case Some(f) if !StringUtils.isNullOrEmpty(f) => Some(f)
|
||||
case _ => None
|
||||
}
|
||||
|
||||
override def schema: StructType = tableStructSchema
|
||||
}
|
||||
|
||||
object HoodieBaseRelation {
|
||||
|
||||
def isMetadataTable(metaClient: HoodieTableMetaClient) =
|
||||
HoodieTableMetadata.isMetadataTable(metaClient.getBasePath)
|
||||
|
||||
/**
|
||||
* Returns file-reader routine accepting [[PartitionedFile]] and returning an [[Iterator]]
|
||||
* over [[InternalRow]]
|
||||
*/
|
||||
def createBaseFileReader(spark: SparkSession,
|
||||
partitionSchema: StructType,
|
||||
tableSchema: HoodieTableSchema,
|
||||
requiredSchema: HoodieTableSchema,
|
||||
filters: Seq[Filter],
|
||||
options: Map[String, String],
|
||||
hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
|
||||
val hfileReader = createHFileReader(
|
||||
spark = spark,
|
||||
tableSchema = tableSchema,
|
||||
requiredSchema = requiredSchema,
|
||||
filters = filters,
|
||||
options = options,
|
||||
hadoopConf = hadoopConf
|
||||
)
|
||||
val parquetReader = HoodieDataSourceHelper.buildHoodieParquetReader(
|
||||
sparkSession = spark,
|
||||
dataSchema = tableSchema.structTypeSchema,
|
||||
partitionSchema = partitionSchema,
|
||||
requiredSchema = requiredSchema.structTypeSchema,
|
||||
filters = filters,
|
||||
options = options,
|
||||
hadoopConf = hadoopConf
|
||||
)
|
||||
|
||||
partitionedFile => {
|
||||
val extension = FSUtils.getFileExtension(partitionedFile.filePath)
|
||||
if (HoodieFileFormat.PARQUET.getFileExtension.equals(extension)) {
|
||||
parquetReader.apply(partitionedFile)
|
||||
} else if (HoodieFileFormat.HFILE.getFileExtension.equals(extension)) {
|
||||
hfileReader.apply(partitionedFile)
|
||||
} else {
|
||||
throw new UnsupportedOperationException(s"Base file format not supported by Spark DataSource ($partitionedFile)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def createHFileReader(spark: SparkSession,
|
||||
tableSchema: HoodieTableSchema,
|
||||
requiredSchema: HoodieTableSchema,
|
||||
filters: Seq[Filter],
|
||||
options: Map[String, String],
|
||||
hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
|
||||
val hadoopConfBroadcast =
|
||||
spark.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
|
||||
|
||||
partitionedFile => {
|
||||
val hadoopConf = hadoopConfBroadcast.value.get()
|
||||
val reader = new HoodieHFileReader[GenericRecord](hadoopConf, new Path(partitionedFile.filePath),
|
||||
new CacheConfig(hadoopConf))
|
||||
|
||||
val requiredRowSchema = requiredSchema.structTypeSchema
|
||||
// NOTE: Schema has to be parsed at this point, since Avro's [[Schema]] aren't serializable
|
||||
// to be passed from driver to executor
|
||||
val requiredAvroSchema = new Schema.Parser().parse(requiredSchema.avroSchemaStr)
|
||||
val avroToRowConverter = AvroConversionUtils.createAvroToInternalRowConverter(requiredAvroSchema, requiredRowSchema)
|
||||
|
||||
reader.getRecordIterator(requiredAvroSchema).asScala
|
||||
.map(record => {
|
||||
avroToRowConverter.apply(record.asInstanceOf[GenericRecord]).get
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,18 +20,16 @@ package org.apache.hudi
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.FileStatus
|
||||
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.catalyst.expressions.{Expression, PredicateHelper, SpecificInternalRow, SubqueryExpression, UnsafeProjection}
|
||||
import org.apache.spark.sql.execution.datasources.PartitionedFile
|
||||
import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
|
||||
import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile}
|
||||
import org.apache.spark.sql.sources.Filter
|
||||
import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.vectorized.ColumnarBatch
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
object HoodieDataSourceHelper extends PredicateHelper {
|
||||
|
||||
@@ -77,14 +75,13 @@ object HoodieDataSourceHelper extends PredicateHelper {
|
||||
* Wrapper `buildReaderWithPartitionValues` of [[ParquetFileFormat]]
|
||||
* to deal with [[ColumnarBatch]] when enable parquet vectorized reader if necessary.
|
||||
*/
|
||||
def buildHoodieParquetReader(
|
||||
sparkSession: SparkSession,
|
||||
dataSchema: StructType,
|
||||
partitionSchema: StructType,
|
||||
requiredSchema: StructType,
|
||||
filters: Seq[Filter],
|
||||
options: Map[String, String],
|
||||
hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
|
||||
def buildHoodieParquetReader(sparkSession: SparkSession,
|
||||
dataSchema: StructType,
|
||||
partitionSchema: StructType,
|
||||
requiredSchema: StructType,
|
||||
filters: Seq[Filter],
|
||||
options: Map[String, String],
|
||||
hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
|
||||
|
||||
val readParquetFile: PartitionedFile => Iterator[Any] = new ParquetFileFormat().buildReaderWithPartitionValues(
|
||||
sparkSession = sparkSession,
|
||||
@@ -98,11 +95,10 @@ object HoodieDataSourceHelper extends PredicateHelper {
|
||||
|
||||
file: PartitionedFile => {
|
||||
val iter = readParquetFile(file)
|
||||
val rows = iter.flatMap(_ match {
|
||||
iter.flatMap {
|
||||
case r: InternalRow => Seq(r)
|
||||
case b: ColumnarBatch => b.rowIterator().asScala
|
||||
})
|
||||
rows
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,14 +23,19 @@ import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.HoodieDataSourceHelper._
|
||||
import org.apache.hudi.HoodieMergeOnReadRDD.resolveAvroSchemaNullability
|
||||
import org.apache.hudi.MergeOnReadSnapshotRelation.getFilePath
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig
|
||||
import org.apache.hudi.common.engine.HoodieLocalEngineContext
|
||||
import org.apache.hudi.common.fs.FSUtils
|
||||
import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath
|
||||
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner
|
||||
import org.apache.hudi.config.HoodiePayloadConfig
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.hadoop.config.HoodieRealtimeConfig
|
||||
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS
|
||||
import org.apache.hudi.metadata.HoodieTableMetadata.getDataTableBasePathFromMetadataTable
|
||||
import org.apache.hudi.metadata.{HoodieBackedTableMetadata, HoodieTableMetadata}
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSerializer}
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
|
||||
import org.apache.spark.sql.execution.datasources.PartitionedFile
|
||||
@@ -48,51 +53,38 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
@transient config: Configuration,
|
||||
fullSchemaFileReader: PartitionedFile => Iterator[InternalRow],
|
||||
requiredSchemaFileReader: PartitionedFile => Iterator[InternalRow],
|
||||
tableState: HoodieMergeOnReadTableState)
|
||||
tableState: HoodieMergeOnReadTableState,
|
||||
tableSchema: HoodieTableSchema,
|
||||
requiredSchema: HoodieTableSchema)
|
||||
extends RDD[InternalRow](sc, Nil) {
|
||||
|
||||
private val confBroadcast = sc.broadcast(new SerializableWritable(config))
|
||||
private val preCombineField = tableState.preCombineField
|
||||
private val recordKeyFieldOpt = tableState.recordKeyFieldOpt
|
||||
private val payloadProps = if (preCombineField.isDefined) {
|
||||
HoodiePayloadConfig.newBuilder
|
||||
.withPayloadOrderingField(preCombineField.get)
|
||||
.build.getProps
|
||||
} else {
|
||||
new Properties()
|
||||
}
|
||||
|
||||
private val requiredSchema = tableState.requiredStructSchema
|
||||
|
||||
private val requiredFieldPosition = HoodieSparkUtils.collectFieldIndexes(requiredSchema,
|
||||
tableState.tableStructSchema
|
||||
)
|
||||
private val recordKeyField = tableState.recordKeyField
|
||||
private val payloadProps = tableState.preCombineFieldOpt
|
||||
.map(preCombineField =>
|
||||
HoodiePayloadConfig.newBuilder
|
||||
.withPayloadOrderingField(preCombineField)
|
||||
.build
|
||||
.getProps
|
||||
)
|
||||
.getOrElse(new Properties())
|
||||
|
||||
override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
|
||||
val mergeOnReadPartition = split.asInstanceOf[HoodieMergeOnReadPartition]
|
||||
val iter = mergeOnReadPartition.split match {
|
||||
case dataFileOnlySplit if dataFileOnlySplit.logPaths.isEmpty =>
|
||||
val rows = requiredSchemaFileReader(dataFileOnlySplit.dataFile.get)
|
||||
extractRequiredSchema(rows, requiredSchema, requiredFieldPosition)
|
||||
case dataFileOnlySplit if dataFileOnlySplit.logFiles.isEmpty =>
|
||||
requiredSchemaFileReader(dataFileOnlySplit.dataFile.get)
|
||||
case logFileOnlySplit if logFileOnlySplit.dataFile.isEmpty =>
|
||||
logFileIterator(logFileOnlySplit, getConfig)
|
||||
case skipMergeSplit if skipMergeSplit.mergeType
|
||||
.equals(DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) =>
|
||||
skipMergeFileIterator(
|
||||
skipMergeSplit,
|
||||
requiredSchemaFileReader(skipMergeSplit.dataFile.get),
|
||||
getConfig
|
||||
)
|
||||
case payloadCombineSplit if payloadCombineSplit.mergeType
|
||||
.equals(DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL) =>
|
||||
payloadCombineFileIterator(
|
||||
payloadCombineSplit,
|
||||
fullSchemaFileReader(payloadCombineSplit.dataFile.get),
|
||||
getConfig
|
||||
)
|
||||
case skipMergeSplit if skipMergeSplit.mergeType.equals(DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) =>
|
||||
skipMergeFileIterator(skipMergeSplit, requiredSchemaFileReader(skipMergeSplit.dataFile.get), getConfig)
|
||||
case payloadCombineSplit
|
||||
if payloadCombineSplit.mergeType.equals(DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL) =>
|
||||
payloadCombineFileIterator(payloadCombineSplit, fullSchemaFileReader(payloadCombineSplit.dataFile.get),
|
||||
getConfig)
|
||||
case _ => throw new HoodieException(s"Unable to select an Iterator to read the Hoodie MOR File Split for " +
|
||||
s"file path: ${mergeOnReadPartition.split.dataFile.get.filePath}" +
|
||||
s"log paths: ${mergeOnReadPartition.split.logPaths.toString}" +
|
||||
s"log paths: ${mergeOnReadPartition.split.logFiles.toString}" +
|
||||
s"hoodie table path: ${mergeOnReadPartition.split.tablePath}" +
|
||||
s"spark partition Index: ${mergeOnReadPartition.index}" +
|
||||
s"merge type: ${mergeOnReadPartition.split.mergeType}")
|
||||
@@ -121,12 +113,15 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
|
||||
private def logFileIterator(split: HoodieMergeOnReadFileSplit,
|
||||
config: Configuration): Iterator[InternalRow] =
|
||||
new Iterator[InternalRow] with Closeable {
|
||||
private val tableAvroSchema = new Schema.Parser().parse(tableState.tableAvroSchema)
|
||||
private val requiredAvroSchema = new Schema.Parser().parse(tableState.requiredAvroSchema)
|
||||
new Iterator[InternalRow] with Closeable with SparkAdapterSupport {
|
||||
private val tableAvroSchema = new Schema.Parser().parse(tableSchema.avroSchemaStr)
|
||||
private val requiredAvroSchema = new Schema.Parser().parse(requiredSchema.avroSchemaStr)
|
||||
private val requiredFieldPosition =
|
||||
requiredSchema.structTypeSchema
|
||||
.map(f => tableAvroSchema.getField(f.name).pos()).toList
|
||||
private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema)
|
||||
private val deserializer = HoodieAvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema)
|
||||
private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema)
|
||||
private val deserializer = sparkAdapter.createAvroDeserializer(requiredAvroSchema, requiredSchema.structTypeSchema)
|
||||
private val unsafeProjection = UnsafeProjection.create(requiredSchema.structTypeSchema)
|
||||
private var logScanner = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config)
|
||||
private val logRecords = logScanner.getRecords
|
||||
private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala
|
||||
@@ -141,9 +136,10 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
// delete record found, skipping
|
||||
this.hasNext
|
||||
} else {
|
||||
val requiredAvroRecord = AvroConversionUtils
|
||||
.buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema, requiredFieldPosition, recordBuilder)
|
||||
recordToLoad = unsafeProjection(deserializer.deserializeData(requiredAvroRecord).asInstanceOf[InternalRow])
|
||||
val requiredAvroRecord = AvroConversionUtils.buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema,
|
||||
requiredFieldPosition, recordBuilder)
|
||||
val rowOpt = deserializer.deserialize(requiredAvroRecord)
|
||||
recordToLoad = unsafeProjection(rowOpt.get.asInstanceOf[InternalRow])
|
||||
true
|
||||
}
|
||||
} else {
|
||||
@@ -169,12 +165,15 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
private def skipMergeFileIterator(split: HoodieMergeOnReadFileSplit,
|
||||
baseFileIterator: Iterator[InternalRow],
|
||||
config: Configuration): Iterator[InternalRow] =
|
||||
new Iterator[InternalRow] with Closeable {
|
||||
private val tableAvroSchema = new Schema.Parser().parse(tableState.tableAvroSchema)
|
||||
private val requiredAvroSchema = new Schema.Parser().parse(tableState.requiredAvroSchema)
|
||||
new Iterator[InternalRow] with Closeable with SparkAdapterSupport {
|
||||
private val tableAvroSchema = new Schema.Parser().parse(tableSchema.avroSchemaStr)
|
||||
private val requiredAvroSchema = new Schema.Parser().parse(requiredSchema.avroSchemaStr)
|
||||
private val requiredFieldPosition =
|
||||
requiredSchema.structTypeSchema
|
||||
.map(f => tableAvroSchema.getField(f.name).pos()).toList
|
||||
private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema)
|
||||
private val deserializer = HoodieAvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema)
|
||||
private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema)
|
||||
private val deserializer = sparkAdapter.createAvroDeserializer(requiredAvroSchema, requiredSchema.structTypeSchema)
|
||||
private val unsafeProjection = UnsafeProjection.create(requiredSchema.structTypeSchema)
|
||||
private var logScanner = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config)
|
||||
private val logRecords = logScanner.getRecords
|
||||
private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala
|
||||
@@ -185,7 +184,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
override def hasNext: Boolean = {
|
||||
if (baseFileIterator.hasNext) {
|
||||
val curRow = baseFileIterator.next()
|
||||
recordToLoad = unsafeProjection(createInternalRowWithSchema(curRow, requiredSchema, requiredFieldPosition))
|
||||
recordToLoad = unsafeProjection(curRow)
|
||||
true
|
||||
} else {
|
||||
if (logRecordsKeyIterator.hasNext) {
|
||||
@@ -195,9 +194,10 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
// delete record found, skipping
|
||||
this.hasNext
|
||||
} else {
|
||||
val requiredAvroRecord = AvroConversionUtils
|
||||
.buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema, requiredFieldPosition, recordBuilder)
|
||||
recordToLoad = unsafeProjection(deserializer.deserializeData(requiredAvroRecord).asInstanceOf[InternalRow])
|
||||
val requiredAvroRecord = AvroConversionUtils.buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema,
|
||||
requiredFieldPosition, recordBuilder)
|
||||
val rowOpt = deserializer.deserialize(requiredAvroRecord)
|
||||
recordToLoad = unsafeProjection(rowOpt.get.asInstanceOf[InternalRow])
|
||||
true
|
||||
}
|
||||
} else {
|
||||
@@ -224,18 +224,22 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
private def payloadCombineFileIterator(split: HoodieMergeOnReadFileSplit,
|
||||
baseFileIterator: Iterator[InternalRow],
|
||||
config: Configuration): Iterator[InternalRow] =
|
||||
new Iterator[InternalRow] with Closeable {
|
||||
private val tableAvroSchema = new Schema.Parser().parse(tableState.tableAvroSchema)
|
||||
private val requiredAvroSchema = new Schema.Parser().parse(tableState.requiredAvroSchema)
|
||||
private val serializer = HoodieAvroSerializer(tableState.tableStructSchema, tableAvroSchema, false)
|
||||
private val requiredDeserializer = HoodieAvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema)
|
||||
new Iterator[InternalRow] with Closeable with SparkAdapterSupport {
|
||||
private val tableAvroSchema = new Schema.Parser().parse(tableSchema.avroSchemaStr)
|
||||
private val requiredAvroSchema = new Schema.Parser().parse(requiredSchema.avroSchemaStr)
|
||||
private val requiredFieldPosition =
|
||||
requiredSchema.structTypeSchema
|
||||
.map(f => tableAvroSchema.getField(f.name).pos()).toList
|
||||
private val serializer = sparkAdapter.createAvroSerializer(tableSchema.structTypeSchema, tableAvroSchema,
|
||||
resolveAvroSchemaNullability(tableAvroSchema))
|
||||
private val requiredDeserializer = sparkAdapter.createAvroDeserializer(requiredAvroSchema, requiredSchema.structTypeSchema)
|
||||
private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema)
|
||||
private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema)
|
||||
private val unsafeProjection = UnsafeProjection.create(requiredSchema.structTypeSchema)
|
||||
private var logScanner = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config)
|
||||
private val logRecords = logScanner.getRecords
|
||||
private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala
|
||||
private val keyToSkip = mutable.Set.empty[String]
|
||||
private val recordKeyPosition = if (recordKeyFieldOpt.isEmpty) HOODIE_RECORD_KEY_COL_POS else tableState.tableStructSchema.fieldIndex(recordKeyFieldOpt.get)
|
||||
private val recordKeyPosition = tableSchema.structTypeSchema.fieldIndex(recordKeyField)
|
||||
|
||||
private var recordToLoad: InternalRow = _
|
||||
|
||||
@@ -253,20 +257,15 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
this.hasNext
|
||||
} else {
|
||||
// load merged record as InternalRow with required schema
|
||||
val requiredAvroRecord = AvroConversionUtils
|
||||
.buildAvroRecordBySchema(
|
||||
mergedAvroRecord.get(),
|
||||
requiredAvroSchema,
|
||||
requiredFieldPosition,
|
||||
recordBuilder
|
||||
)
|
||||
recordToLoad = unsafeProjection(requiredDeserializer
|
||||
.deserializeData(requiredAvroRecord).asInstanceOf[InternalRow])
|
||||
val requiredAvroRecord = AvroConversionUtils.buildAvroRecordBySchema(mergedAvroRecord.get(), requiredAvroSchema,
|
||||
requiredFieldPosition, recordBuilder)
|
||||
val rowOpt = requiredDeserializer.deserialize(requiredAvroRecord)
|
||||
recordToLoad = unsafeProjection(rowOpt.get.asInstanceOf[InternalRow])
|
||||
true
|
||||
}
|
||||
} else {
|
||||
// No merge needed, load current row with required schema
|
||||
recordToLoad = unsafeProjection(createInternalRowWithSchema(curRow, requiredSchema, requiredFieldPosition))
|
||||
recordToLoad = unsafeProjection(createInternalRowWithSchema(curRow, requiredSchema.structTypeSchema, requiredFieldPosition))
|
||||
true
|
||||
}
|
||||
} else {
|
||||
@@ -287,8 +286,8 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
requiredFieldPosition,
|
||||
recordBuilder
|
||||
)
|
||||
recordToLoad = unsafeProjection(requiredDeserializer
|
||||
.deserializeData(requiredAvroRecord).asInstanceOf[InternalRow])
|
||||
val rowOpt = requiredDeserializer.deserialize(requiredAvroRecord)
|
||||
recordToLoad = unsafeProjection(rowOpt.get.asInstanceOf[InternalRow])
|
||||
true
|
||||
}
|
||||
}
|
||||
@@ -312,8 +311,8 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
|
||||
private def mergeRowWithLog(curRow: InternalRow, curKey: String) = {
|
||||
val historyAvroRecord = serializer.serialize(curRow).asInstanceOf[GenericRecord]
|
||||
logRecords.get(curKey).getData.combineAndGetUpdateValue(
|
||||
historyAvroRecord, tableAvroSchema, payloadProps)
|
||||
logRecords.get(curKey).getData
|
||||
.combineAndGetUpdateValue(historyAvroRecord, tableAvroSchema, payloadProps)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -323,32 +322,60 @@ private object HoodieMergeOnReadRDD {
|
||||
|
||||
def scanLog(split: HoodieMergeOnReadFileSplit, logSchema: Schema, config: Configuration): HoodieMergedLogRecordScanner = {
|
||||
val fs = FSUtils.getFs(split.tablePath, config)
|
||||
val partitionPath: String = if (split.logPaths.isEmpty || split.logPaths.get.asJava.isEmpty) {
|
||||
null
|
||||
val logFiles = split.logFiles.get
|
||||
|
||||
if (HoodieTableMetadata.isMetadataTable(split.tablePath)) {
|
||||
val metadataConfig = HoodieMetadataConfig.newBuilder().enable(true).build()
|
||||
val dataTableBasePath = getDataTableBasePathFromMetadataTable(split.tablePath)
|
||||
val metadataTable = new HoodieBackedTableMetadata(
|
||||
new HoodieLocalEngineContext(config), metadataConfig,
|
||||
dataTableBasePath,
|
||||
config.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP, HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH))
|
||||
|
||||
// NOTE: In case of Metadata Table partition path equates to partition name (since there's just one level
|
||||
// of indirection among MT partitions)
|
||||
val relativePartitionPath = getRelativePartitionPath(new Path(split.tablePath), getPartitionPath(split))
|
||||
metadataTable.getLogRecordScanner(logFiles.asJava, relativePartitionPath).getLeft
|
||||
} else {
|
||||
new Path(split.logPaths.get.asJava.get(0)).getParent.getName
|
||||
val logRecordScannerBuilder = HoodieMergedLogRecordScanner.newBuilder()
|
||||
.withFileSystem(fs)
|
||||
.withBasePath(split.tablePath)
|
||||
.withLogFilePaths(split.logFiles.get.map(logFile => getFilePath(logFile.getPath)).asJava)
|
||||
.withReaderSchema(logSchema)
|
||||
.withLatestInstantTime(split.latestCommit)
|
||||
.withReadBlocksLazily(
|
||||
Try(config.get(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP,
|
||||
HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED).toBoolean)
|
||||
.getOrElse(false))
|
||||
.withReverseReader(false)
|
||||
.withBufferSize(
|
||||
config.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP,
|
||||
HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE))
|
||||
.withMaxMemorySizeInBytes(split.maxCompactionMemoryInBytes)
|
||||
.withSpillableMapBasePath(
|
||||
config.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP,
|
||||
HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH))
|
||||
|
||||
if (logFiles.nonEmpty) {
|
||||
logRecordScannerBuilder.withPartition(getRelativePartitionPath(new Path(split.tablePath), logFiles.head.getPath.getParent))
|
||||
}
|
||||
|
||||
logRecordScannerBuilder.build()
|
||||
}
|
||||
val logRecordScannerBuilder = HoodieMergedLogRecordScanner.newBuilder()
|
||||
.withFileSystem(fs)
|
||||
.withBasePath(split.tablePath)
|
||||
.withLogFilePaths(split.logPaths.get.asJava)
|
||||
.withReaderSchema(logSchema)
|
||||
.withLatestInstantTime(split.latestCommit)
|
||||
.withReadBlocksLazily(
|
||||
Try(config.get(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP,
|
||||
HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED).toBoolean)
|
||||
.getOrElse(false))
|
||||
.withReverseReader(false)
|
||||
.withBufferSize(
|
||||
config.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP,
|
||||
HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE))
|
||||
.withMaxMemorySizeInBytes(split.maxCompactionMemoryInBytes)
|
||||
.withSpillableMapBasePath(
|
||||
config.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP,
|
||||
HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH))
|
||||
if (partitionPath != null) {
|
||||
logRecordScannerBuilder.withPartition(partitionPath)
|
||||
}
|
||||
|
||||
private def getPartitionPath(split: HoodieMergeOnReadFileSplit): Path = {
|
||||
// Determine partition path as an immediate parent folder of either
|
||||
// - The base file
|
||||
// - Some log file
|
||||
split.dataFile.map(baseFile => new Path(baseFile.filePath))
|
||||
.getOrElse(split.logFiles.get.head.getPath)
|
||||
.getParent
|
||||
}
|
||||
|
||||
private def resolveAvroSchemaNullability(schema: Schema) = {
|
||||
AvroConversionUtils.resolveAvroTypeNullability(schema) match {
|
||||
case (nullable, _) => nullable
|
||||
}
|
||||
logRecordScannerBuilder.build()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,10 @@
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{GlobPattern, Path}
|
||||
import org.apache.hadoop.mapred.JobConf
|
||||
import org.apache.hudi.HoodieBaseRelation.createBaseFileReader
|
||||
import org.apache.hudi.common.model.HoodieRecord
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
||||
@@ -35,17 +37,17 @@ import org.apache.spark.sql.{Row, SQLContext}
|
||||
import scala.collection.JavaConversions._
|
||||
|
||||
/**
|
||||
* Experimental.
|
||||
* Relation, that implements the Hoodie incremental view for Merge On Read table.
|
||||
*
|
||||
*/
|
||||
* Experimental.
|
||||
* Relation, that implements the Hoodie incremental view for Merge On Read table.
|
||||
*
|
||||
*/
|
||||
class MergeOnReadIncrementalRelation(sqlContext: SQLContext,
|
||||
val optParams: Map[String, String],
|
||||
val userSchema: Option[StructType],
|
||||
val metaClient: HoodieTableMetaClient)
|
||||
extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) {
|
||||
|
||||
private val conf = sqlContext.sparkContext.hadoopConfiguration
|
||||
private val conf = new Configuration(sqlContext.sparkContext.hadoopConfiguration)
|
||||
private val jobConf = new JobConf(conf)
|
||||
|
||||
private val commitTimeline = metaClient.getCommitsAndCompactionTimeline.filterCompletedInstants()
|
||||
@@ -75,84 +77,89 @@ class MergeOnReadIncrementalRelation(sqlContext: SQLContext,
|
||||
|
||||
private val fileIndex = if (commitsToReturn.isEmpty) List() else buildFileIndex()
|
||||
|
||||
private val preCombineField = {
|
||||
val preCombineFieldFromTableConfig = metaClient.getTableConfig.getPreCombineField
|
||||
if (preCombineFieldFromTableConfig != null) {
|
||||
Some(preCombineFieldFromTableConfig)
|
||||
} else {
|
||||
// get preCombineFiled from the options if this is a old table which have not store
|
||||
// the field to hoodie.properties
|
||||
optParams.get(DataSourceReadOptions.READ_PRE_COMBINE_FIELD.key)
|
||||
}
|
||||
private val preCombineFieldOpt = getPrecombineFieldProperty
|
||||
|
||||
// Record filters making sure that only records w/in the requested bounds are being fetched as part of the
|
||||
// scan collected by this relation
|
||||
private lazy val incrementalSpanRecordsFilters: Seq[Filter] = {
|
||||
val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD)
|
||||
val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)
|
||||
val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)
|
||||
Seq(isNotNullFilter, largerThanFilter, lessThanFilter)
|
||||
}
|
||||
|
||||
private lazy val mandatoryColumns = {
|
||||
// NOTE: This columns are required for Incremental flow to be able to handle the rows properly, even in
|
||||
// cases when no columns are requested to be fetched (for ex, when using {@code count()} API)
|
||||
Seq(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD) ++
|
||||
preCombineFieldOpt.map(Seq(_)).getOrElse(Seq())
|
||||
}
|
||||
|
||||
override def needConversion: Boolean = false
|
||||
|
||||
override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
|
||||
if (fileIndex.isEmpty) {
|
||||
filters
|
||||
} else {
|
||||
val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD)
|
||||
val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)
|
||||
val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)
|
||||
filters :+ isNotNullFilter :+ largerThanFilter :+ lessThanFilter
|
||||
}
|
||||
}
|
||||
|
||||
override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
|
||||
if (fileIndex.isEmpty) {
|
||||
sqlContext.sparkContext.emptyRDD[Row]
|
||||
} else {
|
||||
logDebug(s"buildScan requiredColumns = ${requiredColumns.mkString(",")}")
|
||||
logDebug(s"buildScan filters = ${filters.mkString(",")}")
|
||||
|
||||
// config to ensure the push down filter for parquet will be applied.
|
||||
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.filterPushdown", "true")
|
||||
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.recordLevelFilter.enabled", "true")
|
||||
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false")
|
||||
val pushDownFilter = {
|
||||
val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD)
|
||||
val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)
|
||||
val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)
|
||||
filters :+ isNotNullFilter :+ largerThanFilter :+ lessThanFilter
|
||||
}
|
||||
|
||||
val fetchedColumns: Array[String] = appendMandatoryColumns(requiredColumns)
|
||||
|
||||
val (requiredAvroSchema, requiredStructSchema) =
|
||||
HoodieSparkUtils.getRequiredSchema(tableAvroSchema, requiredColumns)
|
||||
HoodieSparkUtils.getRequiredSchema(tableAvroSchema, fetchedColumns)
|
||||
|
||||
val hoodieTableState = HoodieMergeOnReadTableState(
|
||||
tableStructSchema,
|
||||
requiredStructSchema,
|
||||
tableAvroSchema.toString,
|
||||
requiredAvroSchema.toString,
|
||||
fileIndex,
|
||||
preCombineField,
|
||||
Option.empty
|
||||
)
|
||||
val fullSchemaParquetReader = HoodieDataSourceHelper.buildHoodieParquetReader(
|
||||
sparkSession = sqlContext.sparkSession,
|
||||
dataSchema = tableStructSchema,
|
||||
partitionSchema = StructType(Nil),
|
||||
requiredSchema = tableStructSchema,
|
||||
filters = pushDownFilter,
|
||||
val partitionSchema = StructType(Nil)
|
||||
val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchema.toString)
|
||||
val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString)
|
||||
|
||||
val fullSchemaParquetReader = createBaseFileReader(
|
||||
spark = sqlContext.sparkSession,
|
||||
partitionSchema = partitionSchema,
|
||||
tableSchema = tableSchema,
|
||||
requiredSchema = tableSchema,
|
||||
// This file-reader is used to read base file records, subsequently merging them with the records
|
||||
// stored in delta-log files. As such, we have to read _all_ records from the base file, while avoiding
|
||||
// applying any user-defined filtering _before_ we complete combining them w/ delta-log records (to make sure that
|
||||
// we combine them correctly)
|
||||
//
|
||||
// The only filtering applicable here is the filtering to make sure we're only fetching records that
|
||||
// fall into incremental span of the timeline being queried
|
||||
filters = incrementalSpanRecordsFilters,
|
||||
options = optParams,
|
||||
hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
|
||||
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
||||
// to configure Parquet reader appropriately
|
||||
hadoopConf = new Configuration(conf)
|
||||
)
|
||||
|
||||
val requiredSchemaParquetReader = HoodieDataSourceHelper.buildHoodieParquetReader(
|
||||
sparkSession = sqlContext.sparkSession,
|
||||
dataSchema = tableStructSchema,
|
||||
partitionSchema = StructType(Nil),
|
||||
requiredSchema = tableStructSchema,
|
||||
filters = pushDownFilter,
|
||||
val requiredSchemaParquetReader = createBaseFileReader(
|
||||
spark = sqlContext.sparkSession,
|
||||
partitionSchema = partitionSchema,
|
||||
tableSchema = tableSchema,
|
||||
requiredSchema = requiredSchema,
|
||||
filters = filters ++ incrementalSpanRecordsFilters,
|
||||
options = optParams,
|
||||
hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
|
||||
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
||||
// to configure Parquet reader appropriately
|
||||
hadoopConf = new Configuration(conf)
|
||||
)
|
||||
|
||||
val hoodieTableState = HoodieMergeOnReadTableState(fileIndex, HoodieRecord.RECORD_KEY_METADATA_FIELD, preCombineFieldOpt)
|
||||
|
||||
// TODO implement incremental span record filtering w/in RDD to make sure returned iterator is appropriately
|
||||
// filtered, since file-reader might not be capable to perform filtering
|
||||
val rdd = new HoodieMergeOnReadRDD(
|
||||
sqlContext.sparkContext,
|
||||
jobConf,
|
||||
fullSchemaParquetReader,
|
||||
requiredSchemaParquetReader,
|
||||
hoodieTableState
|
||||
hoodieTableState,
|
||||
tableSchema,
|
||||
requiredSchema
|
||||
)
|
||||
rdd.asInstanceOf[RDD[Row]]
|
||||
}
|
||||
@@ -206,10 +213,9 @@ class MergeOnReadIncrementalRelation(sqlContext: SQLContext,
|
||||
}
|
||||
|
||||
val logPath = if (f.getLatestFileSlice.isPresent) {
|
||||
//If log path doesn't exist, we still include an empty path to avoid using
|
||||
// If log path doesn't exist, we still include an empty path to avoid using
|
||||
// the default parquet reader to ensure the push down filter will be applied.
|
||||
Option(f.getLatestFileSlice.get().getLogFiles.iterator().toList
|
||||
.map(logfile => logfile.getPath.toString))
|
||||
Option(f.getLatestFileSlice.get().getLogFiles.iterator().toList)
|
||||
}
|
||||
else {
|
||||
Option.empty
|
||||
@@ -219,4 +225,9 @@ class MergeOnReadIncrementalRelation(sqlContext: SQLContext,
|
||||
latestCommit, metaClient.getBasePath, maxCompactionMemoryInBytes, mergeType)
|
||||
})
|
||||
}
|
||||
|
||||
private def appendMandatoryColumns(requestedColumns: Array[String]): Array[String] = {
|
||||
val missing = mandatoryColumns.filter(col => !requestedColumns.contains(col))
|
||||
requestedColumns ++ missing
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,43 +18,37 @@
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hudi.common.model.HoodieLogFile
|
||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hadoop.mapred.JobConf
|
||||
import org.apache.hudi.HoodieBaseRelation.{createBaseFileReader, isMetadataTable}
|
||||
import org.apache.hudi.common.model.{HoodieLogFile, HoodieRecord}
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
||||
import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils
|
||||
import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hadoop.mapred.JobConf
|
||||
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.hudi.metadata.HoodieMetadataPayload
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.avro.SchemaConverters
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||
import org.apache.spark.sql.execution.datasources.{FileStatusCache, PartitionedFile}
|
||||
import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
|
||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils
|
||||
import org.apache.spark.sql.{Row, SQLContext}
|
||||
import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
|
||||
import org.apache.spark.sql.sources.Filter
|
||||
import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.{Row, SQLContext}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
case class HoodieMergeOnReadFileSplit(dataFile: Option[PartitionedFile],
|
||||
logPaths: Option[List[String]],
|
||||
logFiles: Option[List[HoodieLogFile]],
|
||||
latestCommit: String,
|
||||
tablePath: String,
|
||||
maxCompactionMemoryInBytes: Long,
|
||||
mergeType: String)
|
||||
|
||||
case class HoodieMergeOnReadTableState(tableStructSchema: StructType,
|
||||
requiredStructSchema: StructType,
|
||||
tableAvroSchema: String,
|
||||
requiredAvroSchema: String,
|
||||
hoodieRealtimeFileSplits: List[HoodieMergeOnReadFileSplit],
|
||||
preCombineField: Option[String],
|
||||
recordKeyFieldOpt: Option[String])
|
||||
case class HoodieMergeOnReadTableState(hoodieRealtimeFileSplits: List[HoodieMergeOnReadFileSplit],
|
||||
recordKeyField: String,
|
||||
preCombineFieldOpt: Option[String])
|
||||
|
||||
class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
||||
optParams: Map[String, String],
|
||||
@@ -63,7 +57,7 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
||||
val metaClient: HoodieTableMetaClient)
|
||||
extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) {
|
||||
|
||||
private val conf = sqlContext.sparkContext.hadoopConfiguration
|
||||
private val conf = new Configuration(sqlContext.sparkContext.hadoopConfiguration)
|
||||
private val jobConf = new JobConf(conf)
|
||||
|
||||
private val mergeType = optParams.getOrElse(
|
||||
@@ -72,19 +66,21 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
||||
|
||||
private val maxCompactionMemoryInBytes = getMaxCompactionMemoryInBytes(jobConf)
|
||||
|
||||
private val preCombineField = {
|
||||
val preCombineFieldFromTableConfig = metaClient.getTableConfig.getPreCombineField
|
||||
if (preCombineFieldFromTableConfig != null) {
|
||||
Some(preCombineFieldFromTableConfig)
|
||||
} else {
|
||||
// get preCombineFiled from the options if this is a old table which have not store
|
||||
// the field to hoodie.properties
|
||||
optParams.get(DataSourceReadOptions.READ_PRE_COMBINE_FIELD.key)
|
||||
}
|
||||
// If meta fields are enabled, always prefer key from the meta field as opposed to user-specified one
|
||||
// NOTE: This is historical behavior which is preserved as is
|
||||
private val recordKeyField = {
|
||||
if (metaClient.getTableConfig.populateMetaFields()) HoodieRecord.RECORD_KEY_METADATA_FIELD
|
||||
else metaClient.getTableConfig.getRecordKeyFieldProp
|
||||
}
|
||||
private var recordKeyFieldOpt = Option.empty[String]
|
||||
if (!metaClient.getTableConfig.populateMetaFields()) {
|
||||
recordKeyFieldOpt = Option(metaClient.getTableConfig.getRecordKeyFieldProp)
|
||||
|
||||
private val preCombineFieldOpt = getPrecombineFieldProperty
|
||||
|
||||
private lazy val mandatoryColumns = {
|
||||
if (isMetadataTable(metaClient)) {
|
||||
Seq(HoodieMetadataPayload.KEY_FIELD_NAME, HoodieMetadataPayload.SCHEMA_FIELD_NAME_TYPE)
|
||||
} else {
|
||||
Seq(recordKeyField) ++ preCombineFieldOpt.map(Seq(_)).getOrElse(Seq())
|
||||
}
|
||||
}
|
||||
|
||||
override def needConversion: Boolean = false
|
||||
@@ -96,45 +92,56 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
||||
log.debug(s" buildScan requiredColumns = ${requiredColumns.mkString(",")}")
|
||||
log.debug(s" buildScan filters = ${filters.mkString(",")}")
|
||||
|
||||
// NOTE: In case list of requested columns doesn't contain the Primary Key one, we
|
||||
// have to add it explicitly so that
|
||||
// - Merging could be performed correctly
|
||||
// - In case 0 columns are to be fetched (for ex, when doing {@code count()} on Spark's [[Dataset]],
|
||||
// Spark still fetches all the rows to execute the query correctly
|
||||
//
|
||||
// It's okay to return columns that have not been requested by the caller, as those nevertheless will be
|
||||
// filtered out upstream
|
||||
val fetchedColumns: Array[String] = appendMandatoryColumns(requiredColumns)
|
||||
|
||||
val (requiredAvroSchema, requiredStructSchema) =
|
||||
HoodieSparkUtils.getRequiredSchema(tableAvroSchema, requiredColumns)
|
||||
HoodieSparkUtils.getRequiredSchema(tableAvroSchema, fetchedColumns)
|
||||
val fileIndex = buildFileIndex(filters)
|
||||
val hoodieTableState = HoodieMergeOnReadTableState(
|
||||
tableStructSchema,
|
||||
requiredStructSchema,
|
||||
tableAvroSchema.toString,
|
||||
requiredAvroSchema.toString,
|
||||
fileIndex,
|
||||
preCombineField,
|
||||
recordKeyFieldOpt
|
||||
)
|
||||
val fullSchemaParquetReader = HoodieDataSourceHelper.buildHoodieParquetReader(
|
||||
sparkSession = sqlContext.sparkSession,
|
||||
dataSchema = tableStructSchema,
|
||||
partitionSchema = StructType(Nil),
|
||||
requiredSchema = tableStructSchema,
|
||||
|
||||
val partitionSchema = StructType(Nil)
|
||||
val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchema.toString)
|
||||
val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString)
|
||||
|
||||
val fullSchemaParquetReader = createBaseFileReader(
|
||||
spark = sqlContext.sparkSession,
|
||||
partitionSchema = partitionSchema,
|
||||
tableSchema = tableSchema,
|
||||
requiredSchema = tableSchema,
|
||||
// This file-reader is used to read base file records, subsequently merging them with the records
|
||||
// stored in delta-log files. As such, we have to read _all_ records from the base file, while avoiding
|
||||
// applying any filtering _before_ we complete combining them w/ delta-log records (to make sure that
|
||||
// we combine them correctly)
|
||||
filters = Seq.empty,
|
||||
options = optParams,
|
||||
hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
|
||||
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
||||
// to configure Parquet reader appropriately
|
||||
hadoopConf = new Configuration(conf)
|
||||
)
|
||||
|
||||
val requiredSchemaParquetReader = HoodieDataSourceHelper.buildHoodieParquetReader(
|
||||
sparkSession = sqlContext.sparkSession,
|
||||
dataSchema = tableStructSchema,
|
||||
partitionSchema = StructType(Nil),
|
||||
requiredSchema = tableStructSchema,
|
||||
val requiredSchemaParquetReader = createBaseFileReader(
|
||||
spark = sqlContext.sparkSession,
|
||||
partitionSchema = partitionSchema,
|
||||
tableSchema = tableSchema,
|
||||
requiredSchema = requiredSchema,
|
||||
filters = filters,
|
||||
options = optParams,
|
||||
hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
|
||||
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
||||
// to configure Parquet reader appropriately
|
||||
hadoopConf = new Configuration(conf)
|
||||
)
|
||||
|
||||
val rdd = new HoodieMergeOnReadRDD(
|
||||
sqlContext.sparkContext,
|
||||
jobConf,
|
||||
fullSchemaParquetReader,
|
||||
requiredSchemaParquetReader,
|
||||
hoodieTableState
|
||||
)
|
||||
val tableState = HoodieMergeOnReadTableState(fileIndex, recordKeyField, preCombineFieldOpt)
|
||||
|
||||
val rdd = new HoodieMergeOnReadRDD(sqlContext.sparkContext, jobConf, fullSchemaParquetReader,
|
||||
requiredSchemaParquetReader, tableState, tableSchema, requiredSchema)
|
||||
|
||||
rdd.asInstanceOf[RDD[Row]]
|
||||
}
|
||||
|
||||
@@ -214,8 +221,7 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
||||
Option.empty
|
||||
}
|
||||
|
||||
val logPaths = fileSlice.getLogFiles.sorted(HoodieLogFile.getLogFileComparator).iterator().asScala
|
||||
.map(logFile => MergeOnReadSnapshotRelation.getFilePath(logFile.getPath)).toList
|
||||
val logPaths = fileSlice.getLogFiles.sorted(HoodieLogFile.getLogFileComparator).iterator().asScala.toList
|
||||
val logPathsOptional = if (logPaths.isEmpty) Option.empty else Option(logPaths)
|
||||
|
||||
HoodieMergeOnReadFileSplit(partitionedFile, logPathsOptional, queryInstant, metaClient.getBasePath,
|
||||
@@ -225,6 +231,11 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def appendMandatoryColumns(requestedColumns: Array[String]): Array[String] = {
|
||||
val missing = mandatoryColumns.filter(col => !requestedColumns.contains(col))
|
||||
requestedColumns ++ missing
|
||||
}
|
||||
}
|
||||
|
||||
object MergeOnReadSnapshotRelation {
|
||||
|
||||
@@ -1,48 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.avro
|
||||
|
||||
import org.apache.avro.Schema
|
||||
|
||||
import org.apache.hudi.HoodieSparkUtils
|
||||
|
||||
import org.apache.spark.sql.types.DataType
|
||||
|
||||
/**
|
||||
* This is to be compatible with the type returned by Spark 3.1
|
||||
* and other spark versions for AvroDeserializer
|
||||
*/
|
||||
case class HoodieAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType) {
|
||||
|
||||
private val avroDeserializer = if (HoodieSparkUtils.isSpark3_2) {
|
||||
// SPARK-34404: As of Spark3.2, there is no AvroDeserializer's constructor with Schema and DataType arguments.
|
||||
// So use the reflection to get AvroDeserializer instance.
|
||||
val constructor = classOf[AvroDeserializer].getConstructor(classOf[Schema], classOf[DataType], classOf[String])
|
||||
constructor.newInstance(rootAvroType, rootCatalystType, "EXCEPTION")
|
||||
} else {
|
||||
val constructor = classOf[AvroDeserializer].getConstructor(classOf[Schema], classOf[DataType])
|
||||
constructor.newInstance(rootAvroType, rootCatalystType)
|
||||
}
|
||||
|
||||
def deserializeData(data: Any): Any = {
|
||||
avroDeserializer.deserialize(data) match {
|
||||
case Some(r) => r // As of spark 3.1, this will return data wrapped with Option, so we fetch the data.
|
||||
case o => o // for other spark version, return the data directly.
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20,9 +20,10 @@ package org.apache.spark.sql.avro
|
||||
import org.apache.avro.Schema
|
||||
import org.apache.spark.sql.types.DataType
|
||||
|
||||
/**
|
||||
* As AvroSerializer cannot be access out of the spark.sql.avro package since spark 3.1, we define
|
||||
* this class to be accessed by other class.
|
||||
*/
|
||||
case class HoodieAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean)
|
||||
extends AvroSerializer(rootCatalystType, rootAvroType, nullable)
|
||||
class HoodieAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean)
|
||||
extends HoodieAvroSerializerTrait {
|
||||
|
||||
val avroSerializer = new AvroSerializer(rootCatalystType, rootAvroType, nullable)
|
||||
|
||||
override def serialize(catalystData: Any): Any = avroSerializer.serialize(catalystData)
|
||||
}
|
||||
|
||||
@@ -25,17 +25,15 @@ import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
|
||||
import org.apache.hudi.common.util.ValidationUtils
|
||||
import org.apache.hudi.keygen.ComplexKeyGenerator
|
||||
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory
|
||||
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.{AnalysisException, SparkSession}
|
||||
import org.apache.spark.sql.avro.SchemaConverters
|
||||
import org.apache.spark.sql.catalyst.TableIdentifier
|
||||
import org.apache.spark.sql.hudi.HoodieOptionConfig
|
||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._
|
||||
import org.apache.spark.sql.types.{StructField, StructType}
|
||||
import org.apache.spark.sql.{AnalysisException, SparkSession}
|
||||
|
||||
import java.util.{Locale, Properties}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
|
||||
|
||||
@@ -42,17 +42,21 @@ trait ProvidesHoodieConfig extends Logging {
|
||||
val tableConfig = hoodieCatalogTable.tableConfig
|
||||
val tableId = hoodieCatalogTable.table.identifier
|
||||
|
||||
// NOTE: Here we fallback to "" to make sure that null value is not overridden with
|
||||
// default value ("ts")
|
||||
// TODO(HUDI-3456) clean up
|
||||
val preCombineField = Option(tableConfig.getPreCombineField).getOrElse("")
|
||||
|
||||
require(hoodieCatalogTable.primaryKeys.nonEmpty,
|
||||
s"There are no primary key in table ${hoodieCatalogTable.table.identifier}, cannot execute update operator")
|
||||
val enableHive = isEnableHive(sparkSession)
|
||||
|
||||
withSparkConf(sparkSession, catalogProperties) {
|
||||
Map(
|
||||
Map.apply(
|
||||
"path" -> hoodieCatalogTable.tableLocation,
|
||||
RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","),
|
||||
PRECOMBINE_FIELD.key -> preCombineField,
|
||||
TBL_NAME.key -> hoodieCatalogTable.tableName,
|
||||
PRECOMBINE_FIELD.key -> preCombineField,
|
||||
HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable,
|
||||
URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning,
|
||||
KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName,
|
||||
@@ -70,6 +74,7 @@ trait ProvidesHoodieConfig extends Logging {
|
||||
HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200",
|
||||
SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL
|
||||
)
|
||||
.filter { case(_, v) => v != null }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,8 +103,12 @@ trait ProvidesHoodieConfig extends Logging {
|
||||
val options = hoodieCatalogTable.catalogProperties ++ tableConfig.getProps.asScala.toMap ++ extraOptions
|
||||
val parameters = withSparkConf(sparkSession, options)()
|
||||
|
||||
val preCombineColumn = hoodieCatalogTable.preCombineKey.getOrElse("")
|
||||
val partitionFields = hoodieCatalogTable.partitionFields.mkString(",")
|
||||
val partitionFieldsStr = hoodieCatalogTable.partitionFields.mkString(",")
|
||||
|
||||
// NOTE: Here we fallback to "" to make sure that null value is not overridden with
|
||||
// default value ("ts")
|
||||
// TODO(HUDI-3456) clean up
|
||||
val preCombineField = hoodieCatalogTable.preCombineKey.getOrElse("")
|
||||
|
||||
val hiveStylePartitioningEnable = Option(tableConfig.getHiveStylePartitioningEnable).getOrElse("true")
|
||||
val urlEncodePartitioning = Option(tableConfig.getUrlEncodePartitioning).getOrElse("false")
|
||||
@@ -115,7 +124,7 @@ trait ProvidesHoodieConfig extends Logging {
|
||||
DataSourceWriteOptions.SQL_INSERT_MODE.defaultValue()))
|
||||
val isNonStrictMode = insertMode == InsertMode.NON_STRICT
|
||||
val isPartitionedTable = hoodieCatalogTable.partitionFields.nonEmpty
|
||||
val hasPrecombineColumn = preCombineColumn.nonEmpty
|
||||
val hasPrecombineColumn = hoodieCatalogTable.preCombineKey.nonEmpty
|
||||
val operation =
|
||||
(enableBulkInsert, isOverwrite, dropDuplicate, isNonStrictMode, isPartitionedTable) match {
|
||||
case (true, _, _, false, _) =>
|
||||
@@ -147,37 +156,41 @@ trait ProvidesHoodieConfig extends Logging {
|
||||
} else {
|
||||
classOf[OverwriteWithLatestAvroPayload].getCanonicalName
|
||||
}
|
||||
logInfo(s"insert statement use write operation type: $operation, payloadClass: $payloadClassName")
|
||||
|
||||
logInfo(s"Insert statement use write operation type: $operation, payloadClass: $payloadClassName")
|
||||
|
||||
val enableHive = isEnableHive(sparkSession)
|
||||
|
||||
withSparkConf(sparkSession, options) {
|
||||
Map(
|
||||
"path" -> path,
|
||||
TABLE_TYPE.key -> tableType,
|
||||
TBL_NAME.key -> hoodieCatalogTable.tableName,
|
||||
PRECOMBINE_FIELD.key -> preCombineColumn,
|
||||
OPERATION.key -> operation,
|
||||
HIVE_STYLE_PARTITIONING.key -> hiveStylePartitioningEnable,
|
||||
URL_ENCODE_PARTITIONING.key -> urlEncodePartitioning,
|
||||
KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName,
|
||||
SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> keyGeneratorClassName,
|
||||
RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","),
|
||||
PARTITIONPATH_FIELD.key -> partitionFields,
|
||||
PRECOMBINE_FIELD.key -> preCombineField,
|
||||
PARTITIONPATH_FIELD.key -> partitionFieldsStr,
|
||||
PAYLOAD_CLASS_NAME.key -> payloadClassName,
|
||||
ENABLE_ROW_WRITER.key -> enableBulkInsert.toString,
|
||||
HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(hasPrecombineColumn),
|
||||
HIVE_PARTITION_FIELDS.key -> partitionFieldsStr,
|
||||
META_SYNC_ENABLED.key -> enableHive.toString,
|
||||
HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(),
|
||||
HIVE_USE_JDBC.key -> "false",
|
||||
HIVE_DATABASE.key -> hoodieCatalogTable.table.identifier.database.getOrElse("default"),
|
||||
HIVE_TABLE.key -> hoodieCatalogTable.table.identifier.table,
|
||||
HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true",
|
||||
HIVE_PARTITION_FIELDS.key -> partitionFields,
|
||||
HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName,
|
||||
HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> "200",
|
||||
HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200",
|
||||
SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL
|
||||
)
|
||||
.filter { case (_, v) => v != null }
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
|
||||
package org.apache.spark.sql.hudi.command.payload
|
||||
|
||||
import java.util.UUID
|
||||
import org.apache.avro.generic.{GenericRecord, IndexedRecord}
|
||||
import org.apache.hudi.sql.IExpressionEvaluator
|
||||
import org.apache.spark.executor.InputMetrics
|
||||
@@ -37,6 +36,8 @@ import org.apache.spark.{TaskContext, TaskKilledException}
|
||||
import org.codehaus.commons.compiler.CompileException
|
||||
import org.codehaus.janino.{ClassBodyEvaluator, InternalCompilerException}
|
||||
|
||||
import java.util.UUID
|
||||
|
||||
/**
|
||||
* Do CodeGen for expression based on IndexedRecord.
|
||||
* The mainly difference with the spark's CodeGen for expression is that
|
||||
|
||||
@@ -17,14 +17,9 @@
|
||||
|
||||
package org.apache.spark.sql.hudi.command.payload
|
||||
|
||||
import java.util.{Base64, Properties}
|
||||
import java.util.concurrent.Callable
|
||||
|
||||
import com.google.common.cache.CacheBuilder
|
||||
|
||||
import org.apache.avro.Schema
|
||||
import org.apache.avro.generic.{GenericData, GenericRecord, IndexedRecord}
|
||||
|
||||
import org.apache.hudi.AvroConversionUtils
|
||||
import org.apache.hudi.DataSourceWriteOptions._
|
||||
import org.apache.hudi.avro.HoodieAvroUtils
|
||||
@@ -34,13 +29,14 @@ import org.apache.hudi.common.util.{ValidationUtils, Option => HOption}
|
||||
import org.apache.hudi.config.HoodieWriteConfig
|
||||
import org.apache.hudi.io.HoodieWriteHandle
|
||||
import org.apache.hudi.sql.IExpressionEvaluator
|
||||
|
||||
import org.apache.spark.sql.avro.{AvroSerializer, HoodieAvroSerializer, SchemaConverters}
|
||||
import org.apache.spark.sql.avro.{AvroSerializer, SchemaConverters}
|
||||
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||
import org.apache.spark.sql.hudi.SerDeUtils
|
||||
import org.apache.spark.sql.hudi.command.payload.ExpressionPayload.getEvaluator
|
||||
import org.apache.spark.sql.types.{StructField, StructType}
|
||||
|
||||
import java.util.concurrent.Callable
|
||||
import java.util.{Base64, Properties}
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
@@ -315,7 +311,7 @@ object ExpressionPayload {
|
||||
val conditionEvaluator = ExpressionCodeGen.doCodeGen(Seq(condition), conditionSerializer)
|
||||
|
||||
val assignSqlType = AvroConversionUtils.convertAvroSchemaToStructType(writeSchema)
|
||||
val assignSerializer = new HoodieAvroSerializer(assignSqlType, writeSchema, false)
|
||||
val assignSerializer = new AvroSerializer(assignSqlType, writeSchema, false)
|
||||
val assignmentEvaluator = ExpressionCodeGen.doCodeGen(assignments, assignSerializer)
|
||||
conditionEvaluator -> assignmentEvaluator
|
||||
}
|
||||
|
||||
@@ -17,22 +17,19 @@
|
||||
|
||||
package org.apache.spark.sql.hudi.command.payload
|
||||
|
||||
import org.apache.avro.generic.IndexedRecord
|
||||
import org.apache.avro.Schema
|
||||
|
||||
import org.apache.hudi.AvroConversionUtils
|
||||
|
||||
import org.apache.spark.sql.avro.HoodieAvroDeserializer
|
||||
import org.apache.avro.generic.IndexedRecord
|
||||
import org.apache.hudi.{AvroConversionUtils, SparkAdapterSupport}
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
|
||||
/**
|
||||
* A sql typed record which will convert the avro field to sql typed value.
|
||||
*/
|
||||
class SqlTypedRecord(val record: IndexedRecord) extends IndexedRecord {
|
||||
class SqlTypedRecord(val record: IndexedRecord) extends IndexedRecord with SparkAdapterSupport {
|
||||
|
||||
private lazy val sqlType = AvroConversionUtils.convertAvroSchemaToStructType(getSchema)
|
||||
private lazy val avroDeserializer = HoodieAvroDeserializer(record.getSchema, sqlType)
|
||||
private lazy val sqlRow = avroDeserializer.deserializeData(record).asInstanceOf[InternalRow]
|
||||
private lazy val avroDeserializer = sparkAdapter.createAvroDeserializer(record.getSchema, sqlType)
|
||||
private lazy val sqlRow = avroDeserializer.deserialize(record).get.asInstanceOf[InternalRow]
|
||||
|
||||
override def put(i: Int, v: Any): Unit = {
|
||||
record.put(i, v)
|
||||
|
||||
Reference in New Issue
Block a user