[HUDI-3979] Optimize out mandatory columns when no merging is performed (#5430)
For MOR, when no merging is performed there is no point in reading either primary-key or pre-combine-key values (unless query is referencing these). Avoiding reading these allows to potentially save substantial resources wasted for reading it out.
This commit is contained in:
@@ -60,9 +60,7 @@ class BaseFileOnlyRelation(sqlContext: SQLContext,
|
|||||||
override protected val shouldExtractPartitionValuesFromPartitionPath: Boolean =
|
override protected val shouldExtractPartitionValuesFromPartitionPath: Boolean =
|
||||||
internalSchemaOpt.isEmpty
|
internalSchemaOpt.isEmpty
|
||||||
|
|
||||||
override lazy val mandatoryFields: Seq[String] =
|
override lazy val mandatoryFields: Seq[String] = Seq.empty
|
||||||
// TODO reconcile, record's key shouldn't be mandatory for base-file only relation
|
|
||||||
Seq(recordKeyField)
|
|
||||||
|
|
||||||
override def imbueConfigs(sqlContext: SQLContext): Unit = {
|
override def imbueConfigs(sqlContext: SQLContext): Unit = {
|
||||||
super.imbueConfigs(sqlContext)
|
super.imbueConfigs(sqlContext)
|
||||||
@@ -73,6 +71,7 @@ class BaseFileOnlyRelation(sqlContext: SQLContext,
|
|||||||
partitionSchema: StructType,
|
partitionSchema: StructType,
|
||||||
dataSchema: HoodieTableSchema,
|
dataSchema: HoodieTableSchema,
|
||||||
requiredSchema: HoodieTableSchema,
|
requiredSchema: HoodieTableSchema,
|
||||||
|
requestedColumns: Array[String],
|
||||||
filters: Array[Filter]): HoodieUnsafeRDD = {
|
filters: Array[Filter]): HoodieUnsafeRDD = {
|
||||||
|
|
||||||
val baseFileReader = createBaseFileReader(
|
val baseFileReader = createBaseFileReader(
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ import org.apache.hadoop.conf.Configuration
|
|||||||
import org.apache.hadoop.fs.{FileStatus, Path}
|
import org.apache.hadoop.fs.{FileStatus, Path}
|
||||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig
|
import org.apache.hadoop.hbase.io.hfile.CacheConfig
|
||||||
import org.apache.hadoop.mapred.JobConf
|
import org.apache.hadoop.mapred.JobConf
|
||||||
import org.apache.hudi.HoodieBaseRelation.{convertToAvroSchema, createHFileReader, generateUnsafeProjection, getPartitionPath, projectSchema}
|
import org.apache.hudi.HoodieBaseRelation.{BaseFileReader, convertToAvroSchema, createHFileReader, generateUnsafeProjection, getPartitionPath, projectSchema}
|
||||||
import org.apache.hudi.HoodieConversionUtils.toScalaOption
|
import org.apache.hudi.HoodieConversionUtils.toScalaOption
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils
|
import org.apache.hudi.avro.HoodieAvroUtils
|
||||||
import org.apache.hudi.client.utils.SparkInternalSchemaConverter
|
import org.apache.hudi.client.utils.SparkInternalSchemaConverter
|
||||||
@@ -204,6 +204,10 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
|
|||||||
shouldOmitPartitionColumns || shouldExtractPartitionValueFromPath
|
shouldOmitPartitionColumns || shouldExtractPartitionValueFromPath
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NOTE: This fields are accessed by [[NestedSchemaPruning]] component which is only enabled for
|
||||||
|
* Spark >= 3.1
|
||||||
|
*/
|
||||||
lazy val (fileFormat: FileFormat, fileFormatClassName: String) =
|
lazy val (fileFormat: FileFormat, fileFormatClassName: String) =
|
||||||
metaClient.getTableConfig.getBaseFileFormat match {
|
metaClient.getTableConfig.getBaseFileFormat match {
|
||||||
case HoodieFileFormat.ORC => (new OrcFileFormat, "orc")
|
case HoodieFileFormat.ORC => (new OrcFileFormat, "orc")
|
||||||
@@ -258,12 +262,11 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
|
|||||||
*
|
*
|
||||||
* Check scala-doc for [[shouldExtractPartitionValuesFromPartitionPath]] for more details
|
* Check scala-doc for [[shouldExtractPartitionValuesFromPartitionPath]] for more details
|
||||||
*/
|
*/
|
||||||
def dataSchema: StructType =
|
def dataSchema: StructType = if (shouldExtractPartitionValuesFromPartitionPath) {
|
||||||
if (shouldExtractPartitionValuesFromPartitionPath) {
|
prunePartitionColumns(tableStructSchema)
|
||||||
prunePartitionColumns(tableStructSchema)
|
} else {
|
||||||
} else {
|
tableStructSchema
|
||||||
tableStructSchema
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determines whether relation's schema could be pruned by Spark's Optimizer
|
* Determines whether relation's schema could be pruned by Spark's Optimizer
|
||||||
@@ -346,7 +349,7 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
|
|||||||
if (fileSplits.isEmpty) {
|
if (fileSplits.isEmpty) {
|
||||||
sparkSession.sparkContext.emptyRDD
|
sparkSession.sparkContext.emptyRDD
|
||||||
} else {
|
} else {
|
||||||
val rdd = composeRDD(fileSplits, partitionSchema, dataSchema, requiredDataSchema, filters)
|
val rdd = composeRDD(fileSplits, partitionSchema, dataSchema, requiredDataSchema, targetColumns, filters)
|
||||||
|
|
||||||
// NOTE: In case when partition columns have been pruned from the required schema, we have to project
|
// NOTE: In case when partition columns have been pruned from the required schema, we have to project
|
||||||
// the rows from the pruned schema back into the one expected by the caller
|
// the rows from the pruned schema back into the one expected by the caller
|
||||||
@@ -369,17 +372,19 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
|
|||||||
/**
|
/**
|
||||||
* Composes RDD provided file splits to read from, table and partition schemas, data filters to be applied
|
* Composes RDD provided file splits to read from, table and partition schemas, data filters to be applied
|
||||||
*
|
*
|
||||||
* @param fileSplits file splits to be handled by the RDD
|
* @param fileSplits file splits to be handled by the RDD
|
||||||
* @param partitionSchema target table's partition schema
|
* @param partitionSchema target table's partition schema
|
||||||
* @param dataSchema target table's data files' schema
|
* @param dataSchema target table's data files' schema
|
||||||
* @param requiredSchema projected schema required by the reader
|
* @param requiredSchema projected schema required by the reader
|
||||||
* @param filters data filters to be applied
|
* @param requestedColumns columns requested by the query
|
||||||
|
* @param filters data filters to be applied
|
||||||
* @return instance of RDD (implementing [[HoodieUnsafeRDD]])
|
* @return instance of RDD (implementing [[HoodieUnsafeRDD]])
|
||||||
*/
|
*/
|
||||||
protected def composeRDD(fileSplits: Seq[FileSplit],
|
protected def composeRDD(fileSplits: Seq[FileSplit],
|
||||||
partitionSchema: StructType,
|
partitionSchema: StructType,
|
||||||
dataSchema: HoodieTableSchema,
|
dataSchema: HoodieTableSchema,
|
||||||
requiredSchema: HoodieTableSchema,
|
requiredSchema: HoodieTableSchema,
|
||||||
|
requestedColumns: Array[String],
|
||||||
filters: Array[Filter]): HoodieUnsafeRDD
|
filters: Array[Filter]): HoodieUnsafeRDD
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -551,37 +556,48 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
|
|||||||
requiredSchema: HoodieTableSchema,
|
requiredSchema: HoodieTableSchema,
|
||||||
filters: Seq[Filter],
|
filters: Seq[Filter],
|
||||||
options: Map[String, String],
|
options: Map[String, String],
|
||||||
hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
|
hadoopConf: Configuration): BaseFileReader = {
|
||||||
val hfileReader = createHFileReader(
|
val tableBaseFileFormat = tableConfig.getBaseFileFormat
|
||||||
spark = spark,
|
|
||||||
dataSchema = dataSchema,
|
|
||||||
requiredSchema = requiredSchema,
|
|
||||||
filters = filters,
|
|
||||||
options = options,
|
|
||||||
hadoopConf = hadoopConf
|
|
||||||
)
|
|
||||||
|
|
||||||
val parquetReader = HoodieDataSourceHelper.buildHoodieParquetReader(
|
// NOTE: PLEASE READ CAREFULLY
|
||||||
sparkSession = spark,
|
// Lambda returned from this method is going to be invoked on the executor, and therefore
|
||||||
dataSchema = dataSchema.structTypeSchema,
|
// we have to eagerly initialize all of the readers even though only one specific to the type
|
||||||
partitionSchema = partitionSchema,
|
// of the file being read will be used. This is required to avoid serialization of the whole
|
||||||
requiredSchema = requiredSchema.structTypeSchema,
|
// relation (containing file-index for ex) and passing it to the executor
|
||||||
filters = filters,
|
val reader = tableBaseFileFormat match {
|
||||||
options = options,
|
case HoodieFileFormat.PARQUET =>
|
||||||
hadoopConf = hadoopConf,
|
HoodieDataSourceHelper.buildHoodieParquetReader(
|
||||||
// We're delegating to Spark to append partition values to every row only in cases
|
sparkSession = spark,
|
||||||
// when these corresponding partition-values are not persisted w/in the data file itself
|
dataSchema = dataSchema.structTypeSchema,
|
||||||
appendPartitionValues = shouldExtractPartitionValuesFromPartitionPath
|
partitionSchema = partitionSchema,
|
||||||
)
|
requiredSchema = requiredSchema.structTypeSchema,
|
||||||
|
filters = filters,
|
||||||
|
options = options,
|
||||||
|
hadoopConf = hadoopConf,
|
||||||
|
// We're delegating to Spark to append partition values to every row only in cases
|
||||||
|
// when these corresponding partition-values are not persisted w/in the data file itself
|
||||||
|
appendPartitionValues = shouldExtractPartitionValuesFromPartitionPath
|
||||||
|
)
|
||||||
|
|
||||||
|
case HoodieFileFormat.HFILE =>
|
||||||
|
createHFileReader(
|
||||||
|
spark = spark,
|
||||||
|
dataSchema = dataSchema,
|
||||||
|
requiredSchema = requiredSchema,
|
||||||
|
filters = filters,
|
||||||
|
options = options,
|
||||||
|
hadoopConf = hadoopConf
|
||||||
|
)
|
||||||
|
|
||||||
|
case _ => throw new UnsupportedOperationException(s"Base file format is not currently supported ($tableBaseFileFormat)")
|
||||||
|
}
|
||||||
|
|
||||||
partitionedFile => {
|
partitionedFile => {
|
||||||
val extension = FSUtils.getFileExtension(partitionedFile.filePath)
|
val extension = FSUtils.getFileExtension(partitionedFile.filePath)
|
||||||
if (HoodieFileFormat.PARQUET.getFileExtension.equals(extension)) {
|
if (tableBaseFileFormat.getFileExtension.equals(extension)) {
|
||||||
parquetReader.apply(partitionedFile)
|
reader.apply(partitionedFile)
|
||||||
} else if (HoodieFileFormat.HFILE.getFileExtension.equals(extension)) {
|
|
||||||
hfileReader.apply(partitionedFile)
|
|
||||||
} else {
|
} else {
|
||||||
throw new UnsupportedOperationException(s"Base file format not supported by Spark DataSource ($partitionedFile)")
|
throw new UnsupportedOperationException(s"Invalid base-file format ($extension), expected ($tableBaseFileFormat)")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -629,6 +645,8 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
|
|||||||
|
|
||||||
object HoodieBaseRelation extends SparkAdapterSupport {
|
object HoodieBaseRelation extends SparkAdapterSupport {
|
||||||
|
|
||||||
|
type BaseFileReader = PartitionedFile => Iterator[InternalRow]
|
||||||
|
|
||||||
private def generateUnsafeProjection(from: StructType, to: StructType) =
|
private def generateUnsafeProjection(from: StructType, to: StructType) =
|
||||||
sparkAdapter.getCatalystExpressionUtils().generateUnsafeProjection(from, to)
|
sparkAdapter.getCatalystExpressionUtils().generateUnsafeProjection(from, to)
|
||||||
|
|
||||||
@@ -676,7 +694,7 @@ object HoodieBaseRelation extends SparkAdapterSupport {
|
|||||||
requiredSchema: HoodieTableSchema,
|
requiredSchema: HoodieTableSchema,
|
||||||
filters: Seq[Filter],
|
filters: Seq[Filter],
|
||||||
options: Map[String, String],
|
options: Map[String, String],
|
||||||
hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
|
hadoopConf: Configuration): BaseFileReader = {
|
||||||
val hadoopConfBroadcast =
|
val hadoopConfBroadcast =
|
||||||
spark.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
|
spark.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
|
||||||
|
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder, IndexedReco
|
|||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
import org.apache.hadoop.mapred.JobConf
|
import org.apache.hadoop.mapred.JobConf
|
||||||
|
import org.apache.hudi.HoodieBaseRelation.BaseFileReader
|
||||||
import org.apache.hudi.HoodieConversionUtils.{toJavaOption, toScalaOption}
|
import org.apache.hudi.HoodieConversionUtils.{toJavaOption, toScalaOption}
|
||||||
import org.apache.hudi.HoodieMergeOnReadRDD.{AvroDeserializerSupport, collectFieldOrdinals, getPartitionPath, projectAvro, projectAvroUnsafe, projectRowUnsafe, resolveAvroSchemaNullability}
|
import org.apache.hudi.HoodieMergeOnReadRDD.{AvroDeserializerSupport, collectFieldOrdinals, getPartitionPath, projectAvro, projectAvroUnsafe, projectRowUnsafe, resolveAvroSchemaNullability}
|
||||||
import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig}
|
import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig}
|
||||||
@@ -55,11 +56,14 @@ import scala.util.Try
|
|||||||
|
|
||||||
case class HoodieMergeOnReadPartition(index: Int, split: HoodieMergeOnReadFileSplit) extends Partition
|
case class HoodieMergeOnReadPartition(index: Int, split: HoodieMergeOnReadFileSplit) extends Partition
|
||||||
|
|
||||||
|
case class HoodieMergeOnReadBaseFileReaders(fullSchemaFileReader: BaseFileReader,
|
||||||
|
requiredSchemaFileReaderForMerging: BaseFileReader,
|
||||||
|
requiredSchemaFileReaderForNoMerging: BaseFileReader)
|
||||||
|
|
||||||
class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||||
@transient config: Configuration,
|
@transient config: Configuration,
|
||||||
fullSchemaFileReader: PartitionedFile => Iterator[InternalRow],
|
fileReaders: HoodieMergeOnReadBaseFileReaders,
|
||||||
requiredSchemaFileReader: PartitionedFile => Iterator[InternalRow],
|
dataSchema: HoodieTableSchema,
|
||||||
tableSchema: HoodieTableSchema,
|
|
||||||
requiredSchema: HoodieTableSchema,
|
requiredSchema: HoodieTableSchema,
|
||||||
tableState: HoodieTableState,
|
tableState: HoodieTableState,
|
||||||
mergeType: String,
|
mergeType: String,
|
||||||
@@ -86,13 +90,13 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
|||||||
val mergeOnReadPartition = split.asInstanceOf[HoodieMergeOnReadPartition]
|
val mergeOnReadPartition = split.asInstanceOf[HoodieMergeOnReadPartition]
|
||||||
val iter = mergeOnReadPartition.split match {
|
val iter = mergeOnReadPartition.split match {
|
||||||
case dataFileOnlySplit if dataFileOnlySplit.logFiles.isEmpty =>
|
case dataFileOnlySplit if dataFileOnlySplit.logFiles.isEmpty =>
|
||||||
requiredSchemaFileReader.apply(dataFileOnlySplit.dataFile.get)
|
fileReaders.requiredSchemaFileReaderForNoMerging.apply(dataFileOnlySplit.dataFile.get)
|
||||||
|
|
||||||
case logFileOnlySplit if logFileOnlySplit.dataFile.isEmpty =>
|
case logFileOnlySplit if logFileOnlySplit.dataFile.isEmpty =>
|
||||||
new LogFileIterator(logFileOnlySplit, getConfig)
|
new LogFileIterator(logFileOnlySplit, getConfig)
|
||||||
|
|
||||||
case split if mergeType.equals(DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) =>
|
case split if mergeType.equals(DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) =>
|
||||||
val baseFileIterator = requiredSchemaFileReader.apply(split.dataFile.get)
|
val baseFileIterator = fileReaders.requiredSchemaFileReaderForNoMerging.apply(split.dataFile.get)
|
||||||
new SkipMergeIterator(split, baseFileIterator, getConfig)
|
new SkipMergeIterator(split, baseFileIterator, getConfig)
|
||||||
|
|
||||||
case split if mergeType.equals(DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL) =>
|
case split if mergeType.equals(DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL) =>
|
||||||
@@ -126,9 +130,9 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
|||||||
// then we can avoid reading and parsing the records w/ _full_ schema, and instead only
|
// then we can avoid reading and parsing the records w/ _full_ schema, and instead only
|
||||||
// rely on projected one, nevertheless being able to perform merging correctly
|
// rely on projected one, nevertheless being able to perform merging correctly
|
||||||
if (!whitelistedPayloadClasses.contains(tableState.recordPayloadClassName))
|
if (!whitelistedPayloadClasses.contains(tableState.recordPayloadClassName))
|
||||||
(fullSchemaFileReader(split.dataFile.get), tableSchema)
|
(fileReaders.fullSchemaFileReader(split.dataFile.get), dataSchema)
|
||||||
else
|
else
|
||||||
(requiredSchemaFileReader(split.dataFile.get), requiredSchema)
|
(fileReaders.requiredSchemaFileReaderForMerging(split.dataFile.get), requiredSchema)
|
||||||
}
|
}
|
||||||
|
|
||||||
override protected def getPartitions: Array[Partition] =
|
override protected def getPartitions: Array[Partition] =
|
||||||
@@ -152,7 +156,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
|||||||
protected override val requiredAvroSchema: Schema = new Schema.Parser().parse(requiredSchema.avroSchemaStr)
|
protected override val requiredAvroSchema: Schema = new Schema.Parser().parse(requiredSchema.avroSchemaStr)
|
||||||
protected override val requiredStructTypeSchema: StructType = requiredSchema.structTypeSchema
|
protected override val requiredStructTypeSchema: StructType = requiredSchema.structTypeSchema
|
||||||
|
|
||||||
protected val logFileReaderAvroSchema: Schema = new Schema.Parser().parse(tableSchema.avroSchemaStr)
|
protected val logFileReaderAvroSchema: Schema = new Schema.Parser().parse(dataSchema.avroSchemaStr)
|
||||||
|
|
||||||
protected val recordBuilder: GenericRecordBuilder = new GenericRecordBuilder(requiredAvroSchema)
|
protected val recordBuilder: GenericRecordBuilder = new GenericRecordBuilder(requiredAvroSchema)
|
||||||
protected var recordToLoad: InternalRow = _
|
protected var recordToLoad: InternalRow = _
|
||||||
@@ -167,7 +171,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
|||||||
private val requiredSchemaFieldOrdinals: List[Int] = collectFieldOrdinals(requiredAvroSchema, logFileReaderAvroSchema)
|
private val requiredSchemaFieldOrdinals: List[Int] = collectFieldOrdinals(requiredAvroSchema, logFileReaderAvroSchema)
|
||||||
|
|
||||||
private var logScanner = {
|
private var logScanner = {
|
||||||
val internalSchema = tableSchema.internalSchema.getOrElse(InternalSchema.getEmptyInternalSchema)
|
val internalSchema = dataSchema.internalSchema.getOrElse(InternalSchema.getEmptyInternalSchema)
|
||||||
HoodieMergeOnReadRDD.scanLog(split.logFiles, getPartitionPath(split), logFileReaderAvroSchema, tableState,
|
HoodieMergeOnReadRDD.scanLog(split.logFiles, getPartitionPath(split), logFileReaderAvroSchema, tableState,
|
||||||
maxCompactionMemoryInBytes, config, internalSchema)
|
maxCompactionMemoryInBytes, config, internalSchema)
|
||||||
}
|
}
|
||||||
@@ -232,7 +236,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
|||||||
override def hasNext: Boolean = {
|
override def hasNext: Boolean = {
|
||||||
if (baseFileIterator.hasNext) {
|
if (baseFileIterator.hasNext) {
|
||||||
val curRow = baseFileIterator.next()
|
val curRow = baseFileIterator.next()
|
||||||
recordToLoad = unsafeProjection(curRow)
|
recordToLoad = curRow
|
||||||
true
|
true
|
||||||
} else {
|
} else {
|
||||||
super[LogFileIterator].hasNext
|
super[LogFileIterator].hasNext
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ class MergeOnReadIncrementalRelation(sqlContext: SQLContext,
|
|||||||
partitionSchema: StructType,
|
partitionSchema: StructType,
|
||||||
dataSchema: HoodieTableSchema,
|
dataSchema: HoodieTableSchema,
|
||||||
requiredSchema: HoodieTableSchema,
|
requiredSchema: HoodieTableSchema,
|
||||||
|
requestedColumns: Array[String],
|
||||||
filters: Array[Filter]): HoodieMergeOnReadRDD = {
|
filters: Array[Filter]): HoodieMergeOnReadRDD = {
|
||||||
val fullSchemaParquetReader = createBaseFileReader(
|
val fullSchemaParquetReader = createBaseFileReader(
|
||||||
spark = sqlContext.sparkSession,
|
spark = sqlContext.sparkSession,
|
||||||
@@ -81,23 +82,25 @@ class MergeOnReadIncrementalRelation(sqlContext: SQLContext,
|
|||||||
hadoopConf = embedInternalSchema(new Configuration(conf), internalSchemaOpt)
|
hadoopConf = embedInternalSchema(new Configuration(conf), internalSchemaOpt)
|
||||||
)
|
)
|
||||||
|
|
||||||
val requiredSchemaParquetReader = createBaseFileReader(
|
val (requiredSchemaBaseFileReaderMerging, requiredSchemaBaseFileReaderNoMerging) =
|
||||||
spark = sqlContext.sparkSession,
|
createMergeOnReadBaseFileReaders(partitionSchema, dataSchema, requiredSchema, requestedColumns, filters ++ incrementalSpanRecordFilters)
|
||||||
partitionSchema = partitionSchema,
|
|
||||||
dataSchema = dataSchema,
|
|
||||||
requiredSchema = requiredSchema,
|
|
||||||
filters = filters ++ incrementalSpanRecordFilters,
|
|
||||||
options = optParams,
|
|
||||||
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
|
||||||
// to configure Parquet reader appropriately
|
|
||||||
hadoopConf = embedInternalSchema(new Configuration(conf), requiredSchema.internalSchema)
|
|
||||||
)
|
|
||||||
|
|
||||||
val hoodieTableState = getTableState
|
val hoodieTableState = getTableState
|
||||||
// TODO(HUDI-3639) implement incremental span record filtering w/in RDD to make sure returned iterator is appropriately
|
// TODO(HUDI-3639) implement incremental span record filtering w/in RDD to make sure returned iterator is appropriately
|
||||||
// filtered, since file-reader might not be capable to perform filtering
|
// filtered, since file-reader might not be capable to perform filtering
|
||||||
new HoodieMergeOnReadRDD(sqlContext.sparkContext, jobConf, fullSchemaParquetReader, requiredSchemaParquetReader,
|
new HoodieMergeOnReadRDD(
|
||||||
dataSchema, requiredSchema, hoodieTableState, mergeType, fileSplits)
|
sqlContext.sparkContext,
|
||||||
|
config = jobConf,
|
||||||
|
fileReaders = HoodieMergeOnReadBaseFileReaders(
|
||||||
|
fullSchemaFileReader = fullSchemaParquetReader,
|
||||||
|
requiredSchemaFileReaderForMerging = requiredSchemaBaseFileReaderMerging,
|
||||||
|
requiredSchemaFileReaderForNoMerging = requiredSchemaBaseFileReaderNoMerging
|
||||||
|
),
|
||||||
|
dataSchema = dataSchema,
|
||||||
|
requiredSchema = requiredSchema,
|
||||||
|
tableState = hoodieTableState,
|
||||||
|
mergeType = mergeType,
|
||||||
|
fileSplits = fileSplits)
|
||||||
}
|
}
|
||||||
|
|
||||||
override protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = {
|
override protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = {
|
||||||
|
|||||||
@@ -20,14 +20,17 @@ package org.apache.hudi
|
|||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
|
import org.apache.hudi.HoodieBaseRelation.{BaseFileReader, convertToAvroSchema}
|
||||||
import org.apache.hudi.HoodieConversionUtils.toScalaOption
|
import org.apache.hudi.HoodieConversionUtils.toScalaOption
|
||||||
import org.apache.hudi.MergeOnReadSnapshotRelation.getFilePath
|
import org.apache.hudi.MergeOnReadSnapshotRelation.getFilePath
|
||||||
|
import org.apache.hudi.avro.HoodieAvroUtils
|
||||||
import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath
|
import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath
|
||||||
import org.apache.hudi.common.model.{FileSlice, HoodieLogFile}
|
import org.apache.hudi.common.model.{FileSlice, HoodieLogFile}
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
||||||
import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex
|
import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex
|
||||||
import org.apache.spark.sql.SQLContext
|
import org.apache.spark.sql.SQLContext
|
||||||
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
import org.apache.spark.sql.catalyst.expressions.Expression
|
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||||
import org.apache.spark.sql.execution.datasources.PartitionedFile
|
import org.apache.spark.sql.execution.datasources.PartitionedFile
|
||||||
import org.apache.spark.sql.sources.Filter
|
import org.apache.spark.sql.sources.Filter
|
||||||
@@ -47,9 +50,27 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
|||||||
|
|
||||||
override type FileSplit = HoodieMergeOnReadFileSplit
|
override type FileSplit = HoodieMergeOnReadFileSplit
|
||||||
|
|
||||||
override lazy val mandatoryFields: Seq[String] =
|
/**
|
||||||
|
* NOTE: These are the fields that are required to properly fulfil Merge-on-Read (MOR)
|
||||||
|
* semantic:
|
||||||
|
*
|
||||||
|
* <ol>
|
||||||
|
* <li>Primary key is required to make sure we're able to correlate records from the base
|
||||||
|
* file with the updated records from the delta-log file</li>
|
||||||
|
* <li>Pre-combine key is required to properly perform the combining (or merging) of the
|
||||||
|
* existing and updated records</li>
|
||||||
|
* </ol>
|
||||||
|
*
|
||||||
|
* However, in cases when merging is NOT performed (for ex, if file-group only contains base
|
||||||
|
* files but no delta-log files, or if the query-type is equal to [["skip_merge"]]) neither
|
||||||
|
* of primary-key or pre-combine-key are required to be fetched from storage (unless requested
|
||||||
|
* by the query), therefore saving on throughput
|
||||||
|
*/
|
||||||
|
protected lazy val mandatoryFieldsForMerging: Seq[String] =
|
||||||
Seq(recordKeyField) ++ preCombineFieldOpt.map(Seq(_)).getOrElse(Seq())
|
Seq(recordKeyField) ++ preCombineFieldOpt.map(Seq(_)).getOrElse(Seq())
|
||||||
|
|
||||||
|
override lazy val mandatoryFields: Seq[String] = mandatoryFieldsForMerging
|
||||||
|
|
||||||
protected val mergeType: String = optParams.getOrElse(DataSourceReadOptions.REALTIME_MERGE.key,
|
protected val mergeType: String = optParams.getOrElse(DataSourceReadOptions.REALTIME_MERGE.key,
|
||||||
DataSourceReadOptions.REALTIME_MERGE.defaultValue)
|
DataSourceReadOptions.REALTIME_MERGE.defaultValue)
|
||||||
|
|
||||||
@@ -62,8 +83,9 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
|||||||
partitionSchema: StructType,
|
partitionSchema: StructType,
|
||||||
dataSchema: HoodieTableSchema,
|
dataSchema: HoodieTableSchema,
|
||||||
requiredSchema: HoodieTableSchema,
|
requiredSchema: HoodieTableSchema,
|
||||||
|
requestedColumns: Array[String],
|
||||||
filters: Array[Filter]): HoodieMergeOnReadRDD = {
|
filters: Array[Filter]): HoodieMergeOnReadRDD = {
|
||||||
val fullSchemaParquetReader = createBaseFileReader(
|
val fullSchemaBaseFileReader = createBaseFileReader(
|
||||||
spark = sqlContext.sparkSession,
|
spark = sqlContext.sparkSession,
|
||||||
partitionSchema = partitionSchema,
|
partitionSchema = partitionSchema,
|
||||||
dataSchema = dataSchema,
|
dataSchema = dataSchema,
|
||||||
@@ -79,21 +101,23 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
|||||||
hadoopConf = embedInternalSchema(new Configuration(conf), internalSchemaOpt)
|
hadoopConf = embedInternalSchema(new Configuration(conf), internalSchemaOpt)
|
||||||
)
|
)
|
||||||
|
|
||||||
val requiredSchemaParquetReader = createBaseFileReader(
|
val (requiredSchemaBaseFileReaderMerging, requiredSchemaBaseFileReaderNoMerging) =
|
||||||
spark = sqlContext.sparkSession,
|
createMergeOnReadBaseFileReaders(partitionSchema, dataSchema, requiredSchema, requestedColumns, filters)
|
||||||
partitionSchema = partitionSchema,
|
|
||||||
dataSchema = dataSchema,
|
|
||||||
requiredSchema = requiredSchema,
|
|
||||||
filters = filters,
|
|
||||||
options = optParams,
|
|
||||||
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
|
||||||
// to configure Parquet reader appropriately
|
|
||||||
hadoopConf = embedInternalSchema(new Configuration(conf), requiredSchema.internalSchema)
|
|
||||||
)
|
|
||||||
|
|
||||||
val tableState = getTableState
|
val tableState = getTableState
|
||||||
new HoodieMergeOnReadRDD(sqlContext.sparkContext, jobConf, fullSchemaParquetReader, requiredSchemaParquetReader,
|
new HoodieMergeOnReadRDD(
|
||||||
dataSchema, requiredSchema, tableState, mergeType, fileSplits)
|
sqlContext.sparkContext,
|
||||||
|
config = jobConf,
|
||||||
|
fileReaders = HoodieMergeOnReadBaseFileReaders(
|
||||||
|
fullSchemaFileReader = fullSchemaBaseFileReader,
|
||||||
|
requiredSchemaFileReaderForMerging = requiredSchemaBaseFileReaderMerging,
|
||||||
|
requiredSchemaFileReaderForNoMerging = requiredSchemaBaseFileReaderNoMerging
|
||||||
|
),
|
||||||
|
dataSchema = dataSchema,
|
||||||
|
requiredSchema = requiredSchema,
|
||||||
|
tableState = tableState,
|
||||||
|
mergeType = mergeType,
|
||||||
|
fileSplits = fileSplits)
|
||||||
}
|
}
|
||||||
|
|
||||||
protected override def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = {
|
protected override def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = {
|
||||||
@@ -122,6 +146,61 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
|||||||
HoodieMergeOnReadFileSplit(partitionedBaseFile, logFiles)
|
HoodieMergeOnReadFileSplit(partitionedBaseFile, logFiles)
|
||||||
}.toList
|
}.toList
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected def createMergeOnReadBaseFileReaders(partitionSchema: StructType,
|
||||||
|
dataSchema: HoodieTableSchema,
|
||||||
|
requiredDataSchema: HoodieTableSchema,
|
||||||
|
requestedColumns: Array[String],
|
||||||
|
filters: Array[Filter]): (BaseFileReader, BaseFileReader) = {
|
||||||
|
val requiredSchemaFileReaderMerging = createBaseFileReader(
|
||||||
|
spark = sqlContext.sparkSession,
|
||||||
|
partitionSchema = partitionSchema,
|
||||||
|
dataSchema = dataSchema,
|
||||||
|
requiredSchema = requiredDataSchema,
|
||||||
|
filters = filters,
|
||||||
|
options = optParams,
|
||||||
|
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
||||||
|
// to configure Parquet reader appropriately
|
||||||
|
hadoopConf = embedInternalSchema(new Configuration(conf), requiredDataSchema.internalSchema)
|
||||||
|
)
|
||||||
|
|
||||||
|
// Check whether fields required for merging were also requested to be fetched
|
||||||
|
// by the query:
|
||||||
|
// - In case they were, there's no optimization we could apply here (we will have
|
||||||
|
// to fetch such fields)
|
||||||
|
// - In case they were not, we will provide 2 separate file-readers
|
||||||
|
// a) One which would be applied to file-groups w/ delta-logs (merging)
|
||||||
|
// b) One which would be applied to file-groups w/ no delta-logs or
|
||||||
|
// in case query-mode is skipping merging
|
||||||
|
val requiredColumns = mandatoryFieldsForMerging.map(HoodieAvroUtils.getRootLevelFieldName)
|
||||||
|
if (requiredColumns.forall(requestedColumns.contains)) {
|
||||||
|
(requiredSchemaFileReaderMerging, requiredSchemaFileReaderMerging)
|
||||||
|
} else {
|
||||||
|
val prunedRequiredSchema = {
|
||||||
|
val superfluousColumnNames = requiredColumns.filterNot(requestedColumns.contains)
|
||||||
|
val prunedStructSchema =
|
||||||
|
StructType(requiredDataSchema.structTypeSchema.fields
|
||||||
|
.filterNot(f => superfluousColumnNames.contains(f.name)))
|
||||||
|
|
||||||
|
HoodieTableSchema(prunedStructSchema, convertToAvroSchema(prunedStructSchema).toString)
|
||||||
|
}
|
||||||
|
|
||||||
|
val requiredSchemaFileReaderNoMerging = createBaseFileReader(
|
||||||
|
spark = sqlContext.sparkSession,
|
||||||
|
partitionSchema = partitionSchema,
|
||||||
|
dataSchema = dataSchema,
|
||||||
|
requiredSchema = prunedRequiredSchema,
|
||||||
|
filters = filters,
|
||||||
|
options = optParams,
|
||||||
|
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
||||||
|
// to configure Parquet reader appropriately
|
||||||
|
hadoopConf = embedInternalSchema(new Configuration(conf), requiredDataSchema.internalSchema)
|
||||||
|
)
|
||||||
|
|
||||||
|
(requiredSchemaFileReaderMerging, requiredSchemaFileReaderNoMerging)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
object MergeOnReadSnapshotRelation {
|
object MergeOnReadSnapshotRelation {
|
||||||
|
|||||||
@@ -51,6 +51,4 @@ class TestHoodieRelations {
|
|||||||
requiredStructSchema.fields.toSeq
|
requiredStructSchema.fields.toSeq
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
|
|||||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> classOf[NonpartitionedKeyGenerator].getName
|
DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> classOf[NonpartitionedKeyGenerator].getName
|
||||||
)
|
)
|
||||||
|
|
||||||
@Disabled("HUDI-3896")
|
@Disabled("Currently disabled b/c of the fallback to HadoopFsRelation")
|
||||||
@Test
|
@Test
|
||||||
def testBaseFileOnlyViewRelation(): Unit = {
|
def testBaseFileOnlyViewRelation(): Unit = {
|
||||||
val tablePath = s"$basePath/cow"
|
val tablePath = s"$basePath/cow"
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import org.apache.hudi.config.HoodieWriteConfig
|
|||||||
import org.apache.hudi.exception.HoodieDuplicateKeyException
|
import org.apache.hudi.exception.HoodieDuplicateKeyException
|
||||||
import org.apache.hudi.keygen.ComplexKeyGenerator
|
import org.apache.hudi.keygen.ComplexKeyGenerator
|
||||||
import org.apache.spark.sql.SaveMode
|
import org.apache.spark.sql.SaveMode
|
||||||
|
import org.apache.spark.sql.internal.SQLConf
|
||||||
|
|
||||||
import java.io.File
|
import java.io.File
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user