[HUDI-3934] Fix Spark32HoodieParquetFileFormat not being compatible w/ Spark 3.2.0 (#5378)
- Due to the fact that Spark 3.2.1 is non-BWC w/ 3.2.0, we have to handle all these incompatibilities in Spark32HoodieParquetFileFormat. This PR is addressing that. Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
@@ -53,13 +53,15 @@ object HoodieSparkUtils extends SparkAdapterSupport {
|
|||||||
|
|
||||||
def isSpark3_1: Boolean = SPARK_VERSION.startsWith("3.1")
|
def isSpark3_1: Boolean = SPARK_VERSION.startsWith("3.1")
|
||||||
|
|
||||||
|
def gteqSpark3_1: Boolean = SPARK_VERSION > "3.1"
|
||||||
|
|
||||||
|
def gteqSpark3_1_3: Boolean = SPARK_VERSION >= "3.1.3"
|
||||||
|
|
||||||
def isSpark3_2: Boolean = SPARK_VERSION.startsWith("3.2")
|
def isSpark3_2: Boolean = SPARK_VERSION.startsWith("3.2")
|
||||||
|
|
||||||
def gteqSpark3_2: Boolean = SPARK_VERSION > "3.2"
|
def gteqSpark3_2: Boolean = SPARK_VERSION > "3.2"
|
||||||
|
|
||||||
def gteqSpark3_1: Boolean = SPARK_VERSION > "3.1"
|
def gteqSpark3_2_1: Boolean = SPARK_VERSION >= "3.2.1"
|
||||||
|
|
||||||
def gteqSpark3_1_3: Boolean = SPARK_VERSION >= "3.1.3"
|
|
||||||
|
|
||||||
def getMetaSchema: StructType = {
|
def getMetaSchema: StructType = {
|
||||||
StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => {
|
StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => {
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ import org.apache.spark.SPARK_VERSION
|
|||||||
import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSerializer, HoodieSpark3_1AvroDeserializer, HoodieSpark3_1AvroSerializer}
|
import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSerializer, HoodieSpark3_1AvroDeserializer, HoodieSpark3_1AvroSerializer}
|
||||||
import org.apache.spark.sql.catalyst.plans.logical._
|
import org.apache.spark.sql.catalyst.plans.logical._
|
||||||
import org.apache.spark.sql.catalyst.rules.Rule
|
import org.apache.spark.sql.catalyst.rules.Rule
|
||||||
import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, Spark312HoodieParquetFileFormat}
|
import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, Spark31HoodieParquetFileFormat}
|
||||||
import org.apache.spark.sql.hudi.SparkAdapter
|
import org.apache.spark.sql.hudi.SparkAdapter
|
||||||
import org.apache.spark.sql.types.DataType
|
import org.apache.spark.sql.types.DataType
|
||||||
import org.apache.spark.sql.{HoodieCatalystExpressionUtils, HoodieSpark3_1CatalystExpressionUtils, SparkSession}
|
import org.apache.spark.sql.{HoodieCatalystExpressionUtils, HoodieSpark3_1CatalystExpressionUtils, SparkSession}
|
||||||
@@ -55,6 +55,6 @@ class Spark3_1Adapter extends BaseSpark3Adapter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
override def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = {
|
override def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = {
|
||||||
Some(new Spark312HoodieParquetFileFormat(appendPartitionValues))
|
Some(new Spark31HoodieParquetFileFormat(appendPartitionValues))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ import org.apache.hudi.HoodieSparkUtils
|
|||||||
import org.apache.hudi.client.utils.SparkInternalSchemaConverter
|
import org.apache.hudi.client.utils.SparkInternalSchemaConverter
|
||||||
import org.apache.hudi.common.fs.FSUtils
|
import org.apache.hudi.common.fs.FSUtils
|
||||||
import org.apache.hudi.common.util.StringUtils.isNullOrEmpty
|
import org.apache.hudi.common.util.StringUtils.isNullOrEmpty
|
||||||
import org.apache.hudi.common.util.{InternalSchemaCache, StringUtils}
|
import org.apache.hudi.common.util.{InternalSchemaCache, ReflectionUtils, StringUtils}
|
||||||
import org.apache.hudi.common.util.collection.Pair
|
import org.apache.hudi.common.util.collection.Pair
|
||||||
import org.apache.hudi.internal.schema.InternalSchema
|
import org.apache.hudi.internal.schema.InternalSchema
|
||||||
import org.apache.hudi.internal.schema.action.InternalSchemaMerger
|
import org.apache.hudi.internal.schema.action.InternalSchemaMerger
|
||||||
@@ -41,7 +41,7 @@ import org.apache.spark.sql.catalyst.InternalRow
|
|||||||
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
|
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow}
|
import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow}
|
||||||
import org.apache.spark.sql.catalyst.util.DateTimeUtils
|
import org.apache.spark.sql.catalyst.util.DateTimeUtils
|
||||||
import org.apache.spark.sql.execution.datasources.parquet.Spark312HoodieParquetFileFormat.{createParquetFilters, pruneInternalSchema, rebuildFilterFromParquet}
|
import org.apache.spark.sql.execution.datasources.parquet.Spark31HoodieParquetFileFormat.{createParquetFilters, pruneInternalSchema, rebuildFilterFromParquet}
|
||||||
import org.apache.spark.sql.execution.datasources.{DataSourceUtils, PartitionedFile, RecordReaderIterator}
|
import org.apache.spark.sql.execution.datasources.{DataSourceUtils, PartitionedFile, RecordReaderIterator}
|
||||||
import org.apache.spark.sql.internal.SQLConf
|
import org.apache.spark.sql.internal.SQLConf
|
||||||
import org.apache.spark.sql.sources._
|
import org.apache.spark.sql.sources._
|
||||||
@@ -61,7 +61,7 @@ import java.net.URI
|
|||||||
* <li>Schema on-read</li>
|
* <li>Schema on-read</li>
|
||||||
* </ol>
|
* </ol>
|
||||||
*/
|
*/
|
||||||
class Spark312HoodieParquetFileFormat(private val shouldAppendPartitionValues: Boolean) extends ParquetFileFormat {
|
class Spark31HoodieParquetFileFormat(private val shouldAppendPartitionValues: Boolean) extends ParquetFileFormat {
|
||||||
|
|
||||||
override def buildReaderWithPartitionValues(sparkSession: SparkSession,
|
override def buildReaderWithPartitionValues(sparkSession: SparkSession,
|
||||||
dataSchema: StructType,
|
dataSchema: StructType,
|
||||||
@@ -154,8 +154,8 @@ class Spark312HoodieParquetFileFormat(private val shouldAppendPartitionValues: B
|
|||||||
val shouldUseInternalSchema = !isNullOrEmpty(internalSchemaStr) && querySchemaOption.isPresent
|
val shouldUseInternalSchema = !isNullOrEmpty(internalSchemaStr) && querySchemaOption.isPresent
|
||||||
|
|
||||||
val tablePath = sharedConf.get(SparkInternalSchemaConverter.HOODIE_TABLE_PATH)
|
val tablePath = sharedConf.get(SparkInternalSchemaConverter.HOODIE_TABLE_PATH)
|
||||||
val commitInstantTime = FSUtils.getCommitTime(filePath.getName).toLong;
|
|
||||||
val fileSchema = if (shouldUseInternalSchema) {
|
val fileSchema = if (shouldUseInternalSchema) {
|
||||||
|
val commitInstantTime = FSUtils.getCommitTime(filePath.getName).toLong;
|
||||||
val validCommits = sharedConf.get(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST)
|
val validCommits = sharedConf.get(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST)
|
||||||
InternalSchemaCache.getInternalSchemaByVersionId(commitInstantTime, tablePath, sharedConf, if (validCommits == null) "" else validCommits)
|
InternalSchemaCache.getInternalSchemaByVersionId(commitInstantTime, tablePath, sharedConf, if (validCommits == null) "" else validCommits)
|
||||||
} else {
|
} else {
|
||||||
@@ -223,13 +223,17 @@ class Spark312HoodieParquetFileFormat(private val shouldAppendPartitionValues: B
|
|||||||
|
|
||||||
// Clone new conf
|
// Clone new conf
|
||||||
val hadoopAttemptConf = new Configuration(broadcastedHadoopConf.value.value)
|
val hadoopAttemptConf = new Configuration(broadcastedHadoopConf.value.value)
|
||||||
var typeChangeInfos: java.util.Map[Integer, Pair[DataType, DataType]] = new java.util.HashMap()
|
var typeChangeInfos: java.util.Map[Integer, Pair[DataType, DataType]] = if (shouldUseInternalSchema) {
|
||||||
if (shouldUseInternalSchema) {
|
|
||||||
val mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchemaOption.get(), true, true).mergeSchema()
|
val mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchemaOption.get(), true, true).mergeSchema()
|
||||||
val mergedSchema = SparkInternalSchemaConverter.constructSparkSchemaFromInternalSchema(mergedInternalSchema)
|
val mergedSchema = SparkInternalSchemaConverter.constructSparkSchemaFromInternalSchema(mergedInternalSchema)
|
||||||
typeChangeInfos = SparkInternalSchemaConverter.collectTypeChangedCols(querySchemaOption.get(), mergedInternalSchema)
|
|
||||||
hadoopAttemptConf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, mergedSchema.json)
|
hadoopAttemptConf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, mergedSchema.json)
|
||||||
|
|
||||||
|
SparkInternalSchemaConverter.collectTypeChangedCols(querySchemaOption.get(), mergedInternalSchema)
|
||||||
|
} else {
|
||||||
|
new java.util.HashMap()
|
||||||
}
|
}
|
||||||
|
|
||||||
val hadoopAttemptContext =
|
val hadoopAttemptContext =
|
||||||
new TaskAttemptContextImpl(hadoopAttemptConf, attemptId)
|
new TaskAttemptContextImpl(hadoopAttemptConf, attemptId)
|
||||||
|
|
||||||
@@ -329,9 +333,7 @@ class Spark312HoodieParquetFileFormat(private val shouldAppendPartitionValues: B
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object Spark312HoodieParquetFileFormat {
|
object Spark31HoodieParquetFileFormat {
|
||||||
|
|
||||||
val PARQUET_FILTERS_CLASS_NAME = "org.apache.spark.sql.execution.datasources.parquet.ParquetFilters"
|
|
||||||
|
|
||||||
def pruneInternalSchema(internalSchemaStr: String, requiredSchema: StructType): String = {
|
def pruneInternalSchema(internalSchemaStr: String, requiredSchema: StructType): String = {
|
||||||
val querySchemaOption = SerDeHelper.fromJson(internalSchemaStr)
|
val querySchemaOption = SerDeHelper.fromJson(internalSchemaStr)
|
||||||
@@ -343,10 +345,11 @@ object Spark312HoodieParquetFileFormat {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private def createParquetFilters(arg: Any*): ParquetFilters = {
|
private def createParquetFilters(args: Any*): ParquetFilters = {
|
||||||
val clazz = Class.forName(PARQUET_FILTERS_CLASS_NAME, true, Thread.currentThread().getContextClassLoader)
|
// ParquetFilters bears a single ctor (in Spark 3.1)
|
||||||
val ctor = clazz.getConstructors.head
|
val ctor = classOf[ParquetFilters].getConstructors.head
|
||||||
ctor.newInstance(arg.map(_.asInstanceOf[AnyRef]): _*).asInstanceOf[ParquetFilters]
|
ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*)
|
||||||
|
.asInstanceOf[ParquetFilters]
|
||||||
}
|
}
|
||||||
|
|
||||||
private def rebuildFilterFromParquet(oldFilter: Filter, fileSchema: InternalSchema, querySchema: InternalSchema): Filter = {
|
private def rebuildFilterFromParquet(oldFilter: Filter, fileSchema: InternalSchema, querySchema: InternalSchema): Filter = {
|
||||||
@@ -0,0 +1,77 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark.sql.execution.datasources.parquet
|
||||||
|
|
||||||
|
import org.apache.spark.sql.SPARK_VERSION_METADATA_KEY
|
||||||
|
import org.apache.spark.sql.internal.SQLConf
|
||||||
|
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
|
||||||
|
import org.apache.spark.util.Utils
|
||||||
|
|
||||||
|
object Spark32DataSourceUtils {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NOTE: This method was copied from Spark 3.2.0, and is required to maintain runtime
|
||||||
|
* compatibility against Spark 3.2.0
|
||||||
|
*/
|
||||||
|
// scalastyle:off
|
||||||
|
def int96RebaseMode(lookupFileMeta: String => String,
|
||||||
|
modeByConfig: String): LegacyBehaviorPolicy.Value = {
|
||||||
|
if (Utils.isTesting && SQLConf.get.getConfString("spark.test.forceNoRebase", "") == "true") {
|
||||||
|
return LegacyBehaviorPolicy.CORRECTED
|
||||||
|
}
|
||||||
|
// If there is no version, we return the mode specified by the config.
|
||||||
|
Option(lookupFileMeta(SPARK_VERSION_METADATA_KEY)).map { version =>
|
||||||
|
// Files written by Spark 3.0 and earlier follow the legacy hybrid calendar and we need to
|
||||||
|
// rebase the INT96 timestamp values.
|
||||||
|
// Files written by Spark 3.1 and latter may also need the rebase if they were written with
|
||||||
|
// the "LEGACY" rebase mode.
|
||||||
|
if (version < "3.1.0" || lookupFileMeta("org.apache.spark.legacyINT96") != null) {
|
||||||
|
LegacyBehaviorPolicy.LEGACY
|
||||||
|
} else {
|
||||||
|
LegacyBehaviorPolicy.CORRECTED
|
||||||
|
}
|
||||||
|
}.getOrElse(LegacyBehaviorPolicy.withName(modeByConfig))
|
||||||
|
}
|
||||||
|
// scalastyle:on
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NOTE: This method was copied from Spark 3.2.0, and is required to maintain runtime
|
||||||
|
* compatibility against Spark 3.2.0
|
||||||
|
*/
|
||||||
|
// scalastyle:off
|
||||||
|
def datetimeRebaseMode(lookupFileMeta: String => String,
|
||||||
|
modeByConfig: String): LegacyBehaviorPolicy.Value = {
|
||||||
|
if (Utils.isTesting && SQLConf.get.getConfString("spark.test.forceNoRebase", "") == "true") {
|
||||||
|
return LegacyBehaviorPolicy.CORRECTED
|
||||||
|
}
|
||||||
|
// If there is no version, we return the mode specified by the config.
|
||||||
|
Option(lookupFileMeta(SPARK_VERSION_METADATA_KEY)).map { version =>
|
||||||
|
// Files written by Spark 2.4 and earlier follow the legacy hybrid calendar and we need to
|
||||||
|
// rebase the datetime values.
|
||||||
|
// Files written by Spark 3.0 and latter may also need the rebase if they were written with
|
||||||
|
// the "LEGACY" rebase mode.
|
||||||
|
if (version < "3.0.0" || lookupFileMeta("org.apache.spark.legacyDateTime") != null) {
|
||||||
|
LegacyBehaviorPolicy.LEGACY
|
||||||
|
} else {
|
||||||
|
LegacyBehaviorPolicy.CORRECTED
|
||||||
|
}
|
||||||
|
}.getOrElse(LegacyBehaviorPolicy.withName(modeByConfig))
|
||||||
|
}
|
||||||
|
// scalastyle:on
|
||||||
|
|
||||||
|
}
|
||||||
@@ -22,6 +22,7 @@ import org.apache.hadoop.fs.Path
|
|||||||
import org.apache.hadoop.mapred.FileSplit
|
import org.apache.hadoop.mapred.FileSplit
|
||||||
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
|
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
|
||||||
import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID, TaskID, TaskType}
|
import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID, TaskID, TaskType}
|
||||||
|
import org.apache.hudi.HoodieSparkUtils
|
||||||
import org.apache.hudi.client.utils.SparkInternalSchemaConverter
|
import org.apache.hudi.client.utils.SparkInternalSchemaConverter
|
||||||
import org.apache.hudi.common.fs.FSUtils
|
import org.apache.hudi.common.fs.FSUtils
|
||||||
import org.apache.hudi.common.util.InternalSchemaCache
|
import org.apache.hudi.common.util.InternalSchemaCache
|
||||||
@@ -37,10 +38,10 @@ import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetRecordReader}
|
|||||||
import org.apache.spark.TaskContext
|
import org.apache.spark.TaskContext
|
||||||
import org.apache.spark.sql.SparkSession
|
import org.apache.spark.sql.SparkSession
|
||||||
import org.apache.spark.sql.catalyst.InternalRow
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow}
|
|
||||||
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
|
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
|
||||||
|
import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow}
|
||||||
import org.apache.spark.sql.catalyst.util.DateTimeUtils
|
import org.apache.spark.sql.catalyst.util.DateTimeUtils
|
||||||
import org.apache.spark.sql.execution.datasources.parquet.Spark32HoodieParquetFileFormat.{pruneInternalSchema, rebuildFilterFromParquet}
|
import org.apache.spark.sql.execution.datasources.parquet.Spark32HoodieParquetFileFormat._
|
||||||
import org.apache.spark.sql.execution.datasources.{DataSourceUtils, PartitionedFile, RecordReaderIterator}
|
import org.apache.spark.sql.execution.datasources.{DataSourceUtils, PartitionedFile, RecordReaderIterator}
|
||||||
import org.apache.spark.sql.internal.SQLConf
|
import org.apache.spark.sql.internal.SQLConf
|
||||||
import org.apache.spark.sql.sources._
|
import org.apache.spark.sql.sources._
|
||||||
@@ -148,8 +149,8 @@ class Spark32HoodieParquetFileFormat(private val shouldAppendPartitionValues: Bo
|
|||||||
val shouldUseInternalSchema = !isNullOrEmpty(internalSchemaStr) && querySchemaOption.isPresent
|
val shouldUseInternalSchema = !isNullOrEmpty(internalSchemaStr) && querySchemaOption.isPresent
|
||||||
|
|
||||||
val tablePath = sharedConf.get(SparkInternalSchemaConverter.HOODIE_TABLE_PATH)
|
val tablePath = sharedConf.get(SparkInternalSchemaConverter.HOODIE_TABLE_PATH)
|
||||||
val commitInstantTime = FSUtils.getCommitTime(filePath.getName).toLong;
|
|
||||||
val fileSchema = if (shouldUseInternalSchema) {
|
val fileSchema = if (shouldUseInternalSchema) {
|
||||||
|
val commitInstantTime = FSUtils.getCommitTime(filePath.getName).toLong;
|
||||||
val validCommits = sharedConf.get(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST)
|
val validCommits = sharedConf.get(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST)
|
||||||
InternalSchemaCache.getInternalSchemaByVersionId(commitInstantTime, tablePath, sharedConf, if (validCommits == null) "" else validCommits)
|
InternalSchemaCache.getInternalSchemaByVersionId(commitInstantTime, tablePath, sharedConf, if (validCommits == null) "" else validCommits)
|
||||||
} else {
|
} else {
|
||||||
@@ -158,21 +159,38 @@ class Spark32HoodieParquetFileFormat(private val shouldAppendPartitionValues: Bo
|
|||||||
|
|
||||||
lazy val footerFileMetaData =
|
lazy val footerFileMetaData =
|
||||||
ParquetFooterReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData
|
ParquetFooterReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData
|
||||||
val datetimeRebaseSpec = DataSourceUtils.datetimeRebaseSpec(
|
|
||||||
footerFileMetaData.getKeyValueMetaData.get,
|
|
||||||
datetimeRebaseModeInRead)
|
|
||||||
// Try to push down filters when filter push-down is enabled.
|
// Try to push down filters when filter push-down is enabled.
|
||||||
val pushed = if (enableParquetFilterPushDown) {
|
val pushed = if (enableParquetFilterPushDown) {
|
||||||
val parquetSchema = footerFileMetaData.getSchema
|
val parquetSchema = footerFileMetaData.getSchema
|
||||||
val parquetFilters = new ParquetFilters(
|
val parquetFilters = if (HoodieSparkUtils.gteqSpark3_2_1) {
|
||||||
parquetSchema,
|
// NOTE: Below code could only be compiled against >= Spark 3.2.1,
|
||||||
pushDownDate,
|
// and unfortunately won't compile against Spark 3.2.0
|
||||||
pushDownTimestamp,
|
// However this code is runtime-compatible w/ both Spark 3.2.0 and >= Spark 3.2.1
|
||||||
pushDownDecimal,
|
val datetimeRebaseSpec =
|
||||||
pushDownStringStartWith,
|
DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
|
||||||
pushDownInFilterThreshold,
|
new ParquetFilters(
|
||||||
isCaseSensitive,
|
parquetSchema,
|
||||||
datetimeRebaseSpec)
|
pushDownDate,
|
||||||
|
pushDownTimestamp,
|
||||||
|
pushDownDecimal,
|
||||||
|
pushDownStringStartWith,
|
||||||
|
pushDownInFilterThreshold,
|
||||||
|
isCaseSensitive,
|
||||||
|
datetimeRebaseSpec)
|
||||||
|
} else {
|
||||||
|
// Spark 3.2.0
|
||||||
|
val datetimeRebaseMode =
|
||||||
|
Spark32DataSourceUtils.datetimeRebaseMode(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
|
||||||
|
createParquetFilters(
|
||||||
|
parquetSchema,
|
||||||
|
pushDownDate,
|
||||||
|
pushDownTimestamp,
|
||||||
|
pushDownDecimal,
|
||||||
|
pushDownStringStartWith,
|
||||||
|
pushDownInFilterThreshold,
|
||||||
|
isCaseSensitive,
|
||||||
|
datetimeRebaseMode)
|
||||||
|
}
|
||||||
filters.map(rebuildFilterFromParquet(_, fileSchema, querySchemaOption.orElse(null)))
|
filters.map(rebuildFilterFromParquet(_, fileSchema, querySchemaOption.orElse(null)))
|
||||||
// Collects all converted Parquet filter predicates. Notice that not all predicates can be
|
// Collects all converted Parquet filter predicates. Notice that not all predicates can be
|
||||||
// converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
|
// converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
|
||||||
@@ -198,21 +216,21 @@ class Spark32HoodieParquetFileFormat(private val shouldAppendPartitionValues: Bo
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
val int96RebaseSpec = DataSourceUtils.int96RebaseSpec(
|
|
||||||
footerFileMetaData.getKeyValueMetaData.get,
|
|
||||||
int96RebaseModeInRead)
|
|
||||||
|
|
||||||
val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
|
val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
|
||||||
|
|
||||||
// Clone new conf
|
// Clone new conf
|
||||||
val hadoopAttemptConf = new Configuration(broadcastedHadoopConf.value.value)
|
val hadoopAttemptConf = new Configuration(broadcastedHadoopConf.value.value)
|
||||||
var typeChangeInfos: java.util.Map[Integer, Pair[DataType, DataType]] = new java.util.HashMap()
|
val typeChangeInfos: java.util.Map[Integer, Pair[DataType, DataType]] = if (shouldUseInternalSchema) {
|
||||||
if (shouldUseInternalSchema) {
|
|
||||||
val mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchemaOption.get(), true, true).mergeSchema()
|
val mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchemaOption.get(), true, true).mergeSchema()
|
||||||
val mergedSchema = SparkInternalSchemaConverter.constructSparkSchemaFromInternalSchema(mergedInternalSchema)
|
val mergedSchema = SparkInternalSchemaConverter.constructSparkSchemaFromInternalSchema(mergedInternalSchema)
|
||||||
typeChangeInfos = SparkInternalSchemaConverter.collectTypeChangedCols(querySchemaOption.get(), mergedInternalSchema)
|
|
||||||
hadoopAttemptConf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, mergedSchema.json)
|
hadoopAttemptConf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, mergedSchema.json)
|
||||||
|
|
||||||
|
SparkInternalSchemaConverter.collectTypeChangedCols(querySchemaOption.get(), mergedInternalSchema)
|
||||||
|
} else {
|
||||||
|
new java.util.HashMap()
|
||||||
}
|
}
|
||||||
|
|
||||||
val hadoopAttemptContext =
|
val hadoopAttemptContext =
|
||||||
new TaskAttemptContextImpl(hadoopAttemptConf, attemptId)
|
new TaskAttemptContextImpl(hadoopAttemptConf, attemptId)
|
||||||
|
|
||||||
@@ -225,6 +243,10 @@ class Spark32HoodieParquetFileFormat(private val shouldAppendPartitionValues: Bo
|
|||||||
if (enableVectorizedReader) {
|
if (enableVectorizedReader) {
|
||||||
val vectorizedReader =
|
val vectorizedReader =
|
||||||
if (shouldUseInternalSchema) {
|
if (shouldUseInternalSchema) {
|
||||||
|
val int96RebaseSpec =
|
||||||
|
DataSourceUtils.int96RebaseSpec(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead)
|
||||||
|
val datetimeRebaseSpec =
|
||||||
|
DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
|
||||||
new Spark32HoodieVectorizedParquetRecordReader(
|
new Spark32HoodieVectorizedParquetRecordReader(
|
||||||
convertTz.orNull,
|
convertTz.orNull,
|
||||||
datetimeRebaseSpec.mode.toString,
|
datetimeRebaseSpec.mode.toString,
|
||||||
@@ -234,7 +256,14 @@ class Spark32HoodieParquetFileFormat(private val shouldAppendPartitionValues: Bo
|
|||||||
enableOffHeapColumnVector && taskContext.isDefined,
|
enableOffHeapColumnVector && taskContext.isDefined,
|
||||||
capacity,
|
capacity,
|
||||||
typeChangeInfos)
|
typeChangeInfos)
|
||||||
} else {
|
} else if (HoodieSparkUtils.gteqSpark3_2_1) {
|
||||||
|
// NOTE: Below code could only be compiled against >= Spark 3.2.1,
|
||||||
|
// and unfortunately won't compile against Spark 3.2.0
|
||||||
|
// However this code is runtime-compatible w/ both Spark 3.2.0 and >= Spark 3.2.1
|
||||||
|
val int96RebaseSpec =
|
||||||
|
DataSourceUtils.int96RebaseSpec(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead)
|
||||||
|
val datetimeRebaseSpec =
|
||||||
|
DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
|
||||||
new VectorizedParquetRecordReader(
|
new VectorizedParquetRecordReader(
|
||||||
convertTz.orNull,
|
convertTz.orNull,
|
||||||
datetimeRebaseSpec.mode.toString,
|
datetimeRebaseSpec.mode.toString,
|
||||||
@@ -243,7 +272,20 @@ class Spark32HoodieParquetFileFormat(private val shouldAppendPartitionValues: Bo
|
|||||||
int96RebaseSpec.timeZone,
|
int96RebaseSpec.timeZone,
|
||||||
enableOffHeapColumnVector && taskContext.isDefined,
|
enableOffHeapColumnVector && taskContext.isDefined,
|
||||||
capacity)
|
capacity)
|
||||||
|
} else {
|
||||||
|
// Spark 3.2.0
|
||||||
|
val datetimeRebaseMode =
|
||||||
|
Spark32DataSourceUtils.datetimeRebaseMode(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
|
||||||
|
val int96RebaseMode =
|
||||||
|
Spark32DataSourceUtils.int96RebaseMode(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead)
|
||||||
|
createVectorizedParquetRecordReader(
|
||||||
|
convertTz.orNull,
|
||||||
|
datetimeRebaseMode.toString,
|
||||||
|
int96RebaseMode.toString,
|
||||||
|
enableOffHeapColumnVector && taskContext.isDefined,
|
||||||
|
capacity)
|
||||||
}
|
}
|
||||||
|
|
||||||
// SPARK-37089: We cannot register a task completion listener to close this iterator here
|
// SPARK-37089: We cannot register a task completion listener to close this iterator here
|
||||||
// because downstream exec nodes have already registered their listeners. Since listeners
|
// because downstream exec nodes have already registered their listeners. Since listeners
|
||||||
// are executed in reverse order of registration, a listener registered here would close the
|
// are executed in reverse order of registration, a listener registered here would close the
|
||||||
@@ -279,12 +321,32 @@ class Spark32HoodieParquetFileFormat(private val shouldAppendPartitionValues: Bo
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
logDebug(s"Falling back to parquet-mr")
|
logDebug(s"Falling back to parquet-mr")
|
||||||
// ParquetRecordReader returns InternalRow
|
val readSupport = if (HoodieSparkUtils.gteqSpark3_2_1) {
|
||||||
val readSupport = new ParquetReadSupport(
|
// ParquetRecordReader returns InternalRow
|
||||||
convertTz,
|
// NOTE: Below code could only be compiled against >= Spark 3.2.1,
|
||||||
enableVectorizedReader = false,
|
// and unfortunately won't compile against Spark 3.2.0
|
||||||
datetimeRebaseSpec,
|
// However this code is runtime-compatible w/ both Spark 3.2.0 and >= Spark 3.2.1
|
||||||
int96RebaseSpec)
|
val int96RebaseSpec =
|
||||||
|
DataSourceUtils.int96RebaseSpec(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead)
|
||||||
|
val datetimeRebaseSpec =
|
||||||
|
DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
|
||||||
|
new ParquetReadSupport(
|
||||||
|
convertTz,
|
||||||
|
enableVectorizedReader = false,
|
||||||
|
datetimeRebaseSpec,
|
||||||
|
int96RebaseSpec)
|
||||||
|
} else {
|
||||||
|
val datetimeRebaseMode =
|
||||||
|
Spark32DataSourceUtils.datetimeRebaseMode(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead)
|
||||||
|
val int96RebaseMode =
|
||||||
|
Spark32DataSourceUtils.int96RebaseMode(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead)
|
||||||
|
createParquetReadSupport(
|
||||||
|
convertTz,
|
||||||
|
/* enableVectorizedReader = */ false,
|
||||||
|
datetimeRebaseMode,
|
||||||
|
int96RebaseMode)
|
||||||
|
}
|
||||||
|
|
||||||
val reader = if (pushed.isDefined && enableRecordFilter) {
|
val reader = if (pushed.isDefined && enableRecordFilter) {
|
||||||
val parquetFilter = FilterCompat.get(pushed.get, null)
|
val parquetFilter = FilterCompat.get(pushed.get, null)
|
||||||
new ParquetRecordReader[InternalRow](readSupport, parquetFilter)
|
new ParquetRecordReader[InternalRow](readSupport, parquetFilter)
|
||||||
@@ -332,10 +394,47 @@ class Spark32HoodieParquetFileFormat(private val shouldAppendPartitionValues: Bo
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
object Spark32HoodieParquetFileFormat {
|
object Spark32HoodieParquetFileFormat {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NOTE: This method is specific to Spark 3.2.0
|
||||||
|
*/
|
||||||
|
private def createParquetFilters(args: Any*): ParquetFilters = {
|
||||||
|
// NOTE: ParquetFilters ctor args contain Scala enum, therefore we can't look it
|
||||||
|
// up by arg types, and have to instead rely on the number of args based on individual class;
|
||||||
|
// the ctor order is not guaranteed
|
||||||
|
val ctor = classOf[ParquetFilters].getConstructors.maxBy(_.getParameterCount)
|
||||||
|
ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*)
|
||||||
|
.asInstanceOf[ParquetFilters]
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NOTE: This method is specific to Spark 3.2.0
|
||||||
|
*/
|
||||||
|
private def createParquetReadSupport(args: Any*): ParquetReadSupport = {
|
||||||
|
// NOTE: ParquetReadSupport ctor args contain Scala enum, therefore we can't look it
|
||||||
|
// up by arg types, and have to instead rely on the number of args based on individual class;
|
||||||
|
// the ctor order is not guaranteed
|
||||||
|
val ctor = classOf[ParquetReadSupport].getConstructors.maxBy(_.getParameterCount)
|
||||||
|
ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*)
|
||||||
|
.asInstanceOf[ParquetReadSupport]
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NOTE: This method is specific to Spark 3.2.0
|
||||||
|
*/
|
||||||
|
private def createVectorizedParquetRecordReader(args: Any*): VectorizedParquetRecordReader = {
|
||||||
|
// NOTE: ParquetReadSupport ctor args contain Scala enum, therefore we can't look it
|
||||||
|
// up by arg types, and have to instead rely on the number of args based on individual class;
|
||||||
|
// the ctor order is not guaranteed
|
||||||
|
val ctor = classOf[VectorizedParquetRecordReader].getConstructors.maxBy(_.getParameterCount)
|
||||||
|
ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*)
|
||||||
|
.asInstanceOf[VectorizedParquetRecordReader]
|
||||||
|
}
|
||||||
|
|
||||||
def pruneInternalSchema(internalSchemaStr: String, requiredSchema: StructType): String = {
|
def pruneInternalSchema(internalSchemaStr: String, requiredSchema: StructType): String = {
|
||||||
val querySchemaOption = SerDeHelper.fromJson(internalSchemaStr)
|
val querySchemaOption = SerDeHelper.fromJson(internalSchemaStr)
|
||||||
if (querySchemaOption.isPresent && requiredSchema.nonEmpty) {
|
if (querySchemaOption.isPresent && requiredSchema.nonEmpty) {
|
||||||
|
|||||||
Reference in New Issue
Block a user