[HUDI-3204] fix problem that spark on TimestampKeyGenerator has no re… (#4714)
This commit is contained in:
@@ -18,20 +18,30 @@
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hadoop.fs.{FileStatus, Path}
|
||||
|
||||
import org.apache.hudi.HoodieFileIndex.getConfigProperties
|
||||
import org.apache.hudi.common.config.{HoodieMetadataConfig, TypedProperties}
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.util.StringUtils
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions
|
||||
import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator}
|
||||
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.catalyst.expressions.{And, Expression}
|
||||
import org.apache.spark.sql.catalyst.expressions.{And, Expression, Literal}
|
||||
import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory}
|
||||
import org.apache.spark.sql.hudi.DataSkippingUtils.createColumnStatsIndexFilterExpr
|
||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.types.{StringType, StructType}
|
||||
import org.apache.spark.sql.{AnalysisException, Column, SparkSession}
|
||||
import org.apache.spark.unsafe.types.UTF8String
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.util.{Failure, Success, Try}
|
||||
import scala.util.control.NonFatal
|
||||
|
||||
import java.text.SimpleDateFormat
|
||||
|
||||
/**
|
||||
* A file index which support partition prune for hoodie snapshot and read-optimized query.
|
||||
@@ -102,6 +112,10 @@ case class HoodieFileIndex(spark: SparkSession,
|
||||
* @return list of PartitionDirectory containing partition to base files mapping
|
||||
*/
|
||||
override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
|
||||
|
||||
val convertedPartitionFilters =
|
||||
HoodieFileIndex.convertFilterForTimestampKeyGenerator(metaClient, partitionFilters)
|
||||
|
||||
// Look up candidate files names in the col-stats index, if all of the following conditions are true
|
||||
// - Data-skipping is enabled
|
||||
// - Col-Stats Index is present
|
||||
@@ -135,7 +149,7 @@ case class HoodieFileIndex(spark: SparkSession,
|
||||
Seq(PartitionDirectory(InternalRow.empty, candidateFiles))
|
||||
} else {
|
||||
// Prune the partition path by the partition filters
|
||||
val prunedPartitions = prunePartition(cachedAllInputFileSlices.keySet.asScala.toSeq, partitionFilters)
|
||||
val prunedPartitions = prunePartition(cachedAllInputFileSlices.keySet.asScala.toSeq, convertedPartitionFilters)
|
||||
var totalFileSize = 0
|
||||
var candidateFileSize = 0
|
||||
|
||||
@@ -266,7 +280,7 @@ case class HoodieFileIndex(spark: SparkSession,
|
||||
}
|
||||
}
|
||||
|
||||
object HoodieFileIndex {
|
||||
object HoodieFileIndex extends Logging {
|
||||
|
||||
def getConfigProperties(spark: SparkSession, options: Map[String, String]) = {
|
||||
val sqlConf: SQLConf = spark.sessionState.conf
|
||||
@@ -281,6 +295,41 @@ object HoodieFileIndex {
|
||||
properties
|
||||
}
|
||||
|
||||
def convertFilterForTimestampKeyGenerator(metaClient: HoodieTableMetaClient,
|
||||
partitionFilters: Seq[Expression]): Seq[Expression] = {
|
||||
|
||||
val tableConfig = metaClient.getTableConfig
|
||||
val keyGenerator = tableConfig.getKeyGeneratorClassName
|
||||
|
||||
if (keyGenerator != null && (keyGenerator.equals(classOf[TimestampBasedKeyGenerator].getCanonicalName) ||
|
||||
keyGenerator.equals(classOf[TimestampBasedAvroKeyGenerator].getCanonicalName))) {
|
||||
val inputFormat = tableConfig.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)
|
||||
val outputFormat = tableConfig.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP)
|
||||
if (StringUtils.isNullOrEmpty(inputFormat) || StringUtils.isNullOrEmpty(outputFormat) ||
|
||||
inputFormat.equals(outputFormat)) {
|
||||
partitionFilters
|
||||
} else {
|
||||
try {
|
||||
val inDateFormat = new SimpleDateFormat(inputFormat)
|
||||
val outDateFormat = new SimpleDateFormat(outputFormat)
|
||||
partitionFilters.toArray.map {
|
||||
_.transformDown {
|
||||
case Literal(value, dataType) if dataType.isInstanceOf[StringType] =>
|
||||
val converted = outDateFormat.format(inDateFormat.parse(value.toString))
|
||||
Literal(UTF8String.fromString(converted), StringType)
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
case NonFatal(e) =>
|
||||
logWarning("Fail to convert filters for TimestampBaseAvroKeyGenerator.")
|
||||
partitionFilters
|
||||
}
|
||||
}
|
||||
} else {
|
||||
partitionFilters
|
||||
}
|
||||
}
|
||||
|
||||
private def getQueryPath(options: Map[String, String]) = {
|
||||
new Path(options.getOrElse("path", "'path' option required"))
|
||||
}
|
||||
|
||||
@@ -41,6 +41,7 @@ import org.apache.hudi.execution.bulkinsert.{BulkInsertInternalPartitionerWithRo
|
||||
import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool}
|
||||
import org.apache.hudi.index.SparkHoodieIndexFactory
|
||||
import org.apache.hudi.internal.DataSourceInternalWriterHelper
|
||||
import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator}
|
||||
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool
|
||||
import org.apache.hudi.table.BulkInsertPartitioner
|
||||
@@ -91,6 +92,9 @@ object HoodieSparkSqlWriter {
|
||||
validateTableConfig(sqlContext.sparkSession, optParams, tableConfig)
|
||||
|
||||
val (parameters, hoodieConfig) = mergeParamsAndGetHoodieConfig(optParams, tableConfig)
|
||||
val originKeyGeneratorClassName = HoodieWriterUtils.getOriginKeyGenerator(parameters)
|
||||
val timestampKeyGeneratorConfigs = extractConfigsRelatedToTimestmapBasedKeyGenerator(
|
||||
originKeyGeneratorClassName, parameters)
|
||||
val databaseName = hoodieConfig.getStringOrDefault(HoodieTableConfig.DATABASE_NAME, "")
|
||||
val tblName = hoodieConfig.getStringOrThrow(HoodieWriteConfig.TBL_NAME,
|
||||
s"'${HoodieWriteConfig.TBL_NAME.key}' must be set.").trim
|
||||
@@ -153,7 +157,8 @@ object HoodieSparkSqlWriter {
|
||||
.setPartitionFields(partitionColumns)
|
||||
.setPopulateMetaFields(populateMetaFields)
|
||||
.setRecordKeyFields(hoodieConfig.getString(RECORDKEY_FIELD))
|
||||
.setKeyGeneratorClassProp(HoodieWriterUtils.getOriginKeyGenerator(parameters))
|
||||
.setKeyGeneratorClassProp(originKeyGeneratorClassName)
|
||||
.set(timestampKeyGeneratorConfigs)
|
||||
.setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HIVE_STYLE_PARTITIONING))
|
||||
.setUrlEncodePartitioning(hoodieConfig.getBoolean(URL_ENCODE_PARTITIONING))
|
||||
.setCommitTimezone(HoodieTimelineTimeZone.valueOf(hoodieConfig.getStringOrDefault(HoodieTableConfig.TIMELINE_TIMEZONE)))
|
||||
@@ -757,4 +762,14 @@ object HoodieSparkSqlWriter {
|
||||
val params = mergedParams.toMap
|
||||
(params, HoodieWriterUtils.convertMapToHoodieConfig(params))
|
||||
}
|
||||
|
||||
private def extractConfigsRelatedToTimestmapBasedKeyGenerator(keyGenerator: String,
|
||||
params: Map[String, String]): Map[String, String] = {
|
||||
if (keyGenerator.equals(classOf[TimestampBasedKeyGenerator].getCanonicalName) ||
|
||||
keyGenerator.equals(classOf[TimestampBasedAvroKeyGenerator].getCanonicalName)) {
|
||||
params.filterKeys(HoodieTableConfig.PERSISTED_CONFIG_LIST.contains)
|
||||
} else {
|
||||
Map.empty
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,10 +187,12 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
||||
val partitionFilters = filters.filter(f => f.references.forall(p => partitionColumns.contains(p)))
|
||||
val partitionFilterExpression =
|
||||
HoodieSparkUtils.convertToCatalystExpressions(partitionFilters, tableStructSchema)
|
||||
val convertedPartitionFilterExpression =
|
||||
HoodieFileIndex.convertFilterForTimestampKeyGenerator(metaClient, partitionFilterExpression.toSeq)
|
||||
|
||||
// If convert success to catalyst expression, use the partition prune
|
||||
val fileSlices = if (partitionFilterExpression.isDefined) {
|
||||
hoodieFileIndex.listFileSlices(Seq(partitionFilterExpression.get))
|
||||
val fileSlices = if (convertedPartitionFilterExpression.nonEmpty) {
|
||||
hoodieFileIndex.listFileSlices(convertedPartitionFilterExpression)
|
||||
} else {
|
||||
hoodieFileIndex.listFileSlices(Seq.empty[Expression])
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user