1
0

[HUDI-2243] Support Time Travel Query For Hoodie Table (#3360)

This commit is contained in:
pengzhiwei
2021-08-08 07:07:22 +08:00
committed by GitHub
parent 55d2e786db
commit 32a50d8ddb
5 changed files with 296 additions and 12 deletions

View File

@@ -18,14 +18,13 @@
package org.apache.hudi
import java.util.Properties
import scala.collection.JavaConverters._
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, QUERY_TYPE_SNAPSHOT_OPT_VAL}
import org.apache.hudi.client.common.HoodieSparkEngineContext
import org.apache.hudi.common.config.HoodieMetadataConfig
import org.apache.hudi.common.fs.FSUtils
import org.apache.hudi.common.model.{FileSlice, HoodieLogFile}
import org.apache.hudi.common.model.FileSlice
import org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
import org.apache.hudi.common.table.view.{FileSystemViewStorageConfig, HoodieTableFileSystemView}
@@ -37,6 +36,7 @@ import org.apache.spark.sql.avro.SchemaConverters
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BoundReference, Expression, InterpretedPredicate}
import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory}
import org.apache.spark.sql.hudi.HoodieSqlUtils
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType
import org.apache.spark.unsafe.types.UTF8String
@@ -81,6 +81,9 @@ case class HoodieFileIndex(
private val tableType = metaClient.getTableType
private val specifiedQueryInstant = options.get(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key)
.map(HoodieSqlUtils.formatQueryInstant)
/**
* Get the schema of the table.
*/
@@ -214,15 +217,23 @@ case class HoodieFileIndex(
metaClient.reloadActiveTimeline()
val activeInstants = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants
val latestInstant = activeInstants.lastInstant()
fileSystemView = new HoodieTableFileSystemView(metaClient, activeInstants, allFiles)
val queryInstant = if (specifiedQueryInstant.isDefined) {
specifiedQueryInstant
} else if (latestInstant.isPresent) {
Some(latestInstant.get.getTimestamp)
} else {
None
}
(tableType, queryType) match {
case (MERGE_ON_READ, QUERY_TYPE_SNAPSHOT_OPT_VAL) =>
// Fetch and store latest base and log files, and their sizes
cachedAllInputFileSlices = partitionFiles.map(p => {
val latestSlices = if (activeInstants.lastInstant().isPresent) {
fileSystemView.getLatestMergedFileSlicesBeforeOrOn(p._1.partitionPath,
activeInstants.lastInstant().get().getTimestamp).iterator().asScala.toSeq
val latestSlices = if (latestInstant.isPresent) {
fileSystemView.getLatestMergedFileSlicesBeforeOrOn(p._1.partitionPath, queryInstant.get)
.iterator().asScala.toSeq
} else {
Seq()
}
@@ -238,7 +249,12 @@ case class HoodieFileIndex(
case (_, _) =>
// Fetch and store latest base files and its sizes
cachedAllInputFileSlices = partitionFiles.map(p => {
(p._1, fileSystemView.getLatestFileSlices(p._1.partitionPath).iterator().asScala.toSeq)
val fileSlices = specifiedQueryInstant
.map(instant =>
fileSystemView.getLatestFileSlicesBeforeOrOn(p._1.partitionPath, instant, true))
.getOrElse(fileSystemView.getLatestFileSlices(p._1.partitionPath))
.iterator().asScala.toSeq
(p._1, fileSlices)
})
cachedFileSize = cachedAllInputFileSlices.values.flatten.map(_.getBaseFile.get().getFileLen).sum
}
@@ -246,7 +262,7 @@ case class HoodieFileIndex(
// If the partition value contains InternalRow.empty, we query it as a non-partitioned table.
queryAsNonePartitionedTable = partitionFiles.keys.exists(p => p.values == InternalRow.empty)
val flushSpend = System.currentTimeMillis() - startTime
logInfo(s"Refresh for table ${metaClient.getTableConfig.getTableName}," +
logInfo(s"Refresh table ${metaClient.getTableConfig.getTableName}," +
s" spend: $flushSpend ms")
}

View File

@@ -32,6 +32,7 @@ import org.apache.spark.sql.avro.SchemaConverters
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{FileStatusCache, PartitionedFile}
import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
import org.apache.spark.sql.hudi.HoodieSqlUtils
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
import org.apache.spark.sql.types.StructType
@@ -97,6 +98,9 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
override def needConversion: Boolean = false
private val specifiedQueryInstant = optParams.get(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key)
.map(HoodieSqlUtils.formatQueryInstant)
override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
log.debug(s" buildScan requiredColumns = ${requiredColumns.mkString(",")}")
log.debug(s" buildScan filters = ${filters.mkString(",")}")
@@ -159,7 +163,7 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
if (!lastInstant.isPresent) { // Return empty list if the table has no commit
List.empty
} else {
val latestCommit = lastInstant.get().getTimestamp
val queryInstant = specifiedQueryInstant.getOrElse(lastInstant.get().getTimestamp)
val baseAndLogsList = HoodieRealtimeInputFormatUtils.groupLogsByBaseFile(conf, partitionPaths.asJava).asScala
val fileSplits = baseAndLogsList.map(kv => {
val baseFile = kv.getLeft
@@ -174,7 +178,7 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
} else {
None
}
HoodieMergeOnReadFileSplit(baseDataPath, logPaths, latestCommit,
HoodieMergeOnReadFileSplit(baseDataPath, logPaths, queryInstant,
metaClient.getBasePath, maxCompactionMemoryInBytes, mergeType)
}).toList
fileSplits
@@ -203,8 +207,9 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
List.empty[HoodieMergeOnReadFileSplit]
} else {
val fileSplits = fileSlices.values.flatten.map(fileSlice => {
val latestCommit = metaClient.getActiveTimeline.getCommitsTimeline
val latestInstant = metaClient.getActiveTimeline.getCommitsTimeline
.filterCompletedInstants.lastInstant().get().getTimestamp
val queryInstant = specifiedQueryInstant.getOrElse(latestInstant)
val partitionedFile = if (fileSlice.getBaseFile.isPresent) {
val baseFile = fileSlice.getBaseFile.get()
@@ -217,7 +222,7 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
val logPaths = fileSlice.getLogFiles.sorted(HoodieLogFile.getLogFileComparator).iterator().asScala
.map(logFile => MergeOnReadSnapshotRelation.getFilePath(logFile.getPath)).toList
val logPathsOptional = if (logPaths.isEmpty) Option.empty else Option(logPaths)
HoodieMergeOnReadFileSplit(partitionedFile, logPathsOptional, latestCommit, metaClient.getBasePath,
HoodieMergeOnReadFileSplit(partitionedFile, logPathsOptional, queryInstant, metaClient.getBasePath,
maxCompactionMemoryInBytes, mergeType)
}).toList
fileSplits

View File

@@ -19,12 +19,13 @@ package org.apache.spark.sql.hudi
import scala.collection.JavaConverters._
import java.net.URI
import java.util.Locale
import java.util.{Date, Locale}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hudi.SparkAdapterSupport
import org.apache.hudi.common.model.HoodieRecord
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline
import org.apache.spark.SPARK_VERSION
import org.apache.spark.sql.avro.SchemaConverters
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
@@ -37,9 +38,12 @@ import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
import org.apache.spark.sql.types.{DataType, NullType, StringType, StructField, StructType}
import java.text.SimpleDateFormat
import scala.collection.immutable.Map
object HoodieSqlUtils extends SparkAdapterSupport {
private val defaultDateTimeFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
private val defaultDateFormat = new SimpleDateFormat("yyyy-MM-dd")
def isHoodieTable(table: CatalogTable): Boolean = {
table.provider.map(_.toLowerCase(Locale.ROOT)).orNull == "hudi"
@@ -224,4 +228,26 @@ object HoodieSqlUtils extends SparkAdapterSupport {
def isEnableHive(sparkSession: SparkSession): Boolean =
"hive" == sparkSession.sessionState.conf.getConf(StaticSQLConf.CATALOG_IMPLEMENTATION)
/**
* Convert different query instant time format to the commit time format.
* Currently we support three kinds of instant time format for time travel query:
* 1、yyyy-MM-dd HH:mm:ss
* 2、yyyy-MM-dd
* This will convert to 'yyyyMMdd000000'.
* 3、yyyyMMddHHmmss
*/
def formatQueryInstant(queryInstant: String): String = {
if (queryInstant.length == 19) { // for yyyy-MM-dd HH:mm:ss
HoodieActiveTimeline.COMMIT_FORMATTER.format(defaultDateTimeFormat.parse(queryInstant))
} else if (queryInstant.length == 14) { // for yyyyMMddHHmmss
HoodieActiveTimeline.COMMIT_FORMATTER.parse(queryInstant) // validate the format
queryInstant
} else if (queryInstant.length == 10) { // for yyyy-MM-dd
HoodieActiveTimeline.COMMIT_FORMATTER.format(defaultDateFormat.parse(queryInstant))
} else {
throw new IllegalArgumentException(s"Unsupported query instant time format: $queryInstant,"
+ s"Supported time format are: 'yyyy-MM-dd: HH:mm:ss' or 'yyyy-MM-dd' or 'yyyyMMddHHmmss'")
}
}
}