[HUDI-3221] Support querying a table as of a savepoint (#4720)
This commit is contained in:
@@ -20,14 +20,15 @@ package org.apache.spark.sql.hudi.analysis
|
||||
import org.apache.hudi.DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL
|
||||
import org.apache.hudi.common.model.HoodieRecord
|
||||
import org.apache.hudi.common.util.ReflectionUtils
|
||||
import org.apache.hudi.{HoodieSparkUtils, SparkAdapterSupport}
|
||||
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedStar}
|
||||
import org.apache.hudi.{DataSourceReadOptions, HoodieSparkUtils, SparkAdapterSupport}
|
||||
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation, UnresolvedStar}
|
||||
import org.apache.spark.sql.catalyst.catalog.{CatalogUtils, HoodieCatalogTable}
|
||||
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, GenericInternalRow, Literal, NamedExpression}
|
||||
import org.apache.spark.sql.catalyst.plans.Inner
|
||||
import org.apache.spark.sql.catalyst.plans.logical._
|
||||
import org.apache.spark.sql.catalyst.rules.Rule
|
||||
import org.apache.spark.sql.execution.command._
|
||||
import org.apache.spark.sql.execution.datasources.{CreateTable, LogicalRelation}
|
||||
import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation}
|
||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{getTableIdentifier, removeMetaFields}
|
||||
import org.apache.spark.sql.hudi.HoodieSqlUtils._
|
||||
import org.apache.spark.sql.hudi.command._
|
||||
@@ -113,6 +114,7 @@ case class HoodieAnalysis(sparkSession: SparkSession) extends Rule[LogicalPlan]
|
||||
case _ =>
|
||||
l
|
||||
}
|
||||
|
||||
// Convert to CreateHoodieTableAsSelectCommand
|
||||
case CreateTable(table, mode, Some(query))
|
||||
if query.resolved && sparkAdapter.isHoodieTable(table) =>
|
||||
@@ -396,6 +398,37 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
|
||||
l
|
||||
}
|
||||
|
||||
case l if sparkAdapter.isRelationTimeTravel(l) =>
|
||||
val (plan: UnresolvedRelation, timestamp, version) =
|
||||
sparkAdapter.getRelationTimeTravel(l).get
|
||||
|
||||
if (timestamp.isEmpty && version.nonEmpty) {
|
||||
throw new AnalysisException(
|
||||
"version expression is not supported for time travel")
|
||||
}
|
||||
|
||||
val tableIdentifier = sparkAdapter.toTableIdentifier(plan)
|
||||
if (sparkAdapter.isHoodieTable(tableIdentifier, sparkSession)) {
|
||||
val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier)
|
||||
val table = hoodieCatalogTable.table
|
||||
val pathOption = table.storage.locationUri.map("path" -> CatalogUtils.URIToString(_))
|
||||
val instantOption = Map(
|
||||
DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key -> timestamp.get.toString())
|
||||
val dataSource =
|
||||
DataSource(
|
||||
sparkSession,
|
||||
userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
|
||||
partitionColumns = table.partitionColumnNames,
|
||||
bucketSpec = table.bucketSpec,
|
||||
className = table.provider.get,
|
||||
options = table.storage.properties ++ pathOption ++ instantOption,
|
||||
catalogTable = Some(table))
|
||||
|
||||
LogicalRelation(dataSource.resolveRelation(checkFilesExist = false), table)
|
||||
} else {
|
||||
l
|
||||
}
|
||||
|
||||
case p => p
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,241 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi
|
||||
|
||||
import org.apache.hudi.HoodieSparkUtils
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
|
||||
class TestTimeTravelTable extends TestHoodieSqlBase {
|
||||
test("Test Insert and Update Record with time travel") {
|
||||
if (HoodieSparkUtils.gteqSpark3_2) {
|
||||
withTempDir { tmp =>
|
||||
val tableName1 = generateTableName
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName1 (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| tblproperties (
|
||||
| type = 'cow',
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| location '${tmp.getCanonicalPath}/$tableName1'
|
||||
""".stripMargin)
|
||||
|
||||
spark.sql(s"insert into $tableName1 values(1, 'a1', 10, 1000)")
|
||||
|
||||
val metaClient1 = HoodieTableMetaClient.builder()
|
||||
.setBasePath(s"${tmp.getCanonicalPath}/$tableName1")
|
||||
.setConf(spark.sessionState.newHadoopConf())
|
||||
.build()
|
||||
|
||||
val instant1 = metaClient1.getActiveTimeline.getAllCommitsTimeline
|
||||
.lastInstant().get().getTimestamp
|
||||
|
||||
spark.sql(s"insert into $tableName1 values(1, 'a2', 20, 2000)")
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName1")(
|
||||
Seq(1, "a2", 20.0, 2000)
|
||||
)
|
||||
|
||||
// time travel from instant1
|
||||
checkAnswer(
|
||||
s"select id, name, price, ts from $tableName1 TIMESTAMP AS OF '$instant1'")(
|
||||
Seq(1, "a1", 10.0, 1000)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Insert Into Records with time travel To new Table") {
|
||||
if (HoodieSparkUtils.gteqSpark3_2) {
|
||||
withTempDir { tmp =>
|
||||
// Create Non-Partitioned table
|
||||
val tableName1 = generateTableName
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName1 (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| tblproperties (
|
||||
| type = 'cow',
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| location '${tmp.getCanonicalPath}/$tableName1'
|
||||
""".stripMargin)
|
||||
|
||||
spark.sql(s"insert into $tableName1 values(1, 'a1', 10, 1000)")
|
||||
|
||||
val metaClient1 = HoodieTableMetaClient.builder()
|
||||
.setBasePath(s"${tmp.getCanonicalPath}/$tableName1")
|
||||
.setConf(spark.sessionState.newHadoopConf())
|
||||
.build()
|
||||
|
||||
val instant1 = metaClient1.getActiveTimeline.getAllCommitsTimeline
|
||||
.lastInstant().get().getTimestamp
|
||||
|
||||
|
||||
val tableName2 = generateTableName
|
||||
// Create a partitioned table
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName2 (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long,
|
||||
| dt string
|
||||
|) using hudi
|
||||
| tblproperties (primaryKey = 'id')
|
||||
| partitioned by (dt)
|
||||
| location '${tmp.getCanonicalPath}/$tableName2'
|
||||
""".stripMargin)
|
||||
|
||||
// Insert into dynamic partition
|
||||
spark.sql(
|
||||
s"""
|
||||
| insert into $tableName2
|
||||
| select id, name, price, ts, '2022-02-14' as dt
|
||||
| from $tableName1 TIMESTAMP AS OF '$instant1'
|
||||
""".stripMargin)
|
||||
checkAnswer(s"select id, name, price, ts, dt from $tableName2")(
|
||||
Seq(1, "a1", 10.0, 1000, "2022-02-14")
|
||||
)
|
||||
|
||||
// Insert into static partition
|
||||
spark.sql(
|
||||
s"""
|
||||
| insert into $tableName2 partition(dt = '2022-02-15')
|
||||
| select 2 as id, 'a2' as name, price, ts
|
||||
| from $tableName1 TIMESTAMP AS OF '$instant1'
|
||||
""".stripMargin)
|
||||
checkAnswer(
|
||||
s"select id, name, price, ts, dt from $tableName2")(
|
||||
Seq(1, "a1", 10.0, 1000, "2022-02-14"),
|
||||
Seq(2, "a2", 10.0, 1000, "2022-02-15")
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Two Table's Union Join with time travel") {
|
||||
if (HoodieSparkUtils.gteqSpark3_2) {
|
||||
withTempDir { tmp =>
|
||||
Seq("cow", "mor").foreach { tableType =>
|
||||
val tableName = generateTableName
|
||||
|
||||
val basePath = tmp.getCanonicalPath
|
||||
val tableName1 = tableName + "_1"
|
||||
val tableName2 = tableName + "_2"
|
||||
val path1 = s"$basePath/$tableName1"
|
||||
val path2 = s"$basePath/$tableName2"
|
||||
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName1 (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| tblproperties (
|
||||
| type = '$tableType',
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| location '$path1'
|
||||
""".stripMargin)
|
||||
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName2 (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| tblproperties (
|
||||
| type = '$tableType',
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| location '$path2'
|
||||
""".stripMargin)
|
||||
|
||||
spark.sql(s"insert into $tableName1 values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName1 values(2, 'a2', 20, 1000)")
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName1")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 20.0, 1000)
|
||||
)
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName1")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 20.0, 1000)
|
||||
)
|
||||
|
||||
spark.sql(s"insert into $tableName2 values(3, 'a3', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName2 values(4, 'a4', 20, 1000)")
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName2")(
|
||||
Seq(3, "a3", 10.0, 1000),
|
||||
Seq(4, "a4", 20.0, 1000)
|
||||
)
|
||||
|
||||
val metaClient1 = HoodieTableMetaClient.builder()
|
||||
.setBasePath(path1)
|
||||
.setConf(spark.sessionState.newHadoopConf())
|
||||
.build()
|
||||
|
||||
val metaClient2 = HoodieTableMetaClient.builder()
|
||||
.setBasePath(path2)
|
||||
.setConf(spark.sessionState.newHadoopConf())
|
||||
.build()
|
||||
|
||||
val instant1 = metaClient1.getActiveTimeline.getAllCommitsTimeline
|
||||
.lastInstant().get().getTimestamp
|
||||
|
||||
val instant2 = metaClient2.getActiveTimeline.getAllCommitsTimeline
|
||||
.lastInstant().get().getTimestamp
|
||||
|
||||
val sql =
|
||||
s"""
|
||||
|select id, name, price, ts from $tableName1 TIMESTAMP AS OF '$instant1' where id=1
|
||||
|union
|
||||
|select id, name, price, ts from $tableName2 TIMESTAMP AS OF '$instant2' where id>1
|
||||
|""".stripMargin
|
||||
|
||||
checkAnswer(sql)(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(3, "a3", 10.0, 1000),
|
||||
Seq(4, "a4", 20.0, 1000)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user