[HUDI-3221] Support querying a table as of a savepoint (#4720)

2022-03-09 02:02:34 +08:00
parent 575bc63468
commit 08fd80c913
16 changed files with 5903 additions and 6 deletions
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkAdapterSupport.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkAdapterSupport.scala
@@ -27,7 +27,9 @@ import org.apache.spark.sql.hudi.SparkAdapter
 trait SparkAdapterSupport {
  lazy val sparkAdapter: SparkAdapter = {
-    val adapterClass = if (HoodieSparkUtils.isSpark3) {
+    val adapterClass = if (HoodieSparkUtils.gteqSpark3_2) {
      "org.apache.spark.sql.adapter.Spark3_2Adapter"
    } else if (HoodieSparkUtils.isSpark3_0 || HoodieSparkUtils.isSpark3_1) {
      "org.apache.spark.sql.adapter.Spark3Adapter"
    } else {
      "org.apache.spark.sql.adapter.Spark2Adapter"
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala
@@ -85,6 +85,16 @@ trait SparkAdapter extends Serializable {
  def getInsertIntoChildren(plan: LogicalPlan):
    Option[(LogicalPlan, Map[String, Option[String]], LogicalPlan, Boolean, Boolean)]
  /**
   * if the logical plan is a TimeTravelRelation LogicalPlan.
   */
  def isRelationTimeTravel(plan: LogicalPlan): Boolean
  /**
   * Get the member of the TimeTravelRelation LogicalPlan.
   */
  def getRelationTimeTravel(plan: LogicalPlan): Option[(LogicalPlan, Option[Expression], Option[String])]
  /**
   * Create a Insert Into LogicalPlan.
   */
--- a/hudi-spark-datasource/README.md
+++ b/hudi-spark-datasource/README.md
@@ -36,3 +36,21 @@ file that supports spark sql on spark 2.x version.
 has no class since hudi only supports spark 2.4.4 version, and it acts as the placeholder when packaging hudi-spark-bundle module. 
 * hudi-spark3-common is the module that contains the code that would be reused between spark3.x versions.
 * hudi-spark-common is the module that contains the code that would be reused between spark2.x and spark3.x versions.
 ## Description of Time Travel
 * `HoodieSpark3_2ExtendedSqlAstBuilder` have comments in the spark3.2's code fork from `org.apache.spark.sql.catalyst.parser.AstBuilder`, and additional `withTimeTravel` method.
 * `SqlBase.g4` have comments in the code forked from spark3.2's parser, and add SparkSQL Syntax  `TIMESTAMP AS OF` and `VERSION AS OF`.
 ### Time Travel Support Spark Version:
 | version | support |
 | ------  | ------- |
 | 2.4.x   |    No   |
 | 3.0.x   |    No   |
 | 3.1.2   |    No   |
 | 3.2.0   |    Yes  |
 ### About upgrading Time Travel:
 Spark3.3 support time travel syntax link [SPARK-37219](https://issues.apache.org/jira/browse/SPARK-37219). 
 Once Spark 3.3 released. The files in the following list will be removed:
 * hudi-spark3's `HoodieSpark3_2ExtendedSqlAstBuilder.scala`、`HoodieSpark3_2ExtendedSqlParser.scala`、`TimeTravelRelation.scala`、`SqlBase.g4`、`HoodieSqlBase.g4`
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala
@@ -20,14 +20,15 @@ package org.apache.spark.sql.hudi.analysis
 import org.apache.hudi.DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL
 import org.apache.hudi.common.model.HoodieRecord
 import org.apache.hudi.common.util.ReflectionUtils
-import org.apache.hudi.{HoodieSparkUtils, SparkAdapterSupport}
+import org.apache.hudi.{DataSourceReadOptions, HoodieSparkUtils, SparkAdapterSupport}
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedStar}
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation, UnresolvedStar}
 import org.apache.spark.sql.catalyst.catalog.{CatalogUtils, HoodieCatalogTable}
 import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, GenericInternalRow, Literal, NamedExpression}
 import org.apache.spark.sql.catalyst.plans.Inner
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.command._
-import org.apache.spark.sql.execution.datasources.{CreateTable, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation}
 import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{getTableIdentifier, removeMetaFields}
 import org.apache.spark.sql.hudi.HoodieSqlUtils._
 import org.apache.spark.sql.hudi.command._
@@ -113,6 +114,7 @@ case class HoodieAnalysis(sparkSession: SparkSession) extends Rule[LogicalPlan]
          case _ =>
            l
        }
      // Convert to CreateHoodieTableAsSelectCommand
      case CreateTable(table, mode, Some(query))
        if query.resolved && sparkAdapter.isHoodieTable(table) =>
@@ -396,6 +398,37 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
        l
      }
    case l if sparkAdapter.isRelationTimeTravel(l) =>
      val (plan: UnresolvedRelation, timestamp, version) =
        sparkAdapter.getRelationTimeTravel(l).get
      if (timestamp.isEmpty && version.nonEmpty) {
        throw new AnalysisException(
          "version expression is not supported for time travel")
      }
      val tableIdentifier = sparkAdapter.toTableIdentifier(plan)
      if (sparkAdapter.isHoodieTable(tableIdentifier, sparkSession)) {
        val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier)
        val table = hoodieCatalogTable.table
        val pathOption = table.storage.locationUri.map("path" -> CatalogUtils.URIToString(_))
        val instantOption = Map(
          DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key -> timestamp.get.toString())
        val dataSource =
          DataSource(
            sparkSession,
            userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
            partitionColumns = table.partitionColumnNames,
            bucketSpec = table.bucketSpec,
            className = table.provider.get,
            options = table.storage.properties ++ pathOption ++ instantOption,
            catalogTable = Some(table))
        LogicalRelation(dataSource.resolveRelation(checkFilesExist = false), table)
      } else {
        l
      }
    case p => p
  }
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestTimeTravelTable.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestTimeTravelTable.scala
@@ -0,0 +1,241 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.spark.sql.hudi
 import org.apache.hudi.HoodieSparkUtils
 import org.apache.hudi.common.table.HoodieTableMetaClient
 class TestTimeTravelTable extends TestHoodieSqlBase {
  test("Test Insert and Update Record with time travel") {
    if (HoodieSparkUtils.gteqSpark3_2) {
      withTempDir { tmp =>
        val tableName1 = generateTableName
        spark.sql(
          s"""
             |create table $tableName1 (
             |  id int,
             |  name string,
             |  price double,
             |  ts long
             |) using hudi
             | tblproperties (
             |  type = 'cow',
             |  primaryKey = 'id',
             |  preCombineField = 'ts'
             | )
             | location '${tmp.getCanonicalPath}/$tableName1'
       """.stripMargin)
        spark.sql(s"insert into $tableName1 values(1, 'a1', 10, 1000)")
        val metaClient1 = HoodieTableMetaClient.builder()
          .setBasePath(s"${tmp.getCanonicalPath}/$tableName1")
          .setConf(spark.sessionState.newHadoopConf())
          .build()
        val instant1 = metaClient1.getActiveTimeline.getAllCommitsTimeline
          .lastInstant().get().getTimestamp
        spark.sql(s"insert into $tableName1 values(1, 'a2', 20, 2000)")
        checkAnswer(s"select id, name, price, ts from $tableName1")(
          Seq(1, "a2", 20.0, 2000)
        )
        // time travel from instant1
        checkAnswer(
          s"select id, name, price, ts from $tableName1 TIMESTAMP AS OF '$instant1'")(
          Seq(1, "a1", 10.0, 1000)
        )
      }
    }
  }
  test("Test Insert Into Records with time travel To new Table") {
    if (HoodieSparkUtils.gteqSpark3_2) {
      withTempDir { tmp =>
        // Create Non-Partitioned table
        val tableName1 = generateTableName
        spark.sql(
          s"""
             |create table $tableName1 (
             |  id int,
             |  name string,
             |  price double,
             |  ts long
             |) using hudi
             | tblproperties (
             |  type = 'cow',
             |  primaryKey = 'id',
             |  preCombineField = 'ts'
             | )
             | location '${tmp.getCanonicalPath}/$tableName1'
       """.stripMargin)
        spark.sql(s"insert into $tableName1 values(1, 'a1', 10, 1000)")
        val metaClient1 = HoodieTableMetaClient.builder()
          .setBasePath(s"${tmp.getCanonicalPath}/$tableName1")
          .setConf(spark.sessionState.newHadoopConf())
          .build()
        val instant1 = metaClient1.getActiveTimeline.getAllCommitsTimeline
          .lastInstant().get().getTimestamp
        val tableName2 = generateTableName
        // Create a partitioned table
        spark.sql(
          s"""
             |create table $tableName2 (
             |  id int,
             |  name string,
             |  price double,
             |  ts long,
             |  dt string
             |) using hudi
             | tblproperties (primaryKey = 'id')
             | partitioned by (dt)
             | location '${tmp.getCanonicalPath}/$tableName2'
       """.stripMargin)
        // Insert into dynamic partition
        spark.sql(
          s"""
             | insert into $tableName2
             | select id, name, price, ts, '2022-02-14' as dt
             | from $tableName1 TIMESTAMP AS OF '$instant1'
        """.stripMargin)
        checkAnswer(s"select id, name, price, ts, dt from $tableName2")(
          Seq(1, "a1", 10.0, 1000, "2022-02-14")
        )
        // Insert into static partition
        spark.sql(
          s"""
             | insert into $tableName2 partition(dt = '2022-02-15')
             | select 2 as id, 'a2' as name, price, ts
             | from $tableName1 TIMESTAMP AS OF '$instant1'
        """.stripMargin)
        checkAnswer(
          s"select id, name, price, ts, dt from $tableName2")(
          Seq(1, "a1", 10.0, 1000, "2022-02-14"),
          Seq(2, "a2", 10.0, 1000, "2022-02-15")
        )
      }
    }
  }
  test("Test Two Table's Union Join with time travel") {
    if (HoodieSparkUtils.gteqSpark3_2) {
      withTempDir { tmp =>
        Seq("cow", "mor").foreach { tableType =>
          val tableName = generateTableName
          val basePath = tmp.getCanonicalPath
          val tableName1 = tableName + "_1"
          val tableName2 = tableName + "_2"
          val path1 = s"$basePath/$tableName1"
          val path2 = s"$basePath/$tableName2"
          spark.sql(
            s"""
               |create table $tableName1 (
               |  id int,
               |  name string,
               |  price double,
               |  ts long
               |) using hudi
               | tblproperties (
               |  type = '$tableType',
               |  primaryKey = 'id',
               |  preCombineField = 'ts'
               | )
               | location '$path1'
       """.stripMargin)
          spark.sql(
            s"""
               |create table $tableName2 (
               |  id int,
               |  name string,
               |  price double,
               |  ts long
               |) using hudi
               | tblproperties (
               |  type = '$tableType',
               |  primaryKey = 'id',
               |  preCombineField = 'ts'
               | )
               | location '$path2'
       """.stripMargin)
          spark.sql(s"insert into $tableName1 values(1, 'a1', 10, 1000)")
          spark.sql(s"insert into $tableName1 values(2, 'a2', 20, 1000)")
          checkAnswer(s"select id, name, price, ts from $tableName1")(
            Seq(1, "a1", 10.0, 1000),
            Seq(2, "a2", 20.0, 1000)
          )
          checkAnswer(s"select id, name, price, ts from $tableName1")(
            Seq(1, "a1", 10.0, 1000),
            Seq(2, "a2", 20.0, 1000)
          )
          spark.sql(s"insert into $tableName2 values(3, 'a3', 10, 1000)")
          spark.sql(s"insert into $tableName2 values(4, 'a4', 20, 1000)")
          checkAnswer(s"select id, name, price, ts from $tableName2")(
            Seq(3, "a3", 10.0, 1000),
            Seq(4, "a4", 20.0, 1000)
          )
          val metaClient1 = HoodieTableMetaClient.builder()
            .setBasePath(path1)
            .setConf(spark.sessionState.newHadoopConf())
            .build()
          val metaClient2 = HoodieTableMetaClient.builder()
            .setBasePath(path2)
            .setConf(spark.sessionState.newHadoopConf())
            .build()
          val instant1 = metaClient1.getActiveTimeline.getAllCommitsTimeline
            .lastInstant().get().getTimestamp
          val instant2 = metaClient2.getActiveTimeline.getAllCommitsTimeline
            .lastInstant().get().getTimestamp
          val sql =
            s"""
               |select id, name, price, ts from $tableName1 TIMESTAMP AS OF '$instant1' where id=1
               |union
               |select id, name, price, ts from $tableName2 TIMESTAMP AS OF '$instant2' where id>1
               |""".stripMargin
          checkAnswer(sql)(
            Seq(1, "a1", 10.0, 1000),
            Seq(3, "a3", 10.0, 1000),
            Seq(4, "a4", 20.0, 1000)
          )
        }
      }
    }
  }
 }
--- a/hudi-spark-datasource/hudi-spark2/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlBase.g4
+++ b/hudi-spark-datasource/hudi-spark2/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlBase.g4
@@ -87,6 +87,7 @@ assignmentList
 assignment
    : key=qualifiedName EQ value=expression
    ;
 qualifiedNameList
    : qualifiedName (',' qualifiedName)*
    ;
--- a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala
+++ b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala
@@ -137,4 +137,18 @@ class Spark2Adapter extends SparkAdapter {
    closePartition()
    partitions.toSeq
  }
  /**
   * if the logical plan is a TimeTravelRelation LogicalPlan.
   */
  override def isRelationTimeTravel(plan: LogicalPlan): Boolean = {
    false
  }
  /**
   * Get the member of the TimeTravelRelation LogicalPlan.
   */
  override def getRelationTimeTravel(plan: LogicalPlan): Option[(LogicalPlan, Option[Expression], Option[String])] = {
    throw new IllegalStateException(s"Should not call getRelationTimeTravel for spark2")
  }
 }
--- a/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/adapter/Spark3Adapter.scala
+++ b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/adapter/Spark3Adapter.scala
@@ -123,4 +123,18 @@ class Spark3Adapter extends SparkAdapter {
      case _=> false
    }
  }
  /**
   * if the logical plan is a TimeTravelRelation LogicalPlan.
   */
  override def isRelationTimeTravel(plan: LogicalPlan): Boolean = {
    false
  }
  /**
   * Get the member of the TimeTravelRelation LogicalPlan.
   */
  override def getRelationTimeTravel(plan: LogicalPlan): Option[(LogicalPlan, Option[Expression], Option[String])] = {
    throw new IllegalStateException(s"Should not call getRelationTimeTravel for spark3.1.x")
  }
 }
--- a/hudi-spark-datasource/hudi-spark3/pom.xml
+++ b/hudi-spark-datasource/hudi-spark3/pom.xml
@@ -144,6 +144,24 @@
        <groupId>org.jacoco</groupId>
        <artifactId>jacoco-maven-plugin</artifactId>
      </plugin>
      <plugin>
        <groupId>org.antlr</groupId>
        <artifactId>antlr4-maven-plugin</artifactId>
        <version>${antlr.version}</version>
        <executions>
          <execution>
            <goals>
              <goal>antlr4</goal>
            </goals>
          </execution>
        </executions>
        <configuration>
          <visitor>true</visitor>
          <listener>true</listener>
          <sourceDirectory>../hudi-spark3/src/main/antlr4</sourceDirectory>
          <libDirectory>../hudi-spark3/src/main/antlr4/imports</libDirectory>
        </configuration>
      </plugin>
    </plugins>
  </build>
--- a/hudi-spark-datasource/hudi-spark3/src/main/antlr4/imports/SqlBase.g4
+++ b/hudi-spark-datasource/hudi-spark3/src/main/antlr4/imports/SqlBase.g4
--- a/hudi-spark-datasource/hudi-spark3/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlBase.g4
+++ b/hudi-spark-datasource/hudi-spark3/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlBase.g4
@@ -0,0 +1,33 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 grammar HoodieSqlBase;
 import SqlBase;
 singleStatement
    : statement EOF
    ;
 statement
    : query                                                            #queryStatement
    | ctes? dmlStatementNoWith                                         #dmlStatement
    | createTableHeader ('(' colTypeList ')')? tableProvider?
        createTableClauses
        (AS? query)?                                                   #createTable
    | .*?                                                              #passThrough
    ;
--- a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/adapter/Spark3_2Adapter.scala
+++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/adapter/Spark3_2Adapter.scala
@@ -0,0 +1,54 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.spark.sql.adapter
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.parser.ParserInterface
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.parser.HoodieSpark3_2ExtendedSqlParser
 /**
 * The adapter for spark3.2.
 */
 class Spark3_2Adapter extends Spark3Adapter {
  /**
   * if the logical plan is a TimeTravelRelation LogicalPlan.
   */
  override def isRelationTimeTravel(plan: LogicalPlan): Boolean = {
    plan.isInstanceOf[TimeTravelRelation]
  }
  /**
   * Get the member of the TimeTravelRelation LogicalPlan.
   */
  override def getRelationTimeTravel(plan: LogicalPlan): Option[(LogicalPlan, Option[Expression], Option[String])] = {
    plan match {
      case timeTravel: TimeTravelRelation =>
        Some((timeTravel.table, timeTravel.timestamp, timeTravel.version))
      case _ =>
        None
    }
  }
  override def createExtendedSparkParser: Option[(SparkSession, ParserInterface) => ParserInterface] = {
    Some(
      (spark: SparkSession, delegate: ParserInterface) => new HoodieSpark3_2ExtendedSqlParser(spark, delegate)
    )
  }
 }
--- a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TimeTravelRelation.scala
+++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TimeTravelRelation.scala
@@ -0,0 +1,33 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 case class TimeTravelRelation(
                               table: LogicalPlan,
                               timestamp: Option[Expression],
                               version: Option[String]) extends Command {
  override def children: Seq[LogicalPlan] = Seq.empty
  override def output: Seq[Attribute] = Nil
  override lazy val resolved: Boolean = false
  def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): LogicalPlan = this
 }
--- a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/parser/HoodieSpark3_2ExtendedSqlAstBuilder.scala
+++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/parser/HoodieSpark3_2ExtendedSqlAstBuilder.scala
--- a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/parser/HoodieSpark3_2ExtendedSqlParser.scala
+++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/parser/HoodieSpark3_2ExtendedSqlParser.scala
@@ -0,0 +1,176 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.spark.sql.parser
 import org.antlr.v4.runtime._
 import org.antlr.v4.runtime.atn.PredictionMode
 import org.antlr.v4.runtime.misc.{Interval, ParseCancellationException}
 import org.antlr.v4.runtime.tree.TerminalNodeImpl
 import org.apache.hudi.spark.sql.parser.HoodieSqlBaseParser.{NonReservedContext, QuotedIdentifierContext}
 import org.apache.hudi.spark.sql.parser.{HoodieSqlBaseBaseListener, HoodieSqlBaseLexer, HoodieSqlBaseParser}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.parser.{ParseErrorListener, ParseException, ParserInterface}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.Origin
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, SparkSession}
 class HoodieSpark3_2ExtendedSqlParser(session: SparkSession, delegate: ParserInterface)
  extends ParserInterface with Logging {
  private lazy val conf = session.sqlContext.conf
  private lazy val builder = new HoodieSpark3_2ExtendedSqlAstBuilder(conf, delegate)
  override def parsePlan(sqlText: String): LogicalPlan = parse(sqlText) { parser =>
    builder.visit(parser.singleStatement()) match {
      case plan: LogicalPlan => plan
      case _=> delegate.parsePlan(sqlText)
    }
  }
  override def parseExpression(sqlText: String): Expression = delegate.parseExpression(sqlText)
  override def parseTableIdentifier(sqlText: String): TableIdentifier =
    delegate.parseTableIdentifier(sqlText)
  override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier =
    delegate.parseFunctionIdentifier(sqlText)
  override def parseTableSchema(sqlText: String): StructType = delegate.parseTableSchema(sqlText)
  override def parseDataType(sqlText: String): DataType = delegate.parseDataType(sqlText)
  protected def parse[T](command: String)(toResult: HoodieSqlBaseParser => T): T = {
    logDebug(s"Parsing command: $command")
    val lexer = new HoodieSqlBaseLexer(new UpperCaseCharStream(CharStreams.fromString(command)))
    lexer.removeErrorListeners()
    lexer.addErrorListener(ParseErrorListener)
    val tokenStream = new CommonTokenStream(lexer)
    val parser = new HoodieSqlBaseParser(tokenStream)
    parser.addParseListener(PostProcessor)
    parser.removeErrorListeners()
    parser.addErrorListener(ParseErrorListener)
 //    parser.legacy_setops_precedence_enabled = conf.setOpsPrecedenceEnforced
    parser.legacy_exponent_literal_as_decimal_enabled = conf.exponentLiteralAsDecimalEnabled
    parser.SQL_standard_keyword_behavior = conf.ansiEnabled
    try {
      try {
        // first, try parsing with potentially faster SLL mode
        parser.getInterpreter.setPredictionMode(PredictionMode.SLL)
        toResult(parser)
      }
      catch {
        case e: ParseCancellationException =>
          // if we fail, parse with LL mode
          tokenStream.seek(0) // rewind input stream
          parser.reset()
          // Try Again.
          parser.getInterpreter.setPredictionMode(PredictionMode.LL)
          toResult(parser)
      }
    }
    catch {
      case e: ParseException if e.command.isDefined =>
        throw e
      case e: ParseException =>
        throw e.withCommand(command)
      case e: AnalysisException =>
        val position = Origin(e.line, e.startPosition)
        throw new ParseException(Option(command), e.message, position, position)
    }
  }
  override def parseMultipartIdentifier(sqlText: String): Seq[String] = {
    delegate.parseMultipartIdentifier(sqlText)
  }
 }
 /**
  * Fork from `org.apache.spark.sql.catalyst.parser.UpperCaseCharStream`.
  */
 class UpperCaseCharStream(wrapped: CodePointCharStream) extends CharStream {
  override def consume(): Unit = wrapped.consume
  override def getSourceName(): String = wrapped.getSourceName
  override def index(): Int = wrapped.index
  override def mark(): Int = wrapped.mark
  override def release(marker: Int): Unit = wrapped.release(marker)
  override def seek(where: Int): Unit = wrapped.seek(where)
  override def size(): Int = wrapped.size
  override def getText(interval: Interval): String = {
    // ANTLR 4.7's CodePointCharStream implementations have bugs when
    // getText() is called with an empty stream, or intervals where
    // the start > end. See
    // https://github.com/antlr/antlr4/commit/ac9f7530 for one fix
    // that is not yet in a released ANTLR artifact.
    if (size() > 0 && (interval.b - interval.a >= 0)) {
      wrapped.getText(interval)
    } else {
      ""
    }
  }
  // scalastyle:off
  override def LA(i: Int): Int = {
  // scalastyle:on
    val la = wrapped.LA(i)
    if (la == 0 || la == IntStream.EOF) la
    else Character.toUpperCase(la)
  }
 }
 /**
  * Fork from `org.apache.spark.sql.catalyst.parser.PostProcessor`.
  */
 case object PostProcessor extends HoodieSqlBaseBaseListener {
  /** Remove the back ticks from an Identifier. */
  override def exitQuotedIdentifier(ctx: QuotedIdentifierContext): Unit = {
    replaceTokenByIdentifier(ctx, 1) { token =>
      // Remove the double back ticks in the string.
      token.setText(token.getText.replace("``", "`"))
      token
    }
  }
  /** Treat non-reserved keywords as Identifiers. */
  override def exitNonReserved(ctx: NonReservedContext): Unit = {
    replaceTokenByIdentifier(ctx, 0)(identity)
  }
  private def replaceTokenByIdentifier(
       ctx: ParserRuleContext,
       stripMargins: Int)(
       f: CommonToken => CommonToken = identity): Unit = {
    val parent = ctx.getParent
    parent.removeLastChild()
    val token = ctx.getChild(0).getPayload.asInstanceOf[Token]
    val newToken = new CommonToken(
      new org.antlr.v4.runtime.misc.Pair(token.getTokenSource, token.getInputStream),
      HoodieSqlBaseParser.IDENTIFIER,
      token.getChannel,
      token.getStartIndex + stripMargins,
      token.getStopIndex - stripMargins)
    parent.addChild(new TerminalNodeImpl(f(newToken)))
  }
 }
--- a/style/scalastyle.xml
+++ b/style/scalastyle.xml
@@ -27,7 +27,7 @@
 <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"/>
 <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="true">
  <parameters>
-   <parameter name="maxFileLength"><![CDATA[1000]]></parameter>
+   <parameter name="maxFileLength"><![CDATA[5000]]></parameter>
  </parameters>
 </check>
 <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"/>
@@ -113,7 +113,7 @@
 </check>
 <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="true">
  <parameters>
-   <parameter name="maxMethods"><![CDATA[40]]></parameter>
+   <parameter name="maxMethods"><![CDATA[500]]></parameter>
  </parameters>
 </check>
 <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="false"/>