[HUDI-3445] Support Clustering Command Based on Call Procedure Command for Spark SQL (#4901)

* [HUDI-3445] Clustering Command Based on Call Procedure Command for Spark SQL * [HUDI-3445] Clustering Command Based on Call Procedure Command for Spark SQL * [HUDI-3445] Clustering Command Based on Call Procedure Command for Spark SQL Co-authored-by: shibei <huberylee.li@alibaba-inc.com>
2022-03-04 09:33:16 +08:00
parent be9a264885
commit 62f534d002
20 changed files with 909 additions and 247 deletions
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSkippingUtils.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSkippingUtils.scala
@@ -19,12 +19,10 @@ package org.apache.hudi

 import org.apache.hudi.index.columnstats.ColumnStatsIndexHelper
 import org.apache.hudi.testutils.HoodieClientTestBase
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.{Expression, Not}
-import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation}
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.hudi.DataSkippingUtils
-import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType, VarcharType}
+import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, SparkSession}
 import org.junit.jupiter.api.Assertions.assertEquals
 import org.junit.jupiter.api.BeforeEach
@@ -75,7 +73,7 @@ class TestDataSkippingUtils extends HoodieClientTestBase {
  @ParameterizedTest
  @MethodSource(Array("testBaseLookupFilterExpressionsSource", "testAdvancedLookupFilterExpressionsSource"))
  def testLookupFilterExpressions(sourceExpr: String, input: Seq[IndexRow], output: Seq[String]): Unit = {
-    val resolvedExpr: Expression = resolveFilterExpr(sourceExpr, sourceTableSchema)
+    val resolvedExpr: Expression = HoodieCommonUtils.resolveFilterExpr(spark, sourceExpr, sourceTableSchema)

    val lookupFilter = DataSkippingUtils.createColumnStatsIndexFilterExpr(resolvedExpr, indexSchema)

@@ -96,7 +94,7 @@ class TestDataSkippingUtils extends HoodieClientTestBase {
  @ParameterizedTest
  @MethodSource(Array("testStringsLookupFilterExpressionsSource"))
  def testStringsLookupFilterExpressions(sourceExpr: Expression, input: Seq[IndexRow], output: Seq[String]): Unit = {
-    val resolvedExpr = resolveFilterExpr(sourceExpr, sourceTableSchema)
+    val resolvedExpr = HoodieCommonUtils.resolveFilterExpr(spark, sourceExpr, sourceTableSchema)
    val lookupFilter = DataSkippingUtils.createColumnStatsIndexFilterExpr(resolvedExpr, indexSchema)

    val spark2 = spark
@@ -112,27 +110,6 @@ class TestDataSkippingUtils extends HoodieClientTestBase {

    assertEquals(output, rows)
  }
-
-  private def resolveFilterExpr(exprString: String, tableSchema: StructType): Expression = {
-    val expr = spark.sessionState.sqlParser.parseExpression(exprString)
-    resolveFilterExpr(expr, tableSchema)
-  }
-
-  private def resolveFilterExpr(expr: Expression, tableSchema: StructType): Expression = {
-    val schemaFields = tableSchema.fields
-    val resolvedExpr = spark.sessionState.analyzer.ResolveReferences(
-      Filter(expr, LocalRelation(schemaFields.head, schemaFields.drop(1): _*))
-    )
-      .asInstanceOf[Filter].condition
-
-    checkForUnresolvedRefs(resolvedExpr)
-  }
-
-  def checkForUnresolvedRefs(resolvedExpr: Expression): Expression =
-    resolvedExpr match {
-      case UnresolvedAttribute(_) => throw new IllegalStateException("unresolved attribute")
-      case _ => resolvedExpr.mapChildren(e => checkForUnresolvedRefs(e))
-    }
 }

 object TestDataSkippingUtils {
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCallProcedure.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCallProcedure.scala
@@ -17,6 +17,13 @@

 package org.apache.spark.sql.hudi

+import org.apache.hadoop.fs.Path
+import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline}
+import org.apache.hudi.common.util.{Option => HOption}
+import org.apache.hudi.{HoodieCommonUtils, HoodieDataSourceHelpers}
+
+import scala.collection.JavaConverters.asScalaIteratorConverter
+
 class TestCallProcedure extends TestHoodieSqlBase {

  test("Test Call show_commits Procedure") {
@@ -129,4 +136,222 @@ class TestCallProcedure extends TestHoodieSqlBase {
      assertResult(1){commits.length}
    }
  }
+
+  test("Test Call run_clustering Procedure By Table") {
+    withTempDir { tmp =>
+      Seq("cow", "mor").foreach { tableType =>
+        val tableName = generateTableName
+        val basePath = s"${tmp.getCanonicalPath}/$tableName"
+        spark.sql(
+          s"""
+             |create table $tableName (
+             |  id int,
+             |  name string,
+             |  price double,
+             |  ts long
+             |) using hudi
+             | options (
+             |  primaryKey ='id',
+             |  type = '$tableType',
+             |  preCombineField = 'ts'
+             | )
+             | partitioned by(ts)
+             | location '$basePath'
+       """.stripMargin)
+        spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
+        spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
+        spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
+        val client = HoodieCommonUtils.createHoodieClientFromPath(spark, basePath, Map.empty)
+        // Generate the first clustering plan
+        val firstScheduleInstant = HoodieActiveTimeline.createNewInstantTime
+        client.scheduleClusteringAtInstant(firstScheduleInstant, HOption.empty())
+
+        // Generate the second clustering plan
+        spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
+        val secondScheduleInstant = HoodieActiveTimeline.createNewInstantTime
+        client.scheduleClusteringAtInstant(secondScheduleInstant, HOption.empty())
+        checkAnswer(s"call show_clustering('$tableName')")(
+          Seq(firstScheduleInstant, 3),
+          Seq(secondScheduleInstant, 1)
+        )
+
+        // Do clustering for all clustering plan generated above, and no new clustering
+        // instant will be generated because of there is no commit after the second
+        // clustering plan generated
+        spark.sql(s"call run_clustering(table => '$tableName', order => 'ts')")
+
+        // No new commits
+        val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
+        assertResult(false)(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, secondScheduleInstant))
+
+        checkAnswer(s"select id, name, price, ts from $tableName order by id")(
+          Seq(1, "a1", 10.0, 1000),
+          Seq(2, "a2", 10.0, 1001),
+          Seq(3, "a3", 10.0, 1002),
+          Seq(4, "a4", 10.0, 1003)
+        )
+        // After clustering there should be no pending clustering.
+        checkAnswer(s"call show_clustering(table => '$tableName')")()
+
+        // Check the number of finished clustering instants
+        val finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
+          .getInstants
+          .iterator().asScala
+          .filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
+          .toSeq
+        assertResult(2)(finishedClustering.size)
+
+        // Do clustering without manual schedule(which will do the schedule if no pending clustering exists)
+        spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
+        spark.sql(s"insert into $tableName values(6, 'a6', 10, 1005)")
+        spark.sql(s"call run_clustering(table => '$tableName', order => 'ts')")
+
+        val thirdClusteringInstant = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
+          .findInstantsAfter(secondScheduleInstant)
+          .getInstants
+          .iterator().asScala
+          .filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
+          .toSeq
+        // Should have a new replace commit after the second clustering command.
+        assertResult(1)(thirdClusteringInstant.size)
+
+        checkAnswer(s"select id, name, price, ts from $tableName order by id")(
+          Seq(1, "a1", 10.0, 1000),
+          Seq(2, "a2", 10.0, 1001),
+          Seq(3, "a3", 10.0, 1002),
+          Seq(4, "a4", 10.0, 1003),
+          Seq(5, "a5", 10.0, 1004),
+          Seq(6, "a6", 10.0, 1005)
+        )
+      }
+    }
+  }
+
+  test("Test Call run_clustering Procedure By Path") {
+    withTempDir { tmp =>
+      Seq("cow", "mor").foreach { tableType =>
+        val tableName = generateTableName
+        val basePath = s"${tmp.getCanonicalPath}/$tableName"
+        spark.sql(
+          s"""
+             |create table $tableName (
+             |  id int,
+             |  name string,
+             |  price double,
+             |  ts long
+             |) using hudi
+             | options (
+             |  primaryKey ='id',
+             |  type = '$tableType',
+             |  preCombineField = 'ts'
+             | )
+             | partitioned by(ts)
+             | location '$basePath'
+       """.stripMargin)
+
+        spark.sql(s"call run_clustering(path => '$basePath')")
+        checkAnswer(s"call show_clustering(path => '$basePath')")()
+
+        spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
+        spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
+        spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
+        val client = HoodieCommonUtils.createHoodieClientFromPath(spark, basePath, Map.empty)
+        // Generate the first clustering plan
+        val firstScheduleInstant = HoodieActiveTimeline.createNewInstantTime
+        client.scheduleClusteringAtInstant(firstScheduleInstant, HOption.empty())
+        checkAnswer(s"call show_clustering(path => '$basePath')")(
+          Seq(firstScheduleInstant, 3)
+        )
+        // Do clustering for all the clustering plan
+        spark.sql(s"call run_clustering(path => '$basePath', order => 'ts')")
+        checkAnswer(s"select id, name, price, ts from $tableName order by id")(
+          Seq(1, "a1", 10.0, 1000),
+          Seq(2, "a2", 10.0, 1001),
+          Seq(3, "a3", 10.0, 1002)
+        )
+        val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
+        HoodieDataSourceHelpers.hasNewCommits(fs, basePath, firstScheduleInstant)
+
+        // Check the number of finished clustering instants
+        var finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
+          .getInstants
+          .iterator().asScala
+          .filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
+          .toSeq
+        assertResult(1)(finishedClustering.size)
+
+        // Do clustering without manual schedule(which will do the schedule if no pending clustering exists)
+        spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
+        spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
+        spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts >= 1003L')")
+        checkAnswer(s"select id, name, price, ts from $tableName order by id")(
+          Seq(1, "a1", 10.0, 1000),
+          Seq(2, "a2", 10.0, 1001),
+          Seq(3, "a3", 10.0, 1002),
+          Seq(4, "a4", 10.0, 1003),
+          Seq(5, "a5", 10.0, 1004)
+        )
+
+        finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
+          .getInstants
+          .iterator().asScala
+          .filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
+          .toSeq
+        assertResult(2)(finishedClustering.size)
+      }
+    }
+  }
+
+  test("Test Call run_clustering Procedure With Partition Pruning") {
+    withTempDir { tmp =>
+      Seq("cow", "mor").foreach { tableType =>
+        val tableName = generateTableName
+        val basePath = s"${tmp.getCanonicalPath}/$tableName"
+        spark.sql(
+          s"""
+             |create table $tableName (
+             |  id int,
+             |  name string,
+             |  price double,
+             |  ts long
+             |) using hudi
+             | options (
+             |  primaryKey ='id',
+             |  type = '$tableType',
+             |  preCombineField = 'ts'
+             | )
+             | partitioned by(ts)
+             | location '$basePath'
+       """.stripMargin)
+        spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
+        spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
+        spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
+
+        // Do clustering table with partition predicate
+        spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts <= 1001L', order => 'ts')")
+
+        // Check the num of completed clustering instant
+        val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
+        val clusteringInstants = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
+          .getInstants
+          .iterator().asScala
+          .filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
+          .toSeq
+        assertResult(1)(clusteringInstants.size)
+
+        val clusteringInstant = clusteringInstants.head
+        val clusteringPlan = HoodieDataSourceHelpers.getClusteringPlan(fs, basePath, clusteringInstant.getTimestamp)
+        assertResult(true)(clusteringPlan.isPresent)
+        assertResult(2)(clusteringPlan.get().getInputGroups.size())
+
+        checkAnswer(s"call show_clustering(table => '$tableName')")()
+
+        checkAnswer(s"select id, name, price, ts from $tableName order by id")(
+          Seq(1, "a1", 10.0, 1000),
+          Seq(2, "a2", 10.0, 1001),
+          Seq(3, "a3", 10.0, 1002)
+        )
+      }
+    }
+  }
 }