[HUDI-3445] Support Clustering Command Based on Call Procedure Command for Spark SQL (#4901)
* [HUDI-3445] Clustering Command Based on Call Procedure Command for Spark SQL * [HUDI-3445] Clustering Command Based on Call Procedure Command for Spark SQL * [HUDI-3445] Clustering Command Based on Call Procedure Command for Spark SQL Co-authored-by: shibei <huberylee.li@alibaba-inc.com>
This commit is contained in:
@@ -19,12 +19,10 @@ package org.apache.hudi
|
||||
|
||||
import org.apache.hudi.index.columnstats.ColumnStatsIndexHelper
|
||||
import org.apache.hudi.testutils.HoodieClientTestBase
|
||||
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
|
||||
import org.apache.spark.sql.catalyst.expressions.{Expression, Not}
|
||||
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation}
|
||||
import org.apache.spark.sql.functions.col
|
||||
import org.apache.spark.sql.hudi.DataSkippingUtils
|
||||
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType, VarcharType}
|
||||
import org.apache.spark.sql.types._
|
||||
import org.apache.spark.sql.{Column, SparkSession}
|
||||
import org.junit.jupiter.api.Assertions.assertEquals
|
||||
import org.junit.jupiter.api.BeforeEach
|
||||
@@ -75,7 +73,7 @@ class TestDataSkippingUtils extends HoodieClientTestBase {
|
||||
@ParameterizedTest
|
||||
@MethodSource(Array("testBaseLookupFilterExpressionsSource", "testAdvancedLookupFilterExpressionsSource"))
|
||||
def testLookupFilterExpressions(sourceExpr: String, input: Seq[IndexRow], output: Seq[String]): Unit = {
|
||||
val resolvedExpr: Expression = resolveFilterExpr(sourceExpr, sourceTableSchema)
|
||||
val resolvedExpr: Expression = HoodieCommonUtils.resolveFilterExpr(spark, sourceExpr, sourceTableSchema)
|
||||
|
||||
val lookupFilter = DataSkippingUtils.createColumnStatsIndexFilterExpr(resolvedExpr, indexSchema)
|
||||
|
||||
@@ -96,7 +94,7 @@ class TestDataSkippingUtils extends HoodieClientTestBase {
|
||||
@ParameterizedTest
|
||||
@MethodSource(Array("testStringsLookupFilterExpressionsSource"))
|
||||
def testStringsLookupFilterExpressions(sourceExpr: Expression, input: Seq[IndexRow], output: Seq[String]): Unit = {
|
||||
val resolvedExpr = resolveFilterExpr(sourceExpr, sourceTableSchema)
|
||||
val resolvedExpr = HoodieCommonUtils.resolveFilterExpr(spark, sourceExpr, sourceTableSchema)
|
||||
val lookupFilter = DataSkippingUtils.createColumnStatsIndexFilterExpr(resolvedExpr, indexSchema)
|
||||
|
||||
val spark2 = spark
|
||||
@@ -112,27 +110,6 @@ class TestDataSkippingUtils extends HoodieClientTestBase {
|
||||
|
||||
assertEquals(output, rows)
|
||||
}
|
||||
|
||||
private def resolveFilterExpr(exprString: String, tableSchema: StructType): Expression = {
|
||||
val expr = spark.sessionState.sqlParser.parseExpression(exprString)
|
||||
resolveFilterExpr(expr, tableSchema)
|
||||
}
|
||||
|
||||
private def resolveFilterExpr(expr: Expression, tableSchema: StructType): Expression = {
|
||||
val schemaFields = tableSchema.fields
|
||||
val resolvedExpr = spark.sessionState.analyzer.ResolveReferences(
|
||||
Filter(expr, LocalRelation(schemaFields.head, schemaFields.drop(1): _*))
|
||||
)
|
||||
.asInstanceOf[Filter].condition
|
||||
|
||||
checkForUnresolvedRefs(resolvedExpr)
|
||||
}
|
||||
|
||||
def checkForUnresolvedRefs(resolvedExpr: Expression): Expression =
|
||||
resolvedExpr match {
|
||||
case UnresolvedAttribute(_) => throw new IllegalStateException("unresolved attribute")
|
||||
case _ => resolvedExpr.mapChildren(e => checkForUnresolvedRefs(e))
|
||||
}
|
||||
}
|
||||
|
||||
object TestDataSkippingUtils {
|
||||
|
||||
@@ -17,6 +17,13 @@
|
||||
|
||||
package org.apache.spark.sql.hudi
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline}
|
||||
import org.apache.hudi.common.util.{Option => HOption}
|
||||
import org.apache.hudi.{HoodieCommonUtils, HoodieDataSourceHelpers}
|
||||
|
||||
import scala.collection.JavaConverters.asScalaIteratorConverter
|
||||
|
||||
class TestCallProcedure extends TestHoodieSqlBase {
|
||||
|
||||
test("Test Call show_commits Procedure") {
|
||||
@@ -129,4 +136,222 @@ class TestCallProcedure extends TestHoodieSqlBase {
|
||||
assertResult(1){commits.length}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call run_clustering Procedure By Table") {
|
||||
withTempDir { tmp =>
|
||||
Seq("cow", "mor").foreach { tableType =>
|
||||
val tableName = generateTableName
|
||||
val basePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| options (
|
||||
| primaryKey ='id',
|
||||
| type = '$tableType',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| partitioned by(ts)
|
||||
| location '$basePath'
|
||||
""".stripMargin)
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
|
||||
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
|
||||
val client = HoodieCommonUtils.createHoodieClientFromPath(spark, basePath, Map.empty)
|
||||
// Generate the first clustering plan
|
||||
val firstScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(firstScheduleInstant, HOption.empty())
|
||||
|
||||
// Generate the second clustering plan
|
||||
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
|
||||
val secondScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(secondScheduleInstant, HOption.empty())
|
||||
checkAnswer(s"call show_clustering('$tableName')")(
|
||||
Seq(firstScheduleInstant, 3),
|
||||
Seq(secondScheduleInstant, 1)
|
||||
)
|
||||
|
||||
// Do clustering for all clustering plan generated above, and no new clustering
|
||||
// instant will be generated because of there is no commit after the second
|
||||
// clustering plan generated
|
||||
spark.sql(s"call run_clustering(table => '$tableName', order => 'ts')")
|
||||
|
||||
// No new commits
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
assertResult(false)(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, secondScheduleInstant))
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003)
|
||||
)
|
||||
// After clustering there should be no pending clustering.
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
|
||||
// Check the number of finished clustering instants
|
||||
val finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(2)(finishedClustering.size)
|
||||
|
||||
// Do clustering without manual schedule(which will do the schedule if no pending clustering exists)
|
||||
spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
|
||||
spark.sql(s"insert into $tableName values(6, 'a6', 10, 1005)")
|
||||
spark.sql(s"call run_clustering(table => '$tableName', order => 'ts')")
|
||||
|
||||
val thirdClusteringInstant = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.findInstantsAfter(secondScheduleInstant)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
// Should have a new replace commit after the second clustering command.
|
||||
assertResult(1)(thirdClusteringInstant.size)
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003),
|
||||
Seq(5, "a5", 10.0, 1004),
|
||||
Seq(6, "a6", 10.0, 1005)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call run_clustering Procedure By Path") {
|
||||
withTempDir { tmp =>
|
||||
Seq("cow", "mor").foreach { tableType =>
|
||||
val tableName = generateTableName
|
||||
val basePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| options (
|
||||
| primaryKey ='id',
|
||||
| type = '$tableType',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| partitioned by(ts)
|
||||
| location '$basePath'
|
||||
""".stripMargin)
|
||||
|
||||
spark.sql(s"call run_clustering(path => '$basePath')")
|
||||
checkAnswer(s"call show_clustering(path => '$basePath')")()
|
||||
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
|
||||
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
|
||||
val client = HoodieCommonUtils.createHoodieClientFromPath(spark, basePath, Map.empty)
|
||||
// Generate the first clustering plan
|
||||
val firstScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(firstScheduleInstant, HOption.empty())
|
||||
checkAnswer(s"call show_clustering(path => '$basePath')")(
|
||||
Seq(firstScheduleInstant, 3)
|
||||
)
|
||||
// Do clustering for all the clustering plan
|
||||
spark.sql(s"call run_clustering(path => '$basePath', order => 'ts')")
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002)
|
||||
)
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
HoodieDataSourceHelpers.hasNewCommits(fs, basePath, firstScheduleInstant)
|
||||
|
||||
// Check the number of finished clustering instants
|
||||
var finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(1)(finishedClustering.size)
|
||||
|
||||
// Do clustering without manual schedule(which will do the schedule if no pending clustering exists)
|
||||
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
|
||||
spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts >= 1003L')")
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003),
|
||||
Seq(5, "a5", 10.0, 1004)
|
||||
)
|
||||
|
||||
finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(2)(finishedClustering.size)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call run_clustering Procedure With Partition Pruning") {
|
||||
withTempDir { tmp =>
|
||||
Seq("cow", "mor").foreach { tableType =>
|
||||
val tableName = generateTableName
|
||||
val basePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| options (
|
||||
| primaryKey ='id',
|
||||
| type = '$tableType',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| partitioned by(ts)
|
||||
| location '$basePath'
|
||||
""".stripMargin)
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
|
||||
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
|
||||
|
||||
// Do clustering table with partition predicate
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts <= 1001L', order => 'ts')")
|
||||
|
||||
// Check the num of completed clustering instant
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
val clusteringInstants = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(1)(clusteringInstants.size)
|
||||
|
||||
val clusteringInstant = clusteringInstants.head
|
||||
val clusteringPlan = HoodieDataSourceHelpers.getClusteringPlan(fs, basePath, clusteringInstant.getTimestamp)
|
||||
assertResult(true)(clusteringPlan.isPresent)
|
||||
assertResult(2)(clusteringPlan.get().getInputGroups.size())
|
||||
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user