[HUDI-3567] Refactor HoodieCommonUtils to make code more reasonable (#4982)
This commit is contained in:
@@ -22,8 +22,8 @@ import org.apache.hudi.testutils.HoodieClientTestBase
|
||||
import org.apache.spark.sql.catalyst.expressions.{Expression, Not}
|
||||
import org.apache.spark.sql.functions.col
|
||||
import org.apache.spark.sql.hudi.DataSkippingUtils
|
||||
import org.apache.spark.sql.types._
|
||||
import org.apache.spark.sql.{Column, SparkSession}
|
||||
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType, VarcharType}
|
||||
import org.apache.spark.sql.{Column, HoodieCatalystExpressionUtils, SparkSession}
|
||||
import org.junit.jupiter.api.Assertions.assertEquals
|
||||
import org.junit.jupiter.api.BeforeEach
|
||||
import org.junit.jupiter.params.ParameterizedTest
|
||||
@@ -73,8 +73,7 @@ class TestDataSkippingUtils extends HoodieClientTestBase {
|
||||
@ParameterizedTest
|
||||
@MethodSource(Array("testBaseLookupFilterExpressionsSource", "testAdvancedLookupFilterExpressionsSource"))
|
||||
def testLookupFilterExpressions(sourceExpr: String, input: Seq[IndexRow], output: Seq[String]): Unit = {
|
||||
val resolvedExpr: Expression = HoodieCommonUtils.resolveFilterExpr(spark, sourceExpr, sourceTableSchema)
|
||||
|
||||
val resolvedExpr: Expression = HoodieCatalystExpressionUtils.resolveFilterExpr(spark, sourceExpr, sourceTableSchema)
|
||||
val lookupFilter = DataSkippingUtils.createColumnStatsIndexFilterExpr(resolvedExpr, indexSchema)
|
||||
|
||||
val spark2 = spark
|
||||
@@ -94,7 +93,7 @@ class TestDataSkippingUtils extends HoodieClientTestBase {
|
||||
@ParameterizedTest
|
||||
@MethodSource(Array("testStringsLookupFilterExpressionsSource"))
|
||||
def testStringsLookupFilterExpressions(sourceExpr: Expression, input: Seq[IndexRow], output: Seq[String]): Unit = {
|
||||
val resolvedExpr = HoodieCommonUtils.resolveFilterExpr(spark, sourceExpr, sourceTableSchema)
|
||||
val resolvedExpr = HoodieCatalystExpressionUtils.resolveFilterExpr(spark, sourceExpr, sourceTableSchema)
|
||||
val lookupFilter = DataSkippingUtils.createColumnStatsIndexFilterExpr(resolvedExpr, indexSchema)
|
||||
|
||||
val spark2 = spark
|
||||
|
||||
@@ -17,13 +17,6 @@
|
||||
|
||||
package org.apache.spark.sql.hudi
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline}
|
||||
import org.apache.hudi.common.util.{Option => HOption}
|
||||
import org.apache.hudi.{HoodieCommonUtils, HoodieDataSourceHelpers}
|
||||
|
||||
import scala.collection.JavaConverters.asScalaIteratorConverter
|
||||
|
||||
class TestCallProcedure extends TestHoodieSqlBase {
|
||||
|
||||
test("Test Call show_commits Procedure") {
|
||||
@@ -136,222 +129,4 @@ class TestCallProcedure extends TestHoodieSqlBase {
|
||||
assertResult(1){commits.length}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call run_clustering Procedure By Table") {
|
||||
withTempDir { tmp =>
|
||||
Seq("cow", "mor").foreach { tableType =>
|
||||
val tableName = generateTableName
|
||||
val basePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| options (
|
||||
| primaryKey ='id',
|
||||
| type = '$tableType',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| partitioned by(ts)
|
||||
| location '$basePath'
|
||||
""".stripMargin)
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
|
||||
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
|
||||
val client = HoodieCommonUtils.createHoodieClientFromPath(spark, basePath, Map.empty)
|
||||
// Generate the first clustering plan
|
||||
val firstScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(firstScheduleInstant, HOption.empty())
|
||||
|
||||
// Generate the second clustering plan
|
||||
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
|
||||
val secondScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(secondScheduleInstant, HOption.empty())
|
||||
checkAnswer(s"call show_clustering('$tableName')")(
|
||||
Seq(firstScheduleInstant, 3),
|
||||
Seq(secondScheduleInstant, 1)
|
||||
)
|
||||
|
||||
// Do clustering for all clustering plan generated above, and no new clustering
|
||||
// instant will be generated because of there is no commit after the second
|
||||
// clustering plan generated
|
||||
spark.sql(s"call run_clustering(table => '$tableName', order => 'ts')")
|
||||
|
||||
// No new commits
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
assertResult(false)(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, secondScheduleInstant))
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003)
|
||||
)
|
||||
// After clustering there should be no pending clustering.
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
|
||||
// Check the number of finished clustering instants
|
||||
val finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(2)(finishedClustering.size)
|
||||
|
||||
// Do clustering without manual schedule(which will do the schedule if no pending clustering exists)
|
||||
spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
|
||||
spark.sql(s"insert into $tableName values(6, 'a6', 10, 1005)")
|
||||
spark.sql(s"call run_clustering(table => '$tableName', order => 'ts')")
|
||||
|
||||
val thirdClusteringInstant = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.findInstantsAfter(secondScheduleInstant)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
// Should have a new replace commit after the second clustering command.
|
||||
assertResult(1)(thirdClusteringInstant.size)
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003),
|
||||
Seq(5, "a5", 10.0, 1004),
|
||||
Seq(6, "a6", 10.0, 1005)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call run_clustering Procedure By Path") {
|
||||
withTempDir { tmp =>
|
||||
Seq("cow", "mor").foreach { tableType =>
|
||||
val tableName = generateTableName
|
||||
val basePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| options (
|
||||
| primaryKey ='id',
|
||||
| type = '$tableType',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| partitioned by(ts)
|
||||
| location '$basePath'
|
||||
""".stripMargin)
|
||||
|
||||
spark.sql(s"call run_clustering(path => '$basePath')")
|
||||
checkAnswer(s"call show_clustering(path => '$basePath')")()
|
||||
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
|
||||
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
|
||||
val client = HoodieCommonUtils.createHoodieClientFromPath(spark, basePath, Map.empty)
|
||||
// Generate the first clustering plan
|
||||
val firstScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(firstScheduleInstant, HOption.empty())
|
||||
checkAnswer(s"call show_clustering(path => '$basePath')")(
|
||||
Seq(firstScheduleInstant, 3)
|
||||
)
|
||||
// Do clustering for all the clustering plan
|
||||
spark.sql(s"call run_clustering(path => '$basePath', order => 'ts')")
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002)
|
||||
)
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
HoodieDataSourceHelpers.hasNewCommits(fs, basePath, firstScheduleInstant)
|
||||
|
||||
// Check the number of finished clustering instants
|
||||
var finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(1)(finishedClustering.size)
|
||||
|
||||
// Do clustering without manual schedule(which will do the schedule if no pending clustering exists)
|
||||
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
|
||||
spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts >= 1003L')")
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003),
|
||||
Seq(5, "a5", 10.0, 1004)
|
||||
)
|
||||
|
||||
finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(2)(finishedClustering.size)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call run_clustering Procedure With Partition Pruning") {
|
||||
withTempDir { tmp =>
|
||||
Seq("cow", "mor").foreach { tableType =>
|
||||
val tableName = generateTableName
|
||||
val basePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| options (
|
||||
| primaryKey ='id',
|
||||
| type = '$tableType',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| partitioned by(ts)
|
||||
| location '$basePath'
|
||||
""".stripMargin)
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
|
||||
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
|
||||
|
||||
// Do clustering table with partition predicate
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts <= 1001L', order => 'ts')")
|
||||
|
||||
// Check the num of completed clustering instant
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
val clusteringInstants = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(1)(clusteringInstants.size)
|
||||
|
||||
val clusteringInstant = clusteringInstants.head
|
||||
val clusteringPlan = HoodieDataSourceHelpers.getClusteringPlan(fs, basePath, clusteringInstant.getTimestamp)
|
||||
assertResult(true)(clusteringPlan.isPresent)
|
||||
assertResult(2)(clusteringPlan.get().getInputGroups.size())
|
||||
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,344 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline}
|
||||
import org.apache.hudi.common.util.{Option => HOption}
|
||||
import org.apache.hudi.{HoodieCLIUtils, HoodieDataSourceHelpers}
|
||||
|
||||
import scala.collection.JavaConverters.asScalaIteratorConverter
|
||||
|
||||
class TestRunClusteringProcedure extends TestHoodieSqlBase {
|
||||
|
||||
test("Test Call run_clustering Procedure By Table") {
|
||||
withTempDir { tmp =>
|
||||
Seq("cow", "mor").foreach { tableType =>
|
||||
val tableName = generateTableName
|
||||
val basePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| options (
|
||||
| primaryKey ='id',
|
||||
| type = '$tableType',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| partitioned by(ts)
|
||||
| location '$basePath'
|
||||
""".stripMargin)
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
|
||||
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
|
||||
val client = HoodieCLIUtils.createHoodieClientFromPath(spark, basePath, Map.empty)
|
||||
// Generate the first clustering plan
|
||||
val firstScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(firstScheduleInstant, HOption.empty())
|
||||
|
||||
// Generate the second clustering plan
|
||||
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
|
||||
val secondScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(secondScheduleInstant, HOption.empty())
|
||||
checkAnswer(s"call show_clustering('$tableName')")(
|
||||
Seq(firstScheduleInstant, 3),
|
||||
Seq(secondScheduleInstant, 1)
|
||||
)
|
||||
|
||||
// Do clustering for all clustering plan generated above, and no new clustering
|
||||
// instant will be generated because of there is no commit after the second
|
||||
// clustering plan generated
|
||||
spark.sql(s"call run_clustering(table => '$tableName', order => 'ts')")
|
||||
|
||||
// No new commits
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
assertResult(false)(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, secondScheduleInstant))
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003)
|
||||
)
|
||||
// After clustering there should be no pending clustering.
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
|
||||
// Check the number of finished clustering instants
|
||||
val finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(2)(finishedClustering.size)
|
||||
|
||||
// Do clustering without manual schedule(which will do the schedule if no pending clustering exists)
|
||||
spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
|
||||
spark.sql(s"insert into $tableName values(6, 'a6', 10, 1005)")
|
||||
spark.sql(s"call run_clustering(table => '$tableName', order => 'ts')")
|
||||
|
||||
val thirdClusteringInstant = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.findInstantsAfter(secondScheduleInstant)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
// Should have a new replace commit after the second clustering command.
|
||||
assertResult(1)(thirdClusteringInstant.size)
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003),
|
||||
Seq(5, "a5", 10.0, 1004),
|
||||
Seq(6, "a6", 10.0, 1005)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call run_clustering Procedure By Path") {
|
||||
withTempDir { tmp =>
|
||||
Seq("cow", "mor").foreach { tableType =>
|
||||
val tableName = generateTableName
|
||||
val basePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| options (
|
||||
| primaryKey ='id',
|
||||
| type = '$tableType',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| partitioned by(ts)
|
||||
| location '$basePath'
|
||||
""".stripMargin)
|
||||
|
||||
spark.sql(s"call run_clustering(path => '$basePath')")
|
||||
checkAnswer(s"call show_clustering(path => '$basePath')")()
|
||||
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
|
||||
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
|
||||
val client = HoodieCLIUtils.createHoodieClientFromPath(spark, basePath, Map.empty)
|
||||
// Generate the first clustering plan
|
||||
val firstScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(firstScheduleInstant, HOption.empty())
|
||||
checkAnswer(s"call show_clustering(path => '$basePath')")(
|
||||
Seq(firstScheduleInstant, 3)
|
||||
)
|
||||
// Do clustering for all the clustering plan
|
||||
spark.sql(s"call run_clustering(path => '$basePath', order => 'ts')")
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002)
|
||||
)
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
HoodieDataSourceHelpers.hasNewCommits(fs, basePath, firstScheduleInstant)
|
||||
|
||||
// Check the number of finished clustering instants
|
||||
var finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(1)(finishedClustering.size)
|
||||
|
||||
// Do clustering without manual schedule(which will do the schedule if no pending clustering exists)
|
||||
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
|
||||
spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts >= 1003L')")
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003),
|
||||
Seq(5, "a5", 10.0, 1004)
|
||||
)
|
||||
|
||||
finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(2)(finishedClustering.size)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call run_clustering Procedure With Partition Pruning") {
|
||||
withTempDir { tmp =>
|
||||
Seq("cow", "mor").foreach { tableType =>
|
||||
val tableName = generateTableName
|
||||
val basePath = s"${tmp.getCanonicalPath}/$tableName"
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| options (
|
||||
| primaryKey ='id',
|
||||
| type = '$tableType',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
| partitioned by(ts)
|
||||
| location '$basePath'
|
||||
""".stripMargin)
|
||||
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
|
||||
// Test partition pruning with single predicate
|
||||
{
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
|
||||
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)")
|
||||
|
||||
checkException(
|
||||
s"call run_clustering(table => '$tableName', predicate => 'ts <= 1001L and id = 10', order => 'ts')"
|
||||
)("Only partition predicates are allowed")
|
||||
|
||||
// Do clustering table with partition predicate
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts <= 1001L', order => 'ts')")
|
||||
|
||||
// There is 1 completed clustering instant
|
||||
val clusteringInstants = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(1)(clusteringInstants.size)
|
||||
|
||||
// The latest clustering should contain 2 file groups
|
||||
val clusteringInstant = clusteringInstants.last
|
||||
val clusteringPlan = HoodieDataSourceHelpers.getClusteringPlan(fs, basePath, clusteringInstant.getTimestamp)
|
||||
assertResult(true)(clusteringPlan.isPresent)
|
||||
assertResult(2)(clusteringPlan.get().getInputGroups.size())
|
||||
|
||||
// No pending clustering instant
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002)
|
||||
)
|
||||
}
|
||||
|
||||
// Test partition pruning with {@code And} predicates
|
||||
{
|
||||
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
|
||||
spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
|
||||
spark.sql(s"insert into $tableName values(6, 'a6', 10, 1005)")
|
||||
|
||||
checkException(
|
||||
s"call run_clustering(table => '$tableName', predicate => 'ts > 1001L and ts <= 1005L and id = 10', order => 'ts')"
|
||||
)("Only partition predicates are allowed")
|
||||
|
||||
// Do clustering table with partition predicate
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts > 1001L and ts <= 1005L', order => 'ts')")
|
||||
|
||||
// There are 2 completed clustering instants
|
||||
val clusteringInstants = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(2)(clusteringInstants.size)
|
||||
|
||||
// The latest clustering should contain 4 file groups(1002,1003,1004,1005)
|
||||
val clusteringInstant = clusteringInstants.last
|
||||
val clusteringPlan = HoodieDataSourceHelpers.getClusteringPlan(fs, basePath, clusteringInstant.getTimestamp)
|
||||
assertResult(true)(clusteringPlan.isPresent)
|
||||
assertResult(4)(clusteringPlan.get().getInputGroups.size())
|
||||
|
||||
// No pending clustering instant
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003),
|
||||
Seq(5, "a5", 10.0, 1004),
|
||||
Seq(6, "a6", 10.0, 1005)
|
||||
)
|
||||
}
|
||||
|
||||
// Test partition pruning with {@code And}-{@code Or} predicates
|
||||
{
|
||||
spark.sql(s"insert into $tableName values(7, 'a7', 10, 1006)")
|
||||
spark.sql(s"insert into $tableName values(8, 'a8', 10, 1007)")
|
||||
spark.sql(s"insert into $tableName values(9, 'a9', 10, 1008)")
|
||||
spark.sql(s"insert into $tableName values(10, 'a10', 10, 1009)")
|
||||
|
||||
checkException(
|
||||
s"call run_clustering(table => '$tableName', predicate => 'ts < 1007L or ts >= 1008L or id = 10', order => 'ts')"
|
||||
)("Only partition predicates are allowed")
|
||||
|
||||
// Do clustering table with partition predicate
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => '(ts >= 1006L and ts < 1008L) or ts >= 1009L', order => 'ts')")
|
||||
|
||||
// There are 3 completed clustering instants
|
||||
val clusteringInstants = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
.iterator().asScala
|
||||
.filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION)
|
||||
.toSeq
|
||||
assertResult(3)(clusteringInstants.size)
|
||||
|
||||
// The latest clustering should contain 3 file groups(1006,1007,1009)
|
||||
val clusteringInstant = clusteringInstants.last
|
||||
val clusteringPlan = HoodieDataSourceHelpers.getClusteringPlan(fs, basePath, clusteringInstant.getTimestamp)
|
||||
assertResult(true)(clusteringPlan.isPresent)
|
||||
assertResult(3)(clusteringPlan.get().getInputGroups.size())
|
||||
|
||||
// No pending clustering instant
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003),
|
||||
Seq(5, "a5", 10.0, 1004),
|
||||
Seq(6, "a6", 10.0, 1005),
|
||||
Seq(7, "a7", 10.0, 1006),
|
||||
Seq(8, "a8", 10.0, 1007),
|
||||
Seq(9, "a9", 10.0, 1008),
|
||||
Seq(10, "a10", 10.0, 1009)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user