[HUDI-4116] Unify clustering/compaction related procedures' output type (#5620)
* Unify clustering/compaction related procedures' output type * Address review comments
This commit is contained in:
@@ -20,10 +20,9 @@
|
||||
package org.apache.spark.sql.hudi.procedure
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline}
|
||||
import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieInstant, HoodieTimeline}
|
||||
import org.apache.hudi.common.util.{Option => HOption}
|
||||
import org.apache.hudi.{HoodieCLIUtils, HoodieDataSourceHelpers}
|
||||
|
||||
import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase
|
||||
|
||||
import scala.collection.JavaConverters.asScalaIteratorConverter
|
||||
@@ -64,28 +63,22 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
val secondScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(secondScheduleInstant, HOption.empty())
|
||||
checkAnswer(s"call show_clustering('$tableName')")(
|
||||
Seq(firstScheduleInstant, 3),
|
||||
Seq(secondScheduleInstant, 1)
|
||||
Seq(secondScheduleInstant, 1, HoodieInstant.State.REQUESTED.name(), "*"),
|
||||
Seq(firstScheduleInstant, 3, HoodieInstant.State.REQUESTED.name(), "*")
|
||||
)
|
||||
|
||||
// Do clustering for all clustering plan generated above, and no new clustering
|
||||
// instant will be generated because of there is no commit after the second
|
||||
// clustering plan generated
|
||||
spark.sql(s"call run_clustering(table => '$tableName', order => 'ts')")
|
||||
checkAnswer(s"call run_clustering(table => '$tableName', order => 'ts', show_involved_partition => true)")(
|
||||
Seq(secondScheduleInstant, 1, HoodieInstant.State.COMPLETED.name(), "ts=1003"),
|
||||
Seq(firstScheduleInstant, 3, HoodieInstant.State.COMPLETED.name(), "ts=1000,ts=1001,ts=1002")
|
||||
)
|
||||
|
||||
// No new commits
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
assertResult(false)(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, secondScheduleInstant))
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003)
|
||||
)
|
||||
// After clustering there should be no pending clustering.
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
|
||||
// Check the number of finished clustering instants
|
||||
val finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.getInstants
|
||||
@@ -94,10 +87,23 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
.toSeq
|
||||
assertResult(2)(finishedClustering.size)
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002),
|
||||
Seq(4, "a4", 10.0, 1003)
|
||||
)
|
||||
|
||||
// After clustering there should be no pending clustering and all clustering instants should be completed
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")(
|
||||
Seq(secondScheduleInstant, 1, HoodieInstant.State.COMPLETED.name(), "*"),
|
||||
Seq(firstScheduleInstant, 3, HoodieInstant.State.COMPLETED.name(), "*")
|
||||
)
|
||||
|
||||
// Do clustering without manual schedule(which will do the schedule if no pending clustering exists)
|
||||
spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
|
||||
spark.sql(s"insert into $tableName values(6, 'a6', 10, 1005)")
|
||||
spark.sql(s"call run_clustering(table => '$tableName', order => 'ts')")
|
||||
spark.sql(s"call run_clustering(table => '$tableName', order => 'ts', show_involved_partition => true)").show()
|
||||
|
||||
val thirdClusteringInstant = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
.findInstantsAfter(secondScheduleInstant)
|
||||
@@ -142,7 +148,7 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
| location '$basePath'
|
||||
""".stripMargin)
|
||||
|
||||
spark.sql(s"call run_clustering(path => '$basePath')")
|
||||
spark.sql(s"call run_clustering(path => '$basePath')").show()
|
||||
checkAnswer(s"call show_clustering(path => '$basePath')")()
|
||||
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
@@ -152,18 +158,22 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
// Generate the first clustering plan
|
||||
val firstScheduleInstant = HoodieActiveTimeline.createNewInstantTime
|
||||
client.scheduleClusteringAtInstant(firstScheduleInstant, HOption.empty())
|
||||
checkAnswer(s"call show_clustering(path => '$basePath')")(
|
||||
Seq(firstScheduleInstant, 3)
|
||||
checkAnswer(s"call show_clustering(path => '$basePath', show_involved_partition => true)")(
|
||||
Seq(firstScheduleInstant, 3, HoodieInstant.State.REQUESTED.name(), "ts=1000,ts=1001,ts=1002")
|
||||
)
|
||||
// Do clustering for all the clustering plan
|
||||
spark.sql(s"call run_clustering(path => '$basePath', order => 'ts')")
|
||||
checkAnswer(s"call run_clustering(path => '$basePath', order => 'ts')")(
|
||||
Seq(firstScheduleInstant, 3, HoodieInstant.State.COMPLETED.name(), "*")
|
||||
)
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
Seq(3, "a3", 10.0, 1002)
|
||||
)
|
||||
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
HoodieDataSourceHelpers.hasNewCommits(fs, basePath, firstScheduleInstant)
|
||||
assertResult(false)(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, firstScheduleInstant))
|
||||
|
||||
// Check the number of finished clustering instants
|
||||
var finishedClustering = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
@@ -176,7 +186,12 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
// Do clustering without manual schedule(which will do the schedule if no pending clustering exists)
|
||||
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
|
||||
spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts >= 1003L')")
|
||||
val resultA = spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts >= 1003L', show_involved_partition => true)")
|
||||
.collect()
|
||||
.map(row => Seq(row.getString(0), row.getInt(1), row.getString(2), row.getString(3)))
|
||||
assertResult(1)(resultA.length)
|
||||
assertResult("ts=1003,ts=1004")(resultA(0)(3))
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1001),
|
||||
@@ -220,6 +235,8 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
val fs = new Path(basePath).getFileSystem(spark.sessionState.newHadoopConf())
|
||||
|
||||
// Test partition pruning with single predicate
|
||||
var resultA: Array[Seq[Any]] = Array.empty
|
||||
|
||||
{
|
||||
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
|
||||
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)")
|
||||
@@ -230,7 +247,11 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
)("Only partition predicates are allowed")
|
||||
|
||||
// Do clustering table with partition predicate
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts <= 1001L', order => 'ts')")
|
||||
resultA = spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts <= 1001L', order => 'ts', show_involved_partition => true)")
|
||||
.collect()
|
||||
.map(row => Seq(row.getString(0), row.getInt(1), row.getString(2), row.getString(3)))
|
||||
assertResult(1)(resultA.length)
|
||||
assertResult("ts=1000,ts=1001")(resultA(0)(3))
|
||||
|
||||
// There is 1 completed clustering instant
|
||||
val clusteringInstants = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
@@ -245,9 +266,12 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
val clusteringPlan = HoodieDataSourceHelpers.getClusteringPlan(fs, basePath, clusteringInstant.getTimestamp)
|
||||
assertResult(true)(clusteringPlan.isPresent)
|
||||
assertResult(2)(clusteringPlan.get().getInputGroups.size())
|
||||
assertResult(resultA(0)(1))(clusteringPlan.get().getInputGroups.size())
|
||||
|
||||
// No pending clustering instant
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
// All clustering instants are completed
|
||||
checkAnswer(s"call show_clustering(table => '$tableName', show_involved_partition => true)")(
|
||||
Seq(resultA(0).head, resultA(0)(1), HoodieInstant.State.COMPLETED.name(), "ts=1000,ts=1001")
|
||||
)
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
@@ -257,6 +281,8 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
}
|
||||
|
||||
// Test partition pruning with {@code And} predicates
|
||||
var resultB: Array[Seq[Any]] = Array.empty
|
||||
|
||||
{
|
||||
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1003)")
|
||||
spark.sql(s"insert into $tableName values(5, 'a5', 10, 1004)")
|
||||
@@ -267,7 +293,11 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
)("Only partition predicates are allowed")
|
||||
|
||||
// Do clustering table with partition predicate
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts > 1001L and ts <= 1005L', order => 'ts')")
|
||||
resultB = spark.sql(s"call run_clustering(table => '$tableName', predicate => 'ts > 1001L and ts <= 1005L', order => 'ts', show_involved_partition => true)")
|
||||
.collect()
|
||||
.map(row => Seq(row.getString(0), row.getInt(1), row.getString(2), row.getString(3)))
|
||||
assertResult(1)(resultB.length)
|
||||
assertResult("ts=1002,ts=1003,ts=1004,ts=1005")(resultB(0)(3))
|
||||
|
||||
// There are 2 completed clustering instants
|
||||
val clusteringInstants = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
@@ -283,8 +313,11 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
assertResult(true)(clusteringPlan.isPresent)
|
||||
assertResult(4)(clusteringPlan.get().getInputGroups.size())
|
||||
|
||||
// No pending clustering instant
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
// All clustering instants are completed
|
||||
checkAnswer(s"call show_clustering(table => '$tableName', show_involved_partition => true)")(
|
||||
Seq(resultA(0).head, resultA(0)(1), HoodieInstant.State.COMPLETED.name(), "ts=1000,ts=1001"),
|
||||
Seq(resultB(0).head, resultB(0)(1), HoodieInstant.State.COMPLETED.name(), "ts=1002,ts=1003,ts=1004,ts=1005")
|
||||
)
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
@@ -297,6 +330,8 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
}
|
||||
|
||||
// Test partition pruning with {@code And}-{@code Or} predicates
|
||||
var resultC: Array[Seq[Any]] = Array.empty
|
||||
|
||||
{
|
||||
spark.sql(s"insert into $tableName values(7, 'a7', 10, 1006)")
|
||||
spark.sql(s"insert into $tableName values(8, 'a8', 10, 1007)")
|
||||
@@ -308,7 +343,11 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
)("Only partition predicates are allowed")
|
||||
|
||||
// Do clustering table with partition predicate
|
||||
spark.sql(s"call run_clustering(table => '$tableName', predicate => '(ts >= 1006L and ts < 1008L) or ts >= 1009L', order => 'ts')")
|
||||
resultC = spark.sql(s"call run_clustering(table => '$tableName', predicate => '(ts >= 1006L and ts < 1008L) or ts >= 1009L', order => 'ts', show_involved_partition => true)")
|
||||
.collect()
|
||||
.map(row => Seq(row.getString(0), row.getInt(1), row.getString(2), row.getString(3)))
|
||||
assertResult(1)(resultC.length)
|
||||
assertResult("ts=1006,ts=1007,ts=1009")(resultC(0)(3))
|
||||
|
||||
// There are 3 completed clustering instants
|
||||
val clusteringInstants = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, basePath)
|
||||
@@ -324,8 +363,12 @@ class TestClusteringProcedure extends HoodieSparkSqlTestBase {
|
||||
assertResult(true)(clusteringPlan.isPresent)
|
||||
assertResult(3)(clusteringPlan.get().getInputGroups.size())
|
||||
|
||||
// No pending clustering instant
|
||||
checkAnswer(s"call show_clustering(table => '$tableName')")()
|
||||
// All clustering instants are completed
|
||||
checkAnswer(s"call show_clustering(table => '$tableName', show_involved_partition => true)")(
|
||||
Seq(resultA(0).head, resultA(0)(1), HoodieInstant.State.COMPLETED.name(), "ts=1000,ts=1001"),
|
||||
Seq(resultB(0).head, resultB(0)(1), HoodieInstant.State.COMPLETED.name(), "ts=1002,ts=1003,ts=1004,ts=1005"),
|
||||
Seq(resultC(0).head, resultC(0)(1), HoodieInstant.State.COMPLETED.name(), "ts=1006,ts=1007,ts=1009")
|
||||
)
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
|
||||
package org.apache.spark.sql.hudi.procedure
|
||||
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant
|
||||
import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase
|
||||
|
||||
class TestCompactionProcedure extends HoodieSparkSqlTestBase {
|
||||
@@ -48,22 +49,52 @@ class TestCompactionProcedure extends HoodieSparkSqlTestBase {
|
||||
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1000)")
|
||||
spark.sql(s"update $tableName set price = 11 where id = 1")
|
||||
|
||||
spark.sql(s"call run_compaction(op => 'schedule', table => '$tableName')")
|
||||
// Schedule the first compaction
|
||||
val resultA = spark.sql(s"call run_compaction(op => 'schedule', table => '$tableName')")
|
||||
.collect()
|
||||
.map(row => Seq(row.getString(0), row.getInt(1), row.getString(2)))
|
||||
|
||||
spark.sql(s"update $tableName set price = 12 where id = 2")
|
||||
spark.sql(s"call run_compaction('schedule', '$tableName')")
|
||||
val compactionRows = spark.sql(s"call show_compaction(table => '$tableName', limit => 10)").collect()
|
||||
val timestamps = compactionRows.map(_.getString(0))
|
||||
|
||||
// Schedule the second compaction
|
||||
val resultB = spark.sql(s"call run_compaction('schedule', '$tableName')")
|
||||
.collect()
|
||||
.map(row => Seq(row.getString(0), row.getInt(1), row.getString(2)))
|
||||
|
||||
assertResult(1)(resultA.length)
|
||||
assertResult(1)(resultB.length)
|
||||
val showCompactionSql: String = s"call show_compaction(table => '$tableName', limit => 10)"
|
||||
checkAnswer(showCompactionSql)(
|
||||
resultA(0),
|
||||
resultB(0)
|
||||
)
|
||||
|
||||
val compactionRows = spark.sql(showCompactionSql).collect()
|
||||
val timestamps = compactionRows.map(_.getString(0)).sorted
|
||||
assertResult(2)(timestamps.length)
|
||||
|
||||
spark.sql(s"call run_compaction(op => 'run', table => '$tableName', timestamp => ${timestamps(1)})")
|
||||
// Execute the second scheduled compaction instant actually
|
||||
checkAnswer(s"call run_compaction(op => 'run', table => '$tableName', timestamp => ${timestamps(1)})")(
|
||||
Seq(resultB(0).head, resultB(0)(1), HoodieInstant.State.COMPLETED.name())
|
||||
)
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 11.0, 1000),
|
||||
Seq(2, "a2", 12.0, 1000),
|
||||
Seq(3, "a3", 10.0, 1000),
|
||||
Seq(4, "a4", 10.0, 1000)
|
||||
)
|
||||
assertResult(1)(spark.sql(s"call show_compaction('$tableName')").collect().length)
|
||||
spark.sql(s"call run_compaction(op => 'run', table => '$tableName', timestamp => ${timestamps(0)})")
|
||||
|
||||
// A compaction action eventually becomes commit when completed, so show_compaction
|
||||
// can only see the first scheduled compaction instant
|
||||
val resultC = spark.sql(s"call show_compaction('$tableName')")
|
||||
.collect()
|
||||
.map(row => Seq(row.getString(0), row.getInt(1), row.getString(2)))
|
||||
assertResult(1)(resultC.length)
|
||||
assertResult(resultA)(resultC)
|
||||
|
||||
checkAnswer(s"call run_compaction(op => 'run', table => '$tableName', timestamp => ${timestamps(0)})")(
|
||||
Seq(resultA(0).head, resultA(0)(1), HoodieInstant.State.COMPLETED.name())
|
||||
)
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 11.0, 1000),
|
||||
Seq(2, "a2", 12.0, 1000),
|
||||
@@ -98,25 +129,40 @@ class TestCompactionProcedure extends HoodieSparkSqlTestBase {
|
||||
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1000)")
|
||||
spark.sql(s"update $tableName set price = 11 where id = 1")
|
||||
|
||||
spark.sql(s"call run_compaction(op => 'run', path => '${tmp.getCanonicalPath}')")
|
||||
checkAnswer(s"call run_compaction(op => 'run', path => '${tmp.getCanonicalPath}')")()
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 11.0, 1000),
|
||||
Seq(2, "a2", 10.0, 1000),
|
||||
Seq(3, "a3", 10.0, 1000)
|
||||
)
|
||||
assertResult(0)(spark.sql(s"call show_compaction(path => '${tmp.getCanonicalPath}')").collect().length)
|
||||
// schedule compaction first
|
||||
|
||||
spark.sql(s"update $tableName set price = 12 where id = 1")
|
||||
spark.sql(s"call run_compaction(op=> 'schedule', path => '${tmp.getCanonicalPath}')")
|
||||
|
||||
// schedule compaction second
|
||||
// Schedule the first compaction
|
||||
val resultA = spark.sql(s"call run_compaction(op=> 'schedule', path => '${tmp.getCanonicalPath}')")
|
||||
.collect()
|
||||
.map(row => Seq(row.getString(0), row.getInt(1), row.getString(2)))
|
||||
|
||||
spark.sql(s"update $tableName set price = 12 where id = 2")
|
||||
spark.sql(s"call run_compaction(op => 'schedule', path => '${tmp.getCanonicalPath}')")
|
||||
|
||||
// show compaction
|
||||
assertResult(2)(spark.sql(s"call show_compaction(path => '${tmp.getCanonicalPath}')").collect().length)
|
||||
// run compaction for all the scheduled compaction
|
||||
spark.sql(s"call run_compaction(op => 'run', path => '${tmp.getCanonicalPath}')")
|
||||
// Schedule the second compaction
|
||||
val resultB = spark.sql(s"call run_compaction(op => 'schedule', path => '${tmp.getCanonicalPath}')")
|
||||
.collect()
|
||||
.map(row => Seq(row.getString(0), row.getInt(1), row.getString(2)))
|
||||
|
||||
assertResult(1)(resultA.length)
|
||||
assertResult(1)(resultB.length)
|
||||
checkAnswer(s"call show_compaction(path => '${tmp.getCanonicalPath}')")(
|
||||
resultA(0),
|
||||
resultB(0)
|
||||
)
|
||||
|
||||
// Run compaction for all the scheduled compaction
|
||||
checkAnswer(s"call run_compaction(op => 'run', path => '${tmp.getCanonicalPath}')")(
|
||||
Seq(resultA(0).head, resultA(0)(1), HoodieInstant.State.COMPLETED.name()),
|
||||
Seq(resultB(0).head, resultB(0)(1), HoodieInstant.State.COMPLETED.name())
|
||||
)
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
|
||||
Seq(1, "a1", 12.0, 1000),
|
||||
|
||||
Reference in New Issue
Block a user