1
0

[HUDI-2182] Support Compaction Command For Spark Sql (#3277)

This commit is contained in:
pengzhiwei
2021-08-06 15:12:10 +08:00
committed by GitHub
parent 20feb1a897
commit 3f8ca1a355
20 changed files with 811 additions and 21 deletions

View File

@@ -0,0 +1,129 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.hudi
class TestCompactionTable extends TestHoodieSqlBase {
test("Test compaction table") {
withTempDir {tmp =>
val tableName = generateTableName
spark.sql(
s"""
|create table $tableName (
| id int,
| name string,
| price double,
| ts long
|) using hudi
| location '${tmp.getCanonicalPath}'
| options (
| primaryKey ='id',
| type = 'mor',
| preCombineField = 'ts'
| )
""".stripMargin)
spark.sql("set hoodie.parquet.max.file.size = 10000")
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1000)")
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1000)")
spark.sql(s"insert into $tableName values(4, 'a4', 10, 1000)")
spark.sql(s"update $tableName set price = 11 where id = 1")
spark.sql(s"schedule compaction on $tableName")
spark.sql(s"update $tableName set price = 12 where id = 2")
spark.sql(s"schedule compaction on $tableName")
val compactionRows = spark.sql(s"show compaction on $tableName limit 10").collect()
val timestamps = compactionRows.map(_.getString(0))
assertResult(2)(timestamps.length)
spark.sql(s"run compaction on $tableName at ${timestamps(1)}")
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
Seq(1, "a1", 11.0, 1000),
Seq(2, "a2", 12.0, 1000),
Seq(3, "a3", 10.0, 1000),
Seq(4, "a4", 10.0, 1000)
)
assertResult(1)(spark.sql(s"show compaction on $tableName").collect().length)
spark.sql(s"run compaction on $tableName at ${timestamps(0)}")
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
Seq(1, "a1", 11.0, 1000),
Seq(2, "a2", 12.0, 1000),
Seq(3, "a3", 10.0, 1000),
Seq(4, "a4", 10.0, 1000)
)
assertResult(0)(spark.sql(s"show compaction on $tableName").collect().length)
}
}
test("Test compaction path") {
withTempDir { tmp =>
val tableName = generateTableName
spark.sql(
s"""
|create table $tableName (
| id int,
| name string,
| price double,
| ts long
|) using hudi
| location '${tmp.getCanonicalPath}'
| options (
| primaryKey ='id',
| type = 'mor',
| preCombineField = 'ts'
| )
""".stripMargin)
spark.sql("set hoodie.parquet.max.file.size = 10000")
spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
spark.sql(s"insert into $tableName values(2, 'a2', 10, 1000)")
spark.sql(s"insert into $tableName values(3, 'a3', 10, 1000)")
spark.sql(s"update $tableName set price = 11 where id = 1")
spark.sql(s"run compaction on '${tmp.getCanonicalPath}'")
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
Seq(1, "a1", 11.0, 1000),
Seq(2, "a2", 10.0, 1000),
Seq(3, "a3", 10.0, 1000)
)
assertResult(0)(spark.sql(s"show compaction on '${tmp.getCanonicalPath}'").collect().length)
// schedule compaction first
spark.sql(s"update $tableName set price = 12 where id = 1")
spark.sql(s"schedule compaction on '${tmp.getCanonicalPath}'")
// schedule compaction second
spark.sql(s"update $tableName set price = 12 where id = 2")
spark.sql(s"schedule compaction on '${tmp.getCanonicalPath}'")
// show compaction
assertResult(2)(spark.sql(s"show compaction on '${tmp.getCanonicalPath}'").collect().length)
// run compaction for all the scheduled compaction
spark.sql(s"run compaction on '${tmp.getCanonicalPath}'")
checkAnswer(s"select id, name, price, ts from $tableName order by id")(
Seq(1, "a1", 12.0, 1000),
Seq(2, "a2", 12.0, 1000),
Seq(3, "a3", 10.0, 1000)
)
assertResult(0)(spark.sql(s"show compaction on '${tmp.getCanonicalPath}'").collect().length)
checkException(s"run compaction on '${tmp.getCanonicalPath}' at 12345")(
s"Compaction instant: 12345 is not found in ${tmp.getCanonicalPath}, Available pending compaction instants are: "
)
}
}
}