[HUDI-4081][HUDI-4472] Addressing Spark SQL vs Spark DS performance gap (#6213)
This commit is contained in:
@@ -145,11 +145,23 @@ class HoodieSparkSqlTestBase extends FunSuite with BeforeAndAfterAll {
|
||||
assertResult(true)(hasException)
|
||||
}
|
||||
|
||||
|
||||
protected def removeQuotes(value: Any): Any = {
|
||||
def dropTypeLiteralPrefix(value: Any): Any = {
|
||||
value match {
|
||||
case s: String => s.stripPrefix("'").stripSuffix("'")
|
||||
case _=> value
|
||||
case s: String =>
|
||||
s.stripPrefix("DATE").stripPrefix("TIMESTAMP").stripPrefix("X")
|
||||
case _ => value
|
||||
}
|
||||
}
|
||||
|
||||
protected def extractRawValue(value: Any): Any = {
|
||||
value match {
|
||||
case s: String =>
|
||||
// We need to strip out data-type prefixes like "DATE", "TIMESTAMP"
|
||||
dropTypeLiteralPrefix(s)
|
||||
.asInstanceOf[String]
|
||||
.stripPrefix("'")
|
||||
.stripSuffix("'")
|
||||
case _ => value
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
package org.apache.spark.sql.hudi
|
||||
|
||||
import org.apache.hudi.DataSourceWriteOptions._
|
||||
import org.apache.hudi.HoodieSparkUtils.isSpark2
|
||||
import org.apache.hudi.config.HoodieWriteConfig
|
||||
import org.apache.hudi.keygen.SimpleKeyGenerator
|
||||
import org.apache.spark.sql.SaveMode
|
||||
@@ -93,11 +94,20 @@ class TestDeleteTable extends HoodieSparkSqlTestBase {
|
||||
""".stripMargin)
|
||||
|
||||
// insert data to table
|
||||
spark.sql(
|
||||
s"""
|
||||
|insert into $tableName
|
||||
|values (1, 'a1', 10.0, 1000), (2, 'a2', 20.0, 1000), (3, 'a2', 30.0, 1000)
|
||||
""".stripMargin)
|
||||
if (isSpark2) {
|
||||
spark.sql(
|
||||
s"""
|
||||
|insert into $tableName
|
||||
|values (1, 'a1', cast(10.0 as double), 1000), (2, 'a2', cast(20.0 as double), 1000), (3, 'a2', cast(30.0 as double), 1000)
|
||||
|""".stripMargin)
|
||||
} else {
|
||||
spark.sql(
|
||||
s"""
|
||||
|insert into $tableName
|
||||
|values (1, 'a1', 10.0, 1000), (2, 'a2', 20.0, 1000), (3, 'a2', 30.0, 1000)
|
||||
|""".stripMargin)
|
||||
}
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 20.0, 1000),
|
||||
@@ -132,11 +142,20 @@ class TestDeleteTable extends HoodieSparkSqlTestBase {
|
||||
""".stripMargin)
|
||||
|
||||
// insert data to table
|
||||
spark.sql(
|
||||
s"""
|
||||
|insert into $ptTableName
|
||||
|values (1, 'a1', 10.0, 1000, "2021"), (2, 'a2', 20.0, 1000, "2021"), (3, 'a2', 30.0, 1000, "2022")
|
||||
""".stripMargin)
|
||||
if (isSpark2) {
|
||||
spark.sql(
|
||||
s"""
|
||||
|insert into $ptTableName
|
||||
|values (1, 'a1', cast(10.0 as double), 1000, "2021"), (2, 'a2', cast(20.0 as double), 1000, "2021"), (3, 'a2', cast(30.0 as double), 1000, "2022")
|
||||
|""".stripMargin)
|
||||
} else {
|
||||
spark.sql(
|
||||
s"""
|
||||
|insert into $ptTableName
|
||||
|values (1, 'a1', 10.0, 1000, "2021"), (2, 'a2', 20.0, 1000, "2021"), (3, 'a2', 30.0, 1000, "2022")
|
||||
|""".stripMargin)
|
||||
}
|
||||
|
||||
checkAnswer(s"select id, name, price, ts, pt from $ptTableName")(
|
||||
Seq(1, "a1", 10.0, 1000, "2021"),
|
||||
Seq(2, "a2", 20.0, 1000, "2021"),
|
||||
|
||||
@@ -24,6 +24,7 @@ import org.apache.hudi.config.HoodieWriteConfig
|
||||
import org.apache.hudi.exception.HoodieDuplicateKeyException
|
||||
import org.apache.hudi.keygen.ComplexKeyGenerator
|
||||
import org.apache.spark.sql.SaveMode
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
|
||||
import java.io.File
|
||||
|
||||
@@ -396,8 +397,8 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
|
||||
("string", "'1000'"),
|
||||
("int", 1000),
|
||||
("bigint", 10000),
|
||||
("timestamp", "'2021-05-20 00:00:00'"),
|
||||
("date", "'2021-05-20'")
|
||||
("timestamp", "TIMESTAMP'2021-05-20 00:00:00'"),
|
||||
("date", "DATE'2021-05-20'")
|
||||
)
|
||||
typeAndValue.foreach { case (partitionType, partitionValue) =>
|
||||
val tableName = generateTableName
|
||||
@@ -409,8 +410,8 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
|
||||
test("Test TimestampType Partition Column With Consistent Logical Timestamp Enabled") {
|
||||
withTempDir { tmp =>
|
||||
val typeAndValue = Seq(
|
||||
("timestamp", "'2021-05-20 00:00:00'"),
|
||||
("date", "'2021-05-20'")
|
||||
("timestamp", "TIMESTAMP'2021-05-20 00:00:00'"),
|
||||
("date", "DATE'2021-05-20'")
|
||||
)
|
||||
typeAndValue.foreach { case (partitionType, partitionValue) =>
|
||||
val tableName = generateTableName
|
||||
@@ -433,11 +434,12 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
|
||||
| partitioned by (dt)
|
||||
| location '${tmp.getCanonicalPath}/$tableName'
|
||||
""".stripMargin)
|
||||
spark.sql(s"insert into $tableName partition(dt = $partitionValue) select 1, 'a1', 10")
|
||||
// NOTE: We have to drop type-literal prefix since Spark doesn't parse type literals appropriately
|
||||
spark.sql(s"insert into $tableName partition(dt = ${dropTypeLiteralPrefix(partitionValue)}) select 1, 'a1', 10")
|
||||
spark.sql(s"insert into $tableName select 2, 'a2', 10, $partitionValue")
|
||||
checkAnswer(s"select id, name, price, cast(dt as string) from $tableName order by id")(
|
||||
Seq(1, "a1", 10, removeQuotes(partitionValue).toString),
|
||||
Seq(2, "a2", 10, removeQuotes(partitionValue).toString)
|
||||
Seq(1, "a1", 10, extractRawValue(partitionValue).toString),
|
||||
Seq(2, "a2", 10, extractRawValue(partitionValue).toString)
|
||||
)
|
||||
}
|
||||
|
||||
@@ -481,14 +483,17 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
|
||||
| tblproperties (primaryKey = 'id')
|
||||
| partitioned by (dt)
|
||||
""".stripMargin)
|
||||
checkException(s"insert into $tableName partition(dt = '2021-06-20')" +
|
||||
s" select 1, 'a1', 10, '2021-06-20'") (
|
||||
"assertion failed: Required select columns count: 4, Current select columns(including static partition column)" +
|
||||
" count: 5,columns: (1,a1,10,2021-06-20,dt)"
|
||||
checkException(s"insert into $tableName partition(dt = '2021-06-20') select 1, 'a1', 10, '2021-06-20'") (
|
||||
"Expected table's schema: " +
|
||||
"[StructField(id,IntegerType,true), StructField(name,StringType,true), StructField(price,DoubleType,true), StructField(dt,StringType,true)], " +
|
||||
"query's output (including static partition values): " +
|
||||
"[StructField(1,IntegerType,false), StructField(a1,StringType,false), StructField(10,IntegerType,false), StructField(2021-06-20,StringType,false), StructField(dt,StringType,true)]"
|
||||
)
|
||||
checkException(s"insert into $tableName select 1, 'a1', 10")(
|
||||
"assertion failed: Required select columns count: 4, Current select columns(including static partition column)" +
|
||||
" count: 3,columns: (1,a1,10)"
|
||||
"Expected table's schema: " +
|
||||
"[StructField(id,IntegerType,true), StructField(name,StringType,true), StructField(price,DoubleType,true), StructField(dt,StringType,true)], " +
|
||||
"query's output (including static partition values): " +
|
||||
"[StructField(1,IntegerType,false), StructField(a1,StringType,false), StructField(10,IntegerType,false)]"
|
||||
)
|
||||
spark.sql("set hoodie.sql.bulk.insert.enable = true")
|
||||
spark.sql("set hoodie.sql.insert.mode = strict")
|
||||
|
||||
@@ -908,7 +908,7 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase {
|
||||
| when not matched then insert *
|
||||
|""".stripMargin)
|
||||
checkAnswer(s"select id, name, cast(value as string), ts from $tableName")(
|
||||
Seq(1, "a1", removeQuotes(dataValue), 1000)
|
||||
Seq(1, "a1", extractRawValue(dataValue), 1000)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
package org.apache.spark.sql.hudi
|
||||
|
||||
import org.apache.hudi.HoodieSparkUtils.isSpark2
|
||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH
|
||||
|
||||
class TestShowPartitions extends HoodieSparkSqlTestBase {
|
||||
@@ -84,11 +85,22 @@ class TestShowPartitions extends HoodieSparkSqlTestBase {
|
||||
checkAnswer(s"show partitions $tableName partition(dt='2021-01-02')")(Seq("dt=2021-01-02"))
|
||||
|
||||
// Insert into null partition
|
||||
spark.sql(
|
||||
s"""
|
||||
| insert into $tableName
|
||||
| select 3 as id, 'a3' as name, 10 as price, 1000 as ts, null as dt
|
||||
if (isSpark2) {
|
||||
// Spark 2 isn't able to convert NullType to any other type w/ appropriate nullability, so
|
||||
// explicit cast is required
|
||||
spark.sql(
|
||||
s"""
|
||||
| insert into $tableName
|
||||
| select 3 as id, 'a3' as name, 10 as price, 1000 as ts, cast(null as string) as dt
|
||||
""".stripMargin)
|
||||
} else {
|
||||
spark.sql(
|
||||
s"""
|
||||
| insert into $tableName
|
||||
| select 3 as id, 'a3' as name, 10 as price, 1000 as ts, null as dt
|
||||
""".stripMargin)
|
||||
}
|
||||
|
||||
checkAnswer(s"show partitions $tableName")(
|
||||
Seq("dt=2021-01-01"), Seq("dt=2021-01-02"), Seq("dt=%s".format(DEFAULT_PARTITION_PATH))
|
||||
)
|
||||
|
||||
@@ -55,11 +55,11 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
|
||||
spark.sql(
|
||||
s"""
|
||||
| insert into $tableName values
|
||||
| (1,1,11,100001,101.01,1001.0001,100001.0001,'a000001','2021-12-25','2021-12-25 12:01:01',true,'a01','2021-12-25'),
|
||||
| (2,2,12,100002,102.02,1002.0002,100002.0002,'a000002','2021-12-25','2021-12-25 12:02:02',true,'a02','2021-12-25'),
|
||||
| (3,3,13,100003,103.03,1003.0003,100003.0003,'a000003','2021-12-25','2021-12-25 12:03:03',false,'a03','2021-12-25'),
|
||||
| (4,4,14,100004,104.04,1004.0004,100004.0004,'a000004','2021-12-26','2021-12-26 12:04:04',true,'a04','2021-12-26'),
|
||||
| (5,5,15,100005,105.05,1005.0005,100005.0005,'a000005','2021-12-26','2021-12-26 12:05:05',false,'a05','2021-12-26')
|
||||
| (1,1,11,100001,101.01,1001.0001,100001.0001,'a000001',DATE'2021-12-25',TIMESTAMP'2021-12-25 12:01:01',true,X'a01',TIMESTAMP'2021-12-25'),
|
||||
| (2,2,12,100002,102.02,1002.0002,100002.0002,'a000002',DATE'2021-12-25',TIMESTAMP'2021-12-25 12:02:02',true,X'a02',TIMESTAMP'2021-12-25'),
|
||||
| (3,3,13,100003,103.03,1003.0003,100003.0003,'a000003',DATE'2021-12-25',TIMESTAMP'2021-12-25 12:03:03',false,X'a03',TIMESTAMP'2021-12-25'),
|
||||
| (4,4,14,100004,104.04,1004.0004,100004.0004,'a000004',DATE'2021-12-26',TIMESTAMP'2021-12-26 12:04:04',true,X'a04',TIMESTAMP'2021-12-26'),
|
||||
| (5,5,15,100005,105.05,1005.0005,100005.0005,'a000005',DATE'2021-12-26',TIMESTAMP'2021-12-26 12:05:05',false,X'a05',TIMESTAMP'2021-12-26')
|
||||
|""".stripMargin)
|
||||
}
|
||||
|
||||
@@ -70,6 +70,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
|
||||
val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}"
|
||||
if (HoodieSparkUtils.gteqSpark3_1) {
|
||||
spark.sql("set hoodie.schema.on.read.enable=true")
|
||||
// NOTE: This is required since as this tests use type coercions which were only permitted in Spark 2.x
|
||||
// and are disallowed now by default in Spark 3.x
|
||||
spark.sql("set spark.sql.storeAssignmentPolicy=legacy")
|
||||
createAndPreparePartitionTable(spark, tableName, tablePath, tableType)
|
||||
// date -> string -> date
|
||||
spark.sql(s"alter table $tableName alter column col6 type String")
|
||||
@@ -138,6 +141,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
|
||||
val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}"
|
||||
if (HoodieSparkUtils.gteqSpark3_1) {
|
||||
spark.sql("set hoodie.schema.on.read.enable=true")
|
||||
// NOTE: This is required since as this tests use type coercions which were only permitted in Spark 2.x
|
||||
// and are disallowed now by default in Spark 3.x
|
||||
spark.sql("set spark.sql.storeAssignmentPolicy=legacy")
|
||||
createAndPreparePartitionTable(spark, tableName, tablePath, tableType)
|
||||
// float -> double -> decimal -> String
|
||||
spark.sql(s"alter table $tableName alter column col2 type double")
|
||||
@@ -172,6 +178,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
|
||||
val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}"
|
||||
if (HoodieSparkUtils.gteqSpark3_1) {
|
||||
spark.sql("set hoodie.schema.on.read.enable=true")
|
||||
// NOTE: This is required since as this tests use type coercions which were only permitted in Spark 2.x
|
||||
// and are disallowed now by default in Spark 3.x
|
||||
spark.sql("set spark.sql.storeAssignmentPolicy=legacy")
|
||||
createAndPreparePartitionTable(spark, tableName, tablePath, tableType)
|
||||
|
||||
// test set properties
|
||||
@@ -402,7 +411,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
|
||||
|
||||
spark.sql(s"alter table $tableName alter column members.value.a first")
|
||||
|
||||
spark.sql(s"insert into ${tableName} values(1, 'jack', map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStruct', 29, 100), 1000)")
|
||||
spark.sql(s"insert into ${tableName} values(1, 'jack', map('k1', struct(100, 'v1'), 'k2', struct(200, 'v2')), struct('jackStruct', 29, 100), 1000)")
|
||||
|
||||
// rename column
|
||||
spark.sql(s"alter table ${tableName} rename column user to userx")
|
||||
@@ -424,7 +433,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
|
||||
checkAnswer(spark.sql(s"select name, userx.name, userx.score from ${tableName}").collect())(Seq(null, null, null))
|
||||
|
||||
// insert again
|
||||
spark.sql(s"insert into ${tableName} values(2 , map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStructNew', 291 , 101), 'jacknew', 1000)")
|
||||
spark.sql(s"insert into ${tableName} values(2 , map('k1', struct(100, 'v1'), 'k2', struct(200, 'v2')), struct('jackStructNew', 291 , 101), 'jacknew', 1000)")
|
||||
|
||||
// check again
|
||||
checkAnswer(spark.sql(s"select name, userx.name as uxname, userx.score as uxs from ${tableName} order by id").collect())(
|
||||
@@ -440,9 +449,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
|
||||
Seq(291, 2, "jacknew"))
|
||||
// test map value type change
|
||||
spark.sql(s"alter table ${tableName} add columns(mxp map<String, int>)")
|
||||
spark.sql(s"insert into ${tableName} values(2 , map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 9))")
|
||||
spark.sql(s"insert into ${tableName} values(2, map('k1', struct(100, 'v1'), 'k2', struct(200, 'v2')), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 9))")
|
||||
spark.sql(s"alter table ${tableName} alter column mxp.value type double")
|
||||
spark.sql(s"insert into ${tableName} values(2 , map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 10))")
|
||||
spark.sql(s"insert into ${tableName} values(2, map('k1', struct(100, 'v1'), 'k2', struct(200, 'v2')), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 10))")
|
||||
spark.sql(s"select * from $tableName").show(false)
|
||||
checkAnswer(spark.sql(s"select mxp from ${tableName} order by id").collect())(
|
||||
Seq(null),
|
||||
@@ -453,7 +462,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
|
||||
spark.sql(s"alter table ${tableName} rename column userx to us")
|
||||
spark.sql(s"alter table ${tableName} rename column us.age to age1")
|
||||
|
||||
spark.sql(s"insert into ${tableName} values(2 , map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 10))")
|
||||
spark.sql(s"insert into ${tableName} values(2, map('k1', struct(100, 'v1'), 'k2', struct(200, 'v2')), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 10))")
|
||||
spark.sql(s"select mem.value.nn, us.age1 from $tableName order by id").show()
|
||||
checkAnswer(spark.sql(s"select mem.value.nn, us.age1 from $tableName order by id").collect())(
|
||||
Seq(null, 29),
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
|
||||
package org.apache.spark.sql.hudi
|
||||
|
||||
import org.apache.hudi.HoodieSparkUtils.isSpark2
|
||||
|
||||
class TestUpdateTable extends HoodieSparkSqlTestBase {
|
||||
|
||||
test("Test Update Table") {
|
||||
@@ -84,7 +86,12 @@ class TestUpdateTable extends HoodieSparkSqlTestBase {
|
||||
""".stripMargin)
|
||||
|
||||
// insert data to table
|
||||
spark.sql(s"insert into $tableName values (1, 'a1', 10.0, 1000), (2, 'a2', 20.0, 1000)")
|
||||
if (isSpark2) {
|
||||
spark.sql(s"insert into $tableName values (1, 'a1', cast(10.0 as double), 1000), (2, 'a2', cast(20.0 as double), 1000)")
|
||||
} else {
|
||||
spark.sql(s"insert into $tableName values (1, 'a1', 10.0, 1000), (2, 'a2', 20.0, 1000)")
|
||||
}
|
||||
|
||||
checkAnswer(s"select id, name, price, ts from $tableName")(
|
||||
Seq(1, "a1", 10.0, 1000),
|
||||
Seq(2, "a2", 20.0, 1000)
|
||||
@@ -119,11 +126,20 @@ class TestUpdateTable extends HoodieSparkSqlTestBase {
|
||||
""".stripMargin)
|
||||
|
||||
// insert data to table
|
||||
spark.sql(
|
||||
s"""
|
||||
|insert into $ptTableName
|
||||
|values (1, 'a1', 10.0, 1000, "2021"), (2, 'a2', 20.0, 1000, "2021"), (3, 'a2', 30.0, 1000, "2022")
|
||||
""".stripMargin)
|
||||
if (isSpark2) {
|
||||
spark.sql(
|
||||
s"""
|
||||
|insert into $ptTableName
|
||||
|values (1, 'a1', cast(10.0 as double), 1000, "2021"), (2, 'a2', cast(20.0 as double), 1000, "2021"), (3, 'a2', cast(30.0 as double), 1000, "2022")
|
||||
|""".stripMargin)
|
||||
} else {
|
||||
spark.sql(
|
||||
s"""
|
||||
|insert into $ptTableName
|
||||
|values (1, 'a1', 10.0, 1000, "2021"), (2, 'a2', 20.0, 1000, "2021"), (3, 'a2', 30.0, 1000, "2022")
|
||||
|""".stripMargin)
|
||||
}
|
||||
|
||||
checkAnswer(s"select id, name, price, ts, pt from $ptTableName")(
|
||||
Seq(1, "a1", 10.0, 1000, "2021"),
|
||||
Seq(2, "a2", 20.0, 1000, "2021"),
|
||||
|
||||
Reference in New Issue
Block a user