1
0

[HUDI-4081][HUDI-4472] Addressing Spark SQL vs Spark DS performance gap (#6213)

This commit is contained in:
Alexey Kudinkin
2022-07-28 15:36:03 -07:00
committed by GitHub
parent 70b5cf6dab
commit cfd0c1ee34
14 changed files with 382 additions and 189 deletions

View File

@@ -145,11 +145,23 @@ class HoodieSparkSqlTestBase extends FunSuite with BeforeAndAfterAll {
assertResult(true)(hasException)
}
protected def removeQuotes(value: Any): Any = {
def dropTypeLiteralPrefix(value: Any): Any = {
value match {
case s: String => s.stripPrefix("'").stripSuffix("'")
case _=> value
case s: String =>
s.stripPrefix("DATE").stripPrefix("TIMESTAMP").stripPrefix("X")
case _ => value
}
}
protected def extractRawValue(value: Any): Any = {
value match {
case s: String =>
// We need to strip out data-type prefixes like "DATE", "TIMESTAMP"
dropTypeLiteralPrefix(s)
.asInstanceOf[String]
.stripPrefix("'")
.stripSuffix("'")
case _ => value
}
}

View File

@@ -18,6 +18,7 @@
package org.apache.spark.sql.hudi
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.HoodieSparkUtils.isSpark2
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.keygen.SimpleKeyGenerator
import org.apache.spark.sql.SaveMode
@@ -93,11 +94,20 @@ class TestDeleteTable extends HoodieSparkSqlTestBase {
""".stripMargin)
// insert data to table
spark.sql(
s"""
|insert into $tableName
|values (1, 'a1', 10.0, 1000), (2, 'a2', 20.0, 1000), (3, 'a2', 30.0, 1000)
""".stripMargin)
if (isSpark2) {
spark.sql(
s"""
|insert into $tableName
|values (1, 'a1', cast(10.0 as double), 1000), (2, 'a2', cast(20.0 as double), 1000), (3, 'a2', cast(30.0 as double), 1000)
|""".stripMargin)
} else {
spark.sql(
s"""
|insert into $tableName
|values (1, 'a1', 10.0, 1000), (2, 'a2', 20.0, 1000), (3, 'a2', 30.0, 1000)
|""".stripMargin)
}
checkAnswer(s"select id, name, price, ts from $tableName")(
Seq(1, "a1", 10.0, 1000),
Seq(2, "a2", 20.0, 1000),
@@ -132,11 +142,20 @@ class TestDeleteTable extends HoodieSparkSqlTestBase {
""".stripMargin)
// insert data to table
spark.sql(
s"""
|insert into $ptTableName
|values (1, 'a1', 10.0, 1000, "2021"), (2, 'a2', 20.0, 1000, "2021"), (3, 'a2', 30.0, 1000, "2022")
""".stripMargin)
if (isSpark2) {
spark.sql(
s"""
|insert into $ptTableName
|values (1, 'a1', cast(10.0 as double), 1000, "2021"), (2, 'a2', cast(20.0 as double), 1000, "2021"), (3, 'a2', cast(30.0 as double), 1000, "2022")
|""".stripMargin)
} else {
spark.sql(
s"""
|insert into $ptTableName
|values (1, 'a1', 10.0, 1000, "2021"), (2, 'a2', 20.0, 1000, "2021"), (3, 'a2', 30.0, 1000, "2022")
|""".stripMargin)
}
checkAnswer(s"select id, name, price, ts, pt from $ptTableName")(
Seq(1, "a1", 10.0, 1000, "2021"),
Seq(2, "a2", 20.0, 1000, "2021"),

View File

@@ -24,6 +24,7 @@ import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.exception.HoodieDuplicateKeyException
import org.apache.hudi.keygen.ComplexKeyGenerator
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.internal.SQLConf
import java.io.File
@@ -396,8 +397,8 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
("string", "'1000'"),
("int", 1000),
("bigint", 10000),
("timestamp", "'2021-05-20 00:00:00'"),
("date", "'2021-05-20'")
("timestamp", "TIMESTAMP'2021-05-20 00:00:00'"),
("date", "DATE'2021-05-20'")
)
typeAndValue.foreach { case (partitionType, partitionValue) =>
val tableName = generateTableName
@@ -409,8 +410,8 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
test("Test TimestampType Partition Column With Consistent Logical Timestamp Enabled") {
withTempDir { tmp =>
val typeAndValue = Seq(
("timestamp", "'2021-05-20 00:00:00'"),
("date", "'2021-05-20'")
("timestamp", "TIMESTAMP'2021-05-20 00:00:00'"),
("date", "DATE'2021-05-20'")
)
typeAndValue.foreach { case (partitionType, partitionValue) =>
val tableName = generateTableName
@@ -433,11 +434,12 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
| partitioned by (dt)
| location '${tmp.getCanonicalPath}/$tableName'
""".stripMargin)
spark.sql(s"insert into $tableName partition(dt = $partitionValue) select 1, 'a1', 10")
// NOTE: We have to drop type-literal prefix since Spark doesn't parse type literals appropriately
spark.sql(s"insert into $tableName partition(dt = ${dropTypeLiteralPrefix(partitionValue)}) select 1, 'a1', 10")
spark.sql(s"insert into $tableName select 2, 'a2', 10, $partitionValue")
checkAnswer(s"select id, name, price, cast(dt as string) from $tableName order by id")(
Seq(1, "a1", 10, removeQuotes(partitionValue).toString),
Seq(2, "a2", 10, removeQuotes(partitionValue).toString)
Seq(1, "a1", 10, extractRawValue(partitionValue).toString),
Seq(2, "a2", 10, extractRawValue(partitionValue).toString)
)
}
@@ -481,14 +483,17 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
| tblproperties (primaryKey = 'id')
| partitioned by (dt)
""".stripMargin)
checkException(s"insert into $tableName partition(dt = '2021-06-20')" +
s" select 1, 'a1', 10, '2021-06-20'") (
"assertion failed: Required select columns count: 4, Current select columns(including static partition column)" +
" count: 5columns: (1,a1,10,2021-06-20,dt)"
checkException(s"insert into $tableName partition(dt = '2021-06-20') select 1, 'a1', 10, '2021-06-20'") (
"Expected table's schema: " +
"[StructField(id,IntegerType,true), StructField(name,StringType,true), StructField(price,DoubleType,true), StructField(dt,StringType,true)], " +
"query's output (including static partition values): " +
"[StructField(1,IntegerType,false), StructField(a1,StringType,false), StructField(10,IntegerType,false), StructField(2021-06-20,StringType,false), StructField(dt,StringType,true)]"
)
checkException(s"insert into $tableName select 1, 'a1', 10")(
"assertion failed: Required select columns count: 4, Current select columns(including static partition column)" +
" count: 3columns: (1,a1,10)"
"Expected table's schema: " +
"[StructField(id,IntegerType,true), StructField(name,StringType,true), StructField(price,DoubleType,true), StructField(dt,StringType,true)], " +
"query's output (including static partition values): " +
"[StructField(1,IntegerType,false), StructField(a1,StringType,false), StructField(10,IntegerType,false)]"
)
spark.sql("set hoodie.sql.bulk.insert.enable = true")
spark.sql("set hoodie.sql.insert.mode = strict")

View File

@@ -908,7 +908,7 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase {
| when not matched then insert *
|""".stripMargin)
checkAnswer(s"select id, name, cast(value as string), ts from $tableName")(
Seq(1, "a1", removeQuotes(dataValue), 1000)
Seq(1, "a1", extractRawValue(dataValue), 1000)
)
}
}

View File

@@ -17,6 +17,7 @@
package org.apache.spark.sql.hudi
import org.apache.hudi.HoodieSparkUtils.isSpark2
import org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH
class TestShowPartitions extends HoodieSparkSqlTestBase {
@@ -84,11 +85,22 @@ class TestShowPartitions extends HoodieSparkSqlTestBase {
checkAnswer(s"show partitions $tableName partition(dt='2021-01-02')")(Seq("dt=2021-01-02"))
// Insert into null partition
spark.sql(
s"""
| insert into $tableName
| select 3 as id, 'a3' as name, 10 as price, 1000 as ts, null as dt
if (isSpark2) {
// Spark 2 isn't able to convert NullType to any other type w/ appropriate nullability, so
// explicit cast is required
spark.sql(
s"""
| insert into $tableName
| select 3 as id, 'a3' as name, 10 as price, 1000 as ts, cast(null as string) as dt
""".stripMargin)
} else {
spark.sql(
s"""
| insert into $tableName
| select 3 as id, 'a3' as name, 10 as price, 1000 as ts, null as dt
""".stripMargin)
}
checkAnswer(s"show partitions $tableName")(
Seq("dt=2021-01-01"), Seq("dt=2021-01-02"), Seq("dt=%s".format(DEFAULT_PARTITION_PATH))
)

View File

@@ -55,11 +55,11 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
spark.sql(
s"""
| insert into $tableName values
| (1,1,11,100001,101.01,1001.0001,100001.0001,'a000001','2021-12-25','2021-12-25 12:01:01',true,'a01','2021-12-25'),
| (2,2,12,100002,102.02,1002.0002,100002.0002,'a000002','2021-12-25','2021-12-25 12:02:02',true,'a02','2021-12-25'),
| (3,3,13,100003,103.03,1003.0003,100003.0003,'a000003','2021-12-25','2021-12-25 12:03:03',false,'a03','2021-12-25'),
| (4,4,14,100004,104.04,1004.0004,100004.0004,'a000004','2021-12-26','2021-12-26 12:04:04',true,'a04','2021-12-26'),
| (5,5,15,100005,105.05,1005.0005,100005.0005,'a000005','2021-12-26','2021-12-26 12:05:05',false,'a05','2021-12-26')
| (1,1,11,100001,101.01,1001.0001,100001.0001,'a000001',DATE'2021-12-25',TIMESTAMP'2021-12-25 12:01:01',true,X'a01',TIMESTAMP'2021-12-25'),
| (2,2,12,100002,102.02,1002.0002,100002.0002,'a000002',DATE'2021-12-25',TIMESTAMP'2021-12-25 12:02:02',true,X'a02',TIMESTAMP'2021-12-25'),
| (3,3,13,100003,103.03,1003.0003,100003.0003,'a000003',DATE'2021-12-25',TIMESTAMP'2021-12-25 12:03:03',false,X'a03',TIMESTAMP'2021-12-25'),
| (4,4,14,100004,104.04,1004.0004,100004.0004,'a000004',DATE'2021-12-26',TIMESTAMP'2021-12-26 12:04:04',true,X'a04',TIMESTAMP'2021-12-26'),
| (5,5,15,100005,105.05,1005.0005,100005.0005,'a000005',DATE'2021-12-26',TIMESTAMP'2021-12-26 12:05:05',false,X'a05',TIMESTAMP'2021-12-26')
|""".stripMargin)
}
@@ -70,6 +70,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}"
if (HoodieSparkUtils.gteqSpark3_1) {
spark.sql("set hoodie.schema.on.read.enable=true")
// NOTE: This is required since as this tests use type coercions which were only permitted in Spark 2.x
// and are disallowed now by default in Spark 3.x
spark.sql("set spark.sql.storeAssignmentPolicy=legacy")
createAndPreparePartitionTable(spark, tableName, tablePath, tableType)
// date -> string -> date
spark.sql(s"alter table $tableName alter column col6 type String")
@@ -138,6 +141,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}"
if (HoodieSparkUtils.gteqSpark3_1) {
spark.sql("set hoodie.schema.on.read.enable=true")
// NOTE: This is required since as this tests use type coercions which were only permitted in Spark 2.x
// and are disallowed now by default in Spark 3.x
spark.sql("set spark.sql.storeAssignmentPolicy=legacy")
createAndPreparePartitionTable(spark, tableName, tablePath, tableType)
// float -> double -> decimal -> String
spark.sql(s"alter table $tableName alter column col2 type double")
@@ -172,6 +178,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}"
if (HoodieSparkUtils.gteqSpark3_1) {
spark.sql("set hoodie.schema.on.read.enable=true")
// NOTE: This is required since as this tests use type coercions which were only permitted in Spark 2.x
// and are disallowed now by default in Spark 3.x
spark.sql("set spark.sql.storeAssignmentPolicy=legacy")
createAndPreparePartitionTable(spark, tableName, tablePath, tableType)
// test set properties
@@ -402,7 +411,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
spark.sql(s"alter table $tableName alter column members.value.a first")
spark.sql(s"insert into ${tableName} values(1, 'jack', map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStruct', 29, 100), 1000)")
spark.sql(s"insert into ${tableName} values(1, 'jack', map('k1', struct(100, 'v1'), 'k2', struct(200, 'v2')), struct('jackStruct', 29, 100), 1000)")
// rename column
spark.sql(s"alter table ${tableName} rename column user to userx")
@@ -424,7 +433,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
checkAnswer(spark.sql(s"select name, userx.name, userx.score from ${tableName}").collect())(Seq(null, null, null))
// insert again
spark.sql(s"insert into ${tableName} values(2 , map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStructNew', 291 , 101), 'jacknew', 1000)")
spark.sql(s"insert into ${tableName} values(2 , map('k1', struct(100, 'v1'), 'k2', struct(200, 'v2')), struct('jackStructNew', 291 , 101), 'jacknew', 1000)")
// check again
checkAnswer(spark.sql(s"select name, userx.name as uxname, userx.score as uxs from ${tableName} order by id").collect())(
@@ -440,9 +449,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
Seq(291, 2, "jacknew"))
// test map value type change
spark.sql(s"alter table ${tableName} add columns(mxp map<String, int>)")
spark.sql(s"insert into ${tableName} values(2 , map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 9))")
spark.sql(s"insert into ${tableName} values(2, map('k1', struct(100, 'v1'), 'k2', struct(200, 'v2')), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 9))")
spark.sql(s"alter table ${tableName} alter column mxp.value type double")
spark.sql(s"insert into ${tableName} values(2 , map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 10))")
spark.sql(s"insert into ${tableName} values(2, map('k1', struct(100, 'v1'), 'k2', struct(200, 'v2')), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 10))")
spark.sql(s"select * from $tableName").show(false)
checkAnswer(spark.sql(s"select mxp from ${tableName} order by id").collect())(
Seq(null),
@@ -453,7 +462,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
spark.sql(s"alter table ${tableName} rename column userx to us")
spark.sql(s"alter table ${tableName} rename column us.age to age1")
spark.sql(s"insert into ${tableName} values(2 , map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 10))")
spark.sql(s"insert into ${tableName} values(2, map('k1', struct(100, 'v1'), 'k2', struct(200, 'v2')), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 10))")
spark.sql(s"select mem.value.nn, us.age1 from $tableName order by id").show()
checkAnswer(spark.sql(s"select mem.value.nn, us.age1 from $tableName order by id").collect())(
Seq(null, 29),

View File

@@ -17,6 +17,8 @@
package org.apache.spark.sql.hudi
import org.apache.hudi.HoodieSparkUtils.isSpark2
class TestUpdateTable extends HoodieSparkSqlTestBase {
test("Test Update Table") {
@@ -84,7 +86,12 @@ class TestUpdateTable extends HoodieSparkSqlTestBase {
""".stripMargin)
// insert data to table
spark.sql(s"insert into $tableName values (1, 'a1', 10.0, 1000), (2, 'a2', 20.0, 1000)")
if (isSpark2) {
spark.sql(s"insert into $tableName values (1, 'a1', cast(10.0 as double), 1000), (2, 'a2', cast(20.0 as double), 1000)")
} else {
spark.sql(s"insert into $tableName values (1, 'a1', 10.0, 1000), (2, 'a2', 20.0, 1000)")
}
checkAnswer(s"select id, name, price, ts from $tableName")(
Seq(1, "a1", 10.0, 1000),
Seq(2, "a2", 20.0, 1000)
@@ -119,11 +126,20 @@ class TestUpdateTable extends HoodieSparkSqlTestBase {
""".stripMargin)
// insert data to table
spark.sql(
s"""
|insert into $ptTableName
|values (1, 'a1', 10.0, 1000, "2021"), (2, 'a2', 20.0, 1000, "2021"), (3, 'a2', 30.0, 1000, "2022")
""".stripMargin)
if (isSpark2) {
spark.sql(
s"""
|insert into $ptTableName
|values (1, 'a1', cast(10.0 as double), 1000, "2021"), (2, 'a2', cast(20.0 as double), 1000, "2021"), (3, 'a2', cast(30.0 as double), 1000, "2022")
|""".stripMargin)
} else {
spark.sql(
s"""
|insert into $ptTableName
|values (1, 'a1', 10.0, 1000, "2021"), (2, 'a2', 20.0, 1000, "2021"), (3, 'a2', 30.0, 1000, "2022")
|""".stripMargin)
}
checkAnswer(s"select id, name, price, ts, pt from $ptTableName")(
Seq(1, "a1", 10.0, 1000, "2021"),
Seq(2, "a2", 20.0, 1000, "2021"),