1
0

[HUDI-4303] Use Hive sentinel value as partition default to avoid type caste issues (#5954)

This commit is contained in:
Sagar Sumit
2022-07-23 05:44:36 +05:30
committed by GitHub
parent 39f2a06c85
commit a36762a862
10 changed files with 65 additions and 52 deletions

View File

@@ -24,6 +24,7 @@ import org.apache.hudi.common.config.TypedProperties
import org.apache.hudi.common.model._
import org.apache.hudi.common.testutils.SchemaTestUtil
import org.apache.hudi.common.util.Option
import org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH
import org.apache.hudi.config.HoodiePayloadConfig
import org.apache.hudi.exception.{HoodieException, HoodieKeyException}
import org.apache.hudi.keygen._
@@ -146,17 +147,17 @@ class TestDataSourceDefaults extends ScalaAssertionSupport {
baseRow = KeyGeneratorTestUtilities.getRow(baseRecord, schema, structType)
internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow)
assertEquals("default", keyGen.getKey(baseRecord).getPartitionPath)
assertEquals("default", keyGen.getPartitionPath(baseRow))
assertEquals(UTF8String.fromString("default"), keyGen.getPartitionPath(internalRow, structType))
assertEquals(DEFAULT_PARTITION_PATH, keyGen.getKey(baseRecord).getPartitionPath)
assertEquals(DEFAULT_PARTITION_PATH, keyGen.getPartitionPath(baseRow))
assertEquals(UTF8String.fromString(DEFAULT_PARTITION_PATH), keyGen.getPartitionPath(internalRow, structType))
baseRecord.put("name", null)
baseRow = KeyGeneratorTestUtilities.getRow(baseRecord, schema, structType)
internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow)
assertEquals("default", keyGen.getKey(baseRecord).getPartitionPath)
assertEquals("default", keyGen.getPartitionPath(baseRow))
assertEquals(UTF8String.fromString("default"), keyGen.getPartitionPath(internalRow, structType))
assertEquals(DEFAULT_PARTITION_PATH, keyGen.getKey(baseRecord).getPartitionPath)
assertEquals(DEFAULT_PARTITION_PATH, keyGen.getPartitionPath(baseRow))
assertEquals(UTF8String.fromString(DEFAULT_PARTITION_PATH), keyGen.getPartitionPath(internalRow, structType))
}
{
@@ -335,7 +336,7 @@ class TestDataSourceDefaults extends ScalaAssertionSupport {
baseRow = KeyGeneratorTestUtilities.getRow(baseRecord, schema, structType)
internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow)
val expectedKey = new HoodieKey("field1:field1,name:__empty__", "field1/default")
val expectedKey = new HoodieKey("field1:field1,name:__empty__", "field1/" + DEFAULT_PARTITION_PATH)
assertEquals(expectedKey, keyGen.getKey(baseRecord))
@@ -353,7 +354,7 @@ class TestDataSourceDefaults extends ScalaAssertionSupport {
baseRow = KeyGeneratorTestUtilities.getRow(baseRecord, schema, structType)
internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow)
val expectedKey = new HoodieKey("field1:field1,name:__null__", "field1/default")
val expectedKey = new HoodieKey("field1:field1,name:__null__", "field1/" + DEFAULT_PARTITION_PATH)
assertEquals(expectedKey, keyGen.getKey(baseRecord))

View File

@@ -26,6 +26,7 @@ import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, T
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
import org.apache.hudi.common.testutils.RawTripTestPayload.{deleteRecordsToStrings, recordsToStrings}
import org.apache.hudi.common.util
import org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.exception.{HoodieException, HoodieUpsertException}
import org.apache.hudi.keygen._
@@ -41,7 +42,7 @@ import org.joda.time.DateTime
import org.joda.time.format.DateTimeFormat
import org.junit.jupiter.api.Assertions.{assertEquals, assertThrows, assertTrue, fail}
import org.junit.jupiter.api.function.Executable
import org.junit.jupiter.api.{AfterEach, BeforeEach, Disabled, Test}
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.{CsvSource, ValueSource}
@@ -614,13 +615,14 @@ class TestCOWDataSource extends HoodieClientTestBase {
.load(basePath)
assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("driver")).count() == 0)
// Use the `driver,rider` field as the partition key, If no such field exists, the default value `default` is used
// Use the `driver,rider` field as the partition key, If no such field exists,
// the default value [[PartitionPathEncodeUtils#DEFAULT_PARTITION_PATH]] is used
writer = getDataFrameWriter(classOf[SimpleKeyGenerator].getName)
writer.partitionBy("driver", "rider")
.save(basePath)
recordsReadDF = spark.read.format("org.apache.hudi")
.load(basePath)
assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= lit("default")).count() == 0)
assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= lit(DEFAULT_PARTITION_PATH)).count() == 0)
}
@Test def testSparkPartitionByWithComplexKeyGenerator() {

View File

@@ -17,7 +17,7 @@
package org.apache.spark.sql.hudi
import org.apache.spark.sql.Row
import org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH
class TestShowPartitions extends HoodieSparkSqlTestBase {
@@ -90,7 +90,7 @@ class TestShowPartitions extends HoodieSparkSqlTestBase {
| select 3 as id, 'a3' as name, 10 as price, 1000 as ts, null as dt
""".stripMargin)
checkAnswer(s"show partitions $tableName")(
Seq("dt=2021-01-01"), Seq("dt=2021-01-02"), Seq("dt=default")
Seq("dt=2021-01-01"), Seq("dt=2021-01-02"), Seq("dt=%s".format(DEFAULT_PARTITION_PATH))
)
}
@@ -138,12 +138,12 @@ class TestShowPartitions extends HoodieSparkSqlTestBase {
Seq("year=2021/month=01/day=01"),
Seq("year=2021/month=01/day=02"),
Seq("year=2021/month=02/day=01"),
Seq("year=2021/month=02/day=default"),
Seq("year=2021/month=default/day=01"),
Seq("year=default/month=01/day=default"),
Seq("year=default/month=01/day=02"),
Seq("year=default/month=default/day=01"),
Seq("year=2022/month=default/day=default")
Seq("year=2021/month=02/day=%s".format(DEFAULT_PARTITION_PATH)),
Seq("year=2021/month=%s/day=01".format(DEFAULT_PARTITION_PATH)),
Seq("year=%s/month=01/day=%s".format(DEFAULT_PARTITION_PATH, DEFAULT_PARTITION_PATH)),
Seq("year=%s/month=01/day=02".format(DEFAULT_PARTITION_PATH)),
Seq("year=%s/month=%s/day=01".format(DEFAULT_PARTITION_PATH, DEFAULT_PARTITION_PATH)),
Seq("year=2022/month=%s/day=%s".format(DEFAULT_PARTITION_PATH, DEFAULT_PARTITION_PATH))
)
// check partial partitions
@@ -151,14 +151,14 @@ class TestShowPartitions extends HoodieSparkSqlTestBase {
Seq("year=2021/month=01/day=01")
)
checkAnswer(s"show partitions $tableName partition(year='2021', month='02')")(
Seq("year=2021/month=02/day=default"),
Seq("year=2021/month=02/day=%s".format(DEFAULT_PARTITION_PATH)),
Seq("year=2021/month=02/day=01")
)
checkAnswer(s"show partitions $tableName partition(day='01')")(
Seq("year=2021/month=02/day=01"),
Seq("year=2021/month=default/day=01"),
Seq("year=2021/month=%s/day=01".format(DEFAULT_PARTITION_PATH)),
Seq("year=2021/month=01/day=01"),
Seq("year=default/month=default/day=01")
Seq("year=%s/month=%s/day=01".format(DEFAULT_PARTITION_PATH, DEFAULT_PARTITION_PATH))
)
}
}