[HUDI-2909] Handle logical type in TimestampBasedKeyGenerator (#4203)
* [HUDI-2909] Handle logical type in TimestampBasedKeyGenerator Timestampbased key generator was returning diff values for row writer and non row writer path. this patch fixes it and is guarded by a config flag (`hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled`)
This commit is contained in:
@@ -141,16 +141,16 @@ public class TestDataSourceUtils {
|
||||
record.put("event_cost3", genericFixed);
|
||||
|
||||
assertEquals(LocalDate.ofEpochDay(18000).toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_date1",
|
||||
true));
|
||||
true, false));
|
||||
assertEquals(LocalDate.ofEpochDay(18001).toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_date2",
|
||||
true));
|
||||
true, false));
|
||||
assertEquals(LocalDate.ofEpochDay(18002).toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_date3",
|
||||
true));
|
||||
assertEquals("Hudi Meetup", HoodieAvroUtils.getNestedFieldValAsString(record, "event_name", true));
|
||||
assertEquals("Hudi PMC", HoodieAvroUtils.getNestedFieldValAsString(record, "event_organizer", true));
|
||||
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost1", true));
|
||||
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost2", true));
|
||||
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost3", true));
|
||||
true, false));
|
||||
assertEquals("Hudi Meetup", HoodieAvroUtils.getNestedFieldValAsString(record, "event_name", true, false));
|
||||
assertEquals("Hudi PMC", HoodieAvroUtils.getNestedFieldValAsString(record, "event_organizer", true, false));
|
||||
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost1", true, false));
|
||||
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost2", true, false));
|
||||
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost3", true, false));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
||||
@@ -246,8 +246,8 @@ class TestDataSourceDefaults {
|
||||
var converterFn: Function1[Any, Any] = _
|
||||
|
||||
override def getKey(record: GenericRecord): HoodieKey = {
|
||||
new HoodieKey(HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyProp, true),
|
||||
HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathProp, true))
|
||||
new HoodieKey(HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyProp, true, false),
|
||||
HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathProp, true, false))
|
||||
}
|
||||
|
||||
override def getRecordKey(row: Row): String = {
|
||||
@@ -579,12 +579,12 @@ class TestDataSourceDefaults {
|
||||
val props = new TypedProperties()
|
||||
props.put(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, "favoriteIntNumber");
|
||||
|
||||
val basePayload = new OverwriteWithLatestAvroPayload(baseRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, baseOrderingVal).asInstanceOf[Comparable[_]])
|
||||
val basePayload = new OverwriteWithLatestAvroPayload(baseRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, baseOrderingVal, false).asInstanceOf[Comparable[_]])
|
||||
|
||||
val laterRecord = SchemaTestUtil
|
||||
.generateAvroRecordFromJson(schema, 2, "001", "f1")
|
||||
val laterOrderingVal: Object = laterRecord.get("favoriteIntNumber")
|
||||
val newerPayload = new OverwriteWithLatestAvroPayload(laterRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal).asInstanceOf[Comparable[_]])
|
||||
val newerPayload = new OverwriteWithLatestAvroPayload(laterRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal, false).asInstanceOf[Comparable[_]])
|
||||
|
||||
// it will provide the record with greatest combine value
|
||||
val preCombinedPayload = basePayload.preCombine(newerPayload)
|
||||
@@ -606,10 +606,10 @@ class TestDataSourceDefaults {
|
||||
val earlierOrderingVal: Object = earlierRecord.get("favoriteIntNumber")
|
||||
|
||||
val laterPayload = new DefaultHoodieRecordPayload(laterRecord,
|
||||
HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal).asInstanceOf[Comparable[_]])
|
||||
HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal, false).asInstanceOf[Comparable[_]])
|
||||
|
||||
val earlierPayload = new DefaultHoodieRecordPayload(earlierRecord,
|
||||
HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, earlierOrderingVal).asInstanceOf[Comparable[_]])
|
||||
HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, earlierOrderingVal, false).asInstanceOf[Comparable[_]])
|
||||
|
||||
// it will provide the record with greatest combine value
|
||||
val preCombinedPayload = laterPayload.preCombine(earlierPayload)
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hudi.config.HoodieWriteConfig
|
||||
import org.apache.hudi.testutils.HoodieClientTestBase
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
|
||||
|
||||
import java.sql.{Date, Timestamp}
|
||||
|
||||
class TestGenericRecordAndRowConsistency extends HoodieClientTestBase {
|
||||
|
||||
var spark: SparkSession = _
|
||||
val commonOpts = Map(
|
||||
HoodieWriteConfig.TBL_NAME.key -> "hoodie_type_consistency_tbl",
|
||||
"hoodie.insert.shuffle.parallelism" -> "1",
|
||||
"hoodie.upsert.shuffle.parallelism" -> "1",
|
||||
DataSourceWriteOptions.TABLE_TYPE.key -> "COPY_ON_WRITE",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "str,eventTime",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "typeId",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "typeId",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.ComplexKeyGenerator",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key -> "true"
|
||||
)
|
||||
|
||||
/**
|
||||
* Setup method running before each test.
|
||||
*/
|
||||
@BeforeEach override def setUp(): Unit = {
|
||||
setTableName("hoodie_type_consistency_tbl")
|
||||
initPath()
|
||||
initSparkContexts()
|
||||
spark = sqlContext.sparkSession
|
||||
}
|
||||
|
||||
@AfterEach override def tearDown(): Unit = {
|
||||
cleanupSparkContexts()
|
||||
}
|
||||
|
||||
@Test
|
||||
def testTimestampTypeConsistency(): Unit = {
|
||||
val _spark = spark
|
||||
import _spark.implicits._
|
||||
|
||||
val df = Seq(
|
||||
(1, Timestamp.valueOf("2014-01-01 23:00:01"), "abc"),
|
||||
(1, Timestamp.valueOf("2014-11-30 12:40:32"), "abc"),
|
||||
(2, Timestamp.valueOf("2016-12-29 09:54:00"), "def"),
|
||||
(2, Timestamp.valueOf("2016-05-09 10:12:43"), "def")
|
||||
).toDF("typeId", "eventTime", "str")
|
||||
|
||||
testConsistencyBetweenGenericRecordAndRow(df)
|
||||
}
|
||||
|
||||
@Test
|
||||
def testDateTypeConsistency(): Unit = {
|
||||
val _spark = spark
|
||||
import _spark.implicits._
|
||||
|
||||
val df = Seq(
|
||||
(1, Date.valueOf("2014-01-01"), "abc"),
|
||||
(1, Date.valueOf("2014-11-30"), "abc"),
|
||||
(2, Date.valueOf("2016-12-29"), "def"),
|
||||
(2, Date.valueOf("2016-05-09"), "def")
|
||||
).toDF("typeId", "eventTime", "str")
|
||||
|
||||
testConsistencyBetweenGenericRecordAndRow(df)
|
||||
}
|
||||
|
||||
private def testConsistencyBetweenGenericRecordAndRow(df: DataFrame): Unit = {
|
||||
val _spark = spark
|
||||
import _spark.implicits._
|
||||
|
||||
// upsert operation generate recordKey by GenericRecord
|
||||
val tempRecordPath = basePath + "/record_tbl/"
|
||||
df.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, "upsert")
|
||||
.mode(org.apache.spark.sql.SaveMode.Overwrite)
|
||||
.save(tempRecordPath)
|
||||
|
||||
val data1 = spark.read.format("hudi")
|
||||
.load(tempRecordPath)
|
||||
.select("_hoodie_record_key")
|
||||
.map(_.toString()).collect().sorted
|
||||
|
||||
// bulk_insert operation generate recordKey by Row
|
||||
val tempRowPath = basePath + "/row_tbl/"
|
||||
df.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, "bulk_insert")
|
||||
.mode(org.apache.spark.sql.SaveMode.Overwrite)
|
||||
.save(tempRowPath)
|
||||
|
||||
val data2 = spark.read.format("hudi")
|
||||
.load(tempRowPath)
|
||||
.select("_hoodie_record_key")
|
||||
.map(_.toString()).collect().sorted
|
||||
|
||||
assert(data1 sameElements data2)
|
||||
}
|
||||
}
|
||||
@@ -25,9 +25,8 @@ import org.apache.hudi.common.config.HoodieConfig
|
||||
import org.apache.hudi.common.model._
|
||||
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver}
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
|
||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils
|
||||
import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieWriteConfig}
|
||||
import org.apache.hudi.exception.{ExceptionUtil, HoodieException}
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode
|
||||
import org.apache.hudi.functional.TestBootstrap
|
||||
import org.apache.hudi.hive.HiveSyncConfig
|
||||
@@ -35,13 +34,12 @@ import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator,
|
||||
import org.apache.hudi.testutils.DataSourceTestUtils
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.api.java.JavaSparkContext
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.functions.{expr, lit}
|
||||
import org.apache.spark.sql.hudi.HoodieSparkSessionExtension
|
||||
import org.apache.spark.sql.hudi.command.SqlKeyGenerator
|
||||
import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
|
||||
import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext, SaveMode, SparkSession}
|
||||
import org.junit.jupiter.api.Assertions.{assertEquals, assertFalse, assertTrue, fail}
|
||||
import org.junit.jupiter.api.function.Executable
|
||||
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
|
||||
import org.junit.jupiter.params.ParameterizedTest
|
||||
import org.junit.jupiter.params.provider.{CsvSource, EnumSource, ValueSource}
|
||||
@@ -115,6 +113,13 @@ class TestHoodieSparkSqlWriter {
|
||||
sqlContext.clearCache();
|
||||
sqlContext = null;
|
||||
}
|
||||
if (sc != null) {
|
||||
sc.stop()
|
||||
sc = null
|
||||
}
|
||||
if (spark != null) {
|
||||
spark.close()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -20,6 +20,8 @@ package org.apache.spark.sql.hudi
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.exception.HoodieDuplicateKeyException
|
||||
|
||||
import java.io.File
|
||||
|
||||
class TestInsertTable extends TestHoodieSqlBase {
|
||||
|
||||
test("Test Insert Into") {
|
||||
@@ -238,29 +240,46 @@ class TestInsertTable extends TestHoodieSqlBase {
|
||||
)
|
||||
typeAndValue.foreach { case (partitionType, partitionValue) =>
|
||||
val tableName = generateTableName
|
||||
// Create table
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| dt $partitionType
|
||||
|) using hudi
|
||||
| tblproperties (primaryKey = 'id')
|
||||
| partitioned by (dt)
|
||||
| location '${tmp.getCanonicalPath}/$tableName'
|
||||
""".stripMargin)
|
||||
spark.sql(s"insert into $tableName partition(dt = $partitionValue) select 1, 'a1', 10")
|
||||
spark.sql(s"insert into $tableName select 2, 'a2', 10, $partitionValue")
|
||||
checkAnswer(s"select id, name, price, cast(dt as string) from $tableName order by id")(
|
||||
Seq(1, "a1", 10, removeQuotes(partitionValue).toString),
|
||||
Seq(2, "a2", 10, removeQuotes(partitionValue).toString)
|
||||
)
|
||||
validateDifferentTypesOfPartitionColumn(tmp, partitionType, partitionValue, tableName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test TimestampType Partition Column With Consistent Logical Timestamp Enabled") {
|
||||
withTempDir { tmp =>
|
||||
val typeAndValue = Seq(
|
||||
("timestamp", "'2021-05-20 00:00:00'"),
|
||||
("date", "'2021-05-20'")
|
||||
)
|
||||
typeAndValue.foreach { case (partitionType, partitionValue) =>
|
||||
val tableName = generateTableName
|
||||
spark.sql(s"set hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled=true")
|
||||
validateDifferentTypesOfPartitionColumn(tmp, partitionType, partitionValue, tableName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def validateDifferentTypesOfPartitionColumn(tmp: File, partitionType: String, partitionValue: Any, tableName: String) = {
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| dt $partitionType
|
||||
|) using hudi
|
||||
| tblproperties (primaryKey = 'id')
|
||||
| partitioned by (dt)
|
||||
| location '${tmp.getCanonicalPath}/$tableName'
|
||||
""".stripMargin)
|
||||
spark.sql(s"insert into $tableName partition(dt = $partitionValue) select 1, 'a1', 10")
|
||||
spark.sql(s"insert into $tableName select 2, 'a2', 10, $partitionValue")
|
||||
checkAnswer(s"select id, name, price, cast(dt as string) from $tableName order by id")(
|
||||
Seq(1, "a1", 10, removeQuotes(partitionValue).toString),
|
||||
Seq(2, "a2", 10, removeQuotes(partitionValue).toString)
|
||||
)
|
||||
}
|
||||
|
||||
test("Test insert for uppercase table name") {
|
||||
withTempDir{ tmp =>
|
||||
val tableName = s"H_$generateTableName"
|
||||
|
||||
Reference in New Issue
Block a user