1
0

[HUDI-2909] Handle logical type in TimestampBasedKeyGenerator (#4203)

* [HUDI-2909] Handle logical type in TimestampBasedKeyGenerator

Timestampbased key generator was returning diff values for row writer and non row writer path. this patch fixes it and is guarded by a config flag (`hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled`)
This commit is contained in:
Sagar Sumit
2022-01-08 20:52:44 +05:30
committed by GitHub
parent 03a83ffeb5
commit 827549949c
36 changed files with 364 additions and 101 deletions

View File

@@ -141,16 +141,16 @@ public class TestDataSourceUtils {
record.put("event_cost3", genericFixed);
assertEquals(LocalDate.ofEpochDay(18000).toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_date1",
true));
true, false));
assertEquals(LocalDate.ofEpochDay(18001).toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_date2",
true));
true, false));
assertEquals(LocalDate.ofEpochDay(18002).toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_date3",
true));
assertEquals("Hudi Meetup", HoodieAvroUtils.getNestedFieldValAsString(record, "event_name", true));
assertEquals("Hudi PMC", HoodieAvroUtils.getNestedFieldValAsString(record, "event_organizer", true));
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost1", true));
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost2", true));
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost3", true));
true, false));
assertEquals("Hudi Meetup", HoodieAvroUtils.getNestedFieldValAsString(record, "event_name", true, false));
assertEquals("Hudi PMC", HoodieAvroUtils.getNestedFieldValAsString(record, "event_organizer", true, false));
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost1", true, false));
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost2", true, false));
assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost3", true, false));
}
@Test

View File

@@ -246,8 +246,8 @@ class TestDataSourceDefaults {
var converterFn: Function1[Any, Any] = _
override def getKey(record: GenericRecord): HoodieKey = {
new HoodieKey(HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyProp, true),
HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathProp, true))
new HoodieKey(HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyProp, true, false),
HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathProp, true, false))
}
override def getRecordKey(row: Row): String = {
@@ -579,12 +579,12 @@ class TestDataSourceDefaults {
val props = new TypedProperties()
props.put(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, "favoriteIntNumber");
val basePayload = new OverwriteWithLatestAvroPayload(baseRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, baseOrderingVal).asInstanceOf[Comparable[_]])
val basePayload = new OverwriteWithLatestAvroPayload(baseRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, baseOrderingVal, false).asInstanceOf[Comparable[_]])
val laterRecord = SchemaTestUtil
.generateAvroRecordFromJson(schema, 2, "001", "f1")
val laterOrderingVal: Object = laterRecord.get("favoriteIntNumber")
val newerPayload = new OverwriteWithLatestAvroPayload(laterRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal).asInstanceOf[Comparable[_]])
val newerPayload = new OverwriteWithLatestAvroPayload(laterRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal, false).asInstanceOf[Comparable[_]])
// it will provide the record with greatest combine value
val preCombinedPayload = basePayload.preCombine(newerPayload)
@@ -606,10 +606,10 @@ class TestDataSourceDefaults {
val earlierOrderingVal: Object = earlierRecord.get("favoriteIntNumber")
val laterPayload = new DefaultHoodieRecordPayload(laterRecord,
HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal).asInstanceOf[Comparable[_]])
HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal, false).asInstanceOf[Comparable[_]])
val earlierPayload = new DefaultHoodieRecordPayload(earlierRecord,
HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, earlierOrderingVal).asInstanceOf[Comparable[_]])
HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, earlierOrderingVal, false).asInstanceOf[Comparable[_]])
// it will provide the record with greatest combine value
val preCombinedPayload = laterPayload.preCombine(earlierPayload)

View File

@@ -0,0 +1,118 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.testutils.HoodieClientTestBase
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
import java.sql.{Date, Timestamp}
class TestGenericRecordAndRowConsistency extends HoodieClientTestBase {
var spark: SparkSession = _
val commonOpts = Map(
HoodieWriteConfig.TBL_NAME.key -> "hoodie_type_consistency_tbl",
"hoodie.insert.shuffle.parallelism" -> "1",
"hoodie.upsert.shuffle.parallelism" -> "1",
DataSourceWriteOptions.TABLE_TYPE.key -> "COPY_ON_WRITE",
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "str,eventTime",
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "typeId",
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "typeId",
DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.ComplexKeyGenerator",
DataSourceWriteOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key -> "true"
)
/**
* Setup method running before each test.
*/
@BeforeEach override def setUp(): Unit = {
setTableName("hoodie_type_consistency_tbl")
initPath()
initSparkContexts()
spark = sqlContext.sparkSession
}
@AfterEach override def tearDown(): Unit = {
cleanupSparkContexts()
}
@Test
def testTimestampTypeConsistency(): Unit = {
val _spark = spark
import _spark.implicits._
val df = Seq(
(1, Timestamp.valueOf("2014-01-01 23:00:01"), "abc"),
(1, Timestamp.valueOf("2014-11-30 12:40:32"), "abc"),
(2, Timestamp.valueOf("2016-12-29 09:54:00"), "def"),
(2, Timestamp.valueOf("2016-05-09 10:12:43"), "def")
).toDF("typeId", "eventTime", "str")
testConsistencyBetweenGenericRecordAndRow(df)
}
@Test
def testDateTypeConsistency(): Unit = {
val _spark = spark
import _spark.implicits._
val df = Seq(
(1, Date.valueOf("2014-01-01"), "abc"),
(1, Date.valueOf("2014-11-30"), "abc"),
(2, Date.valueOf("2016-12-29"), "def"),
(2, Date.valueOf("2016-05-09"), "def")
).toDF("typeId", "eventTime", "str")
testConsistencyBetweenGenericRecordAndRow(df)
}
private def testConsistencyBetweenGenericRecordAndRow(df: DataFrame): Unit = {
val _spark = spark
import _spark.implicits._
// upsert operation generate recordKey by GenericRecord
val tempRecordPath = basePath + "/record_tbl/"
df.write.format("hudi")
.options(commonOpts)
.option(DataSourceWriteOptions.OPERATION.key, "upsert")
.mode(org.apache.spark.sql.SaveMode.Overwrite)
.save(tempRecordPath)
val data1 = spark.read.format("hudi")
.load(tempRecordPath)
.select("_hoodie_record_key")
.map(_.toString()).collect().sorted
// bulk_insert operation generate recordKey by Row
val tempRowPath = basePath + "/row_tbl/"
df.write.format("hudi")
.options(commonOpts)
.option(DataSourceWriteOptions.OPERATION.key, "bulk_insert")
.mode(org.apache.spark.sql.SaveMode.Overwrite)
.save(tempRowPath)
val data2 = spark.read.format("hudi")
.load(tempRowPath)
.select("_hoodie_record_key")
.map(_.toString()).collect().sorted
assert(data1 sameElements data2)
}
}

View File

@@ -25,9 +25,8 @@ import org.apache.hudi.common.config.HoodieConfig
import org.apache.hudi.common.model._
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver}
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
import org.apache.hudi.common.util.PartitionPathEncodeUtils
import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieWriteConfig}
import org.apache.hudi.exception.{ExceptionUtil, HoodieException}
import org.apache.hudi.exception.HoodieException
import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode
import org.apache.hudi.functional.TestBootstrap
import org.apache.hudi.hive.HiveSyncConfig
@@ -35,13 +34,12 @@ import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator,
import org.apache.hudi.testutils.DataSourceTestUtils
import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.functions.{expr, lit}
import org.apache.spark.sql.hudi.HoodieSparkSessionExtension
import org.apache.spark.sql.hudi.command.SqlKeyGenerator
import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext, SaveMode, SparkSession}
import org.junit.jupiter.api.Assertions.{assertEquals, assertFalse, assertTrue, fail}
import org.junit.jupiter.api.function.Executable
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.{CsvSource, EnumSource, ValueSource}
@@ -115,6 +113,13 @@ class TestHoodieSparkSqlWriter {
sqlContext.clearCache();
sqlContext = null;
}
if (sc != null) {
sc.stop()
sc = null
}
if (spark != null) {
spark.close()
}
}
/**

View File

@@ -20,6 +20,8 @@ package org.apache.spark.sql.hudi
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.exception.HoodieDuplicateKeyException
import java.io.File
class TestInsertTable extends TestHoodieSqlBase {
test("Test Insert Into") {
@@ -238,29 +240,46 @@ class TestInsertTable extends TestHoodieSqlBase {
)
typeAndValue.foreach { case (partitionType, partitionValue) =>
val tableName = generateTableName
// Create table
spark.sql(
s"""
|create table $tableName (
| id int,
| name string,
| price double,
| dt $partitionType
|) using hudi
| tblproperties (primaryKey = 'id')
| partitioned by (dt)
| location '${tmp.getCanonicalPath}/$tableName'
""".stripMargin)
spark.sql(s"insert into $tableName partition(dt = $partitionValue) select 1, 'a1', 10")
spark.sql(s"insert into $tableName select 2, 'a2', 10, $partitionValue")
checkAnswer(s"select id, name, price, cast(dt as string) from $tableName order by id")(
Seq(1, "a1", 10, removeQuotes(partitionValue).toString),
Seq(2, "a2", 10, removeQuotes(partitionValue).toString)
)
validateDifferentTypesOfPartitionColumn(tmp, partitionType, partitionValue, tableName)
}
}
}
test("Test TimestampType Partition Column With Consistent Logical Timestamp Enabled") {
withTempDir { tmp =>
val typeAndValue = Seq(
("timestamp", "'2021-05-20 00:00:00'"),
("date", "'2021-05-20'")
)
typeAndValue.foreach { case (partitionType, partitionValue) =>
val tableName = generateTableName
spark.sql(s"set hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled=true")
validateDifferentTypesOfPartitionColumn(tmp, partitionType, partitionValue, tableName)
}
}
}
private def validateDifferentTypesOfPartitionColumn(tmp: File, partitionType: String, partitionValue: Any, tableName: String) = {
spark.sql(
s"""
|create table $tableName (
| id int,
| name string,
| price double,
| dt $partitionType
|) using hudi
| tblproperties (primaryKey = 'id')
| partitioned by (dt)
| location '${tmp.getCanonicalPath}/$tableName'
""".stripMargin)
spark.sql(s"insert into $tableName partition(dt = $partitionValue) select 1, 'a1', 10")
spark.sql(s"insert into $tableName select 2, 'a2', 10, $partitionValue")
checkAnswer(s"select id, name, price, cast(dt as string) from $tableName order by id")(
Seq(1, "a1", 10, removeQuotes(partitionValue).toString),
Seq(2, "a2", 10, removeQuotes(partitionValue).toString)
)
}
test("Test insert for uppercase table name") {
withTempDir{ tmp =>
val tableName = s"H_$generateTableName"