[HUDI-2909] Handle logical type in TimestampBasedKeyGenerator (#4203)

* [HUDI-2909] Handle logical type in TimestampBasedKeyGenerator Timestampbased key generator was returning diff values for row writer and non row writer path. this patch fixes it and is guarded by a config flag (`hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled`)
2022-01-08 20:52:44 +05:30
parent 03a83ffeb5
commit 827549949c
36 changed files with 364 additions and 101 deletions
--- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java
+++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java
@@ -141,16 +141,16 @@ public class TestDataSourceUtils {
    record.put("event_cost3", genericFixed);

    assertEquals(LocalDate.ofEpochDay(18000).toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_date1",
-        true));
+        true, false));
    assertEquals(LocalDate.ofEpochDay(18001).toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_date2",
-        true));
+        true, false));
    assertEquals(LocalDate.ofEpochDay(18002).toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_date3",
-        true));
-    assertEquals("Hudi Meetup", HoodieAvroUtils.getNestedFieldValAsString(record, "event_name", true));
-    assertEquals("Hudi PMC", HoodieAvroUtils.getNestedFieldValAsString(record, "event_organizer", true));
-    assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost1", true));
-    assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost2", true));
-    assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost3", true));
+        true, false));
+    assertEquals("Hudi Meetup", HoodieAvroUtils.getNestedFieldValAsString(record, "event_name", true, false));
+    assertEquals("Hudi PMC", HoodieAvroUtils.getNestedFieldValAsString(record, "event_organizer", true, false));
+    assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost1", true, false));
+    assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost2", true, false));
+    assertEquals(bigDecimal.toString(), HoodieAvroUtils.getNestedFieldValAsString(record, "event_cost3", true, false));
  }

  @Test
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala
@@ -246,8 +246,8 @@ class TestDataSourceDefaults {
    var converterFn: Function1[Any, Any] = _

    override def getKey(record: GenericRecord): HoodieKey = {
-      new HoodieKey(HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyProp, true),
-        HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathProp, true))
+      new HoodieKey(HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyProp, true, false),
+        HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathProp, true, false))
    }

    override def getRecordKey(row: Row): String = {
@@ -579,12 +579,12 @@ class TestDataSourceDefaults {
    val props = new TypedProperties()
    props.put(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, "favoriteIntNumber");

-    val basePayload = new OverwriteWithLatestAvroPayload(baseRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, baseOrderingVal).asInstanceOf[Comparable[_]])
+    val basePayload = new OverwriteWithLatestAvroPayload(baseRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, baseOrderingVal, false).asInstanceOf[Comparable[_]])

    val laterRecord = SchemaTestUtil
      .generateAvroRecordFromJson(schema, 2, "001", "f1")
    val laterOrderingVal: Object = laterRecord.get("favoriteIntNumber")
-    val newerPayload = new OverwriteWithLatestAvroPayload(laterRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal).asInstanceOf[Comparable[_]])
+    val newerPayload = new OverwriteWithLatestAvroPayload(laterRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal, false).asInstanceOf[Comparable[_]])

    // it will provide the record with greatest combine value
    val preCombinedPayload = basePayload.preCombine(newerPayload)
@@ -606,10 +606,10 @@ class TestDataSourceDefaults {
    val earlierOrderingVal: Object = earlierRecord.get("favoriteIntNumber")

    val laterPayload = new DefaultHoodieRecordPayload(laterRecord,
-      HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal).asInstanceOf[Comparable[_]])
+      HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal, false).asInstanceOf[Comparable[_]])

    val earlierPayload = new DefaultHoodieRecordPayload(earlierRecord,
-      HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, earlierOrderingVal).asInstanceOf[Comparable[_]])
+      HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, earlierOrderingVal, false).asInstanceOf[Comparable[_]])

    // it will provide the record with greatest combine value
    val preCombinedPayload = laterPayload.preCombine(earlierPayload)
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestGenericRecordAndRowConsistency.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestGenericRecordAndRowConsistency.scala
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hudi
+
+import org.apache.hudi.config.HoodieWriteConfig
+import org.apache.hudi.testutils.HoodieClientTestBase
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
+
+import java.sql.{Date, Timestamp}
+
+class TestGenericRecordAndRowConsistency extends HoodieClientTestBase {
+
+  var spark: SparkSession = _
+  val commonOpts = Map(
+    HoodieWriteConfig.TBL_NAME.key -> "hoodie_type_consistency_tbl",
+    "hoodie.insert.shuffle.parallelism" -> "1",
+    "hoodie.upsert.shuffle.parallelism" -> "1",
+    DataSourceWriteOptions.TABLE_TYPE.key -> "COPY_ON_WRITE",
+    DataSourceWriteOptions.RECORDKEY_FIELD.key -> "str,eventTime",
+    DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "typeId",
+    DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "typeId",
+    DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.ComplexKeyGenerator",
+    DataSourceWriteOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key -> "true"
+  )
+
+  /**
+   * Setup method running before each test.
+   */
+  @BeforeEach override def setUp(): Unit = {
+    setTableName("hoodie_type_consistency_tbl")
+    initPath()
+    initSparkContexts()
+    spark = sqlContext.sparkSession
+  }
+
+  @AfterEach override def tearDown(): Unit = {
+    cleanupSparkContexts()
+  }
+
+  @Test
+  def testTimestampTypeConsistency(): Unit = {
+    val _spark = spark
+    import _spark.implicits._
+
+    val df = Seq(
+      (1, Timestamp.valueOf("2014-01-01 23:00:01"), "abc"),
+      (1, Timestamp.valueOf("2014-11-30 12:40:32"), "abc"),
+      (2, Timestamp.valueOf("2016-12-29 09:54:00"), "def"),
+      (2, Timestamp.valueOf("2016-05-09 10:12:43"), "def")
+    ).toDF("typeId", "eventTime", "str")
+
+    testConsistencyBetweenGenericRecordAndRow(df)
+  }
+
+  @Test
+  def testDateTypeConsistency(): Unit = {
+    val _spark = spark
+    import _spark.implicits._
+
+    val df = Seq(
+      (1, Date.valueOf("2014-01-01"), "abc"),
+      (1, Date.valueOf("2014-11-30"), "abc"),
+      (2, Date.valueOf("2016-12-29"), "def"),
+      (2, Date.valueOf("2016-05-09"), "def")
+    ).toDF("typeId", "eventTime", "str")
+
+    testConsistencyBetweenGenericRecordAndRow(df)
+  }
+
+  private def testConsistencyBetweenGenericRecordAndRow(df: DataFrame): Unit = {
+    val _spark = spark
+    import _spark.implicits._
+
+    // upsert operation generate recordKey by GenericRecord
+    val tempRecordPath = basePath + "/record_tbl/"
+    df.write.format("hudi")
+      .options(commonOpts)
+      .option(DataSourceWriteOptions.OPERATION.key, "upsert")
+      .mode(org.apache.spark.sql.SaveMode.Overwrite)
+      .save(tempRecordPath)
+
+    val data1 = spark.read.format("hudi")
+      .load(tempRecordPath)
+      .select("_hoodie_record_key")
+      .map(_.toString()).collect().sorted
+
+    // bulk_insert operation generate recordKey by Row
+    val tempRowPath = basePath + "/row_tbl/"
+    df.write.format("hudi")
+      .options(commonOpts)
+      .option(DataSourceWriteOptions.OPERATION.key, "bulk_insert")
+      .mode(org.apache.spark.sql.SaveMode.Overwrite)
+      .save(tempRowPath)
+
+    val data2 = spark.read.format("hudi")
+      .load(tempRowPath)
+      .select("_hoodie_record_key")
+      .map(_.toString()).collect().sorted
+
+    assert(data1 sameElements data2)
+  }
+}
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala
@@ -25,9 +25,8 @@ import org.apache.hudi.common.config.HoodieConfig
 import org.apache.hudi.common.model._
 import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver}
 import org.apache.hudi.common.testutils.HoodieTestDataGenerator
-import org.apache.hudi.common.util.PartitionPathEncodeUtils
 import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieWriteConfig}
-import org.apache.hudi.exception.{ExceptionUtil, HoodieException}
+import org.apache.hudi.exception.HoodieException
 import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode
 import org.apache.hudi.functional.TestBootstrap
 import org.apache.hudi.hive.HiveSyncConfig
@@ -35,13 +34,12 @@ import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator,
 import org.apache.hudi.testutils.DataSourceTestUtils
 import org.apache.spark.SparkContext
 import org.apache.spark.api.java.JavaSparkContext
+import org.apache.spark.sql._
 import org.apache.spark.sql.functions.{expr, lit}
 import org.apache.spark.sql.hudi.HoodieSparkSessionExtension
 import org.apache.spark.sql.hudi.command.SqlKeyGenerator
 import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
-import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext, SaveMode, SparkSession}
 import org.junit.jupiter.api.Assertions.{assertEquals, assertFalse, assertTrue, fail}
-import org.junit.jupiter.api.function.Executable
 import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
 import org.junit.jupiter.params.ParameterizedTest
 import org.junit.jupiter.params.provider.{CsvSource, EnumSource, ValueSource}
@@ -115,6 +113,13 @@ class TestHoodieSparkSqlWriter {
      sqlContext.clearCache();
      sqlContext = null;
    }
+    if (sc != null) {
+      sc.stop()
+      sc = null
+    }
+    if (spark != null) {
+      spark.close()
+    }
  }

  /**
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.hudi
 import org.apache.hudi.common.table.HoodieTableMetaClient
 import org.apache.hudi.exception.HoodieDuplicateKeyException

+import java.io.File
+
 class TestInsertTable extends TestHoodieSqlBase {

  test("Test Insert Into") {
@@ -238,29 +240,46 @@ class TestInsertTable extends TestHoodieSqlBase {
      )
      typeAndValue.foreach { case (partitionType, partitionValue) =>
        val tableName = generateTableName
-        // Create table
-        spark.sql(
-          s"""
-             |create table $tableName (
-             |  id int,
-             |  name string,
-             |  price double,
-             |  dt $partitionType
-             |) using hudi
-             | tblproperties (primaryKey = 'id')
-             | partitioned by (dt)
-             | location '${tmp.getCanonicalPath}/$tableName'
-       """.stripMargin)
-        spark.sql(s"insert into $tableName partition(dt = $partitionValue) select 1, 'a1', 10")
-        spark.sql(s"insert into $tableName select 2, 'a2', 10, $partitionValue")
-        checkAnswer(s"select id, name, price, cast(dt as string) from $tableName order by id")(
-          Seq(1, "a1", 10, removeQuotes(partitionValue).toString),
-          Seq(2, "a2", 10, removeQuotes(partitionValue).toString)
-        )
+        validateDifferentTypesOfPartitionColumn(tmp, partitionType, partitionValue, tableName)
      }
    }
  }

+  test("Test TimestampType Partition Column With Consistent Logical Timestamp Enabled") {
+    withTempDir { tmp =>
+      val typeAndValue = Seq(
+        ("timestamp", "'2021-05-20 00:00:00'"),
+        ("date", "'2021-05-20'")
+      )
+      typeAndValue.foreach { case (partitionType, partitionValue) =>
+        val tableName = generateTableName
+        spark.sql(s"set hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled=true")
+        validateDifferentTypesOfPartitionColumn(tmp, partitionType, partitionValue, tableName)
+      }
+    }
+  }
+
+  private def validateDifferentTypesOfPartitionColumn(tmp: File, partitionType: String, partitionValue: Any, tableName: String) = {
+    spark.sql(
+      s"""
+         |create table $tableName (
+         |  id int,
+         |  name string,
+         |  price double,
+         |  dt $partitionType
+         |) using hudi
+         | tblproperties (primaryKey = 'id')
+         | partitioned by (dt)
+         | location '${tmp.getCanonicalPath}/$tableName'
+       """.stripMargin)
+    spark.sql(s"insert into $tableName partition(dt = $partitionValue) select 1, 'a1', 10")
+    spark.sql(s"insert into $tableName select 2, 'a2', 10, $partitionValue")
+    checkAnswer(s"select id, name, price, cast(dt as string) from $tableName order by id")(
+      Seq(1, "a1", 10, removeQuotes(partitionValue).toString),
+      Seq(2, "a2", 10, removeQuotes(partitionValue).toString)
+    )
+  }
+
  test("Test insert for uppercase table name") {
    withTempDir{ tmp =>
      val tableName = s"H_$generateTableName"