1
0

Revert "[HUDI-2495] Resolve inconsistent key generation for timestamp types by GenericRecord and Row (#3944)" (#4201)

This commit is contained in:
Yann Byron
2021-12-04 00:13:38 +08:00
committed by GitHub
parent bed7f9897a
commit 2f96f4300b
3 changed files with 1 additions and 109 deletions

View File

@@ -1,104 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.testutils.SparkClientFunctionalTestHarness
import org.apache.spark.sql.DataFrame
import org.junit.jupiter.api.Test
import java.sql.{Date, Timestamp}
class TestGenericRecordAndRowConsistency extends SparkClientFunctionalTestHarness {
val commonOpts = Map(
HoodieWriteConfig.TBL_NAME.key -> "hoodie_type_consistency_tbl",
"hoodie.insert.shuffle.parallelism" -> "1",
"hoodie.upsert.shuffle.parallelism" -> "1",
DataSourceWriteOptions.TABLE_TYPE.key -> "COPY_ON_WRITE",
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "str,eventTime",
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "typeId",
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "typeId",
DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.ComplexKeyGenerator"
)
@Test
def testTimestampTypeConsistency(): Unit = {
val _spark = spark
import _spark.implicits._
val df = Seq(
(1, Timestamp.valueOf("2014-01-01 23:00:01"), "abc"),
(1, Timestamp.valueOf("2014-11-30 12:40:32"), "abc"),
(2, Timestamp.valueOf("2016-12-29 09:54:00"), "def"),
(2, Timestamp.valueOf("2016-05-09 10:12:43"), "def")
).toDF("typeId", "eventTime", "str")
testConsistencyBetweenGenericRecordAndRow(df)
}
@Test
def testDateTypeConsistency(): Unit = {
val _spark = spark
import _spark.implicits._
val df = Seq(
(1, Date.valueOf("2014-01-01"), "abc"),
(1, Date.valueOf("2014-11-30"), "abc"),
(2, Date.valueOf("2016-12-29"), "def"),
(2, Date.valueOf("2016-05-09"), "def")
).toDF("typeId", "eventTime", "str")
testConsistencyBetweenGenericRecordAndRow(df)
}
private def testConsistencyBetweenGenericRecordAndRow(df: DataFrame): Unit = {
val _spark = spark
import _spark.implicits._
// upsert operation generate recordKey by GenericRecord
val tempRecordPath = basePath + "/record_tbl/"
df.write.format("hudi")
.options(commonOpts)
.option(DataSourceWriteOptions.OPERATION.key, "upsert")
.mode(org.apache.spark.sql.SaveMode.Overwrite)
.save(tempRecordPath)
val data1 = spark.read.format("hudi")
.load(tempRecordPath)
.select("_hoodie_record_key")
.map(_.toString()).collect().sorted
// bulk_insert operation generate recordKey by Row
val tempRowPath = basePath + "/row_tbl/"
df.write.format("hudi")
.options(commonOpts)
.option(DataSourceWriteOptions.OPERATION.key, "bulk_insert")
.mode(org.apache.spark.sql.SaveMode.Overwrite)
.save(tempRowPath)
val data2 = spark.read.format("hudi")
.load(tempRowPath)
.select("_hoodie_record_key")
.map(_.toString()).collect().sorted
assert(data1 sameElements data2)
}
}