1
0

[HUDI-1550] Honor ordering field for MOR Spark datasource reader (#2497)

This commit is contained in:
pengzhiwei
2021-02-01 21:04:27 +08:00
committed by GitHub
parent f159c0c49a
commit 0d8a4d0a56
8 changed files with 138 additions and 20 deletions

View File

@@ -17,11 +17,14 @@
package org.apache.hudi.functional
import org.apache.hudi.DataSourceWriteOptions.{KEYGENERATOR_CLASS_OPT_KEY, PARTITIONPATH_FIELD_OPT_KEY, PAYLOAD_CLASS_OPT_KEY, PRECOMBINE_FIELD_OPT_KEY, RECORDKEY_FIELD_OPT_KEY}
import org.apache.hudi.common.fs.FSUtils
import org.apache.hudi.common.model.DefaultHoodieRecordPayload
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
import org.apache.hudi.config.{HoodieCompactionConfig, HoodieWriteConfig}
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers}
import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings
import org.apache.hudi.keygen.NonpartitionedKeyGenerator
import org.apache.hudi.testutils.HoodieClientTestBase
import org.apache.log4j.LogManager
import org.apache.spark.sql._
@@ -29,7 +32,6 @@ import org.apache.spark.sql.functions._
import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue}
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
import scala.collection.JavaConversions._
/**
@@ -502,6 +504,44 @@ class TestMORDataSource extends HoodieClientTestBase {
hudiSnapshotDF2.show(1)
}
@Test
def testPreCombineFiledForReadMOR(): Unit = {
writeData((1, "a0",10, 100))
checkAnswer((1, "a0",10, 100))
writeData((1, "a0", 12, 99))
// The value has not update, because the version 99 < 100
checkAnswer((1, "a0",10, 100))
writeData((1, "a0", 12, 101))
// The value has update
checkAnswer((1, "a0", 12, 101))
}
private def writeData(data: (Int, String, Int, Int)): Unit = {
val _spark = spark
import _spark.implicits._
val df = Seq(data).toDF("id", "name", "value", "version")
df.write.format("org.apache.hudi")
.options(commonOpts)
// use DefaultHoodieRecordPayload here
.option(PAYLOAD_CLASS_OPT_KEY, classOf[DefaultHoodieRecordPayload].getCanonicalName)
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
.option(RECORDKEY_FIELD_OPT_KEY, "id")
.option(PRECOMBINE_FIELD_OPT_KEY, "version")
.option(PARTITIONPATH_FIELD_OPT_KEY, "")
.option(KEYGENERATOR_CLASS_OPT_KEY, classOf[NonpartitionedKeyGenerator].getName)
.mode(SaveMode.Append)
.save(basePath)
}
private def checkAnswer(expect: (Int, String, Int, Int)): Unit = {
val readDf = spark.read.format("org.apache.hudi")
.load(basePath + "/*")
val row1 = readDf.select("id", "name", "value", "version").take(1)(0)
assertEquals(Row(expect.productIterator.toSeq: _*), row1)
}
def verifySchemaAndTypes(df: DataFrame): Unit = {
assertEquals("amount,currency,tip_history,_hoodie_commit_seqno",
df.select("fare.amount", "fare.currency", "tip_history", "_hoodie_commit_seqno")