[HUDI-3268] Fix NPE while reading table with Spark datasource (#4630)
This commit is contained in:
@@ -86,8 +86,10 @@ class SparkHoodieTableFileIndex(spark: SparkSession,
|
|||||||
val nameFieldMap = generateFieldMap(schema)
|
val nameFieldMap = generateFieldMap(schema)
|
||||||
|
|
||||||
if (partitionColumns.isPresent) {
|
if (partitionColumns.isPresent) {
|
||||||
if (tableConfig.getKeyGeneratorClassName.equalsIgnoreCase(classOf[TimestampBasedKeyGenerator].getName)
|
// Note that key generator class name could be null
|
||||||
|| tableConfig.getKeyGeneratorClassName.equalsIgnoreCase(classOf[TimestampBasedAvroKeyGenerator].getName)) {
|
val keyGeneratorClassName = tableConfig.getKeyGeneratorClassName
|
||||||
|
if (classOf[TimestampBasedKeyGenerator].getName.equalsIgnoreCase(keyGeneratorClassName)
|
||||||
|
|| classOf[TimestampBasedAvroKeyGenerator].getName.equalsIgnoreCase(keyGeneratorClassName)) {
|
||||||
val partitionFields = partitionColumns.get().map(column => StructField(column, StringType))
|
val partitionFields = partitionColumns.get().map(column => StructField(column, StringType))
|
||||||
StructType(partitionFields)
|
StructType(partitionFields)
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -466,6 +466,12 @@
|
|||||||
<type>test-jar</type>
|
<type>test-jar</type>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hudi</groupId>
|
||||||
|
<artifactId>hudi-java-client</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.scalatest</groupId>
|
<groupId>org.scalatest</groupId>
|
||||||
|
|||||||
@@ -17,29 +17,36 @@
|
|||||||
|
|
||||||
package org.apache.hudi
|
package org.apache.hudi
|
||||||
|
|
||||||
import java.util.Properties
|
import org.apache.hadoop.conf.Configuration
|
||||||
import org.apache.hudi.DataSourceWriteOptions._
|
import org.apache.hudi.DataSourceWriteOptions._
|
||||||
|
import org.apache.hudi.client.HoodieJavaWriteClient
|
||||||
|
import org.apache.hudi.client.common.HoodieJavaEngineContext
|
||||||
import org.apache.hudi.common.config.HoodieMetadataConfig
|
import org.apache.hudi.common.config.HoodieMetadataConfig
|
||||||
|
import org.apache.hudi.common.engine.EngineType
|
||||||
|
import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType}
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
||||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
|
import org.apache.hudi.common.testutils.{HoodieTestDataGenerator, HoodieTestUtils}
|
||||||
|
import org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime
|
||||||
import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings
|
import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings
|
||||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils
|
import org.apache.hudi.common.util.PartitionPathEncodeUtils
|
||||||
|
import org.apache.hudi.common.util.StringUtils.isNullOrEmpty
|
||||||
import org.apache.hudi.config.HoodieWriteConfig
|
import org.apache.hudi.config.HoodieWriteConfig
|
||||||
import org.apache.hudi.keygen.ComplexKeyGenerator
|
import org.apache.hudi.keygen.ComplexKeyGenerator
|
||||||
import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.{Config, TimestampType}
|
import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.{Config, TimestampType}
|
||||||
import org.apache.hudi.testutils.HoodieClientTestBase
|
import org.apache.hudi.testutils.HoodieClientTestBase
|
||||||
import org.apache.spark.sql.catalyst.InternalRow
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
import org.apache.spark.sql.functions.{lit, struct}
|
|
||||||
import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, EqualTo, GreaterThanOrEqual, LessThan, Literal}
|
import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, EqualTo, GreaterThanOrEqual, LessThan, Literal}
|
||||||
import org.apache.spark.sql.execution.datasources.PartitionDirectory
|
import org.apache.spark.sql.execution.datasources.PartitionDirectory
|
||||||
|
import org.apache.spark.sql.functions.{lit, struct}
|
||||||
import org.apache.spark.sql.types.StringType
|
import org.apache.spark.sql.types.StringType
|
||||||
import org.apache.spark.sql.{SaveMode, SparkSession}
|
import org.apache.spark.sql.{DataFrameWriter, Row, SaveMode, SparkSession}
|
||||||
import org.junit.jupiter.api.Assertions.assertEquals
|
import org.junit.jupiter.api.Assertions.assertEquals
|
||||||
import org.junit.jupiter.api.BeforeEach
|
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||||
import org.junit.jupiter.params.ParameterizedTest
|
import org.junit.jupiter.params.ParameterizedTest
|
||||||
import org.junit.jupiter.params.provider.{CsvSource, ValueSource}
|
import org.junit.jupiter.params.provider.{Arguments, CsvSource, MethodSource, ValueSource}
|
||||||
|
|
||||||
|
import java.util.Properties
|
||||||
import scala.collection.JavaConversions._
|
import scala.collection.JavaConversions._
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
@@ -92,22 +99,25 @@ class TestHoodieFileIndex extends HoodieClientTestBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@ValueSource(strings = Array(
|
@MethodSource(Array("keyGeneratorParameters"))
|
||||||
"org.apache.hudi.keygen.ComplexKeyGenerator",
|
|
||||||
"org.apache.hudi.keygen.SimpleKeyGenerator",
|
|
||||||
"org.apache.hudi.keygen.TimestampBasedKeyGenerator"))
|
|
||||||
def testPartitionSchemaForBuildInKeyGenerator(keyGenerator: String): Unit = {
|
def testPartitionSchemaForBuildInKeyGenerator(keyGenerator: String): Unit = {
|
||||||
val records1 = dataGen.generateInsertsContainsAllPartitions("000", 100)
|
val records1 = dataGen.generateInsertsContainsAllPartitions("000", 100)
|
||||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2))
|
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2))
|
||||||
inputDF1.write.format("hudi")
|
val writer: DataFrameWriter[Row] = inputDF1.write.format("hudi")
|
||||||
.options(commonOpts)
|
.options(commonOpts)
|
||||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key, keyGenerator)
|
|
||||||
.option(Config.TIMESTAMP_TYPE_FIELD_PROP, TimestampType.DATE_STRING.name())
|
.option(Config.TIMESTAMP_TYPE_FIELD_PROP, TimestampType.DATE_STRING.name())
|
||||||
.option(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "yyyy/MM/dd")
|
.option(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "yyyy/MM/dd")
|
||||||
.option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyy-MM-dd")
|
.option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyy-MM-dd")
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
|
||||||
|
if (isNullOrEmpty(keyGenerator)) {
|
||||||
|
writer.save(basePath)
|
||||||
|
} else {
|
||||||
|
writer.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key, keyGenerator)
|
||||||
.save(basePath)
|
.save(basePath)
|
||||||
|
}
|
||||||
|
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient)
|
metaClient = HoodieTableMetaClient.reload(metaClient)
|
||||||
val fileIndex = HoodieFileIndex(spark, metaClient, None, queryOpts)
|
val fileIndex = HoodieFileIndex(spark, metaClient, None, queryOpts)
|
||||||
assertEquals("partition", fileIndex.partitionSchema.fields.map(_.name).mkString(","))
|
assertEquals("partition", fileIndex.partitionSchema.fields.map(_.name).mkString(","))
|
||||||
@@ -132,6 +142,44 @@ class TestHoodieFileIndex extends HoodieClientTestBase {
|
|||||||
assertEquals("partition", fileIndex.partitionSchema.fields.map(_.name).mkString(","))
|
assertEquals("partition", fileIndex.partitionSchema.fields.map(_.name).mkString(","))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def testPartitionSchemaWithoutKeyGenerator(): Unit = {
|
||||||
|
val metaClient = HoodieTestUtils.init(
|
||||||
|
hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, HoodieTableMetaClient.withPropertyBuilder()
|
||||||
|
.fromMetaClient(this.metaClient)
|
||||||
|
.setRecordKeyFields("_row_key")
|
||||||
|
.setPartitionFields("partition_path")
|
||||||
|
.setTableName("hoodie_test").build())
|
||||||
|
val props = Map(
|
||||||
|
"hoodie.insert.shuffle.parallelism" -> "4",
|
||||||
|
"hoodie.upsert.shuffle.parallelism" -> "4",
|
||||||
|
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||||
|
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition_path",
|
||||||
|
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp",
|
||||||
|
HoodieWriteConfig.TBL_NAME.key -> "hoodie_test",
|
||||||
|
DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL
|
||||||
|
)
|
||||||
|
val writeConfig = HoodieWriteConfig.newBuilder()
|
||||||
|
.withEngineType(EngineType.JAVA)
|
||||||
|
.withPath(basePath)
|
||||||
|
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
|
||||||
|
.withProps(props)
|
||||||
|
.build()
|
||||||
|
val context = new HoodieJavaEngineContext(new Configuration())
|
||||||
|
val writeClient = new HoodieJavaWriteClient(context, writeConfig)
|
||||||
|
val instantTime = makeNewCommitTime()
|
||||||
|
|
||||||
|
val records: java.util.List[HoodieRecord[Nothing]] =
|
||||||
|
dataGen.generateInsertsContainsAllPartitions(instantTime, 100)
|
||||||
|
.asInstanceOf[java.util.List[HoodieRecord[Nothing]]]
|
||||||
|
writeClient.startCommitWithTime(instantTime)
|
||||||
|
writeClient.insert(records, instantTime)
|
||||||
|
metaClient.reloadActiveTimeline()
|
||||||
|
|
||||||
|
val fileIndex = HoodieFileIndex(spark, metaClient, None, queryOpts)
|
||||||
|
assertEquals("partition_path", fileIndex.partitionSchema.fields.map(_.name).mkString(","))
|
||||||
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@ValueSource(booleans = Array(true, false))
|
@ValueSource(booleans = Array(true, false))
|
||||||
def testPartitionPruneWithPartitionEncode(partitionEncode: Boolean): Unit = {
|
def testPartitionPruneWithPartitionEncode(partitionEncode: Boolean): Unit = {
|
||||||
@@ -295,3 +343,14 @@ class TestHoodieFileIndex extends HoodieClientTestBase {
|
|||||||
partitionPaths.map(getFileCountInPartitionPath).sum
|
partitionPaths.map(getFileCountInPartitionPath).sum
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
object TestHoodieFileIndex {
|
||||||
|
def keyGeneratorParameters(): java.util.stream.Stream[Arguments] = {
|
||||||
|
java.util.stream.Stream.of(
|
||||||
|
Arguments.arguments(null.asInstanceOf[String]),
|
||||||
|
Arguments.arguments("org.apache.hudi.keygen.ComplexKeyGenerator"),
|
||||||
|
Arguments.arguments("org.apache.hudi.keygen.SimpleKeyGenerator"),
|
||||||
|
Arguments.arguments("org.apache.hudi.keygen.TimestampBasedKeyGenerator")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user