[HUDI-2217] Fix no value present in incremental query on MOR (#3340)
This commit is contained in:
@@ -17,7 +17,6 @@
|
|||||||
|
|
||||||
package org.apache.hudi
|
package org.apache.hudi
|
||||||
|
|
||||||
import org.apache.hudi.common.fs.FSUtils
|
|
||||||
import org.apache.hudi.common.model.HoodieRecord
|
import org.apache.hudi.common.model.HoodieRecord
|
||||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
||||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||||
@@ -52,7 +51,6 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext,
|
|||||||
private val log = LogManager.getLogger(classOf[MergeOnReadIncrementalRelation])
|
private val log = LogManager.getLogger(classOf[MergeOnReadIncrementalRelation])
|
||||||
private val conf = sqlContext.sparkContext.hadoopConfiguration
|
private val conf = sqlContext.sparkContext.hadoopConfiguration
|
||||||
private val jobConf = new JobConf(conf)
|
private val jobConf = new JobConf(conf)
|
||||||
private val fs = FSUtils.getFs(metaClient.getBasePath, conf)
|
|
||||||
private val commitTimeline = metaClient.getCommitsAndCompactionTimeline.filterCompletedInstants()
|
private val commitTimeline = metaClient.getCommitsAndCompactionTimeline.filterCompletedInstants()
|
||||||
if (commitTimeline.empty()) {
|
if (commitTimeline.empty()) {
|
||||||
throw new HoodieException("No instants to incrementally pull")
|
throw new HoodieException("No instants to incrementally pull")
|
||||||
@@ -76,7 +74,7 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext,
|
|||||||
private val tableAvroSchema = schemaUtil.getTableAvroSchema
|
private val tableAvroSchema = schemaUtil.getTableAvroSchema
|
||||||
private val tableStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)
|
private val tableStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)
|
||||||
private val maxCompactionMemoryInBytes = getMaxCompactionMemoryInBytes(jobConf)
|
private val maxCompactionMemoryInBytes = getMaxCompactionMemoryInBytes(jobConf)
|
||||||
private val fileIndex = buildFileIndex()
|
private val fileIndex = if (commitsToReturn.isEmpty) List() else buildFileIndex()
|
||||||
private val preCombineField = {
|
private val preCombineField = {
|
||||||
val preCombineFieldFromTableConfig = metaClient.getTableConfig.getPreCombineField
|
val preCombineFieldFromTableConfig = metaClient.getTableConfig.getPreCombineField
|
||||||
if (preCombineFieldFromTableConfig != null) {
|
if (preCombineFieldFromTableConfig != null) {
|
||||||
@@ -92,63 +90,71 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext,
|
|||||||
override def needConversion: Boolean = false
|
override def needConversion: Boolean = false
|
||||||
|
|
||||||
override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
|
override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
|
||||||
val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD)
|
if (fileIndex.isEmpty) {
|
||||||
val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)
|
filters
|
||||||
val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)
|
} else {
|
||||||
filters :+isNotNullFilter :+ largerThanFilter :+ lessThanFilter
|
|
||||||
}
|
|
||||||
|
|
||||||
override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
|
|
||||||
log.debug(s"buildScan requiredColumns = ${requiredColumns.mkString(",")}")
|
|
||||||
log.debug(s"buildScan filters = ${filters.mkString(",")}")
|
|
||||||
// config to ensure the push down filter for parquet will be applied.
|
|
||||||
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.filterPushdown", "true")
|
|
||||||
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.recordLevelFilter.enabled", "true")
|
|
||||||
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false")
|
|
||||||
val pushDownFilter = {
|
|
||||||
val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD)
|
val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD)
|
||||||
val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)
|
val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)
|
||||||
val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)
|
val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)
|
||||||
filters :+isNotNullFilter :+ largerThanFilter :+ lessThanFilter
|
filters :+ isNotNullFilter :+ largerThanFilter :+ lessThanFilter
|
||||||
}
|
}
|
||||||
val (requiredAvroSchema, requiredStructSchema) =
|
}
|
||||||
MergeOnReadSnapshotRelation.getRequiredSchema(tableAvroSchema, requiredColumns)
|
|
||||||
|
|
||||||
val hoodieTableState = HoodieMergeOnReadTableState(
|
override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
|
||||||
tableStructSchema,
|
if (fileIndex.isEmpty) {
|
||||||
requiredStructSchema,
|
sqlContext.sparkContext.emptyRDD[Row]
|
||||||
tableAvroSchema.toString,
|
} else {
|
||||||
requiredAvroSchema.toString,
|
log.debug(s"buildScan requiredColumns = ${requiredColumns.mkString(",")}")
|
||||||
fileIndex,
|
log.debug(s"buildScan filters = ${filters.mkString(",")}")
|
||||||
preCombineField
|
// config to ensure the push down filter for parquet will be applied.
|
||||||
)
|
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.filterPushdown", "true")
|
||||||
val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(
|
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.recordLevelFilter.enabled", "true")
|
||||||
sparkSession = sqlContext.sparkSession,
|
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false")
|
||||||
dataSchema = tableStructSchema,
|
val pushDownFilter = {
|
||||||
partitionSchema = StructType(Nil),
|
val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD)
|
||||||
requiredSchema = tableStructSchema,
|
val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)
|
||||||
filters = pushDownFilter,
|
val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)
|
||||||
options = optParams,
|
filters :+ isNotNullFilter :+ largerThanFilter :+ lessThanFilter
|
||||||
hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
|
}
|
||||||
)
|
val (requiredAvroSchema, requiredStructSchema) =
|
||||||
val requiredSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(
|
MergeOnReadSnapshotRelation.getRequiredSchema(tableAvroSchema, requiredColumns)
|
||||||
sparkSession = sqlContext.sparkSession,
|
|
||||||
dataSchema = tableStructSchema,
|
|
||||||
partitionSchema = StructType(Nil),
|
|
||||||
requiredSchema = requiredStructSchema,
|
|
||||||
filters = pushDownFilter,
|
|
||||||
options = optParams,
|
|
||||||
hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
|
|
||||||
)
|
|
||||||
|
|
||||||
val rdd = new HoodieMergeOnReadRDD(
|
val hoodieTableState = HoodieMergeOnReadTableState(
|
||||||
sqlContext.sparkContext,
|
tableStructSchema,
|
||||||
jobConf,
|
requiredStructSchema,
|
||||||
fullSchemaParquetReader,
|
tableAvroSchema.toString,
|
||||||
requiredSchemaParquetReader,
|
requiredAvroSchema.toString,
|
||||||
hoodieTableState
|
fileIndex,
|
||||||
)
|
preCombineField
|
||||||
rdd.asInstanceOf[RDD[Row]]
|
)
|
||||||
|
val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(
|
||||||
|
sparkSession = sqlContext.sparkSession,
|
||||||
|
dataSchema = tableStructSchema,
|
||||||
|
partitionSchema = StructType(Nil),
|
||||||
|
requiredSchema = tableStructSchema,
|
||||||
|
filters = pushDownFilter,
|
||||||
|
options = optParams,
|
||||||
|
hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
|
||||||
|
)
|
||||||
|
val requiredSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(
|
||||||
|
sparkSession = sqlContext.sparkSession,
|
||||||
|
dataSchema = tableStructSchema,
|
||||||
|
partitionSchema = StructType(Nil),
|
||||||
|
requiredSchema = requiredStructSchema,
|
||||||
|
filters = pushDownFilter,
|
||||||
|
options = optParams,
|
||||||
|
hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
|
||||||
|
)
|
||||||
|
|
||||||
|
val rdd = new HoodieMergeOnReadRDD(
|
||||||
|
sqlContext.sparkContext,
|
||||||
|
jobConf,
|
||||||
|
fullSchemaParquetReader,
|
||||||
|
requiredSchemaParquetReader,
|
||||||
|
hoodieTableState
|
||||||
|
)
|
||||||
|
rdd.asInstanceOf[RDD[Row]]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def buildFileIndex(): List[HoodieMergeOnReadFileSplit] = {
|
def buildFileIndex(): List[HoodieMergeOnReadFileSplit] = {
|
||||||
|
|||||||
@@ -230,6 +230,14 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
|||||||
assertEquals(1, countsPerCommit.length)
|
assertEquals(1, countsPerCommit.length)
|
||||||
assertEquals(firstCommit, countsPerCommit(0).get(0))
|
assertEquals(firstCommit, countsPerCommit(0).get(0))
|
||||||
|
|
||||||
|
// Test incremental query has no instant in range
|
||||||
|
val emptyIncDF = spark.read.format("org.apache.hudi")
|
||||||
|
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||||
|
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||||
|
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, "001")
|
||||||
|
.load(basePath)
|
||||||
|
assertEquals(0, emptyIncDF.count())
|
||||||
|
|
||||||
// Upsert an empty dataFrame
|
// Upsert an empty dataFrame
|
||||||
val emptyRecords = recordsToStrings(dataGen.generateUpdates("002", 0)).toList
|
val emptyRecords = recordsToStrings(dataGen.generateUpdates("002", 0)).toList
|
||||||
val emptyDF = spark.read.json(spark.sparkContext.parallelize(emptyRecords, 1))
|
val emptyDF = spark.read.json(spark.sparkContext.parallelize(emptyRecords, 1))
|
||||||
|
|||||||
@@ -201,6 +201,14 @@ class TestMORDataSource extends HoodieClientTestBase {
|
|||||||
assertEquals(1, hudiIncDF3.select("_hoodie_commit_time").distinct().count())
|
assertEquals(1, hudiIncDF3.select("_hoodie_commit_time").distinct().count())
|
||||||
assertEquals(commit2Time, hudiIncDF3.select("_hoodie_commit_time").head().get(0).toString)
|
assertEquals(commit2Time, hudiIncDF3.select("_hoodie_commit_time").head().get(0).toString)
|
||||||
|
|
||||||
|
// Test incremental query has no instant in range
|
||||||
|
val emptyIncDF = spark.read.format("org.apache.hudi")
|
||||||
|
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||||
|
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||||
|
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, "001")
|
||||||
|
.load(basePath)
|
||||||
|
assertEquals(0, emptyIncDF.count())
|
||||||
|
|
||||||
// Unmerge
|
// Unmerge
|
||||||
val hudiSnapshotSkipMergeDF2 = spark.read.format("org.apache.hudi")
|
val hudiSnapshotSkipMergeDF2 = spark.read.format("org.apache.hudi")
|
||||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||||
|
|||||||
Reference in New Issue
Block a user