[HUDI-2177][HUDI-2200] Adding virtual keys support for MOR table (#3315)
This commit is contained in:
committed by
GitHub
parent
dde57b293c
commit
fe508376fa
@@ -50,6 +50,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
|
||||
private val confBroadcast = sc.broadcast(new SerializableWritable(config))
|
||||
private val preCombineField = tableState.preCombineField
|
||||
private val recordKeyFieldOpt = tableState.recordKeyFieldOpt
|
||||
private val payloadProps = if (preCombineField.isDefined) {
|
||||
Some(HoodiePayloadConfig.newBuilder.withPayloadOrderingField(preCombineField.get).build.getProps)
|
||||
} else {
|
||||
@@ -209,6 +210,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
private val logRecords = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config).getRecords
|
||||
private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala
|
||||
private val keyToSkip = mutable.Set.empty[String]
|
||||
private val recordKeyPosition = if (recordKeyFieldOpt.isEmpty) HOODIE_RECORD_KEY_COL_POS else tableState.tableStructSchema.fieldIndex(recordKeyFieldOpt.get)
|
||||
|
||||
private var recordToLoad: InternalRow = _
|
||||
|
||||
@@ -216,7 +218,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
||||
override def hasNext: Boolean = {
|
||||
if (baseFileIterator.hasNext) {
|
||||
val curRow = baseFileIterator.next()
|
||||
val curKey = curRow.getString(HOODIE_RECORD_KEY_COL_POS)
|
||||
val curKey = curRow.getString(recordKeyPosition)
|
||||
if (logRecords.containsKey(curKey)) {
|
||||
// duplicate key found, merging
|
||||
keyToSkip.add(curKey)
|
||||
|
||||
@@ -120,6 +120,7 @@ object HoodieSparkSqlWriter {
|
||||
val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP)
|
||||
val partitionColumns = HoodieWriterUtils.getPartitionColumns(keyGenerator)
|
||||
val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY)
|
||||
val populateMetaFields = parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean
|
||||
|
||||
val tableMetaClient = HoodieTableMetaClient.withPropertyBuilder()
|
||||
.setTableType(tableType)
|
||||
@@ -130,7 +131,9 @@ object HoodieSparkSqlWriter {
|
||||
.setPayloadClassName(hoodieConfig.getString(PAYLOAD_CLASS_OPT_KEY))
|
||||
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
|
||||
.setPartitionFields(partitionColumns)
|
||||
.setPopulateMetaFields(parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean)
|
||||
.setPopulateMetaFields(populateMetaFields)
|
||||
.setRecordKeyFields(hoodieConfig.getString(RECORDKEY_FIELD_OPT_KEY))
|
||||
.setKeyGeneratorClassProp(hoodieConfig.getString(KEYGENERATOR_CLASS_OPT_KEY))
|
||||
.initTable(sparkContext.hadoopConfiguration, path.get)
|
||||
tableConfig = tableMetaClient.getTableConfig
|
||||
}
|
||||
@@ -284,18 +287,22 @@ object HoodieSparkSqlWriter {
|
||||
val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP)
|
||||
val partitionColumns = HoodieWriterUtils.getPartitionColumns(parameters)
|
||||
val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY)
|
||||
val keyGenProp = hoodieConfig.getString(HoodieTableConfig.HOODIE_TABLE_KEY_GENERATOR_CLASS)
|
||||
val populateMetaFields = parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean
|
||||
|
||||
HoodieTableMetaClient.withPropertyBuilder()
|
||||
.setTableType(HoodieTableType.valueOf(tableType))
|
||||
.setTableName(tableName)
|
||||
.setRecordKeyFields(recordKeyFields)
|
||||
.setArchiveLogFolder(archiveLogFolder)
|
||||
.setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_OPT_KEY))
|
||||
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
|
||||
.setBootstrapIndexClass(bootstrapIndexClass)
|
||||
.setBootstrapBasePath(bootstrapBasePath)
|
||||
.setPartitionFields(partitionColumns)
|
||||
.initTable(sparkContext.hadoopConfiguration, path)
|
||||
.setTableType(HoodieTableType.valueOf(tableType))
|
||||
.setTableName(tableName)
|
||||
.setRecordKeyFields(recordKeyFields)
|
||||
.setArchiveLogFolder(archiveLogFolder)
|
||||
.setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_OPT_KEY))
|
||||
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
|
||||
.setBootstrapIndexClass(bootstrapIndexClass)
|
||||
.setBootstrapBasePath(bootstrapBasePath)
|
||||
.setPartitionFields(partitionColumns)
|
||||
.setPopulateMetaFields(populateMetaFields)
|
||||
.setKeyGeneratorClassProp(keyGenProp)
|
||||
.initTable(sparkContext.hadoopConfiguration, path)
|
||||
}
|
||||
|
||||
val jsc = new JavaSparkContext(sqlContext.sparkContext)
|
||||
|
||||
@@ -65,6 +65,9 @@ class IncrementalRelation(val sqlContext: SQLContext,
|
||||
throw new HoodieException(s"Specify the begin instant time to pull from using " +
|
||||
s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key}")
|
||||
}
|
||||
if (!metaClient.getTableConfig.populateMetaFields()) {
|
||||
throw new HoodieException("Incremental queries are not supported when meta fields are disabled")
|
||||
}
|
||||
|
||||
val useEndInstantSchema = optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_KEY.key,
|
||||
DataSourceReadOptions.INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_KEY.defaultValue).toBoolean
|
||||
|
||||
@@ -59,6 +59,9 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext,
|
||||
throw new HoodieException(s"Specify the begin instant time to pull from using " +
|
||||
s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key}")
|
||||
}
|
||||
if (!metaClient.getTableConfig.populateMetaFields()) {
|
||||
throw new HoodieException("Incremental queries are not supported when meta fields are disabled")
|
||||
}
|
||||
|
||||
private val lastInstant = commitTimeline.lastInstant().get()
|
||||
private val mergeType = optParams.getOrElse(
|
||||
@@ -125,7 +128,8 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext,
|
||||
tableAvroSchema.toString,
|
||||
requiredAvroSchema.toString,
|
||||
fileIndex,
|
||||
preCombineField
|
||||
preCombineField,
|
||||
Option.empty
|
||||
)
|
||||
val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(
|
||||
sparkSession = sqlContext.sparkSession,
|
||||
|
||||
@@ -49,7 +49,8 @@ case class HoodieMergeOnReadTableState(tableStructSchema: StructType,
|
||||
tableAvroSchema: String,
|
||||
requiredAvroSchema: String,
|
||||
hoodieRealtimeFileSplits: List[HoodieMergeOnReadFileSplit],
|
||||
preCombineField: Option[String])
|
||||
preCombineField: Option[String],
|
||||
recordKeyFieldOpt: Option[String])
|
||||
|
||||
class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
|
||||
val optParams: Map[String, String],
|
||||
@@ -87,6 +88,10 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
|
||||
optParams.get(DataSourceReadOptions.READ_PRE_COMBINE_FIELD.key)
|
||||
}
|
||||
}
|
||||
private var recordKeyFieldOpt = Option.empty[String]
|
||||
if (!metaClient.getTableConfig.populateMetaFields()) {
|
||||
recordKeyFieldOpt = Option(metaClient.getTableConfig.getRecordKeyFieldProp)
|
||||
}
|
||||
override def schema: StructType = tableStructSchema
|
||||
|
||||
override def needConversion: Boolean = false
|
||||
@@ -104,7 +109,8 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
|
||||
tableAvroSchema.toString,
|
||||
requiredAvroSchema.toString,
|
||||
fileIndex,
|
||||
preCombineField
|
||||
preCombineField,
|
||||
recordKeyFieldOpt
|
||||
)
|
||||
val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(
|
||||
sparkSession = sqlContext.sparkSession,
|
||||
|
||||
Reference in New Issue
Block a user