1
0

[HUDI-2177][HUDI-2200] Adding virtual keys support for MOR table (#3315)

This commit is contained in:
Sivabalan Narayanan
2021-08-02 09:45:09 -04:00
committed by GitHub
parent dde57b293c
commit fe508376fa
37 changed files with 633 additions and 261 deletions

View File

@@ -50,6 +50,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
private val confBroadcast = sc.broadcast(new SerializableWritable(config))
private val preCombineField = tableState.preCombineField
private val recordKeyFieldOpt = tableState.recordKeyFieldOpt
private val payloadProps = if (preCombineField.isDefined) {
Some(HoodiePayloadConfig.newBuilder.withPayloadOrderingField(preCombineField.get).build.getProps)
} else {
@@ -209,6 +210,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
private val logRecords = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config).getRecords
private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala
private val keyToSkip = mutable.Set.empty[String]
private val recordKeyPosition = if (recordKeyFieldOpt.isEmpty) HOODIE_RECORD_KEY_COL_POS else tableState.tableStructSchema.fieldIndex(recordKeyFieldOpt.get)
private var recordToLoad: InternalRow = _
@@ -216,7 +218,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
override def hasNext: Boolean = {
if (baseFileIterator.hasNext) {
val curRow = baseFileIterator.next()
val curKey = curRow.getString(HOODIE_RECORD_KEY_COL_POS)
val curKey = curRow.getString(recordKeyPosition)
if (logRecords.containsKey(curKey)) {
// duplicate key found, merging
keyToSkip.add(curKey)

View File

@@ -120,6 +120,7 @@ object HoodieSparkSqlWriter {
val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP)
val partitionColumns = HoodieWriterUtils.getPartitionColumns(keyGenerator)
val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY)
val populateMetaFields = parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean
val tableMetaClient = HoodieTableMetaClient.withPropertyBuilder()
.setTableType(tableType)
@@ -130,7 +131,9 @@ object HoodieSparkSqlWriter {
.setPayloadClassName(hoodieConfig.getString(PAYLOAD_CLASS_OPT_KEY))
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
.setPartitionFields(partitionColumns)
.setPopulateMetaFields(parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean)
.setPopulateMetaFields(populateMetaFields)
.setRecordKeyFields(hoodieConfig.getString(RECORDKEY_FIELD_OPT_KEY))
.setKeyGeneratorClassProp(hoodieConfig.getString(KEYGENERATOR_CLASS_OPT_KEY))
.initTable(sparkContext.hadoopConfiguration, path.get)
tableConfig = tableMetaClient.getTableConfig
}
@@ -284,18 +287,22 @@ object HoodieSparkSqlWriter {
val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP)
val partitionColumns = HoodieWriterUtils.getPartitionColumns(parameters)
val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY)
val keyGenProp = hoodieConfig.getString(HoodieTableConfig.HOODIE_TABLE_KEY_GENERATOR_CLASS)
val populateMetaFields = parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean
HoodieTableMetaClient.withPropertyBuilder()
.setTableType(HoodieTableType.valueOf(tableType))
.setTableName(tableName)
.setRecordKeyFields(recordKeyFields)
.setArchiveLogFolder(archiveLogFolder)
.setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_OPT_KEY))
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
.setBootstrapIndexClass(bootstrapIndexClass)
.setBootstrapBasePath(bootstrapBasePath)
.setPartitionFields(partitionColumns)
.initTable(sparkContext.hadoopConfiguration, path)
.setTableType(HoodieTableType.valueOf(tableType))
.setTableName(tableName)
.setRecordKeyFields(recordKeyFields)
.setArchiveLogFolder(archiveLogFolder)
.setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_OPT_KEY))
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
.setBootstrapIndexClass(bootstrapIndexClass)
.setBootstrapBasePath(bootstrapBasePath)
.setPartitionFields(partitionColumns)
.setPopulateMetaFields(populateMetaFields)
.setKeyGeneratorClassProp(keyGenProp)
.initTable(sparkContext.hadoopConfiguration, path)
}
val jsc = new JavaSparkContext(sqlContext.sparkContext)

View File

@@ -65,6 +65,9 @@ class IncrementalRelation(val sqlContext: SQLContext,
throw new HoodieException(s"Specify the begin instant time to pull from using " +
s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key}")
}
if (!metaClient.getTableConfig.populateMetaFields()) {
throw new HoodieException("Incremental queries are not supported when meta fields are disabled")
}
val useEndInstantSchema = optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_KEY.key,
DataSourceReadOptions.INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_KEY.defaultValue).toBoolean

View File

@@ -59,6 +59,9 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext,
throw new HoodieException(s"Specify the begin instant time to pull from using " +
s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key}")
}
if (!metaClient.getTableConfig.populateMetaFields()) {
throw new HoodieException("Incremental queries are not supported when meta fields are disabled")
}
private val lastInstant = commitTimeline.lastInstant().get()
private val mergeType = optParams.getOrElse(
@@ -125,7 +128,8 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext,
tableAvroSchema.toString,
requiredAvroSchema.toString,
fileIndex,
preCombineField
preCombineField,
Option.empty
)
val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(
sparkSession = sqlContext.sparkSession,

View File

@@ -49,7 +49,8 @@ case class HoodieMergeOnReadTableState(tableStructSchema: StructType,
tableAvroSchema: String,
requiredAvroSchema: String,
hoodieRealtimeFileSplits: List[HoodieMergeOnReadFileSplit],
preCombineField: Option[String])
preCombineField: Option[String],
recordKeyFieldOpt: Option[String])
class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
val optParams: Map[String, String],
@@ -87,6 +88,10 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
optParams.get(DataSourceReadOptions.READ_PRE_COMBINE_FIELD.key)
}
}
private var recordKeyFieldOpt = Option.empty[String]
if (!metaClient.getTableConfig.populateMetaFields()) {
recordKeyFieldOpt = Option(metaClient.getTableConfig.getRecordKeyFieldProp)
}
override def schema: StructType = tableStructSchema
override def needConversion: Boolean = false
@@ -104,7 +109,8 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
tableAvroSchema.toString,
requiredAvroSchema.toString,
fileIndex,
preCombineField
preCombineField,
recordKeyFieldOpt
)
val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(
sparkSession = sqlContext.sparkSession,