1
0

[HUDI-2177][HUDI-2200] Adding virtual keys support for MOR table (#3315)

This commit is contained in:
Sivabalan Narayanan
2021-08-02 09:45:09 -04:00
committed by GitHub
parent dde57b293c
commit fe508376fa
37 changed files with 633 additions and 261 deletions

View File

@@ -50,6 +50,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
private val confBroadcast = sc.broadcast(new SerializableWritable(config))
private val preCombineField = tableState.preCombineField
private val recordKeyFieldOpt = tableState.recordKeyFieldOpt
private val payloadProps = if (preCombineField.isDefined) {
Some(HoodiePayloadConfig.newBuilder.withPayloadOrderingField(preCombineField.get).build.getProps)
} else {
@@ -209,6 +210,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
private val logRecords = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config).getRecords
private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala
private val keyToSkip = mutable.Set.empty[String]
private val recordKeyPosition = if (recordKeyFieldOpt.isEmpty) HOODIE_RECORD_KEY_COL_POS else tableState.tableStructSchema.fieldIndex(recordKeyFieldOpt.get)
private var recordToLoad: InternalRow = _
@@ -216,7 +218,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
override def hasNext: Boolean = {
if (baseFileIterator.hasNext) {
val curRow = baseFileIterator.next()
val curKey = curRow.getString(HOODIE_RECORD_KEY_COL_POS)
val curKey = curRow.getString(recordKeyPosition)
if (logRecords.containsKey(curKey)) {
// duplicate key found, merging
keyToSkip.add(curKey)

View File

@@ -120,6 +120,7 @@ object HoodieSparkSqlWriter {
val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP)
val partitionColumns = HoodieWriterUtils.getPartitionColumns(keyGenerator)
val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY)
val populateMetaFields = parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean
val tableMetaClient = HoodieTableMetaClient.withPropertyBuilder()
.setTableType(tableType)
@@ -130,7 +131,9 @@ object HoodieSparkSqlWriter {
.setPayloadClassName(hoodieConfig.getString(PAYLOAD_CLASS_OPT_KEY))
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
.setPartitionFields(partitionColumns)
.setPopulateMetaFields(parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean)
.setPopulateMetaFields(populateMetaFields)
.setRecordKeyFields(hoodieConfig.getString(RECORDKEY_FIELD_OPT_KEY))
.setKeyGeneratorClassProp(hoodieConfig.getString(KEYGENERATOR_CLASS_OPT_KEY))
.initTable(sparkContext.hadoopConfiguration, path.get)
tableConfig = tableMetaClient.getTableConfig
}
@@ -284,18 +287,22 @@ object HoodieSparkSqlWriter {
val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP)
val partitionColumns = HoodieWriterUtils.getPartitionColumns(parameters)
val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY)
val keyGenProp = hoodieConfig.getString(HoodieTableConfig.HOODIE_TABLE_KEY_GENERATOR_CLASS)
val populateMetaFields = parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean
HoodieTableMetaClient.withPropertyBuilder()
.setTableType(HoodieTableType.valueOf(tableType))
.setTableName(tableName)
.setRecordKeyFields(recordKeyFields)
.setArchiveLogFolder(archiveLogFolder)
.setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_OPT_KEY))
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
.setBootstrapIndexClass(bootstrapIndexClass)
.setBootstrapBasePath(bootstrapBasePath)
.setPartitionFields(partitionColumns)
.initTable(sparkContext.hadoopConfiguration, path)
.setTableType(HoodieTableType.valueOf(tableType))
.setTableName(tableName)
.setRecordKeyFields(recordKeyFields)
.setArchiveLogFolder(archiveLogFolder)
.setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_OPT_KEY))
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
.setBootstrapIndexClass(bootstrapIndexClass)
.setBootstrapBasePath(bootstrapBasePath)
.setPartitionFields(partitionColumns)
.setPopulateMetaFields(populateMetaFields)
.setKeyGeneratorClassProp(keyGenProp)
.initTable(sparkContext.hadoopConfiguration, path)
}
val jsc = new JavaSparkContext(sqlContext.sparkContext)

View File

@@ -65,6 +65,9 @@ class IncrementalRelation(val sqlContext: SQLContext,
throw new HoodieException(s"Specify the begin instant time to pull from using " +
s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key}")
}
if (!metaClient.getTableConfig.populateMetaFields()) {
throw new HoodieException("Incremental queries are not supported when meta fields are disabled")
}
val useEndInstantSchema = optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_KEY.key,
DataSourceReadOptions.INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_KEY.defaultValue).toBoolean

View File

@@ -59,6 +59,9 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext,
throw new HoodieException(s"Specify the begin instant time to pull from using " +
s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key}")
}
if (!metaClient.getTableConfig.populateMetaFields()) {
throw new HoodieException("Incremental queries are not supported when meta fields are disabled")
}
private val lastInstant = commitTimeline.lastInstant().get()
private val mergeType = optParams.getOrElse(
@@ -125,7 +128,8 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext,
tableAvroSchema.toString,
requiredAvroSchema.toString,
fileIndex,
preCombineField
preCombineField,
Option.empty
)
val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(
sparkSession = sqlContext.sparkSession,

View File

@@ -49,7 +49,8 @@ case class HoodieMergeOnReadTableState(tableStructSchema: StructType,
tableAvroSchema: String,
requiredAvroSchema: String,
hoodieRealtimeFileSplits: List[HoodieMergeOnReadFileSplit],
preCombineField: Option[String])
preCombineField: Option[String],
recordKeyFieldOpt: Option[String])
class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
val optParams: Map[String, String],
@@ -87,6 +88,10 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
optParams.get(DataSourceReadOptions.READ_PRE_COMBINE_FIELD.key)
}
}
private var recordKeyFieldOpt = Option.empty[String]
if (!metaClient.getTableConfig.populateMetaFields()) {
recordKeyFieldOpt = Option(metaClient.getTableConfig.getRecordKeyFieldProp)
}
override def schema: StructType = tableStructSchema
override def needConversion: Boolean = false
@@ -104,7 +109,8 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
tableAvroSchema.toString,
requiredAvroSchema.toString,
fileIndex,
preCombineField
preCombineField,
recordKeyFieldOpt
)
val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(
sparkSession = sqlContext.sparkSession,

View File

@@ -110,7 +110,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
*/
public class TestBootstrap extends HoodieClientTestBase {
public static final String TRIP_HIVE_COLUMN_TYPES = "bigint,string,string,string,double,double,double,double,"
public static final String TRIP_HIVE_COLUMN_TYPES = "bigint,string,string,string,string,double,double,double,double,"
+ "struct<amount:double,currency:string>,array<struct<amount:double,currency:string>>,boolean";
@TempDir
@@ -576,11 +576,11 @@ public class TestBootstrap extends HoodieClientTestBase {
if (isPartitioned) {
df = df.withColumn("datestr", callUDF("partgen", new Column("_row_key")));
// Order the columns to ensure generated avro schema aligns with Hive schema
df = df.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon",
df = df.select("timestamp", "_row_key", "partition_path", "rider", "driver", "begin_lat", "begin_lon",
"end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted", "datestr");
} else {
// Order the columns to ensure generated avro schema aligns with Hive schema
df = df.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon",
df = df.select("timestamp", "_row_key", "partition_path", "rider", "driver", "begin_lat", "begin_lon",
"end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted");
}
return df;

View File

@@ -394,7 +394,7 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
List((DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, HoodieFileFormat.PARQUET.name(), true), (DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, HoodieFileFormat.ORC.name(), true),
(DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, HoodieFileFormat.PARQUET.name(), true), (DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, HoodieFileFormat.ORC.name(), true),
(DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, HoodieFileFormat.PARQUET.name(), false))
(DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, HoodieFileFormat.PARQUET.name(), false), (DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, HoodieFileFormat.PARQUET.name(), false))
.foreach(t => {
val tableType = t._1
val baseFileFormat = t._2