1
0

[HUDI-115] Adding DefaultHoodieRecordPayload to honor ordering with combineAndGetUpdateValue (#2311)

* Added ability to pass in `properties` to payload methods, so they can perform table/record specific merges
* Added default methods so existing payload classes are backwards compatible. 
* Adding DefaultHoodiePayload to honor ordering while merging two records
* Fixing default payload based on feedback
This commit is contained in:
Sivabalan Narayanan
2020-12-19 22:19:42 -05:00
committed by GitHub
parent 5388c7f7a3
commit 33d338f392
14 changed files with 447 additions and 27 deletions

View File

@@ -39,13 +39,14 @@ import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.TablePathUtils;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodiePayloadConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.exception.TableNotFoundException;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.HoodieIndex.IndexType;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.keygen.parser.AbstractHoodieDateTimeParser;
import org.apache.hudi.table.BulkInsertPartitioner;
@@ -177,10 +178,12 @@ public class DataSourceUtils {
}
return builder.forTable(tblName)
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withPayloadClass(parameters.get(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY()))
.withInlineCompaction(inlineCompact).build())
.withPayloadConfig(HoodiePayloadConfig.newBuilder().withPayloadOrderingField(parameters.get(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY()))
.build())
// override above with Hoodie configs specified as options.
.withProps(parameters).build();
}

View File

@@ -205,7 +205,6 @@ object DataSourceWriteOptions {
val PRECOMBINE_FIELD_OPT_KEY = "hoodie.datasource.write.precombine.field"
val DEFAULT_PRECOMBINE_FIELD_OPT_VAL = "ts"
/**
* Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting.
* This will render any value set for `PRECOMBINE_FIELD_OPT_VAL` in-effective

View File

@@ -17,12 +17,14 @@
package org.apache.hudi
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hudi.avro.HoodieAvroUtils
import org.apache.hudi.common.config.TypedProperties
import org.apache.hudi.common.model.{EmptyHoodieRecordPayload, HoodieKey, OverwriteWithLatestAvroPayload}
import org.apache.hudi.common.model.{BaseAvroPayload, DefaultHoodieRecordPayload, EmptyHoodieRecordPayload, HoodieKey, HoodiePayloadProps, OverwriteWithLatestAvroPayload}
import org.apache.hudi.common.testutils.SchemaTestUtil
import org.apache.hudi.common.util.Option
import org.apache.hudi.config.HoodiePayloadConfig
import org.apache.hudi.exception.{HoodieException, HoodieKeyException}
import org.apache.hudi.keygen._
import org.apache.hudi.testutils.KeyGeneratorTestUtilities
@@ -31,6 +33,8 @@ import org.junit.jupiter.api.Assertions.assertEquals
import org.junit.jupiter.api.{BeforeEach, Test}
import org.scalatest.Assertions.fail
import scala.collection.JavaConverters.mapAsJavaMapConverter
/**
* Tests on the default key generator, payload classes.
*/
@@ -567,6 +571,62 @@ class TestDataSourceDefaults {
assertEquals("field2", combinedGR21.get("field1").toString)
}
@Test def testOverwriteWithLatestAvroPayloadCombineAndGetUpdateValue() = {
val baseOrderingVal: Object = baseRecord.get("favoriteIntNumber")
val fieldSchema: Schema = baseRecord.getSchema().getField("favoriteIntNumber").schema()
val props = new TypedProperties()
props.put(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP, "favoriteIntNumber");
val basePayload = new OverwriteWithLatestAvroPayload(baseRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, baseOrderingVal).asInstanceOf[Comparable[_]])
val laterRecord = SchemaTestUtil
.generateAvroRecordFromJson(schema, 2, "001", "f1")
val laterOrderingVal: Object = laterRecord.get("favoriteIntNumber")
val newerPayload = new OverwriteWithLatestAvroPayload(laterRecord, HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal).asInstanceOf[Comparable[_]])
// it will provide the record with greatest combine value
val preCombinedPayload = basePayload.preCombine(newerPayload)
val precombinedGR = preCombinedPayload.getInsertValue(schema).get().asInstanceOf[GenericRecord]
assertEquals("field2", precombinedGR.get("field1").toString)
}
@Test def testDefaultHoodieRecordPayloadCombineAndGetUpdateValue() = {
val baseOrderingVal: Object = baseRecord.get("favoriteIntNumber")
val fieldSchema: Schema = baseRecord.getSchema().getField("favoriteIntNumber").schema()
val props = new TypedProperties()
props.put(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP, "favoriteIntNumber");
val laterRecord = SchemaTestUtil
.generateAvroRecordFromJson(schema, 2, "001", "f1")
val laterOrderingVal: Object = laterRecord.get("favoriteIntNumber")
val earlierRecord = SchemaTestUtil
.generateAvroRecordFromJson(schema, 1, "000", "f1")
val earlierOrderingVal: Object = earlierRecord.get("favoriteIntNumber")
val laterPayload = new DefaultHoodieRecordPayload(laterRecord,
HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, laterOrderingVal).asInstanceOf[Comparable[_]])
val earlierPayload = new DefaultHoodieRecordPayload(earlierRecord,
HoodieAvroUtils.convertValueForSpecificDataTypes(fieldSchema, earlierOrderingVal).asInstanceOf[Comparable[_]])
// it will provide the record with greatest combine value
val preCombinedPayload = laterPayload.preCombine(earlierPayload)
val precombinedGR = preCombinedPayload.getInsertValue(schema).get().asInstanceOf[GenericRecord]
assertEquals("field2", precombinedGR.get("field1").toString)
assertEquals(laterOrderingVal, precombinedGR.get("favoriteIntNumber"))
val earlierWithLater = earlierPayload.combineAndGetUpdateValue(laterRecord, schema, props)
val earlierwithLaterGR = earlierWithLater.get().asInstanceOf[GenericRecord]
assertEquals("field2", earlierwithLaterGR.get("field1").toString)
assertEquals(laterOrderingVal, earlierwithLaterGR.get("favoriteIntNumber"))
val laterWithEarlier = laterPayload.combineAndGetUpdateValue(earlierRecord, schema, props)
val laterWithEarlierGR = laterWithEarlier.get().asInstanceOf[GenericRecord]
assertEquals("field2", laterWithEarlierGR.get("field1").toString)
assertEquals(laterOrderingVal, laterWithEarlierGR.get("favoriteIntNumber"))
}
@Test def testEmptyHoodieRecordPayload() = {
val emptyPayload1 = new EmptyHoodieRecordPayload(baseRecord, 1)
val laterRecord = SchemaTestUtil