[HUDI-2909] Handle logical type in TimestampBasedKeyGenerator (#4203)
* [HUDI-2909] Handle logical type in TimestampBasedKeyGenerator Timestampbased key generator was returning diff values for row writer and non row writer path. this patch fixes it and is guarded by a config flag (`hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled`)
This commit is contained in:
@@ -59,6 +59,7 @@ import java.io.OutputStream;
|
||||
import java.math.BigDecimal;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.Timestamp;
|
||||
import java.time.LocalDate;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
@@ -444,15 +445,15 @@ public class HoodieAvroUtils {
|
||||
/**
|
||||
* Obtain value of the provided field as string, denoted by dot notation. e.g: a.b.c
|
||||
*/
|
||||
public static String getNestedFieldValAsString(GenericRecord record, String fieldName, boolean returnNullIfNotFound) {
|
||||
Object obj = getNestedFieldVal(record, fieldName, returnNullIfNotFound);
|
||||
public static String getNestedFieldValAsString(GenericRecord record, String fieldName, boolean returnNullIfNotFound, boolean consistentLogicalTimestampEnabled) {
|
||||
Object obj = getNestedFieldVal(record, fieldName, returnNullIfNotFound, consistentLogicalTimestampEnabled);
|
||||
return StringUtils.objToString(obj);
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtain value of the provided field, denoted by dot notation. e.g: a.b.c
|
||||
*/
|
||||
public static Object getNestedFieldVal(GenericRecord record, String fieldName, boolean returnNullIfNotFound) {
|
||||
public static Object getNestedFieldVal(GenericRecord record, String fieldName, boolean returnNullIfNotFound, boolean consistentLogicalTimestampEnabled) {
|
||||
String[] parts = fieldName.split("\\.");
|
||||
GenericRecord valueNode = record;
|
||||
int i = 0;
|
||||
@@ -466,7 +467,7 @@ public class HoodieAvroUtils {
|
||||
// return, if last part of name
|
||||
if (i == parts.length - 1) {
|
||||
Schema fieldSchema = valueNode.getSchema().getField(part).schema();
|
||||
return convertValueForSpecificDataTypes(fieldSchema, val);
|
||||
return convertValueForSpecificDataTypes(fieldSchema, val, consistentLogicalTimestampEnabled);
|
||||
} else {
|
||||
// VC: Need a test here
|
||||
if (!(val instanceof GenericRecord)) {
|
||||
@@ -510,7 +511,7 @@ public class HoodieAvroUtils {
|
||||
* @param fieldValue avro field value
|
||||
* @return field value either converted (for certain data types) or as it is.
|
||||
*/
|
||||
public static Object convertValueForSpecificDataTypes(Schema fieldSchema, Object fieldValue) {
|
||||
public static Object convertValueForSpecificDataTypes(Schema fieldSchema, Object fieldValue, boolean consistentLogicalTimestampEnabled) {
|
||||
if (fieldSchema == null) {
|
||||
return fieldValue;
|
||||
}
|
||||
@@ -518,11 +519,11 @@ public class HoodieAvroUtils {
|
||||
if (fieldSchema.getType() == Schema.Type.UNION) {
|
||||
for (Schema schema : fieldSchema.getTypes()) {
|
||||
if (schema.getType() != Schema.Type.NULL) {
|
||||
return convertValueForAvroLogicalTypes(schema, fieldValue);
|
||||
return convertValueForAvroLogicalTypes(schema, fieldValue, consistentLogicalTimestampEnabled);
|
||||
}
|
||||
}
|
||||
}
|
||||
return convertValueForAvroLogicalTypes(fieldSchema, fieldValue);
|
||||
return convertValueForAvroLogicalTypes(fieldSchema, fieldValue, consistentLogicalTimestampEnabled);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -538,9 +539,13 @@ public class HoodieAvroUtils {
|
||||
* @param fieldValue avro field value
|
||||
* @return field value either converted (for certain data types) or as it is.
|
||||
*/
|
||||
private static Object convertValueForAvroLogicalTypes(Schema fieldSchema, Object fieldValue) {
|
||||
private static Object convertValueForAvroLogicalTypes(Schema fieldSchema, Object fieldValue, boolean consistentLogicalTimestampEnabled) {
|
||||
if (fieldSchema.getLogicalType() == LogicalTypes.date()) {
|
||||
return LocalDate.ofEpochDay(Long.parseLong(fieldValue.toString()));
|
||||
} else if (fieldSchema.getLogicalType() == LogicalTypes.timestampMillis() && consistentLogicalTimestampEnabled) {
|
||||
return new Timestamp(Long.parseLong(fieldValue.toString()));
|
||||
} else if (fieldSchema.getLogicalType() == LogicalTypes.timestampMicros() && consistentLogicalTimestampEnabled) {
|
||||
return new Timestamp(Long.parseLong(fieldValue.toString()) / 1000);
|
||||
} else if (fieldSchema.getLogicalType() instanceof LogicalTypes.Decimal) {
|
||||
Decimal dc = (Decimal) fieldSchema.getLogicalType();
|
||||
DecimalConversion decimalConversion = new DecimalConversion();
|
||||
@@ -585,15 +590,15 @@ public class HoodieAvroUtils {
|
||||
*/
|
||||
public static Object getRecordColumnValues(HoodieRecord<? extends HoodieRecordPayload> record,
|
||||
String[] columns,
|
||||
Schema schema) {
|
||||
Schema schema, boolean consistentLogicalTimestampEnabled) {
|
||||
try {
|
||||
GenericRecord genericRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
|
||||
if (columns.length == 1) {
|
||||
return HoodieAvroUtils.getNestedFieldVal(genericRecord, columns[0], true);
|
||||
return HoodieAvroUtils.getNestedFieldVal(genericRecord, columns[0], true, consistentLogicalTimestampEnabled);
|
||||
} else {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String col : columns) {
|
||||
sb.append(HoodieAvroUtils.getNestedFieldValAsString(genericRecord, col, true));
|
||||
sb.append(HoodieAvroUtils.getNestedFieldValAsString(genericRecord, col, true, consistentLogicalTimestampEnabled));
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
@@ -613,7 +618,7 @@ public class HoodieAvroUtils {
|
||||
*/
|
||||
public static Object getRecordColumnValues(HoodieRecord<? extends HoodieRecordPayload> record,
|
||||
String[] columns,
|
||||
SerializableSchema schema) {
|
||||
return getRecordColumnValues(record, columns, schema.get());
|
||||
SerializableSchema schema, boolean consistentLogicalTimestampEnabled) {
|
||||
return getRecordColumnValues(record, columns, schema.get(), consistentLogicalTimestampEnabled);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
package org.apache.hudi.common.model;
|
||||
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
@@ -87,7 +88,10 @@ public class DefaultHoodieRecordPayload extends OverwriteWithLatestAvroPayload {
|
||||
}
|
||||
|
||||
private static Option<Object> updateEventTime(GenericRecord record, Properties properties) {
|
||||
return Option.ofNullable(getNestedFieldVal(record, properties.getProperty(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP_KEY), true));
|
||||
boolean consistentLogicalTimestampEnabled = Boolean.parseBoolean(properties.getProperty(
|
||||
KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(),
|
||||
KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue()));
|
||||
return Option.ofNullable(getNestedFieldVal(record, properties.getProperty(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP_KEY), true, consistentLogicalTimestampEnabled));
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -110,10 +114,13 @@ public class DefaultHoodieRecordPayload extends OverwriteWithLatestAvroPayload {
|
||||
* NOTE: Deletes sent via EmptyHoodieRecordPayload and/or Delete operation type do not hit this code path
|
||||
* and need to be dealt with separately.
|
||||
*/
|
||||
boolean consistentLogicalTimestampEnabled = Boolean.parseBoolean(properties.getProperty(
|
||||
KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(),
|
||||
KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue()));
|
||||
Object persistedOrderingVal = getNestedFieldVal((GenericRecord) currentValue,
|
||||
properties.getProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY), true);
|
||||
properties.getProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY), true, consistentLogicalTimestampEnabled);
|
||||
Comparable incomingOrderingVal = (Comparable) getNestedFieldVal((GenericRecord) incomingRecord,
|
||||
properties.getProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY), true);
|
||||
properties.getProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY), true, consistentLogicalTimestampEnabled);
|
||||
return persistedOrderingVal == null || ((Comparable) persistedOrderingVal).compareTo(incomingOrderingVal) <= 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,6 +32,7 @@ public abstract class BaseKeyGenerator extends KeyGenerator {
|
||||
protected List<String> partitionPathFields;
|
||||
protected final boolean encodePartitionPath;
|
||||
protected final boolean hiveStylePartitioning;
|
||||
protected final boolean consistentLogicalTimestampEnabled;
|
||||
|
||||
protected BaseKeyGenerator(TypedProperties config) {
|
||||
super(config);
|
||||
@@ -39,6 +40,8 @@ public abstract class BaseKeyGenerator extends KeyGenerator {
|
||||
Boolean.parseBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING.defaultValue()));
|
||||
this.hiveStylePartitioning = config.getBoolean(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.key(),
|
||||
Boolean.parseBoolean(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.defaultValue()));
|
||||
this.consistentLogicalTimestampEnabled = config.getBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(),
|
||||
Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue()));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -78,4 +81,8 @@ public abstract class BaseKeyGenerator extends KeyGenerator {
|
||||
public List<String> getPartitionPathFields() {
|
||||
return partitionPathFields;
|
||||
}
|
||||
|
||||
public boolean isConsistentLogicalTimestampEnabled() {
|
||||
return consistentLogicalTimestampEnabled;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,6 +56,16 @@ public class KeyGeneratorOptions extends HoodieConfig {
|
||||
.withDocumentation("Partition path field. Value to be used at the partitionPath component of HoodieKey. "
|
||||
+ "Actual value ontained by invoking .toString()");
|
||||
|
||||
public static final ConfigProperty<String> KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED = ConfigProperty
|
||||
.key("hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("When set to true, consistent value will be generated for a logical timestamp type column, "
|
||||
+ "like timestamp-millis and timestamp-micros, irrespective of whether row-writer is enabled. Disabled by default so "
|
||||
+ "as not to break the pipeline that deploy either fully row-writer path or non row-writer path. For example, "
|
||||
+ "if it is kept disabled then record key of timestamp type with value `2016-12-29 09:54:00` will be written as timestamp "
|
||||
+ "`2016-12-29 09:54:00.0` in row-writer path, while it will be written as long value `1483023240000000` in non row-writer path. "
|
||||
+ "If enabled, then the timestamp value will be written in both the cases.");
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #URL_ENCODE_PARTITIONING} and its methods.
|
||||
*/
|
||||
|
||||
@@ -225,15 +225,15 @@ public class TestHoodieAvroUtils {
|
||||
rec.put("non_pii_col", "val1");
|
||||
rec.put("pii_col", "val2");
|
||||
|
||||
Object rowKey = HoodieAvroUtils.getNestedFieldVal(rec, "_row_key", true);
|
||||
Object rowKey = HoodieAvroUtils.getNestedFieldVal(rec, "_row_key", true, false);
|
||||
assertEquals("key1", rowKey);
|
||||
|
||||
Object rowKeyNotExist = HoodieAvroUtils.getNestedFieldVal(rec, "fake_key", true);
|
||||
Object rowKeyNotExist = HoodieAvroUtils.getNestedFieldVal(rec, "fake_key", true, false);
|
||||
assertNull(rowKeyNotExist);
|
||||
|
||||
// Field does not exist
|
||||
try {
|
||||
HoodieAvroUtils.getNestedFieldVal(rec, "fake_key", false);
|
||||
HoodieAvroUtils.getNestedFieldVal(rec, "fake_key", false, false);
|
||||
} catch (Exception e) {
|
||||
assertEquals("fake_key(Part -fake_key) field not found in record. Acceptable fields were :[timestamp, _row_key, non_pii_col, pii_col]",
|
||||
e.getMessage());
|
||||
@@ -241,7 +241,7 @@ public class TestHoodieAvroUtils {
|
||||
|
||||
// Field exist while value not
|
||||
try {
|
||||
HoodieAvroUtils.getNestedFieldVal(rec, "timestamp", false);
|
||||
HoodieAvroUtils.getNestedFieldVal(rec, "timestamp", false, false);
|
||||
} catch (Exception e) {
|
||||
assertEquals("The value of timestamp can not be null", e.getMessage());
|
||||
}
|
||||
@@ -255,7 +255,7 @@ public class TestHoodieAvroUtils {
|
||||
ByteBuffer byteBuffer = ByteBuffer.wrap(bigDecimal.unscaledValue().toByteArray());
|
||||
rec.put("decimal_col", byteBuffer);
|
||||
|
||||
Object decimalCol = HoodieAvroUtils.getNestedFieldVal(rec, "decimal_col", true);
|
||||
Object decimalCol = HoodieAvroUtils.getNestedFieldVal(rec, "decimal_col", true, false);
|
||||
assertEquals(bigDecimal, decimalCol);
|
||||
|
||||
Object obj = rec.get(1);
|
||||
|
||||
@@ -202,6 +202,10 @@ public final class SchemaTestUtil {
|
||||
return new Schema.Parser().parse(SchemaTestUtil.class.getResourceAsStream("/timestamp-test-evolved.avsc"));
|
||||
}
|
||||
|
||||
public static Schema getTimestampWithLogicalTypeSchema() throws IOException {
|
||||
return new Schema.Parser().parse(SchemaTestUtil.class.getResourceAsStream("/timestamp-logical-type.avsc"));
|
||||
}
|
||||
|
||||
public static GenericRecord generateAvroRecordFromJson(Schema schema, int recordNumber, String instantTime,
|
||||
String fileId) throws IOException {
|
||||
return generateAvroRecordFromJson(schema, recordNumber, instantTime, fileId, true);
|
||||
|
||||
26
hudi-common/src/test/resources/timestamp-logical-type.avsc
Normal file
26
hudi-common/src/test/resources/timestamp-logical-type.avsc
Normal file
@@ -0,0 +1,26 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
{
|
||||
"namespace": "example.avro",
|
||||
"type": "record",
|
||||
"name": "User",
|
||||
"fields": [
|
||||
{"name": "field1", "type": ["null", "string"], "default": null},
|
||||
{"name": "createTime", "type": ["null", {"type" : "long", "logicalType" : "timestamp-micros"}], "default": null}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user