1
0

[HUDI-1150] Fix unable to parse input partition field :1 exception when using TimestampBasedKeyGenerator(#1920)

This commit is contained in:
Mathieu
2020-08-23 19:56:50 +08:00
committed by GitHub
parent 7291607ae3
commit 35b21855da
6 changed files with 109 additions and 33 deletions

View File

@@ -39,7 +39,7 @@ import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.keygen.parser.HoodieDateTimeParser;
import org.apache.hudi.keygen.parser.AbstractHoodieDateTimeParser;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.avro.LogicalTypes;
@@ -172,9 +172,9 @@ public class DataSourceUtils {
/**
* Create a date time parser class for TimestampBasedKeyGenerator, passing in any configs needed.
*/
public static HoodieDateTimeParser createDateTimeParser(TypedProperties props, String parserClass) throws IOException {
public static AbstractHoodieDateTimeParser createDateTimeParser(TypedProperties props, String parserClass) throws IOException {
try {
return (HoodieDateTimeParser) ReflectionUtils.loadClass(parserClass, props);
return (AbstractHoodieDateTimeParser) ReflectionUtils.loadClass(parserClass, props);
} catch (Throwable e) {
throw new IOException("Could not load date time parser class " + parserClass, e);
}

View File

@@ -146,7 +146,7 @@ public class RowKeyGeneratorHelper {
}
valueToProcess = (Row) valueToProcess.get(positions.get(index));
} else { // last index
if (valueToProcess.getAs(positions.get(index)).toString().isEmpty()) {
if (null != valueToProcess.getAs(positions.get(index)) && valueToProcess.getAs(positions.get(index)).toString().isEmpty()) {
toReturn = EMPTY_RECORDKEY_PLACEHOLDER;
break;
}

View File

@@ -26,7 +26,7 @@ import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieDeltaStreamerException;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.keygen.parser.HoodieDateTimeParser;
import org.apache.hudi.keygen.parser.AbstractHoodieDateTimeParser;
import org.apache.hudi.keygen.parser.HoodieDateTimeParserImpl;
import org.apache.avro.generic.GenericRecord;
@@ -41,6 +41,7 @@ import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.TimeZone;
import java.util.concurrent.TimeUnit;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
@@ -63,10 +64,11 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator {
private final String outputDateFormat;
private transient Option<DateTimeFormatter> inputFormatter;
private transient DateTimeFormatter partitionFormatter;
private final HoodieDateTimeParser parser;
private final AbstractHoodieDateTimeParser parser;
// TimeZone detailed settings reference
// https://docs.oracle.com/javase/8/docs/api/java/util/TimeZone.html
private final DateTimeZone inputDateTimeZone;
private final DateTimeZone outputDateTimeZone;
protected final boolean encodePartitionPath;
@@ -107,6 +109,7 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator {
super(config, recordKeyField, partitionPathField);
String dateTimeParserClass = config.getString(Config.DATE_TIME_PARSER_PROP, HoodieDateTimeParserImpl.class.getName());
this.parser = DataSourceUtils.createDateTimeParser(config, dateTimeParserClass);
this.inputDateTimeZone = parser.getInputDateTimeZone();
this.outputDateTimeZone = parser.getOutputDateTimeZone();
this.outputDateFormat = parser.getOutputDateFormat();
this.timestampType = TimestampType.valueOf(config.getString(Config.TIMESTAMP_TYPE_FIELD_PROP));
@@ -133,7 +136,7 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator {
public String getPartitionPath(GenericRecord record) {
Object partitionVal = HoodieAvroUtils.getNestedFieldVal(record, getPartitionPathFields().get(0), true);
if (partitionVal == null) {
partitionVal = 1L;
partitionVal = getDefaultPartitionVal();
}
try {
return getPartitionPath(partitionVal);
@@ -142,6 +145,31 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator {
}
}
/**
* Set default value to partitionVal if the input value of partitionPathField is null.
*/
private Object getDefaultPartitionVal() {
Object result = 1L;
if (timestampType == TimestampType.DATE_STRING || timestampType == TimestampType.MIXED) {
// since partitionVal is null, we can set a default value of any format as TIMESTAMP_INPUT_DATE_FORMAT_PROP
// configured, here we take the first.
// {Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP} won't be null, it has been checked in the initialization process of
// inputFormatter
String delimiter = parser.getConfigInputDateFormatDelimiter();
String format = config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "").split(delimiter)[0];
// if both input and output timeZone are not configured, use GMT.
if (null != inputDateTimeZone) {
return new DateTime(result, inputDateTimeZone).toString(format);
} else if (null != outputDateTimeZone) {
return new DateTime(result, outputDateTimeZone).toString(format);
} else {
return new DateTime(result, DateTimeZone.forTimeZone(TimeZone.getTimeZone("GMT"))).toString(format);
}
}
return result;
}
/**
* The function takes care of lazily initialising dateTimeFormatter variables only once.
*/
@@ -219,9 +247,9 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator {
buildFieldPositionMapIfNeeded(row.schema());
Object partitionPathFieldVal = RowKeyGeneratorHelper.getNestedFieldVal(row, partitionPathPositions.get(getPartitionPathFields().get(0)));
try {
if (partitionPathFieldVal.toString().contains(DEFAULT_PARTITION_PATH) || partitionPathFieldVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER)
if (partitionPathFieldVal == null || partitionPathFieldVal.toString().contains(DEFAULT_PARTITION_PATH) || partitionPathFieldVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER)
|| partitionPathFieldVal.toString().contains(EMPTY_RECORDKEY_PLACEHOLDER)) {
fieldVal = 1L;
fieldVal = getDefaultPartitionVal();
} else {
fieldVal = partitionPathFieldVal;
}

View File

@@ -17,35 +17,57 @@
package org.apache.hudi.keygen.parser;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.keygen.TimestampBasedKeyGenerator;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormatter;
import java.io.Serializable;
public interface HoodieDateTimeParser extends Serializable {
public abstract class AbstractHoodieDateTimeParser implements Serializable {
protected final TypedProperties config;
protected final String configInputDateFormatDelimiter;
public AbstractHoodieDateTimeParser(TypedProperties config) {
this.config = config;
this.configInputDateFormatDelimiter = initInputDateFormatDelimiter();
}
private String initInputDateFormatDelimiter() {
String inputDateFormatDelimiter = config.getString(TimestampBasedKeyGenerator.Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, ",").trim();
inputDateFormatDelimiter = inputDateFormatDelimiter.isEmpty() ? "," : inputDateFormatDelimiter;
return inputDateFormatDelimiter;
}
/**
* Returns the output date format in which the partition paths will be created for the hudi dataset.
* @return
*/
String getOutputDateFormat();
public String getOutputDateFormat() {
return config.getString(TimestampBasedKeyGenerator.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP);
}
/**
* Returns input formats in which datetime based values might be coming in incoming records.
* @return
*/
Option<DateTimeFormatter> getInputFormatter();
public abstract Option<DateTimeFormatter> getInputFormatter();
/**
* Returns the datetime zone one should expect the incoming values into.
* @return
*/
DateTimeZone getInputDateTimeZone();
public abstract DateTimeZone getInputDateTimeZone();
/**
* Returns the datetime zone using which the final partition paths for hudi dataset are created.
* @return
*/
DateTimeZone getOutputDateTimeZone();
public abstract DateTimeZone getOutputDateTimeZone();
/**
* Returns the input date format delimiter, comma by default.
*/
public String getConfigInputDateFormatDelimiter() {
return this.configInputDateFormatDelimiter;
}
}

View File

@@ -28,32 +28,22 @@ import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.DateTimeFormatterBuilder;
import org.joda.time.format.DateTimeParser;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.TimeZone;
public class HoodieDateTimeParserImpl implements HoodieDateTimeParser, Serializable {
public class HoodieDateTimeParserImpl extends AbstractHoodieDateTimeParser {
private String configInputDateFormatList;
private final String configInputDateFormatDelimiter;
private final TypedProperties config;
// TimeZone detailed settings reference
// https://docs.oracle.com/javase/8/docs/api/java/util/TimeZone.html
private final DateTimeZone inputDateTimeZone;
public HoodieDateTimeParserImpl(TypedProperties config) {
this.config = config;
super(config);
DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.TIMESTAMP_TYPE_FIELD_PROP, Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP));
this.inputDateTimeZone = getInputDateTimeZone();
this.configInputDateFormatDelimiter = getConfigInputDateFormatDelimiter();
}
private String getConfigInputDateFormatDelimiter() {
String inputDateFormatDelimiter = config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, ",").trim();
inputDateFormatDelimiter = inputDateFormatDelimiter.isEmpty() ? "," : inputDateFormatDelimiter;
return inputDateFormatDelimiter;
}
private DateTimeFormatter getInputDateFormatter() {
@@ -65,7 +55,7 @@ public class HoodieDateTimeParserImpl implements HoodieDateTimeParser, Serializa
.append(
null,
Arrays.stream(
this.configInputDateFormatList.split(this.configInputDateFormatDelimiter))
this.configInputDateFormatList.split(super.configInputDateFormatDelimiter))
.map(String::trim)
.map(DateTimeFormat::forPattern)
.map(DateTimeFormatter::getParser)
@@ -119,4 +109,5 @@ public class HoodieDateTimeParserImpl implements HoodieDateTimeParser, Serializa
}
return !outputTimeZone.trim().isEmpty() ? DateTimeZone.forTimeZone(TimeZone.getTimeZone(outputTimeZone)) : null;
}
}

View File

@@ -150,6 +150,29 @@ public class TestTimestampBasedKeyGenerator {
// test w/ Row
assertEquals("2020-01-06 12", keyGen.getPartitionPath(baseRow));
// timezone is GMT+8:00, createTime is null
baseRecord.put("createTime", null);
properties = getBaseKeyConfig("EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT+8:00", null);
keyGen = new TimestampBasedKeyGenerator(properties);
HoodieKey hk5 = keyGen.getKey(baseRecord);
assertEquals("1970-01-01 08", hk5.getPartitionPath());
// test w/ Row
baseRow = genericRecordToRow(baseRecord);
assertEquals("1970-01-01 08", keyGen.getPartitionPath(baseRow));
// timestamp is DATE_STRING, timezone is GMT, createTime is null
baseRecord.put("createTime", null);
properties = getBaseKeyConfig("DATE_STRING", "yyyy-MM-dd hh:mm:ss", "GMT", null);
properties.setProperty("hoodie.deltastreamer.keygen.timebased.input.dateformat", "yyyy-MM-dd hh:mm:ss");
keyGen = new TimestampBasedKeyGenerator(properties);
HoodieKey hk6 = keyGen.getKey(baseRecord);
assertEquals("1970-01-01 12:00:00", hk6.getPartitionPath());
// test w/ Row
baseRow = genericRecordToRow(baseRecord);
assertEquals("1970-01-01 12:00:00", keyGen.getPartitionPath(baseRow));
}
@Test
@@ -160,12 +183,24 @@ public class TestTimestampBasedKeyGenerator {
// timezone is GMT
properties = getBaseKeyConfig("SCALAR", "yyyy-MM-dd hh", "GMT", "days");
TimestampBasedKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties);
HoodieKey hk5 = keyGen.getKey(baseRecord);
assertEquals(hk5.getPartitionPath(), "2024-10-04 12");
HoodieKey hk1 = keyGen.getKey(baseRecord);
assertEquals(hk1.getPartitionPath(), "2024-10-04 12");
// test w/ Row
baseRow = genericRecordToRow(baseRecord);
assertEquals("2024-10-04 12", keyGen.getPartitionPath(baseRow));
// timezone is GMT, createTime is null
baseRecord.put("createTime", null);
properties = getBaseKeyConfig("SCALAR", "yyyy-MM-dd hh", "GMT", "days");
keyGen = new TimestampBasedKeyGenerator(properties);
HoodieKey hk2 = keyGen.getKey(baseRecord);
assertEquals("1970-01-02 12", hk2.getPartitionPath());
// test w/ Row
baseRow = genericRecordToRow(baseRecord);
assertEquals("1970-01-02 12", keyGen.getPartitionPath(baseRow));
}
@Test