1
0

[HUDI-1587] Add latency and freshness support (#2541)

Save min and max of event time in each commit and compute the latency and freshness metrics.
This commit is contained in:
Raymond Xu
2021-03-03 20:13:12 -08:00
committed by GitHub
parent f11a6c7b2d
commit 899ae70fdb
14 changed files with 283 additions and 26 deletions

View File

@@ -25,6 +25,8 @@ import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import static org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro;
@@ -37,6 +39,9 @@ import static org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldVal;
*/
public class DefaultHoodieRecordPayload extends OverwriteWithLatestAvroPayload {
public static final String METADATA_EVENT_TIME_KEY = "metadata.event_time.key";
private Option<Object> eventTime = Option.empty();
public DefaultHoodieRecordPayload(GenericRecord record, Comparable orderingVal) {
super(record, orderingVal);
}
@@ -71,6 +76,10 @@ public class DefaultHoodieRecordPayload extends OverwriteWithLatestAvroPayload {
/*
* We reached a point where the value is disk is older than the incoming record.
*/
eventTime = Option.ofNullable(getNestedFieldVal(incomingRecord, properties.getProperty(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP), true));
/*
* Now check if the incoming record is a delete record.
*/
if (isDeleteRecord(incomingRecord)) {
@@ -79,4 +88,13 @@ public class DefaultHoodieRecordPayload extends OverwriteWithLatestAvroPayload {
return Option.of(incomingRecord);
}
}
@Override
public Option<Map<String, String>> getMetadata() {
Map<String, String> metadata = new HashMap<>();
if (eventTime.isPresent()) {
metadata.put(METADATA_EVENT_TIME_KEY, String.valueOf(eventTime.get()));
}
return metadata.isEmpty() ? Option.empty() : Option.of(metadata);
}
}

View File

@@ -18,14 +18,16 @@
package org.apache.hudi.common.model;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -323,6 +325,20 @@ public class HoodieCommitMetadata implements Serializable {
return totalUpsertTime;
}
public Pair<Option<Long>, Option<Long>> getMinAndMaxEventTime() {
long minEventTime = Long.MAX_VALUE;
long maxEventTime = Long.MIN_VALUE;
for (Map.Entry<String, List<HoodieWriteStat>> entry : partitionToWriteStats.entrySet()) {
for (HoodieWriteStat writeStat : entry.getValue()) {
minEventTime = writeStat.getMinEventTime() != null ? Math.min(writeStat.getMinEventTime(), minEventTime) : minEventTime;
maxEventTime = writeStat.getMaxEventTime() != null ? Math.max(writeStat.getMaxEventTime(), maxEventTime) : maxEventTime;
}
}
return Pair.of(
minEventTime == Long.MAX_VALUE ? Option.empty() : Option.of(minEventTime),
maxEventTime == Long.MIN_VALUE ? Option.empty() : Option.of(maxEventTime));
}
@Override
public boolean equals(Object o) {
if (this == o) {

View File

@@ -24,9 +24,20 @@ package org.apache.hudi.common.model;
*/
public class HoodiePayloadProps {
// payload ordering field. This could be used to merge incoming record with that in storage. Implementations of
// {@link HoodieRecordPayload} can leverage if required.
/**
* Property for payload ordering field; to be used to merge incoming record with that in storage.
* Implementations of {@link HoodieRecordPayload} can leverage if required.
*
* @see DefaultHoodieRecordPayload
*/
public static final String PAYLOAD_ORDERING_FIELD_PROP = "hoodie.payload.ordering.field";
public static String DEFAULT_PAYLOAD_ORDERING_FIELD_VAL = "ts";
/**
* Property for payload event time field; to be used to extract source event time info.
*
* @see DefaultHoodieRecordPayload
*/
public static final String PAYLOAD_EVENT_TIME_FIELD_PROP = "hoodie.payload.event.time.field";
public static String DEFAULT_PAYLOAD_EVENT_TIME_FIELD_VAL = "ts";
}

View File

@@ -143,6 +143,18 @@ public class HoodieWriteStat implements Serializable {
*/
private long fileSizeInBytes;
/**
* The earliest of incoming records' event times (Epoch ms) for calculating latency.
*/
@Nullable
private Long minEventTime;
/**
* The latest of incoming records' event times (Epoch ms) for calculating freshness.
*/
@Nullable
private Long maxEventTime;
@Nullable
@JsonIgnore
private RuntimeStats runtimeStats;
@@ -303,6 +315,30 @@ public class HoodieWriteStat implements Serializable {
this.fileSizeInBytes = fileSizeInBytes;
}
public Long getMinEventTime() {
return minEventTime;
}
public void setMinEventTime(Long minEventTime) {
if (this.minEventTime == null) {
this.minEventTime = minEventTime;
} else {
this.minEventTime = Math.min(minEventTime, this.minEventTime);
}
}
public Long getMaxEventTime() {
return maxEventTime;
}
public void setMaxEventTime(Long maxEventTime) {
if (this.maxEventTime == null) {
this.maxEventTime = maxEventTime;
} else {
this.maxEventTime = Math.max(maxEventTime, this.maxEventTime);
}
}
@Nullable
public RuntimeStats getRuntimeStats() {
return runtimeStats;

View File

@@ -0,0 +1,40 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.util;
import java.time.Instant;
import java.time.format.DateTimeParseException;
import java.util.Objects;
public class DateTimeUtils {
/**
* Parse input String to a {@link java.time.Instant}.
* @param s Input String should be Epoch time in millisecond or ISO-8601 format.
*/
public static Instant parseDateTime(String s) throws DateTimeParseException {
ValidationUtils.checkArgument(Objects.nonNull(s), "Input String cannot be null.");
try {
return Instant.ofEpochMilli(Long.parseLong(s));
} catch (NumberFormatException e) {
return Instant.parse(s);
}
}
}

View File

@@ -18,12 +18,16 @@
package org.apache.hudi.common.model;
import org.apache.hudi.common.util.Option;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.IOException;
import java.util.Arrays;
@@ -31,6 +35,7 @@ import java.util.Properties;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Unit tests {@link DefaultHoodieRecordPayload}.
@@ -50,6 +55,7 @@ public class TestDefaultHoodieRecordPayload {
));
props = new Properties();
props.setProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP, "ts");
props.setProperty(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP, "ts");
}
@Test
@@ -104,4 +110,36 @@ public class TestDefaultHoodieRecordPayload {
assertFalse(payload2.combineAndGetUpdateValue(record1, schema, props).isPresent());
}
@Test
public void testGetEmptyMetadata() {
GenericRecord record = new GenericData.Record(schema);
record.put("id", "1");
record.put("partition", "partition0");
record.put("ts", 0L);
record.put("_hoodie_is_deleted", false);
DefaultHoodieRecordPayload payload = new DefaultHoodieRecordPayload(Option.of(record));
assertFalse(payload.getMetadata().isPresent());
}
@ParameterizedTest
@ValueSource(longs = {1L, 1612542030000L})
public void testGetEventTimeInMetadata(long eventTime) throws IOException {
GenericRecord record1 = new GenericData.Record(schema);
record1.put("id", "1");
record1.put("partition", "partition0");
record1.put("ts", 0L);
record1.put("_hoodie_is_deleted", false);
GenericRecord record2 = new GenericData.Record(schema);
record2.put("id", "1");
record2.put("partition", "partition0");
record2.put("ts", eventTime);
record2.put("_hoodie_is_deleted", false);
DefaultHoodieRecordPayload payload2 = new DefaultHoodieRecordPayload(record2, eventTime);
payload2.combineAndGetUpdateValue(record1, schema, props);
assertTrue(payload2.getMetadata().isPresent());
assertEquals(eventTime,
Long.parseLong(payload2.getMetadata().get().get(DefaultHoodieRecordPayload.METADATA_EVENT_TIME_KEY)));
}
}

View File

@@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.util;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.time.format.DateTimeParseException;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertThrows;
public class TestDateTimeUtils {
@ParameterizedTest
@ValueSource(strings = {"0", "1612542030000", "2020-01-01T01:01:00Z", "1970-01-01T00:00:00.123456Z"})
public void testParseStringIntoInstant(String s) {
assertDoesNotThrow(() -> {
DateTimeUtils.parseDateTime(s);
});
}
@ParameterizedTest
@ValueSource(strings = {"#", "0L", ""})
public void testParseDateTimeThrowsException(String s) {
assertThrows(DateTimeParseException.class, () -> {
DateTimeUtils.parseDateTime(s);
});
}
@Test
public void testParseDateTimeWithNull() {
assertThrows(IllegalArgumentException.class, () -> {
DateTimeUtils.parseDateTime(null);
});
}
}