[HUDI-164] Fixes incorrect averageBytesPerRecord
When number of records written is zero, averageBytesPerRecord results in a huge size (division by zero and ceiled to Long.MAX_VALUE) causing OOM. This commit fixes this issue by reverse traversing the commits until a more reasonable average record size can be computed and if that is not possible returns the default configured record size.
This commit is contained in:
committed by
vinoth chandar
parent
93bc5e2153
commit
64df98fc4a
@@ -628,7 +628,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
private void assignInserts(WorkloadProfile profile) {
|
private void assignInserts(WorkloadProfile profile) {
|
||||||
// for new inserts, compute buckets depending on how many records we have for each partition
|
// for new inserts, compute buckets depending on how many records we have for each partition
|
||||||
Set<String> partitionPaths = profile.getPartitionPaths();
|
Set<String> partitionPaths = profile.getPartitionPaths();
|
||||||
long averageRecordSize = averageBytesPerRecord();
|
long averageRecordSize = averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline()
|
||||||
|
.filterCompletedInstants(), config.getCopyOnWriteRecordSizeEstimate());
|
||||||
logger.info("AvgRecordSize => " + averageRecordSize);
|
logger.info("AvgRecordSize => " + averageRecordSize);
|
||||||
for (String partitionPath : partitionPaths) {
|
for (String partitionPath : partitionPaths) {
|
||||||
WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
|
WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
|
||||||
@@ -735,30 +736,6 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
return smallFileLocations;
|
return smallFileLocations;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Obtains the average record size based on records written during last commit. Used for
|
|
||||||
* estimating how many records pack into one file.
|
|
||||||
*/
|
|
||||||
protected long averageBytesPerRecord() {
|
|
||||||
long avgSize = 0L;
|
|
||||||
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getCommitTimeline()
|
|
||||||
.filterCompletedInstants();
|
|
||||||
try {
|
|
||||||
if (!commitTimeline.empty()) {
|
|
||||||
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
|
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
|
||||||
.fromBytes(commitTimeline.getInstantDetails(latestCommitTime).get(), HoodieCommitMetadata.class);
|
|
||||||
avgSize = (long) Math.ceil(
|
|
||||||
(1.0 * commitMetadata.fetchTotalBytesWritten()) / commitMetadata
|
|
||||||
.fetchTotalRecordsWritten());
|
|
||||||
}
|
|
||||||
} catch (Throwable t) {
|
|
||||||
// make this fail safe.
|
|
||||||
logger.error("Error trying to compute average bytes/record ", t);
|
|
||||||
}
|
|
||||||
return avgSize <= 0L ? config.getCopyOnWriteRecordSizeEstimate() : avgSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
public BucketInfo getBucketInfo(int bucketNumber) {
|
public BucketInfo getBucketInfo(int bucketNumber) {
|
||||||
return bucketInfoMap.get(bucketNumber);
|
return bucketInfoMap.get(bucketNumber);
|
||||||
}
|
}
|
||||||
@@ -803,4 +780,33 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
protected HoodieRollingStatMetadata getRollingStats() {
|
protected HoodieRollingStatMetadata getRollingStats() {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Obtains the average record size based on records written during previous commits. Used for
|
||||||
|
* estimating how many records pack into one file.
|
||||||
|
*/
|
||||||
|
protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, int defaultRecordSizeEstimate) {
|
||||||
|
long avgSize = defaultRecordSizeEstimate;
|
||||||
|
try {
|
||||||
|
if (!commitTimeline.empty()) {
|
||||||
|
// Go over the reverse ordered commits to get a more recent estimate of average record size.
|
||||||
|
Iterator<HoodieInstant> instants = commitTimeline.getReverseOrderedInstants().iterator();
|
||||||
|
while (instants.hasNext()) {
|
||||||
|
HoodieInstant instant = instants.next();
|
||||||
|
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||||
|
.fromBytes(commitTimeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class);
|
||||||
|
long totalBytesWritten = commitMetadata.fetchTotalBytesWritten();
|
||||||
|
long totalRecordsWritten = commitMetadata.fetchTotalRecordsWritten();
|
||||||
|
if (totalBytesWritten > 0 && totalRecordsWritten > 0) {
|
||||||
|
avgSize = (long) Math.ceil((1.0 * totalBytesWritten) / totalRecordsWritten);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Throwable t) {
|
||||||
|
// make this fail safe.
|
||||||
|
logger.error("Error trying to compute average bytes/record ", t);
|
||||||
|
}
|
||||||
|
return avgSize;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,119 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table;
|
||||||
|
|
||||||
|
import static org.apache.hudi.common.model.HoodieTestUtils.generateFakeHoodieWriteStat;
|
||||||
|
import static org.apache.hudi.table.HoodieCopyOnWriteTable.averageBytesPerRecord;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.mockito.Matchers.any;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
|
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||||
|
import org.apache.hudi.common.table.HoodieTimeline;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestHoodieRecordSizing {
|
||||||
|
|
||||||
|
private static List<HoodieInstant> setupHoodieInstants() {
|
||||||
|
List<HoodieInstant> instants = new ArrayList<>();
|
||||||
|
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts1"));
|
||||||
|
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts2"));
|
||||||
|
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts3"));
|
||||||
|
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts4"));
|
||||||
|
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts5"));
|
||||||
|
Collections.reverse(instants);
|
||||||
|
return instants;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<HoodieWriteStat> generateCommitStatWith(int totalRecordsWritten, int totalBytesWritten) {
|
||||||
|
List<HoodieWriteStat> writeStatsList = generateFakeHoodieWriteStat(5);
|
||||||
|
// clear all record and byte stats except for last entry.
|
||||||
|
for (int i = 0; i < writeStatsList.size() - 1; i++) {
|
||||||
|
HoodieWriteStat writeStat = writeStatsList.get(i);
|
||||||
|
writeStat.setNumWrites(0);
|
||||||
|
writeStat.setTotalWriteBytes(0);
|
||||||
|
}
|
||||||
|
HoodieWriteStat lastWriteStat = writeStatsList.get(writeStatsList.size() - 1);
|
||||||
|
lastWriteStat.setTotalWriteBytes(totalBytesWritten);
|
||||||
|
lastWriteStat.setNumWrites(totalRecordsWritten);
|
||||||
|
return writeStatsList;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static HoodieCommitMetadata generateCommitMetadataWith(int totalRecordsWritten, int totalBytesWritten) {
|
||||||
|
List<HoodieWriteStat> fakeHoodieWriteStats = generateCommitStatWith(totalRecordsWritten, totalBytesWritten);
|
||||||
|
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||||
|
fakeHoodieWriteStats.forEach(stat -> commitMetadata.addWriteStat(stat.getPartitionPath(), stat));
|
||||||
|
return commitMetadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This needs to be a stack so we test all cases when either/both recordsWritten ,bytesWritten is zero before a non
|
||||||
|
* zero averageRecordSize can be computed.
|
||||||
|
*/
|
||||||
|
private static LinkedList<Option<byte[]>> generateCommitMetadataList() throws IOException {
|
||||||
|
LinkedList<Option<byte[]>> commits = new LinkedList<>();
|
||||||
|
// First commit with non zero records and bytes
|
||||||
|
commits.push(Option.of(generateCommitMetadataWith(2000, 10000).toJsonString()
|
||||||
|
.getBytes(StandardCharsets.UTF_8)));
|
||||||
|
// Second commit with non zero records and bytes
|
||||||
|
commits.push(Option.of(generateCommitMetadataWith(1500, 7500).toJsonString()
|
||||||
|
.getBytes(StandardCharsets.UTF_8)));
|
||||||
|
// Third commit with both zero records and zero bytes
|
||||||
|
commits.push(Option.of(generateCommitMetadataWith(0, 0).toJsonString()
|
||||||
|
.getBytes(StandardCharsets.UTF_8)));
|
||||||
|
// Fourth commit with zero records
|
||||||
|
commits.push(Option.of(generateCommitMetadataWith(0, 1500).toJsonString()
|
||||||
|
.getBytes(StandardCharsets.UTF_8)));
|
||||||
|
// Fifth commit with zero bytes
|
||||||
|
commits.push(Option.of(generateCommitMetadataWith(2500, 0).toJsonString()
|
||||||
|
.getBytes(StandardCharsets.UTF_8)));
|
||||||
|
return commits;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAverageBytesPerRecordForNonEmptyCommitTimeLine() throws Exception {
|
||||||
|
HoodieTimeline commitTimeLine = mock(HoodieTimeline.class);
|
||||||
|
when(commitTimeLine.empty()).thenReturn(false);
|
||||||
|
when(commitTimeLine.getReverseOrderedInstants()).thenReturn(setupHoodieInstants().stream());
|
||||||
|
LinkedList<Option<byte[]>> commits = generateCommitMetadataList();
|
||||||
|
when(commitTimeLine.getInstantDetails(any(HoodieInstant.class))).thenAnswer(invocationOnMock -> commits.pop());
|
||||||
|
long expectAvgSize = (long) Math.ceil((1.0 * 7500) / 1500);
|
||||||
|
long actualAvgSize = averageBytesPerRecord(commitTimeLine, 1234);
|
||||||
|
assertEquals(expectAvgSize, actualAvgSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAverageBytesPerRecordForEmptyCommitTimeLine() {
|
||||||
|
HoodieTimeline commitTimeLine = mock(HoodieTimeline.class);
|
||||||
|
when(commitTimeLine.empty()).thenReturn(true);
|
||||||
|
long expectAvgSize = 2345;
|
||||||
|
long actualAvgSize = averageBytesPerRecord(commitTimeLine, 2345);
|
||||||
|
assertEquals(expectAvgSize, actualAvgSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -187,6 +187,12 @@ public interface HoodieTimeline extends Serializable {
|
|||||||
*/
|
*/
|
||||||
Stream<HoodieInstant> getInstants();
|
Stream<HoodieInstant> getInstants();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Get the stream of completed instants in reverse order
|
||||||
|
* TODO Change code references to getInstants() that reverse the instants later on to use this method instead.
|
||||||
|
*/
|
||||||
|
Stream<HoodieInstant> getReverseOrderedInstants();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return true if the passed in instant is before the first completed instant in the timeline
|
* @return true if the passed in instant is before the first completed instant in the timeline
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -18,6 +18,8 @@
|
|||||||
|
|
||||||
package org.apache.hudi.common.table.timeline;
|
package org.apache.hudi.common.table.timeline;
|
||||||
|
|
||||||
|
import static java.util.Collections.reverse;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import java.security.MessageDigest;
|
import java.security.MessageDigest;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
@@ -195,6 +197,13 @@ public class HoodieDefaultTimeline implements HoodieTimeline {
|
|||||||
return instants.stream();
|
return instants.stream();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Stream<HoodieInstant> getReverseOrderedInstants() {
|
||||||
|
List<HoodieInstant> instants = getInstants().collect(Collectors.toList());
|
||||||
|
reverse(instants);
|
||||||
|
return instants.stream();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isBeforeTimelineStarts(String instant) {
|
public boolean isBeforeTimelineStarts(String instant) {
|
||||||
Option<HoodieInstant> firstCommit = firstInstant();
|
Option<HoodieInstant> firstCommit = firstInstant();
|
||||||
|
|||||||
Reference in New Issue
Block a user