1
0

HUDI-494 fix incorrect record size estimation

This commit is contained in:
garyli1019
2020-05-14 20:20:44 -07:00
committed by vinoth chandar
parent 9e07cebece
commit 22cd824d99
8 changed files with 125 additions and 155 deletions

View File

@@ -1041,10 +1041,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
HoodieWriteConfig.Builder builder = getConfigBuilder(useNullSchema ? NULL_SCHEMA : TRIP_EXAMPLE_SCHEMA);
return builder
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 15)
.insertSplitSize(insertSplitSize).build()) // tolerate upto 15 records
HoodieCompactionConfig.newBuilder()
.compactionSmallFileSize(dataGen.getEstimatedFileSizeInBytes(150))
.insertSplitSize(insertSplitSize).build())
.withStorageConfig(
HoodieStorageConfig.newBuilder().limitFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 20).build())
HoodieStorageConfig.newBuilder()
.limitFileSize(dataGen.getEstimatedFileSizeInBytes(200)).build())
.build();
}
}

View File

@@ -1,116 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import static org.apache.hudi.common.testutils.HoodieTestUtils.generateFakeHoodieWriteStat;
import static org.apache.hudi.table.HoodieCopyOnWriteTable.averageBytesPerRecord;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class TestHoodieRecordSizing {
private static List<HoodieInstant> setupHoodieInstants() {
List<HoodieInstant> instants = new ArrayList<>();
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts1"));
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts2"));
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts3"));
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts4"));
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts5"));
Collections.reverse(instants);
return instants;
}
private static List<HoodieWriteStat> generateCommitStatWith(int totalRecordsWritten, int totalBytesWritten) {
List<HoodieWriteStat> writeStatsList = generateFakeHoodieWriteStat(5);
// clear all record and byte stats except for last entry.
for (int i = 0; i < writeStatsList.size() - 1; i++) {
HoodieWriteStat writeStat = writeStatsList.get(i);
writeStat.setNumWrites(0);
writeStat.setTotalWriteBytes(0);
}
HoodieWriteStat lastWriteStat = writeStatsList.get(writeStatsList.size() - 1);
lastWriteStat.setTotalWriteBytes(totalBytesWritten);
lastWriteStat.setNumWrites(totalRecordsWritten);
return writeStatsList;
}
private static HoodieCommitMetadata generateCommitMetadataWith(int totalRecordsWritten, int totalBytesWritten) {
List<HoodieWriteStat> fakeHoodieWriteStats = generateCommitStatWith(totalRecordsWritten, totalBytesWritten);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
fakeHoodieWriteStats.forEach(stat -> commitMetadata.addWriteStat(stat.getPartitionPath(), stat));
return commitMetadata;
}
/*
* This needs to be a stack so we test all cases when either/both recordsWritten ,bytesWritten is zero before a non
* zero averageRecordSize can be computed.
*/
private static LinkedList<Option<byte[]>> generateCommitMetadataList() throws IOException {
LinkedList<Option<byte[]>> commits = new LinkedList<>();
// First commit with non zero records and bytes
commits.push(Option.of(generateCommitMetadataWith(2000, 10000).toJsonString().getBytes(StandardCharsets.UTF_8)));
// Second commit with non zero records and bytes
commits.push(Option.of(generateCommitMetadataWith(1500, 7500).toJsonString().getBytes(StandardCharsets.UTF_8)));
// Third commit with both zero records and zero bytes
commits.push(Option.of(generateCommitMetadataWith(0, 0).toJsonString().getBytes(StandardCharsets.UTF_8)));
// Fourth commit with zero records
commits.push(Option.of(generateCommitMetadataWith(0, 1500).toJsonString().getBytes(StandardCharsets.UTF_8)));
// Fifth commit with zero bytes
commits.push(Option.of(generateCommitMetadataWith(2500, 0).toJsonString().getBytes(StandardCharsets.UTF_8)));
return commits;
}
@Test
public void testAverageBytesPerRecordForNonEmptyCommitTimeLine() throws Exception {
HoodieTimeline commitTimeLine = mock(HoodieTimeline.class);
when(commitTimeLine.empty()).thenReturn(false);
when(commitTimeLine.getReverseOrderedInstants()).thenReturn(setupHoodieInstants().stream());
LinkedList<Option<byte[]>> commits = generateCommitMetadataList();
when(commitTimeLine.getInstantDetails(any(HoodieInstant.class))).thenAnswer(invocationOnMock -> commits.pop());
long expectAvgSize = (long) Math.ceil((1.0 * 7500) / 1500);
long actualAvgSize = averageBytesPerRecord(commitTimeLine, 1234);
assertEquals(expectAvgSize, actualAvgSize);
}
@Test
public void testAverageBytesPerRecordForEmptyCommitTimeLine() {
HoodieTimeline commitTimeLine = mock(HoodieTimeline.class);
when(commitTimeLine.empty()).thenReturn(true);
long expectAvgSize = 2345;
long actualAvgSize = averageBytesPerRecord(commitTimeLine, 2345);
assertEquals(expectAvgSize, actualAvgSize);
}
}

View File

@@ -18,9 +18,13 @@
package org.apache.hudi.table.action.commit;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.FileIOUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieCompactionConfig;
@@ -37,12 +41,21 @@ import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import scala.Tuple2;
import static org.apache.hudi.common.testutils.HoodieTestUtils.generateFakeHoodieWriteStat;
import static org.apache.hudi.table.action.commit.UpsertPartitioner.averageBytesPerRecord;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class TestUpsertPartitioner extends HoodieClientTestBase {
@@ -79,6 +92,84 @@ public class TestUpsertPartitioner extends HoodieClientTestBase {
return partitioner;
}
private static List<HoodieInstant> setupHoodieInstants() {
List<HoodieInstant> instants = new ArrayList<>();
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts1"));
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts2"));
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts3"));
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts4"));
instants.add(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "ts5"));
Collections.reverse(instants);
return instants;
}
private static List<HoodieWriteStat> generateCommitStatWith(int totalRecordsWritten, int totalBytesWritten) {
List<HoodieWriteStat> writeStatsList = generateFakeHoodieWriteStat(5);
// clear all record and byte stats except for last entry.
for (int i = 0; i < writeStatsList.size() - 1; i++) {
HoodieWriteStat writeStat = writeStatsList.get(i);
writeStat.setNumWrites(0);
writeStat.setTotalWriteBytes(0);
}
HoodieWriteStat lastWriteStat = writeStatsList.get(writeStatsList.size() - 1);
lastWriteStat.setTotalWriteBytes(totalBytesWritten);
lastWriteStat.setNumWrites(totalRecordsWritten);
return writeStatsList;
}
private static HoodieCommitMetadata generateCommitMetadataWith(int totalRecordsWritten, int totalBytesWritten) {
List<HoodieWriteStat> fakeHoodieWriteStats = generateCommitStatWith(totalRecordsWritten, totalBytesWritten);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
fakeHoodieWriteStats.forEach(stat -> commitMetadata.addWriteStat(stat.getPartitionPath(), stat));
return commitMetadata;
}
/*
* This needs to be a stack so we test all cases when either/both recordsWritten ,bytesWritten is zero before a non
* zero averageRecordSize can be computed.
*/
private static LinkedList<Option<byte[]>> generateCommitMetadataList() throws IOException {
LinkedList<Option<byte[]>> commits = new LinkedList<>();
// First commit with non zero records and bytes
commits.push(Option.of(generateCommitMetadataWith(2000, 10000).toJsonString().getBytes(StandardCharsets.UTF_8)));
// Second commit with non zero records and bytes
commits.push(Option.of(generateCommitMetadataWith(1500, 7500).toJsonString().getBytes(StandardCharsets.UTF_8)));
// Third commit with a small file
commits.push(Option.of(generateCommitMetadataWith(100, 500).toJsonString().getBytes(StandardCharsets.UTF_8)));
// Fourth commit with both zero records and zero bytes
commits.push(Option.of(generateCommitMetadataWith(0, 0).toJsonString().getBytes(StandardCharsets.UTF_8)));
// Fifth commit with zero records
commits.push(Option.of(generateCommitMetadataWith(0, 1500).toJsonString().getBytes(StandardCharsets.UTF_8)));
// Sixth commit with zero bytes
commits.push(Option.of(generateCommitMetadataWith(2500, 0).toJsonString().getBytes(StandardCharsets.UTF_8)));
return commits;
}
@Test
public void testAverageBytesPerRecordForNonEmptyCommitTimeLine() throws Exception {
HoodieTimeline commitTimeLine = mock(HoodieTimeline.class);
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1000).build())
.build();
when(commitTimeLine.empty()).thenReturn(false);
when(commitTimeLine.getReverseOrderedInstants()).thenReturn(setupHoodieInstants().stream());
LinkedList<Option<byte[]>> commits = generateCommitMetadataList();
when(commitTimeLine.getInstantDetails(any(HoodieInstant.class))).thenAnswer(invocationOnMock -> commits.pop());
long expectAvgSize = (long) Math.ceil((1.0 * 7500) / 1500);
long actualAvgSize = averageBytesPerRecord(commitTimeLine, config);
assertEquals(expectAvgSize, actualAvgSize);
}
@Test
public void testAverageBytesPerRecordForEmptyCommitTimeLine() throws Exception {
HoodieTimeline commitTimeLine = mock(HoodieTimeline.class);
HoodieWriteConfig config = makeHoodieClientConfigBuilder().build();
when(commitTimeLine.empty()).thenReturn(true);
long expectAvgSize = config.getCopyOnWriteRecordSizeEstimate();
long actualAvgSize = averageBytesPerRecord(commitTimeLine, config);
assertEquals(expectAvgSize, actualAvgSize);
}
@Test
public void testUpsertPartitioner() throws Exception {
final String testPartitionPath = "2016/09/26";

View File

@@ -70,7 +70,9 @@ import java.util.stream.Stream;
public class HoodieTestDataGenerator {
// based on examination of sample file, the schema produces the following per record size
public static final int SIZE_PER_RECORD = 50 * 1024;
public static final int BYTES_PER_RECORD = (int) (1.2 * 1024);
// with default bloom filter with 60,000 entries and 0.000000001 FPRate
public static final int BLOOM_FILTER_BYTES = 323495;
private static Logger logger = LogManager.getLogger(HoodieTestDataGenerator.class);
public static final String DEFAULT_FIRST_PARTITION_PATH = "2016/03/15";
public static final String DEFAULT_SECOND_PARTITION_PATH = "2015/03/16";
@@ -144,6 +146,10 @@ public class HoodieTestDataGenerator {
}
}
public int getEstimatedFileSizeInBytes(int numOfRecords) {
return numOfRecords * BYTES_PER_RECORD + BLOOM_FILTER_BYTES;
}
public TestRawTripPayload generateRandomValueAsPerSchema(String schemaStr, HoodieKey key, String commitTime, boolean isFlattened) throws IOException {
if (TRIP_EXAMPLE_SCHEMA.equals(schemaStr)) {
return generateRandomValue(key, commitTime, isFlattened);