[HUDI-1863] Add rate limiter to Flink writer to avoid OOM for bootstrap (#2891)

2021-04-29 20:32:10 +08:00
parent c9bcb5e33f
commit 6e9c5dd765
7 changed files with 177 additions and 167 deletions
--- a/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java
+++ b/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java
@@ -267,20 +267,20 @@ public class FlinkOptions {
      .defaultValue(4)
      .withDescription("Parallelism of tasks that do actual write, default is 4");

-  public static final ConfigOption<Double> WRITE_BUFFER_SIZE = ConfigOptions
-      .key("write.buffer.size.MB")
-      .doubleType()
-      .defaultValue(256D) // 256MB
-      .withDescription("Total buffer size in MB to flush data into the underneath filesystem, default 256MB");
-
-  public static final ConfigOption<Double> WRITE_BUCKET_SIZE = ConfigOptions
-      .key("write.bucket.size.MB")
+  public static final ConfigOption<Double> WRITE_BATCH_SIZE = ConfigOptions
+      .key("write.batch.size")
      .doubleType()
      .defaultValue(64D) // 64MB
-      .withDescription("Bucket size in MB to flush data into the underneath filesystem, default 64MB");
+      .withDescription("Batch buffer size in MB to flush data into the underneath filesystem, default 64MB");
+
+  public static final ConfigOption<Long> WRITE_RATE_LIMIT = ConfigOptions
+      .key("write.rate.limit")
+      .longType()
+      .defaultValue(-1L) // default no limit
+      .withDescription("Write records rate limit per second to reduce risk of OOM, default -1 (no limit)");

  public static final ConfigOption<Integer> WRITE_LOG_BLOCK_SIZE = ConfigOptions
-      .key("write.log_block.size.MB")
+      .key("write.log_block.size")
      .intType()
      .defaultValue(128)
      .withDescription("Max log block size in MB for log file, default 128MB");
--- a/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java
+++ b/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java
@@ -41,21 +41,18 @@ import org.apache.flink.runtime.state.FunctionSnapshotContext;
 import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
 import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
 import org.apache.flink.util.Collector;
-import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.Comparator;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
 import java.util.function.BiFunction;
-import java.util.stream.Collectors;

 /**
 * Sink function to write the data to the underneath filesystem.
@@ -63,8 +60,7 @@ import java.util.stream.Collectors;
 * <p><h2>Work Flow</h2>
 *
 * <p>The function firstly buffers the data as a batch of {@link HoodieRecord}s,
- * It flushes(write) the records bucket when the bucket size exceeds the configured threshold {@link FlinkOptions#WRITE_BUCKET_SIZE}
- * or the whole data buffer size exceeds the configured threshold {@link FlinkOptions#WRITE_BUFFER_SIZE}
+ * It flushes(write) the records batch when a batch exceeds the configured size {@link FlinkOptions#WRITE_BATCH_SIZE}
 * or a Flink checkpoint starts. After a batch has been written successfully,
 * the function notifies its operator coordinator {@link StreamWriteOperatorCoordinator} to mark a successful write.
 *
@@ -102,11 +98,6 @@ public class StreamWriteFunction<K, I, O>

  private static final Logger LOG = LoggerFactory.getLogger(StreamWriteFunction.class);

-  /**
-   * Write buffer size detector.
-   */
-  private transient BufferSizeDetector detector;
-
  /**
   * Write buffer as buckets for a checkpoint. The key is bucket ID.
   */
@@ -232,7 +223,6 @@ public class StreamWriteFunction<K, I, O>
  // -------------------------------------------------------------------------

  private void initBuffer() {
-    this.detector = new BufferSizeDetector(this.config.getDouble(FlinkOptions.WRITE_BUFFER_SIZE));
    this.buckets = new LinkedHashMap<>();
  }

@@ -259,49 +249,18 @@ public class StreamWriteFunction<K, I, O>
  /**
   * Data bucket.
   */
-  private static class DataBucket implements Comparable<DataBucket> {
+  private static class DataBucket {
    private final List<HoodieRecord> records;
-    private final BucketSizeTracer tracer;
+    private final BufferSizeDetector detector;

    private DataBucket(Double batchSize) {
      this.records = new ArrayList<>();
-      this.tracer = new BucketSizeTracer(batchSize);
+      this.detector = new BufferSizeDetector(batchSize);
    }

    public void reset() {
      this.records.clear();
-      this.tracer.reset();
-    }
-
-    @Override
-    public int compareTo(@NotNull DataBucket other) {
-      return Double.compare(tracer.threshold, other.tracer.threshold);
-    }
-  }
-
-  /**
-   * Tool to detect if to flush out the existing bucket.
-   */
-  private static class BucketSizeTracer {
-    private final double threshold;
-
-    private long totalSize = 0L;
-
-    BucketSizeTracer(double bucketSizeMb) {
-      this.threshold = bucketSizeMb * 1024 * 1024;
-    }
-
-    /**
-     * Trace the bucket size with given record size,
-     * returns true if the bucket size exceeds specified threshold.
-     */
-    boolean trace(long recordSize) {
-      totalSize += recordSize;
-      return totalSize > this.threshold;
-    }
-
-    void reset() {
-      this.totalSize = 0L;
+      this.detector.reset();
    }
  }

@@ -313,13 +272,13 @@ public class StreamWriteFunction<K, I, O>
    private final Random random = new Random(47);
    private static final int DENOMINATOR = 100;

-    private final double threshold;
+    private final double batchSizeBytes;

    private long lastRecordSize = -1L;
    private long totalSize = 0L;

    BufferSizeDetector(double batchSizeMb) {
-      this.threshold = batchSizeMb * 1024 * 1024;
+      this.batchSizeBytes = batchSizeMb * 1024 * 1024;
    }

    boolean detect(Object record) {
@@ -327,7 +286,7 @@ public class StreamWriteFunction<K, I, O>
        lastRecordSize = ObjectSizeCalculator.getObjectSize(record);
      }
      totalSize += lastRecordSize;
-      return totalSize > this.threshold;
+      return totalSize > this.batchSizeBytes;
    }

    boolean sampling() {
@@ -339,10 +298,6 @@ public class StreamWriteFunction<K, I, O>
      this.lastRecordSize = -1L;
      this.totalSize = 0L;
    }
-
-    public void countDown(long bucketSize) {
-      this.totalSize -= bucketSize;
-    }
  }

  /**
@@ -357,49 +312,19 @@ public class StreamWriteFunction<K, I, O>
  /**
   * Buffers the given record.
   *
-   * <p>Flush the data bucket first if one of the condition meets:
-   *
-   * <ul>
-   *   <li>The bucket size is greater than the configured value {@link FlinkOptions#WRITE_BUCKET_SIZE}.</li>
-   *   <li>Flush half of the data buckets if the whole buffer size
-   *   exceeds the configured threshold {@link FlinkOptions#WRITE_BUFFER_SIZE}.</li>
-   * </ul>
+   * <p>Flush the data bucket first if the bucket records size is greater than
+   * the configured value {@link FlinkOptions#WRITE_BATCH_SIZE}.
   *
   * @param value HoodieRecord
   */
  private void bufferRecord(I value) {
-    boolean flushBuffer = detector.detect(value);
-    if (flushBuffer) {
-      List<DataBucket> sortedBuckets = this.buckets.values().stream()
-          .filter(b -> b.records.size() > 0)
-          .sorted(Comparator.comparingLong(b -> b.tracer.totalSize))
-          .collect(Collectors.toList());
-      // flush half bytes size of buckets to avoid flushing too small buckets
-      // which cause small files.
-      long totalSize = detector.totalSize;
-      long flushedBytes = 0;
-      for (DataBucket bucket : sortedBuckets) {
-        final long bucketSize = bucket.tracer.totalSize;
-        flushBucket(bucket);
-        detector.countDown(bucketSize);
-        bucket.reset();
-
-        flushedBytes += bucketSize;
-        if (flushedBytes > detector.totalSize / 2) {
-          break;
-        }
-      }
-      LOG.info("Flush {} bytes data buckets because the total buffer size {} bytes exceeds the threshold {} bytes",
-          flushedBytes, totalSize, detector.threshold);
-    }
    final String bucketID = getBucketID(value);

    DataBucket bucket = this.buckets.computeIfAbsent(bucketID,
-        k -> new DataBucket(this.config.getDouble(FlinkOptions.WRITE_BUCKET_SIZE)));
-    boolean flushBucket = bucket.tracer.trace(detector.lastRecordSize);
-    if (flushBucket) {
+        k -> new DataBucket(this.config.getDouble(FlinkOptions.WRITE_BATCH_SIZE)));
+    boolean needFlush = bucket.detector.detect(value);
+    if (needFlush) {
      flushBucket(bucket);
-      detector.countDown(bucket.tracer.totalSize);
      bucket.reset();
    }
    bucket.records.add((HoodieRecord<?>) value);
@@ -465,7 +390,6 @@ public class StreamWriteFunction<K, I, O>
        .build();
    this.eventGateway.sendEventToCoordinator(event);
    this.buckets.clear();
-    this.detector.reset();
    this.currentInstant = "";
  }
 }
--- a/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java
+++ b/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java
@@ -44,6 +44,8 @@ import javax.annotation.Nullable;
 import java.io.IOException;
 import java.io.Serializable;
 import java.lang.reflect.Constructor;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;

 /**
 * Function that transforms RowData to HoodieRecord.
@@ -80,6 +82,12 @@ public class RowDataToHoodieFunction<I extends RowData, O extends HoodieRecord<?
   */
  private final Configuration config;

+  /**
+   * Rate limit per second for this task.
+   * The task sleep a little while when the consuming rate exceeds the threshold.
+   */
+  private transient RateLimiter rateLimiter;
+
  public RowDataToHoodieFunction(RowType rowType, Configuration config) {
    this.rowType = rowType;
    this.config = config;
@@ -92,12 +100,30 @@ public class RowDataToHoodieFunction<I extends RowData, O extends HoodieRecord<?
    this.converter = RowDataToAvroConverters.createConverter(this.rowType);
    this.keyGenerator = StreamerUtil.createKeyGenerator(FlinkOptions.flatOptions(this.config));
    this.payloadCreation = PayloadCreation.instance(config);
+    long totalLimit = this.config.getLong(FlinkOptions.WRITE_RATE_LIMIT);
+    if (totalLimit > 0) {
+      this.rateLimiter = new RateLimiter(totalLimit / getRuntimeContext().getNumberOfParallelSubtasks());
+    }
  }

  @SuppressWarnings("unchecked")
  @Override
  public O map(I i) throws Exception {
-    return (O) toHoodieRecord(i);
+    if (rateLimiter != null) {
+      final O hoodieRecord;
+      if (rateLimiter.sampling()) {
+        long startTime = System.currentTimeMillis();
+        hoodieRecord = (O) toHoodieRecord(i);
+        long endTime = System.currentTimeMillis();
+        rateLimiter.processTime(endTime - startTime);
+      } else {
+        hoodieRecord = (O) toHoodieRecord(i);
+      }
+      rateLimiter.sleepIfNeeded();
+      return hoodieRecord;
+    } else {
+      return (O) toHoodieRecord(i);
+    }
  }

  /**
@@ -165,4 +191,43 @@ public class RowDataToHoodieFunction<I extends RowData, O extends HoodieRecord<?
      }
    }
  }
+
+  // -------------------------------------------------------------------------
+  //  Inner Class
+  // -------------------------------------------------------------------------
+
+  /**
+   * Tool to decide whether the limit the processing rate.
+   * Sampling the record to compute the process time with 0.01 percentage.
+   */
+  private static class RateLimiter {
+    private final Random random = new Random(47);
+    private static final int DENOMINATOR = 100;
+
+    private final long maxProcessTime;
+
+    private long processTime = -1L;
+    private long timeToSleep = -1;
+
+    RateLimiter(long rate) {
+      ValidationUtils.checkArgument(rate > 0, "rate should be positive");
+      this.maxProcessTime = 1000 / rate;
+    }
+
+    void processTime(long processTime) {
+      this.processTime = processTime;
+      this.timeToSleep = maxProcessTime - processTime;
+    }
+
+    boolean sampling() {
+      // 0.01 sampling percentage
+      return processTime == -1 || random.nextInt(DENOMINATOR) == 1;
+    }
+
+    void sleepIfNeeded() throws Exception {
+      if (timeToSleep > 0) {
+        TimeUnit.MILLISECONDS.sleep(timeToSleep);
+      }
+    }
+  }
 }
--- a/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java
+++ b/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java
@@ -80,7 +80,7 @@ public class StreamReadOperator extends AbstractStreamOperator<RowData>
  // them to the executor. This state is used to ensure that only one read task is in that splits queue at a time, so that
  // read tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this is set to RUNNING.
  // When there are no more files to read, this will be set to IDLE.
-  private transient SplitState currentSplitState;
+  private transient volatile SplitState currentSplitState;

  private StreamReadOperator(MergeOnReadInputFormat format, ProcessingTimeService timeService,
                             MailboxExecutor mailboxExecutor) {
@@ -141,7 +141,7 @@ public class StreamReadOperator extends AbstractStreamOperator<RowData>
  private void enqueueProcessSplits() {
    if (currentSplitState == SplitState.IDLE && !splits.isEmpty()) {
      currentSplitState = SplitState.RUNNING;
-      executor.execute(this::processSplits, this.getClass().getSimpleName());
+      executor.execute(this::processSplits, "process input split");
    }
  }

@@ -155,8 +155,8 @@ public class StreamReadOperator extends AbstractStreamOperator<RowData>
    // This log is important to indicate the consuming process, there is only one log message for one data bucket.
    LOG.info("Processing input split : {}", split);

-    format.open(split);
    try {
+      format.open(split);
      RowData nextElement = null;
      while (!format.reachedEnd()) {
        nextElement = format.nextRecord(nextElement);
--- a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java
+++ b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java
@@ -378,7 +378,7 @@ public class TestWriteCopyOnWrite {
  @Test
  public void testInsertWithMiniBatches() throws Exception {
    // reset the config option
-    conf.setDouble(FlinkOptions.WRITE_BUCKET_SIZE, 0.001); // 1Kb batch size
+    conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.001); // 1Kb batch size
    funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf);

    // open the function and ingest data
@@ -436,68 +436,6 @@ public class TestWriteCopyOnWrite {
    checkWrittenData(tempFile, expected, 1);
  }

-  @Test
-  public void testInsertWithSmallBuffer() throws Exception {
-    // reset the config option
-    conf.setDouble(FlinkOptions.WRITE_BUFFER_SIZE, 0.001); // 1Kb buffer size
-    funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf);
-
-    // open the function and ingest data
-    funcWrapper.openFunction();
-    // each record is 424 bytes. so 3 records expect to trigger buffer flush:
-    // flush half of the buckets once at a time.
-    for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
-      funcWrapper.invoke(rowData);
-    }
-
-    Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
-    assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
-    assertThat("4 records expect to flush out as a mini-batch",
-        dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
-        is(1));
-
-    // this triggers the data write and event send
-    funcWrapper.checkpointFunction(1);
-    dataBuffer = funcWrapper.getDataBuffer();
-    assertThat("All data should be flushed out", dataBuffer.size(), is(0));
-
-    for (int i = 0; i < 3; i++) {
-      final OperatorEvent event = funcWrapper.getNextEvent(); // remove the first event first
-      assertThat("The operator expect to send an event", event, instanceOf(BatchWriteSuccessEvent.class));
-      funcWrapper.getCoordinator().handleEventFromOperator(0, event);
-    }
-    assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event");
-
-    String instant = funcWrapper.getWriteClient()
-        .getLastPendingInstant(getTableType());
-
-    funcWrapper.checkpointComplete(1);
-
-    Map<String, String> expected = getMiniBatchExpected();
-    checkWrittenData(tempFile, expected, 1);
-
-    // started a new instant already
-    checkInflightInstant(funcWrapper.getWriteClient());
-    checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant);
-
-    // insert duplicates again
-    for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
-      funcWrapper.invoke(rowData);
-    }
-
-    funcWrapper.checkpointFunction(2);
-
-    for (int i = 0; i < 3; i++) {
-      final OperatorEvent event = funcWrapper.getNextEvent(); // remove the first event first
-      funcWrapper.getCoordinator().handleEventFromOperator(0, event);
-    }
-
-    funcWrapper.checkpointComplete(2);
-
-    // Same the original base file content.
-    checkWrittenData(tempFile, expected, 1);
-  }
-
  Map<String, String> getMiniBatchExpected() {
    Map<String, String> expected = new HashMap<>();
    expected.put("par1", "[id1,par1,id1,Danny,23,1,par1, "
--- a/hudi-flink/src/test/java/org/apache/hudi/sink/transform/TestRowDataToHoodieFunction.java
+++ b/hudi-flink/src/test/java/org/apache/hudi/sink/transform/TestRowDataToHoodieFunction.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sink.transform;
+
+import org.apache.hudi.configuration.FlinkOptions;
+import org.apache.hudi.sink.utils.MockStreamingRuntimeContext;
+import org.apache.hudi.utils.TestConfigurations;
+import org.apache.hudi.utils.TestData;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.table.data.RowData;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.File;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Test cases for {@link RowDataToHoodieFunction}.
+ */
+public class TestRowDataToHoodieFunction {
+  @TempDir
+  File tempFile;
+
+  private Configuration conf;
+
+  @BeforeEach
+  public void before() {
+    final String basePath = tempFile.getAbsolutePath();
+    conf = TestConfigurations.getDefaultConf(basePath);
+  }
+
+  @Test
+  void testRateLimit() throws Exception {
+    // at most 100 record per second
+    RowDataToHoodieFunction<RowData, ?> func1 = getFunc(100);
+    long instant1 = System.currentTimeMillis();
+    for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
+      func1.map(rowData);
+    }
+    long instant2 = System.currentTimeMillis();
+    long processTime1 = instant2 - instant1;
+
+    // at most 1 record per second
+    RowDataToHoodieFunction<RowData, ?> func2 = getFunc(1);
+    long instant3 = System.currentTimeMillis();
+    for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
+      func2.map(rowData);
+    }
+    long instant4 = System.currentTimeMillis();
+    long processTime2 = instant4 - instant3;
+
+    assertTrue(processTime2 > processTime1, "lower rate should have longer process time");
+    assertTrue(processTime2 > 5000, "should process at least 5 seconds");
+  }
+
+  private RowDataToHoodieFunction<RowData, ?> getFunc(long rate) throws Exception {
+    conf.setLong(FlinkOptions.WRITE_RATE_LIMIT, rate);
+    RowDataToHoodieFunction<RowData, ?> func =
+        new RowDataToHoodieFunction<>(TestConfigurations.ROW_TYPE, conf);
+    func.setRuntimeContext(new MockStreamingRuntimeContext(false, 1, 1));
+    func.open(conf);
+    return func;
+  }
+}
--- a/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java
+++ b/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java
@@ -330,7 +330,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
    TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv;
    Map<String, String> options = new HashMap<>();
    options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
-    options.put(FlinkOptions.WRITE_BUCKET_SIZE.key(), "0.001");
+    options.put(FlinkOptions.WRITE_BATCH_SIZE.key(), "0.001");
    String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
    tableEnv.executeSql(hoodieTableDDL);