[HUDI-1911] Reuse the partition path and file group id for flink write data buffer (#2961)

Reuse to reduce memory footprint.
2021-05-18 17:47:22 +08:00
parent 46a2399a45
commit 7d2971d4e2
5 changed files with 89 additions and 24 deletions
--- a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java
+++ b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java
@@ -378,21 +378,21 @@ public class TestWriteCopyOnWrite {
  @Test
  public void testInsertWithMiniBatches() throws Exception {
    // reset the config option
-    conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.001); // 1Kb batch size
+    conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.0006); // 630 bytes batch size
    funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf);

    // open the function and ingest data
    funcWrapper.openFunction();
-    // Each record is 424 bytes. so 3 records expect to trigger a mini-batch write
+    // Each record is 208 bytes. so 4 records expect to trigger a mini-batch write
    for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
      funcWrapper.invoke(rowData);
    }

    Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
    assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
-    assertThat("2 records expect to flush out as a mini-batch",
+    assertThat("3 records expect to flush out as a mini-batch",
        dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
-        is(3));
+        is(2));

    // this triggers the data write and event send
    funcWrapper.checkpointFunction(1);
@@ -439,12 +439,12 @@ public class TestWriteCopyOnWrite {
  @Test
  public void testInsertWithSmallBufferSize() throws Exception {
    // reset the config option
-    conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.001); // 1Kb buffer size
+    conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.0006); // 630 bytes buffer size
    funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf);

    // open the function and ingest data
    funcWrapper.openFunction();
-    // each record is 424 bytes. so 3 records expect to trigger buffer flush:
+    // each record is 208 bytes. so 4 records expect to trigger buffer flush:
    // flush the max size bucket once at a time.
    for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
      funcWrapper.invoke(rowData);
@@ -452,9 +452,9 @@ public class TestWriteCopyOnWrite {

    Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
    assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
-    assertThat("2 records expect to flush out as a mini-batch",
+    assertThat("3 records expect to flush out as a mini-batch",
        dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
-        is(3));
+        is(2));

    // this triggers the data write and event send
    funcWrapper.checkpointFunction(1);
@@ -500,8 +500,9 @@ public class TestWriteCopyOnWrite {

  Map<String, String> getMiniBatchExpected() {
    Map<String, String> expected = new HashMap<>();
-    // the last 3 lines are merged
+    // the last 2 lines are merged
    expected.put("par1", "["
+        + "id1,par1,id1,Danny,23,1,par1, "
        + "id1,par1,id1,Danny,23,1,par1, "
        + "id1,par1,id1,Danny,23,1,par1]");
    return expected;
--- a/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java
+++ b/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java
@@ -401,6 +401,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {

    Map<String, String> options = new HashMap<>();
    options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
+    options.put(FlinkOptions.INDEX_GLOBAL_ENABLED.key(), "true");
    options.put(FlinkOptions.INSERT_DROP_DUPS.key(), "true");
    String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
    streamTableEnv.executeSql(hoodieTableDDL);