1
0

[HUDI-2209] Bulk insert for flink writer (#3334)

This commit is contained in:
Danny Chan
2021-07-27 10:58:23 +08:00
committed by GitHub
parent 024cf01f02
commit 9d2a65a6a6
26 changed files with 2000 additions and 83 deletions

View File

@@ -0,0 +1,96 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sink.bulk;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.exception.HoodieKeyException;
import org.apache.hudi.utils.TestConfigurations;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.data.TimestampData;
import org.junit.jupiter.api.Test;
import static org.apache.hudi.utils.TestData.insertRow;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.jupiter.api.Assertions.assertThrows;
/**
* Test cases for {@link RowDataKeyGen}.
*/
public class TestRowDataKeyGen {
@Test
void testSimpleKeyAndPartition() {
Configuration conf = TestConfigurations.getDefaultConf("path1");
final RowData rowData1 = insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23,
TimestampData.fromEpochMillis(1), StringData.fromString("par1"));
final RowDataKeyGen keyGen1 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE);
assertThat(keyGen1.getRecordKey(rowData1), is("id1"));
assertThat(keyGen1.getPartitionPath(rowData1), is("par1"));
// null record key and partition path
final RowData rowData2 = insertRow(null, StringData.fromString("Danny"), 23,
TimestampData.fromEpochMillis(1), null);
assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData2));
assertThat(keyGen1.getPartitionPath(rowData2), is("default"));
// empty record key and partition path
final RowData rowData3 = insertRow(StringData.fromString(""), StringData.fromString("Danny"), 23,
TimestampData.fromEpochMillis(1), StringData.fromString(""));
assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData3));
assertThat(keyGen1.getPartitionPath(rowData3), is("default"));
// hive style partitioning
conf.set(FlinkOptions.HIVE_STYLE_PARTITIONING, true);
final RowDataKeyGen keyGen2 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE);
assertThat(keyGen2.getPartitionPath(rowData1), is("partition=par1"));
assertThat(keyGen2.getPartitionPath(rowData2), is("partition=default"));
assertThat(keyGen2.getPartitionPath(rowData3), is("partition=default"));
}
@Test
void testComplexKeyAndPartition() {
Configuration conf = TestConfigurations.getDefaultConf("path1");
conf.set(FlinkOptions.RECORD_KEY_FIELD, "uuid,name");
conf.set(FlinkOptions.PARTITION_PATH_FIELD, "partition,ts");
RowData rowData1 = insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23,
TimestampData.fromEpochMillis(1), StringData.fromString("par1"));
RowDataKeyGen keyGen1 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE);
assertThat(keyGen1.getRecordKey(rowData1), is("uuid:id1,name:Danny"));
assertThat(keyGen1.getPartitionPath(rowData1), is("par1/1970-01-01T00:00:00.001"));
// null record key and partition path
final RowData rowData2 = insertRow(null, null, 23, null, null);
assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData2));
assertThat(keyGen1.getPartitionPath(rowData2), is("default/default"));
// empty record key and partition path
final RowData rowData3 = insertRow(StringData.fromString(""), StringData.fromString(""), 23,
TimestampData.fromEpochMillis(1), StringData.fromString(""));
assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData3));
assertThat(keyGen1.getPartitionPath(rowData3), is("default/1970-01-01T00:00:00.001"));
// hive style partitioning
conf.set(FlinkOptions.HIVE_STYLE_PARTITIONING, true);
final RowDataKeyGen keyGen2 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE);
assertThat(keyGen2.getPartitionPath(rowData1), is("partition=par1/ts=1970-01-01T00:00:00.001"));
assertThat(keyGen2.getPartitionPath(rowData2), is("partition=default/ts=default"));
assertThat(keyGen2.getPartitionPath(rowData3), is("partition=default/ts=1970-01-01T00:00:00.001"));
}
}

View File

@@ -25,6 +25,7 @@ import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.util.StreamerUtil;
import org.apache.hudi.utils.TestConfigurations;
import org.apache.hudi.utils.TestData;
import org.apache.hudi.utils.TestSQL;
import org.apache.hudi.utils.TestUtils;
import org.apache.hudi.utils.factory.CollectSinkTableFactory;
@@ -48,6 +49,7 @@ import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.EnumSource;
import org.junit.jupiter.params.provider.MethodSource;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.File;
import java.util.Collection;
@@ -66,7 +68,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* IT cases for Hoodie table source and sink.
*
* <p>
* Note: should add more SQL cases when batch write is supported.
*/
public class HoodieDataSourceITCase extends AbstractTestBase {
@@ -267,17 +269,8 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
}
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
tableEnv.executeSql(hoodieTableDDL);
String insertInto = "insert into t1 values\n"
+ "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'),\n"
+ "('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),\n"
+ "('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'),\n"
+ "('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),\n"
+ "('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),\n"
+ "('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),\n"
+ "('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),\n"
+ "('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4')";
execInsertSql(tableEnv, insertInto);
execInsertSql(tableEnv, TestSQL.INSERT_T1);
List<Row> result1 = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
@@ -296,40 +289,40 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
void testWriteAndReadParMiddle(ExecMode execMode) throws Exception {
boolean streaming = execMode == ExecMode.STREAM;
String hoodieTableDDL = "create table t1(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " `partition` varchar(20),\n" // test streaming read with partition field in the middle
+ " ts timestamp(3),\n"
+ " PRIMARY KEY(uuid) NOT ENFORCED\n"
+ ")\n"
+ "PARTITIONED BY (`partition`)\n"
+ "with (\n"
+ " 'connector' = 'hudi',\n"
+ " 'path' = '" + tempFile.getAbsolutePath() + "',\n"
+ " 'read.streaming.enabled' = '" + streaming + "'\n"
+ ")";
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " `partition` varchar(20),\n" // test streaming read with partition field in the middle
+ " ts timestamp(3),\n"
+ " PRIMARY KEY(uuid) NOT ENFORCED\n"
+ ")\n"
+ "PARTITIONED BY (`partition`)\n"
+ "with (\n"
+ " 'connector' = 'hudi',\n"
+ " 'path' = '" + tempFile.getAbsolutePath() + "',\n"
+ " 'read.streaming.enabled' = '" + streaming + "'\n"
+ ")";
streamTableEnv.executeSql(hoodieTableDDL);
String insertInto = "insert into t1 values\n"
+ "('id1','Danny',23,'par1',TIMESTAMP '1970-01-01 00:00:01'),\n"
+ "('id2','Stephen',33,'par1',TIMESTAMP '1970-01-01 00:00:02'),\n"
+ "('id3','Julian',53,'par2',TIMESTAMP '1970-01-01 00:00:03'),\n"
+ "('id4','Fabian',31,'par2',TIMESTAMP '1970-01-01 00:00:04'),\n"
+ "('id5','Sophia',18,'par3',TIMESTAMP '1970-01-01 00:00:05'),\n"
+ "('id6','Emma',20,'par3',TIMESTAMP '1970-01-01 00:00:06'),\n"
+ "('id7','Bob',44,'par4',TIMESTAMP '1970-01-01 00:00:07'),\n"
+ "('id8','Han',56,'par4',TIMESTAMP '1970-01-01 00:00:08')";
+ "('id1','Danny',23,'par1',TIMESTAMP '1970-01-01 00:00:01'),\n"
+ "('id2','Stephen',33,'par1',TIMESTAMP '1970-01-01 00:00:02'),\n"
+ "('id3','Julian',53,'par2',TIMESTAMP '1970-01-01 00:00:03'),\n"
+ "('id4','Fabian',31,'par2',TIMESTAMP '1970-01-01 00:00:04'),\n"
+ "('id5','Sophia',18,'par3',TIMESTAMP '1970-01-01 00:00:05'),\n"
+ "('id6','Emma',20,'par3',TIMESTAMP '1970-01-01 00:00:06'),\n"
+ "('id7','Bob',44,'par4',TIMESTAMP '1970-01-01 00:00:07'),\n"
+ "('id8','Han',56,'par4',TIMESTAMP '1970-01-01 00:00:08')";
execInsertSql(streamTableEnv, insertInto);
final String expected = "["
+ "id1,Danny,23,par1,1970-01-01T00:00:01, "
+ "id2,Stephen,33,par1,1970-01-01T00:00:02, "
+ "id3,Julian,53,par2,1970-01-01T00:00:03, "
+ "id4,Fabian,31,par2,1970-01-01T00:00:04, "
+ "id5,Sophia,18,par3,1970-01-01T00:00:05, "
+ "id6,Emma,20,par3,1970-01-01T00:00:06, "
+ "id7,Bob,44,par4,1970-01-01T00:00:07, "
+ "id8,Han,56,par4,1970-01-01T00:00:08]";
+ "id1,Danny,23,par1,1970-01-01T00:00:01, "
+ "id2,Stephen,33,par1,1970-01-01T00:00:02, "
+ "id3,Julian,53,par2,1970-01-01T00:00:03, "
+ "id4,Fabian,31,par2,1970-01-01T00:00:04, "
+ "id5,Sophia,18,par3,1970-01-01T00:00:05, "
+ "id6,Emma,20,par3,1970-01-01T00:00:06, "
+ "id7,Bob,44,par4,1970-01-01T00:00:07, "
+ "id8,Han,56,par4,1970-01-01T00:00:08]";
List<Row> result = execSelectSql(streamTableEnv, "select * from t1", execMode);
@@ -350,17 +343,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
tableEnv.executeSql(hoodieTableDDL);
final String insertInto1 = "insert into t1 values\n"
+ "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'),\n"
+ "('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),\n"
+ "('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'),\n"
+ "('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),\n"
+ "('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),\n"
+ "('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),\n"
+ "('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),\n"
+ "('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4')";
execInsertSql(tableEnv, insertInto1);
execInsertSql(tableEnv, TestSQL.INSERT_T1);
// overwrite partition 'par1' and increase in age by 1
final String insertInto2 = "insert overwrite t1 partition(`partition`='par1') values\n"
@@ -519,7 +502,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
// execute query and assert throws exception
assertThrows(HoodieException.class, () -> execSelectSql(streamTableEnv, "select * from t1", 10),
"No successful commits under path " + tempFile.getAbsolutePath());
"No successful commits under path " + tempFile.getAbsolutePath());
}
@@ -575,6 +558,80 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
assertRowsEquals(result, expected);
}
@ParameterizedTest
@ValueSource(booleans = {true, false})
void testBulkInsert(boolean hiveStylePartitioning) {
TableEnvironment tableEnv = batchTableEnv;
// csv source
String csvSourceDDL = TestConfigurations.getCsvSourceDDL("csv_source", "test_source_5.data");
tableEnv.executeSql(csvSourceDDL);
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
options.put(FlinkOptions.OPERATION.key(), "bulk_insert");
options.put(FlinkOptions.SINK_SHUFFLE_BY_PARTITION.key(), "true");
if (hiveStylePartitioning) {
options.put(FlinkOptions.HIVE_STYLE_PARTITIONING.key(), "true");
}
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("hoodie_sink", options);
tableEnv.executeSql(hoodieTableDDL);
String insertInto = "insert into hoodie_sink select * from csv_source";
execInsertSql(tableEnv, insertInto);
List<Row> result1 = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from hoodie_sink").execute().collect());
assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT);
// apply filters
List<Row> result2 = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from hoodie_sink where uuid > 'id5'").execute().collect());
assertRowsEquals(result2, "["
+ "id6,Emma,20,1970-01-01T00:00:06,par3, "
+ "id7,Bob,44,1970-01-01T00:00:07,par4, "
+ "id8,Han,56,1970-01-01T00:00:08,par4]");
}
@Test
void testBulkInsertNonPartitionedTable() {
TableEnvironment tableEnv = batchTableEnv;
String hoodieTableDDL = "create table t1(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20),\n"
+ " PRIMARY KEY(uuid) NOT ENFORCED\n"
+ ")\n"
+ "with (\n"
+ " 'connector' = 'hudi',\n"
+ " 'path' = '" + tempFile.getAbsolutePath() + "',\n"
+ " 'write.operation' = 'bulk_insert'\n"
+ ")";
tableEnv.executeSql(hoodieTableDDL);
final String insertInto1 = "insert into t1 values\n"
+ "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1')";
execInsertSql(tableEnv, insertInto1);
final String insertInto2 = "insert into t1 values\n"
+ "('id1','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par2'),\n"
+ "('id1','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par3'),\n"
+ "('id1','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par4'),\n"
+ "('id1','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par5')";
execInsertSql(tableEnv, insertInto2);
List<Row> result = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
assertRowsEquals(result, "["
+ "id1,Danny,23,1970-01-01T00:00:01,par1, "
+ "id1,Stephen,33,1970-01-01T00:00:02,par2, "
+ "id1,Julian,53,1970-01-01T00:00:03,par3, "
+ "id1,Fabian,31,1970-01-01T00:00:04,par4, "
+ "id1,Sophia,18,1970-01-01T00:00:05,par5]", 3);
}
// -------------------------------------------------------------------------
// Utilities
// -------------------------------------------------------------------------
@@ -606,7 +663,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
}
private List<Row> execSelectSql(TableEnvironment tEnv, String select, ExecMode execMode)
throws TableNotExistException, InterruptedException {
throws TableNotExistException, InterruptedException {
final String[] splits = select.split(" ");
final String tableName = splits[splits.length - 1];
switch (execMode) {
@@ -621,12 +678,12 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
}
private List<Row> execSelectSql(TableEnvironment tEnv, String select, long timeout)
throws InterruptedException, TableNotExistException {
throws InterruptedException, TableNotExistException {
return execSelectSql(tEnv, select, timeout, null);
}
private List<Row> execSelectSql(TableEnvironment tEnv, String select, long timeout, String sourceTable)
throws InterruptedException, TableNotExistException {
throws InterruptedException, TableNotExistException {
final String sinkDDL;
if (sourceTable != null) {
// use the source table schema as the sink schema if the source table was specified, .

View File

@@ -137,6 +137,22 @@ public class TestConfigurations {
return builder.toString();
}
public static String getCsvSourceDDL(String tableName, String fileName) {
String sourcePath = Objects.requireNonNull(Thread.currentThread()
.getContextClassLoader().getResource(fileName)).toString();
return "create table " + tableName + "(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20)\n"
+ ") with (\n"
+ " 'connector' = 'filesystem',\n"
+ " 'path' = '" + sourcePath + "',\n"
+ " 'format' = 'csv'\n"
+ ")";
}
public static final RowDataSerializer SERIALIZER = new RowDataSerializer(ROW_TYPE);
public static Configuration getDefaultConf(String tablePath) {

View File

@@ -515,7 +515,7 @@ public class TestData {
return Strings.join(fields, ",");
}
private static BinaryRowData insertRow(Object... fields) {
public static BinaryRowData insertRow(Object... fields) {
LogicalType[] types = TestConfigurations.ROW_TYPE.getFields().stream().map(RowType.RowField::getType)
.toArray(LogicalType[]::new);
assertEquals(

View File

@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.utils;
/**
* Test sql statements.
*/
public class TestSQL {
private TestSQL() {}
public static final String INSERT_T1 = "insert into t1 values\n"
+ "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'),\n"
+ "('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),\n"
+ "('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'),\n"
+ "('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),\n"
+ "('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),\n"
+ "('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),\n"
+ "('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),\n"
+ "('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4')";
}

View File

@@ -0,0 +1,8 @@
id1,Danny,23,1970-01-01 00:00:01,par1
id2,Stephen,33,1970-01-01 00:00:02,par1
id3,Julian,53,1970-01-01 00:00:03,par2
id4,Fabian,31,1970-01-01 00:00:04,par2
id5,Sophia,18,1970-01-01 00:00:05,par3
id6,Emma,20,1970-01-01 00:00:06,par3
id7,Bob,44,1970-01-01 00:00:07,par4
id8,Han,56,1970-01-01 00:00:08,par4