1
0

[HUDI-2316] Support Flink batch upsert (#3494)

This commit is contained in:
swuferhong
2021-08-19 17:15:26 +08:00
committed by GitHub
parent b7a0d76fc9
commit 1fed44af84
8 changed files with 343 additions and 133 deletions

View File

@@ -292,7 +292,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
}
@ParameterizedTest
@MethodSource("configParams")
@MethodSource("executionModeAndPartitioningParams")
void testWriteAndRead(ExecMode execMode, boolean hiveStylePartitioning) {
TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv;
Map<String, String> options = new HashMap<>();
@@ -317,6 +317,56 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
+ "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]");
}
@ParameterizedTest
@EnumSource(value = HoodieTableType.class)
void testBatchModeUpsertWithoutPartition(HoodieTableType tableType) {
TableEnvironment tableEnv = batchTableEnv;
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
options.put(FlinkOptions.TABLE_NAME.key(), tableType.name());
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options, false);
tableEnv.executeSql(hoodieTableDDL);
execInsertSql(tableEnv, TestSQL.INSERT_T1);
List<Row> result1 = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT);
// batchMode update
execInsertSql(tableEnv, TestSQL.UPDATE_INSERT_T1);
List<Row> result2 = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
assertRowsEquals(result2, TestData.DATA_SET_SOURCE_MERGED);
}
@ParameterizedTest
@MethodSource("tableTypeAndPartitioningParams")
void testBatchModeUpsert(HoodieTableType tableType, boolean hiveStylePartitioning) {
TableEnvironment tableEnv = batchTableEnv;
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
options.put(FlinkOptions.TABLE_NAME.key(), tableType.name());
if (hiveStylePartitioning) {
options.put(FlinkOptions.HIVE_STYLE_PARTITIONING.key(), "true");
}
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
tableEnv.executeSql(hoodieTableDDL);
execInsertSql(tableEnv, TestSQL.INSERT_T1);
List<Row> result1 = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT);
// batchMode update
execInsertSql(tableEnv, TestSQL.UPDATE_INSERT_T1);
List<Row> result2 = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
assertRowsEquals(result2, TestData.DATA_SET_SOURCE_MERGED);
}
@ParameterizedTest
@EnumSource(value = ExecMode.class)
void testWriteAndReadParMiddle(ExecMode execMode) throws Exception {
@@ -436,18 +486,9 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
@EnumSource(value = ExecMode.class)
void testWriteNonPartitionedTable(ExecMode execMode) {
TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv;
String hoodieTableDDL = "create table t1(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20),\n"
+ " PRIMARY KEY(uuid) NOT ENFORCED\n"
+ ")\n"
+ "with (\n"
+ " 'connector' = 'hudi',\n"
+ " 'path' = '" + tempFile.getAbsolutePath() + "'\n"
+ ")";
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options, false);
tableEnv.executeSql(hoodieTableDDL);
final String insertInto1 = "insert into t1 values\n"
@@ -627,19 +668,10 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
@Test
void testBulkInsertNonPartitionedTable() {
TableEnvironment tableEnv = batchTableEnv;
String hoodieTableDDL = "create table t1(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20),\n"
+ " PRIMARY KEY(uuid) NOT ENFORCED\n"
+ ")\n"
+ "with (\n"
+ " 'connector' = 'hudi',\n"
+ " 'path' = '" + tempFile.getAbsolutePath() + "',\n"
+ " 'write.operation' = 'bulk_insert'\n"
+ ")";
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
options.put(FlinkOptions.OPERATION.key(), "bulk_insert");
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options, false);
tableEnv.executeSql(hoodieTableDDL);
final String insertInto1 = "insert into t1 values\n"
@@ -675,7 +707,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
/**
* Return test params => (execution mode, hive style partitioning).
*/
private static Stream<Arguments> configParams() {
private static Stream<Arguments> executionModeAndPartitioningParams() {
Object[][] data =
new Object[][] {
{ExecMode.BATCH, false},
@@ -685,6 +717,19 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
return Stream.of(data).map(Arguments::of);
}
/**
* Return test params => (HoodieTableType, hive style partitioning).
*/
private static Stream<Arguments> tableTypeAndPartitioningParams() {
Object[][] data =
new Object[][] {
{HoodieTableType.COPY_ON_WRITE, false},
{HoodieTableType.COPY_ON_WRITE, true},
{HoodieTableType.MERGE_ON_READ, false},
{HoodieTableType.MERGE_ON_READ, true}};
return Stream.of(data).map(Arguments::of);
}
private void execInsertSql(TableEnvironment tEnv, String insert) {
TableResult tableResult = tEnv.executeSql(insert);
// wait to finish

View File

@@ -56,18 +56,24 @@ public class TestConfigurations {
.build();
public static String getCreateHoodieTableDDL(String tableName, Map<String, String> options) {
String createTable = "create table " + tableName + "(\n"
return getCreateHoodieTableDDL(tableName, options, true);
}
public static String getCreateHoodieTableDDL(String tableName, Map<String, String> options, boolean havePartition) {
StringBuilder builder = new StringBuilder();
builder.append("create table " + tableName + "(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20),\n"
+ " PRIMARY KEY(uuid) NOT ENFORCED\n"
+ ")\n"
+ "PARTITIONED BY (`partition`)\n"
+ "with (\n"
+ " 'connector' = 'hudi'";
StringBuilder builder = new StringBuilder(createTable);
+ ")\n");
if (havePartition) {
builder.append("PARTITIONED BY (`partition`)\n");
}
builder.append("with (\n"
+ " 'connector' = 'hudi'");
options.forEach((k, v) -> builder.append(",\n")
.append(" '").append(k).append("' = '").append(v).append("'"));
builder.append("\n)");

View File

@@ -33,4 +33,14 @@ public class TestSQL {
+ "('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),\n"
+ "('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),\n"
+ "('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4')";
public static final String UPDATE_INSERT_T1 = "insert into t1 values\n"
+ "('id1','Danny',24,TIMESTAMP '1970-01-01 00:00:01','par1'),\n"
+ "('id2','Stephen',34,TIMESTAMP '1970-01-01 00:00:02','par1'),\n"
+ "('id3','Julian',54,TIMESTAMP '1970-01-01 00:00:03','par2'),\n"
+ "('id4','Fabian',32,TIMESTAMP '1970-01-01 00:00:04','par2'),\n"
+ "('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),\n"
+ "('id9','Jane',19,TIMESTAMP '1970-01-01 00:00:06','par3'),\n"
+ "('id10','Ella',38,TIMESTAMP '1970-01-01 00:00:07','par4'),\n"
+ "('id11','Phoebe',52,TIMESTAMP '1970-01-01 00:00:08','par4')";
}