1
0

[HUDI-3563] Make quickstart examples covered by CI tests (#5082)

This commit is contained in:
ForwardXu
2022-03-25 16:37:17 +08:00
committed by GitHub
parent f20c9867d7
commit e5c3f9089b
38 changed files with 2980 additions and 225 deletions

View File

@@ -0,0 +1,211 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart;
import static org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations.sql;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.execution.JobClient;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.api.TableSchema;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.api.config.ExecutionConfigOptions;
import org.apache.flink.table.api.internal.TableEnvironmentImpl;
import org.apache.flink.table.catalog.ObjectPath;
import org.apache.flink.table.catalog.exceptions.TableNotExistException;
import org.apache.flink.types.Row;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory;
import org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations;
import org.jetbrains.annotations.NotNull;
public final class HoodieFlinkQuickstart {
private EnvironmentSettings settings = null;
private TableEnvironment streamTableEnv = null;
private String tableName;
private HoodieFlinkQuickstart() {
}
public static HoodieFlinkQuickstart instance() {
return new HoodieFlinkQuickstart();
}
public static void main(String[] args) throws TableNotExistException, InterruptedException {
if (args.length < 3) {
System.err.println("Usage: HoodieWriteClientExample <tablePath> <tableName> <tableType>");
System.exit(1);
}
String tablePath = args[0];
String tableName = args[1];
String tableType = args[2];
HoodieFlinkQuickstart flinkQuickstart = instance();
flinkQuickstart.initEnv();
// create filesystem table named source
flinkQuickstart.createFileSource();
// create hudi table
flinkQuickstart.createHudiTable(tablePath, tableName, HoodieTableType.valueOf(tableType));
// insert data
flinkQuickstart.insertData();
// query data
flinkQuickstart.queryData();
// update data
flinkQuickstart.updateData();
}
public void initEnv() {
if (this.streamTableEnv == null) {
settings = EnvironmentSettings.newInstance().build();
TableEnvironment streamTableEnv = TableEnvironmentImpl.create(settings);
streamTableEnv.getConfig().getConfiguration()
.setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1);
Configuration execConf = streamTableEnv.getConfig().getConfiguration();
execConf.setString("execution.checkpointing.interval", "2s");
// configure not to retry after failure
execConf.setString("restart-strategy", "fixed-delay");
execConf.setString("restart-strategy.fixed-delay.attempts", "0");
this.streamTableEnv = streamTableEnv;
}
}
public TableEnvironment getStreamTableEnv() {
return streamTableEnv;
}
public TableEnvironment getBatchTableEnv() {
Configuration conf = new Configuration();
// for batch upsert use cases: current suggestion is to disable these 2 options,
// from 1.14, flink runtime execution mode has switched from streaming
// to batch for batch execution mode(before that, both streaming and batch use streaming execution mode),
// current batch execution mode has these limitations:
//
// 1. the keyed stream default to always sort the inputs by key;
// 2. the batch state-backend requires the inputs sort by state key
//
// For our hudi batch pipeline upsert case, we rely on the consuming sequence for index records and data records,
// the index records must be loaded first before data records for BucketAssignFunction to keep upsert semantics correct,
// so we suggest disabling these 2 options to use streaming state-backend for batch execution mode
// to keep the strategy before 1.14.
conf.setBoolean("execution.sorted-inputs.enabled", false);
conf.setBoolean("execution.batch-state-backend.enabled", false);
StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(conf);
settings = EnvironmentSettings.newInstance().inBatchMode().build();
TableEnvironment batchTableEnv = StreamTableEnvironment.create(execEnv, settings);
batchTableEnv.getConfig().getConfiguration()
.setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1);
return batchTableEnv;
}
public void createHudiTable(String tablePath, String tableName,
HoodieTableType tableType) {
this.tableName = tableName;
// create hudi table
String hoodieTableDDL = sql(tableName)
.option(FlinkOptions.PATH, tablePath)
.option(FlinkOptions.READ_AS_STREAMING, true)
.option(FlinkOptions.TABLE_TYPE, tableType)
.end();
streamTableEnv.executeSql(hoodieTableDDL);
}
public void createFileSource() {
// create filesystem table named source
String createSource = QuickstartConfigurations.getFileSourceDDL("source");
streamTableEnv.executeSql(createSource);
}
@NotNull List<Row> insertData() throws InterruptedException, TableNotExistException {
// insert data
String insertInto = String.format("insert into %s select * from source", tableName);
execInsertSql(streamTableEnv, insertInto);
return queryData();
}
List<Row> queryData() throws InterruptedException, TableNotExistException {
// query data
// reading from the latest commit instance.
return execSelectSql(streamTableEnv, String.format("select * from %s", tableName), 10);
}
@NotNull List<Row> updateData() throws InterruptedException, TableNotExistException {
// update data
String insertInto = String.format("insert into %s select * from source", tableName);
execInsertSql(getStreamTableEnv(), insertInto);
return queryData();
}
public static void execInsertSql(TableEnvironment tEnv, String insert) {
TableResult tableResult = tEnv.executeSql(insert);
// wait to finish
try {
tableResult.getJobClient().get().getJobExecutionResult().get();
} catch (InterruptedException | ExecutionException ex) {
// ignored
}
}
public static List<Row> execSelectSql(TableEnvironment tEnv, String select, long timeout)
throws InterruptedException, TableNotExistException {
return execSelectSql(tEnv, select, timeout, null);
}
public static List<Row> execSelectSql(TableEnvironment tEnv, String select, long timeout, String sourceTable)
throws InterruptedException, TableNotExistException {
final String sinkDDL;
if (sourceTable != null) {
// use the source table schema as the sink schema if the source table was specified, .
ObjectPath objectPath = new ObjectPath(tEnv.getCurrentDatabase(), sourceTable);
TableSchema schema = tEnv.getCatalog(tEnv.getCurrentCatalog()).get().getTable(objectPath).getSchema();
sinkDDL = QuickstartConfigurations.getCollectSinkDDL("sink", schema);
} else {
sinkDDL = QuickstartConfigurations.getCollectSinkDDL("sink");
}
return execSelectSql(tEnv, select, sinkDDL, timeout);
}
public static List<Row> execSelectSql(TableEnvironment tEnv, String select, String sinkDDL, long timeout)
throws InterruptedException {
tEnv.executeSql("DROP TABLE IF EXISTS sink");
tEnv.executeSql(sinkDDL);
TableResult tableResult = tEnv.executeSql("insert into sink " + select);
// wait for the timeout then cancels the job
TimeUnit.SECONDS.sleep(timeout);
tableResult.getJobClient().ifPresent(JobClient::cancel);
tEnv.executeSql("DROP TABLE IF EXISTS sink");
return CollectSinkTableFactory.RESULT.values().stream()
.flatMap(Collection::stream)
.collect(Collectors.toList());
}
}

View File

@@ -0,0 +1,178 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart.factory;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.java.typeutils.RowTypeInfo;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.flink.table.api.TableSchema;
import org.apache.flink.table.connector.ChangelogMode;
import org.apache.flink.table.connector.sink.DynamicTableSink;
import org.apache.flink.table.connector.sink.SinkFunctionProvider;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.factories.DynamicTableSinkFactory;
import org.apache.flink.table.factories.FactoryUtil;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.utils.TypeConversions;
import org.apache.flink.types.Row;
import org.apache.flink.types.RowKind;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Factory for CollectTableSink.
*
* <p>Note: The CollectTableSink collects all the data of a table into a global collection {@code RESULT},
* so the tests should executed in single thread and the table name should be the same.
*/
public class CollectSinkTableFactory implements DynamicTableSinkFactory {
public static final String FACTORY_ID = "collect";
// global results to collect and query
public static final Map<Integer, List<Row>> RESULT = new HashMap<>();
@Override
public DynamicTableSink createDynamicTableSink(Context context) {
FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context);
helper.validate();
TableSchema schema = context.getCatalogTable().getSchema();
RESULT.clear();
return new CollectTableSink(schema, context.getObjectIdentifier().getObjectName());
}
@Override
public String factoryIdentifier() {
return FACTORY_ID;
}
@Override
public Set<ConfigOption<?>> requiredOptions() {
return Collections.emptySet();
}
@Override
public Set<ConfigOption<?>> optionalOptions() {
return Collections.emptySet();
}
// --------------------------------------------------------------------------------------------
// Table sinks
// --------------------------------------------------------------------------------------------
/**
* Values {@link DynamicTableSink} for testing.
*/
private static class CollectTableSink implements DynamicTableSink {
private final TableSchema schema;
private final String tableName;
private CollectTableSink(
TableSchema schema,
String tableName) {
this.schema = schema;
this.tableName = tableName;
}
@Override
public ChangelogMode getChangelogMode(ChangelogMode requestedMode) {
return ChangelogMode.newBuilder()
.addContainedKind(RowKind.INSERT)
.addContainedKind(RowKind.DELETE)
.addContainedKind(RowKind.UPDATE_AFTER)
.build();
}
@Override
public SinkRuntimeProvider getSinkRuntimeProvider(Context context) {
final DataType rowType = schema.toPhysicalRowDataType();
final RowTypeInfo rowTypeInfo = (RowTypeInfo) TypeConversions.fromDataTypeToLegacyInfo(rowType);
DataStructureConverter converter = context.createDataStructureConverter(schema.toPhysicalRowDataType());
return SinkFunctionProvider.of(new CollectSinkFunction(converter, rowTypeInfo));
}
@Override
public DynamicTableSink copy() {
return new CollectTableSink(schema, tableName);
}
@Override
public String asSummaryString() {
return "CollectSink";
}
}
static class CollectSinkFunction extends RichSinkFunction<RowData> implements CheckpointedFunction {
private static final long serialVersionUID = 1L;
private final DynamicTableSink.DataStructureConverter converter;
private final RowTypeInfo rowTypeInfo;
protected transient ListState<Row> resultState;
protected transient List<Row> localResult;
private int taskID;
protected CollectSinkFunction(DynamicTableSink.DataStructureConverter converter, RowTypeInfo rowTypeInfo) {
this.converter = converter;
this.rowTypeInfo = rowTypeInfo;
}
@Override
public void invoke(RowData value, Context context) {
Row row = (Row) converter.toExternal(value);
assert row != null;
row.setKind(value.getRowKind());
RESULT.get(taskID).add(row);
}
@Override
public void initializeState(FunctionInitializationContext context) throws Exception {
this.resultState = context.getOperatorStateStore().getListState(
new ListStateDescriptor<>("sink-results", rowTypeInfo));
this.localResult = new ArrayList<>();
if (context.isRestored()) {
for (Row value : resultState.get()) {
localResult.add(value);
}
}
this.taskID = getRuntimeContext().getIndexOfThisSubtask();
synchronized (CollectSinkTableFactory.class) {
RESULT.put(taskID, localResult);
}
}
@Override
public void snapshotState(FunctionSnapshotContext context) throws Exception {
resultState.clear();
resultState.addAll(RESULT.get(taskID));
}
}
}

View File

@@ -0,0 +1,72 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart.factory;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.ConfigOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.table.api.ValidationException;
import org.apache.flink.table.connector.source.DynamicTableSource;
import org.apache.flink.table.factories.DynamicTableSourceFactory;
import org.apache.flink.table.factories.FactoryUtil;
import java.util.Collections;
import java.util.Set;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.examples.quickstart.source.ContinuousFileSource;
/**
* Factory for ContinuousFileSource.
*/
public class ContinuousFileSourceFactory implements DynamicTableSourceFactory {
public static final String FACTORY_ID = "continuous-file-source";
public static final ConfigOption<Integer> CHECKPOINTS = ConfigOptions
.key("checkpoints")
.intType()
.defaultValue(2)
.withDescription("Number of checkpoints to write the data set as, default 2");
@Override
public DynamicTableSource createDynamicTableSource(Context context) {
FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context);
helper.validate();
Configuration conf = (Configuration) helper.getOptions();
Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() ->
new ValidationException("Option [path] should be not empty.")));
return new ContinuousFileSource(context.getCatalogTable().getResolvedSchema(), path, conf);
}
@Override
public String factoryIdentifier() {
return FACTORY_ID;
}
@Override
public Set<ConfigOption<?>> requiredOptions() {
return Collections.singleton(FlinkOptions.PATH);
}
@Override
public Set<ConfigOption<?>> optionalOptions() {
return Collections.singleton(CHECKPOINTS);
}
}

View File

@@ -0,0 +1,185 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart.source;
import org.apache.flink.api.common.state.CheckpointListener;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.formats.common.TimestampFormat;
import org.apache.flink.formats.json.JsonRowDataDeserializationSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.table.catalog.ResolvedSchema;
import org.apache.flink.table.connector.ChangelogMode;
import org.apache.flink.table.connector.source.DataStreamScanProvider;
import org.apache.flink.table.connector.source.DynamicTableSource;
import org.apache.flink.table.connector.source.ScanTableSource;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.runtime.typeutils.InternalTypeInfo;
import org.apache.flink.table.types.logical.RowType;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import static org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory.CHECKPOINTS;
/**
* A continuous file source that can trigger checkpoints continuously.
*
* <p>It loads the data in the specified file and split the data into number of checkpoints batches.
* Say, if you want 4 checkpoints and there are 8 records in the file, the emit strategy is:
*
* <pre>
* | 2 records | 2 records | 2 records | 2 records |
* | cp1 | cp2 |cp3 | cp4 |
* </pre>
*
* <p>If all the data are flushed out, it waits for the next checkpoint to finish and tear down the source.
*/
public class ContinuousFileSource implements ScanTableSource {
private final ResolvedSchema tableSchema;
private final Path path;
private final Configuration conf;
public ContinuousFileSource(
ResolvedSchema tableSchema,
Path path,
Configuration conf) {
this.tableSchema = tableSchema;
this.path = path;
this.conf = conf;
}
@Override
public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) {
return new DataStreamScanProvider() {
@Override
public boolean isBounded() {
return false;
}
@Override
public DataStream<RowData> produceDataStream(StreamExecutionEnvironment execEnv) {
final RowType rowType = (RowType) tableSchema.toSourceRowDataType().getLogicalType();
JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema(
rowType,
InternalTypeInfo.of(rowType),
false,
true,
TimestampFormat.ISO_8601);
return execEnv.addSource(new BoundedSourceFunction(path, conf.getInteger(CHECKPOINTS)))
.name("continuous_file_source")
.setParallelism(1)
.map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8)),
InternalTypeInfo.of(rowType));
}
};
}
@Override
public ChangelogMode getChangelogMode() {
return ChangelogMode.insertOnly();
}
@Override
public DynamicTableSource copy() {
return new ContinuousFileSource(this.tableSchema, this.path, this.conf);
}
@Override
public String asSummaryString() {
return "ContinuousFileSource";
}
/**
* Source function that partition the data into given number checkpoints batches.
*/
public static class BoundedSourceFunction implements SourceFunction<String>, CheckpointListener {
private final Path path;
private List<String> dataBuffer;
private final int checkpoints;
private final AtomicInteger currentCP = new AtomicInteger(0);
private volatile boolean isRunning = true;
public BoundedSourceFunction(Path path, int checkpoints) {
this.path = path;
this.checkpoints = checkpoints;
}
@Override
public void run(SourceContext<String> context) throws Exception {
if (this.dataBuffer == null) {
loadDataBuffer();
}
int oldCP = this.currentCP.get();
boolean finish = false;
while (isRunning) {
int batchSize = this.dataBuffer.size() / this.checkpoints;
int start = batchSize * oldCP;
synchronized (context.getCheckpointLock()) {
for (int i = start; i < start + batchSize; i++) {
if (i >= this.dataBuffer.size()) {
finish = true;
break;
// wait for the next checkpoint and exit
}
context.collect(this.dataBuffer.get(i));
}
}
oldCP++;
while (this.currentCP.get() < oldCP) {
synchronized (context.getCheckpointLock()) {
context.getCheckpointLock().wait(10);
}
}
if (finish || !isRunning) {
return;
}
}
}
@Override
public void cancel() {
this.isRunning = false;
}
private void loadDataBuffer() {
try {
this.dataBuffer = Files.readAllLines(Paths.get(this.path.toUri()));
} catch (IOException e) {
throw new RuntimeException("Read file " + this.path + " error", e);
}
}
@Override
public void notifyCheckpointComplete(long l) {
this.currentCP.incrementAndGet();
}
}
}

View File

@@ -0,0 +1,317 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart.utils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.api.DataTypes;
import org.apache.flink.table.api.TableSchema;
import org.apache.flink.table.catalog.ResolvedSchema;
import org.apache.flink.table.runtime.typeutils.RowDataSerializer;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory;
import org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory;
import org.apache.hudi.streamer.FlinkStreamerConfig;
/**
* Configurations for the test.
*/
public class QuickstartConfigurations {
private QuickstartConfigurations() {
}
public static final DataType ROW_DATA_TYPE = DataTypes.ROW(
DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key
DataTypes.FIELD("name", DataTypes.VARCHAR(10)),
DataTypes.FIELD("age", DataTypes.INT()),
DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field
DataTypes.FIELD("partition", DataTypes.VARCHAR(10)))
.notNull();
public static final RowType ROW_TYPE = (RowType) ROW_DATA_TYPE.getLogicalType();
public static final ResolvedSchema TABLE_SCHEMA = SchemaBuilder.instance()
.fields(ROW_TYPE.getFieldNames(), ROW_DATA_TYPE.getChildren())
.build();
private static final List<String> FIELDS = ROW_TYPE.getFields().stream()
.map(RowType.RowField::asSummaryString).collect(Collectors.toList());
public static final DataType ROW_DATA_TYPE_WIDER = DataTypes.ROW(
DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key
DataTypes.FIELD("name", DataTypes.VARCHAR(10)),
DataTypes.FIELD("age", DataTypes.INT()),
DataTypes.FIELD("salary", DataTypes.DOUBLE()),
DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field
DataTypes.FIELD("partition", DataTypes.VARCHAR(10)))
.notNull();
public static final RowType ROW_TYPE_WIDER = (RowType) ROW_DATA_TYPE_WIDER.getLogicalType();
public static String getCreateHoodieTableDDL(String tableName, Map<String, String> options) {
return getCreateHoodieTableDDL(tableName, options, true, "partition");
}
public static String getCreateHoodieTableDDL(
String tableName,
Map<String, String> options,
boolean havePartition,
String partitionField) {
return getCreateHoodieTableDDL(tableName, FIELDS, options, havePartition, "uuid", partitionField);
}
public static String getCreateHoodieTableDDL(
String tableName,
List<String> fields,
Map<String, String> options,
boolean havePartition,
String pkField,
String partitionField) {
StringBuilder builder = new StringBuilder();
builder.append("create table ").append(tableName).append("(\n");
for (String field : fields) {
builder.append(" ").append(field).append(",\n");
}
builder.append(" PRIMARY KEY(").append(pkField).append(") NOT ENFORCED\n")
.append(")\n");
if (havePartition) {
builder.append("PARTITIONED BY (`").append(partitionField).append("`)\n");
}
final String connector = options.computeIfAbsent("connector", k -> "hudi");
builder.append("with (\n"
+ " 'connector' = '").append(connector).append("'");
options.forEach((k, v) -> builder.append(",\n")
.append(" '").append(k).append("' = '").append(v).append("'"));
builder.append("\n)");
return builder.toString();
}
public static String getCreateHudiCatalogDDL(final String catalogName, final String catalogPath) {
StringBuilder builder = new StringBuilder();
builder.append("create catalog ").append(catalogName).append(" with (\n");
builder.append(" 'type' = 'hudi',\n"
+ " 'catalog.path' = '").append(catalogPath).append("'");
builder.append("\n)");
return builder.toString();
}
public static String getFileSourceDDL(String tableName) {
return getFileSourceDDL(tableName, "source-file.json");
}
public static String getFileSourceDDL(String tableName, int checkpoints) {
return getFileSourceDDL(tableName, "source-file.json", checkpoints);
}
public static String getFileSourceDDL(String tableName, String fileName) {
return getFileSourceDDL(tableName, fileName, 2);
}
public static String getFileSourceDDL(String tableName, String fileName, int checkpoints) {
String sourcePath = Objects.requireNonNull(Thread.currentThread()
.getContextClassLoader().getResource(fileName)).toString();
return "create table " + tableName + "(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20)\n"
+ ") with (\n"
+ " 'connector' = '" + ContinuousFileSourceFactory.FACTORY_ID + "',\n"
+ " 'path' = '" + sourcePath + "',\n"
+ " 'checkpoints' = '" + checkpoints + "'\n"
+ ")";
}
public static String getCollectSinkDDL(String tableName) {
return "create table " + tableName + "(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20)\n"
+ ") with (\n"
+ " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'"
+ ")";
}
public static String getCollectSinkDDL(String tableName, TableSchema tableSchema) {
final StringBuilder builder = new StringBuilder("create table " + tableName + "(\n");
String[] fieldNames = tableSchema.getFieldNames();
DataType[] fieldTypes = tableSchema.getFieldDataTypes();
for (int i = 0; i < fieldNames.length; i++) {
builder.append(" `")
.append(fieldNames[i])
.append("` ")
.append(fieldTypes[i].toString());
if (i != fieldNames.length - 1) {
builder.append(",");
}
builder.append("\n");
}
final String withProps = ""
+ ") with (\n"
+ " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'\n"
+ ")";
builder.append(withProps);
return builder.toString();
}
public static String getCsvSourceDDL(String tableName, String fileName) {
String sourcePath = Objects.requireNonNull(Thread.currentThread()
.getContextClassLoader().getResource(fileName)).toString();
return "create table " + tableName + "(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20)\n"
+ ") with (\n"
+ " 'connector' = 'filesystem',\n"
+ " 'path' = '" + sourcePath + "',\n"
+ " 'format' = 'csv'\n"
+ ")";
}
public static final RowDataSerializer SERIALIZER = new RowDataSerializer(ROW_TYPE);
public static Configuration getDefaultConf(String tablePath) {
Configuration conf = new Configuration();
conf.setString(FlinkOptions.PATH, tablePath);
conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH,
Objects.requireNonNull(Thread.currentThread()
.getContextClassLoader().getResource("test_read_schema.avsc")).toString());
conf.setString(FlinkOptions.TABLE_NAME, "TestHoodieTable");
conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "partition");
return conf;
}
public static FlinkStreamerConfig getDefaultStreamerConf(String tablePath) {
FlinkStreamerConfig streamerConf = new FlinkStreamerConfig();
streamerConf.targetBasePath = tablePath;
streamerConf.sourceAvroSchemaPath = Objects.requireNonNull(Thread.currentThread()
.getContextClassLoader().getResource("test_read_schema.avsc")).toString();
streamerConf.targetTableName = "TestHoodieTable";
streamerConf.partitionPathField = "partition";
streamerConf.tableType = "COPY_ON_WRITE";
streamerConf.checkpointInterval = 4000L;
return streamerConf;
}
/**
* Creates the tool to build hoodie table DDL.
*/
public static Sql sql(String tableName) {
return new Sql(tableName);
}
public static Catalog catalog(String catalogName) {
return new Catalog(catalogName);
}
// -------------------------------------------------------------------------
// Utilities
// -------------------------------------------------------------------------
/**
* Tool to build hoodie table DDL with schema {@link #TABLE_SCHEMA}.
*/
public static class Sql {
private final Map<String, String> options;
private final String tableName;
private List<String> fields = new ArrayList<>();
private boolean withPartition = true;
private String pkField = "uuid";
private String partitionField = "partition";
public Sql(String tableName) {
options = new HashMap<>();
this.tableName = tableName;
}
public Sql option(ConfigOption<?> option, Object val) {
this.options.put(option.key(), val.toString());
return this;
}
public Sql option(String key, Object val) {
this.options.put(key, val.toString());
return this;
}
public Sql options(Map<String, String> options) {
this.options.putAll(options);
return this;
}
public Sql noPartition() {
this.withPartition = false;
return this;
}
public Sql pkField(String pkField) {
this.pkField = pkField;
return this;
}
public Sql partitionField(String partitionField) {
this.partitionField = partitionField;
return this;
}
public Sql field(String fieldSchema) {
fields.add(fieldSchema);
return this;
}
public String end() {
if (this.fields.size() == 0) {
this.fields = FIELDS;
}
return QuickstartConfigurations.getCreateHoodieTableDDL(this.tableName, this.fields, options,
this.withPartition, this.pkField, this.partitionField);
}
}
public static class Catalog {
private final String catalogName;
private String catalogPath = ".";
public Catalog(String catalogName) {
this.catalogName = catalogName;
}
public Catalog catalogPath(String catalogPath) {
this.catalogPath = catalogPath;
return this;
}
public String end() {
return QuickstartConfigurations.getCreateHudiCatalogDDL(catalogName, catalogPath);
}
}
}

View File

@@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart.utils;
import org.apache.flink.table.catalog.Column;
import org.apache.flink.table.catalog.ResolvedSchema;
import org.apache.flink.table.catalog.UniqueConstraint;
import org.apache.flink.table.catalog.WatermarkSpec;
import org.apache.flink.table.types.DataType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Builder for {@link ResolvedSchema}.
*/
public class SchemaBuilder {
private List<Column> columns;
private List<WatermarkSpec> watermarkSpecs;
private UniqueConstraint constraint;
public static SchemaBuilder instance() {
return new SchemaBuilder();
}
private SchemaBuilder() {
this.columns = new ArrayList<>();
this.watermarkSpecs = new ArrayList<>();
}
public SchemaBuilder field(String name, DataType type) {
this.columns.add(Column.physical(name, type));
return this;
}
public SchemaBuilder fields(List<String> names, List<DataType> types) {
List<Column> columns = IntStream.range(0, names.size())
.mapToObj(idx -> Column.physical(names.get(idx), types.get(idx)))
.collect(Collectors.toList());
this.columns.addAll(columns);
return this;
}
public SchemaBuilder primaryKey(String... columns) {
this.constraint = UniqueConstraint.primaryKey("pk", Arrays.asList(columns));
return this;
}
public ResolvedSchema build() {
return new ResolvedSchema(columns, watermarkSpecs, constraint);
}
}

View File

@@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory
org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory

View File

@@ -0,0 +1,8 @@
{"uuid": "id1", "name": "Danny", "age": 23, "ts": "1970-01-01T00:00:01", "partition": "par1"}
{"uuid": "id2", "name": "Stephen", "age": 33, "ts": "1970-01-01T00:00:02", "partition": "par1"}
{"uuid": "id3", "name": "Julian", "age": 53, "ts": "1970-01-01T00:00:03", "partition": "par2"}
{"uuid": "id4", "name": "Fabian", "age": 31, "ts": "1970-01-01T00:00:04", "partition": "par2"}
{"uuid": "id5", "name": "Sophia", "age": 18, "ts": "1970-01-01T00:00:05", "partition": "par3"}
{"uuid": "id6", "name": "Emma", "age": 20, "ts": "1970-01-01T00:00:06", "partition": "par3"}
{"uuid": "id7", "name": "Bob", "age": 44, "ts": "1970-01-01T00:00:07", "partition": "par4"}
{"uuid": "id8", "name": "Han", "age": 56, "ts": "1970-01-01T00:00:08", "partition": "par4"}