[HUDI-1692] Bounded source for stream writer (#2674)
Supports bounded source such as VALUES for stream mode writer.
This commit is contained in:
@@ -182,7 +182,7 @@ public class StreamWriteFunction<K, I, O>
|
||||
// it would check the validity.
|
||||
this.onCheckpointing = true;
|
||||
// wait for the buffer data flush out and request a new instant
|
||||
flushBuffer(true);
|
||||
flushBuffer(true, false);
|
||||
// signal the task thread to start buffering
|
||||
addToBufferCondition.signal();
|
||||
} finally {
|
||||
@@ -221,7 +221,7 @@ public class StreamWriteFunction<K, I, O>
|
||||
* End input action for batch source.
|
||||
*/
|
||||
public void endInput() {
|
||||
flushBuffer(true);
|
||||
flushBuffer(true, true);
|
||||
this.writeClient.cleanHandles();
|
||||
}
|
||||
|
||||
@@ -333,13 +333,13 @@ public class StreamWriteFunction<K, I, O>
|
||||
private void flushBufferOnCondition(I value) {
|
||||
boolean needFlush = this.detector.detect(value);
|
||||
if (needFlush) {
|
||||
flushBuffer(false);
|
||||
flushBuffer(false, false);
|
||||
this.detector.reset();
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked, rawtypes")
|
||||
private void flushBuffer(boolean isFinalBatch) {
|
||||
private void flushBuffer(boolean isLastBatch, boolean isEndInput) {
|
||||
this.currentInstant = this.writeClient.getInflightAndRequestedInstant(this.config.get(FlinkOptions.TABLE_TYPE));
|
||||
if (this.currentInstant == null) {
|
||||
// in case there are empty checkpoints that has no input data
|
||||
@@ -364,8 +364,14 @@ public class StreamWriteFunction<K, I, O>
|
||||
LOG.info("No data to write in subtask [{}] for instant [{}]", taskID, currentInstant);
|
||||
writeStatus = Collections.emptyList();
|
||||
}
|
||||
this.eventGateway.sendEventToCoordinator(
|
||||
new BatchWriteSuccessEvent(this.taskID, currentInstant, writeStatus, isFinalBatch));
|
||||
final BatchWriteSuccessEvent event = BatchWriteSuccessEvent.builder()
|
||||
.taskID(taskID)
|
||||
.instantTime(currentInstant)
|
||||
.writeStatus(writeStatus)
|
||||
.isLastBatch(isLastBatch)
|
||||
.isEndInput(isEndInput)
|
||||
.build();
|
||||
this.eventGateway.sendEventToCoordinator(event);
|
||||
this.buffer.clear();
|
||||
this.currentInstant = "";
|
||||
}
|
||||
|
||||
@@ -102,11 +102,6 @@ public class StreamWriteOperatorCoordinator
|
||||
*/
|
||||
private final int parallelism;
|
||||
|
||||
/**
|
||||
* Whether the coordinator executes with the bounded data set.
|
||||
*/
|
||||
private final boolean isBounded;
|
||||
|
||||
/**
|
||||
* Whether needs to schedule compaction task on finished checkpoints.
|
||||
*/
|
||||
@@ -117,16 +112,13 @@ public class StreamWriteOperatorCoordinator
|
||||
*
|
||||
* @param conf The config options
|
||||
* @param parallelism The operator task number
|
||||
* @param isBounded Whether the input data source is bounded
|
||||
*/
|
||||
public StreamWriteOperatorCoordinator(
|
||||
Configuration conf,
|
||||
int parallelism,
|
||||
boolean isBounded) {
|
||||
int parallelism) {
|
||||
this.conf = conf;
|
||||
this.parallelism = parallelism;
|
||||
this.needsScheduleCompaction = StreamerUtil.needsScheduleCompaction(conf);
|
||||
this.isBounded = isBounded;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -143,11 +135,6 @@ public class StreamWriteOperatorCoordinator
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
if (isBounded) {
|
||||
// start to commit the instant.
|
||||
checkAndCommitWithRetry();
|
||||
// no compaction scheduling for batch mode
|
||||
}
|
||||
// teardown the resource
|
||||
if (writeClient != null) {
|
||||
writeClient.close();
|
||||
@@ -216,6 +203,11 @@ public class StreamWriteOperatorCoordinator
|
||||
} else {
|
||||
this.eventBuffer[event.getTaskID()] = event;
|
||||
}
|
||||
if (event.isEndInput() && checkReady()) {
|
||||
// start to commit the instant.
|
||||
doCommit();
|
||||
// no compaction scheduling for batch mode
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -424,12 +416,10 @@ public class StreamWriteOperatorCoordinator
|
||||
public static class Provider implements OperatorCoordinator.Provider {
|
||||
private final OperatorID operatorId;
|
||||
private final Configuration conf;
|
||||
private final boolean isBounded;
|
||||
|
||||
public Provider(OperatorID operatorId, Configuration conf, boolean isBounded) {
|
||||
public Provider(OperatorID operatorId, Configuration conf) {
|
||||
this.operatorId = operatorId;
|
||||
this.conf = conf;
|
||||
this.isBounded = isBounded;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -439,7 +429,7 @@ public class StreamWriteOperatorCoordinator
|
||||
|
||||
@Override
|
||||
public OperatorCoordinator create(Context context) {
|
||||
return new StreamWriteOperatorCoordinator(this.conf, context.currentParallelism(), isBounded);
|
||||
return new StreamWriteOperatorCoordinator(this.conf, context.currentParallelism());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,20 +39,11 @@ public class StreamWriteOperatorFactory<I>
|
||||
|
||||
private final StreamWriteOperator<I> operator;
|
||||
private final Configuration conf;
|
||||
private final boolean isBounded;
|
||||
|
||||
public StreamWriteOperatorFactory(
|
||||
Configuration conf) {
|
||||
this(conf, false);
|
||||
}
|
||||
|
||||
public StreamWriteOperatorFactory(
|
||||
Configuration conf,
|
||||
boolean isBounded) {
|
||||
public StreamWriteOperatorFactory(Configuration conf) {
|
||||
super(new StreamWriteOperator<>(conf));
|
||||
this.operator = (StreamWriteOperator<I>) getOperator();
|
||||
this.conf = conf;
|
||||
this.isBounded = isBounded;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -70,7 +61,7 @@ public class StreamWriteOperatorFactory<I>
|
||||
|
||||
@Override
|
||||
public OperatorCoordinator.Provider getCoordinatorProvider(String s, OperatorID operatorID) {
|
||||
return new StreamWriteOperatorCoordinator.Provider(operatorID, this.conf, isBounded);
|
||||
return new StreamWriteOperatorCoordinator.Provider(operatorID, this.conf);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@@ -25,6 +25,7 @@ import org.apache.flink.runtime.operators.coordination.OperatorEvent;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* An operator event to mark successful checkpoint batch write.
|
||||
@@ -36,13 +37,13 @@ public class BatchWriteSuccessEvent implements OperatorEvent {
|
||||
private final int taskID;
|
||||
private final String instantTime;
|
||||
private boolean isLastBatch;
|
||||
|
||||
public BatchWriteSuccessEvent(
|
||||
int taskID,
|
||||
String instantTime,
|
||||
List<WriteStatus> writeStatuses) {
|
||||
this(taskID, instantTime, writeStatuses, false);
|
||||
}
|
||||
/**
|
||||
* Flag saying whether the event comes from the end of input, e.g. the source
|
||||
* is bounded, there are two cases in which this flag should be set to true:
|
||||
* 1. batch execution mode
|
||||
* 2. bounded stream source such as VALUES
|
||||
*/
|
||||
private final boolean isEndInput;
|
||||
|
||||
/**
|
||||
* Creates an event.
|
||||
@@ -55,15 +56,24 @@ public class BatchWriteSuccessEvent implements OperatorEvent {
|
||||
* if true, the whole data set of the checkpoint
|
||||
* has been flushed successfully
|
||||
*/
|
||||
public BatchWriteSuccessEvent(
|
||||
private BatchWriteSuccessEvent(
|
||||
int taskID,
|
||||
String instantTime,
|
||||
List<WriteStatus> writeStatuses,
|
||||
boolean isLastBatch) {
|
||||
boolean isLastBatch,
|
||||
boolean isEndInput) {
|
||||
this.taskID = taskID;
|
||||
this.instantTime = instantTime;
|
||||
this.writeStatuses = new ArrayList<>(writeStatuses);
|
||||
this.isLastBatch = isLastBatch;
|
||||
this.isEndInput = isEndInput;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the builder for {@link BatchWriteSuccessEvent}.
|
||||
*/
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public List<WriteStatus> getWriteStatuses() {
|
||||
@@ -82,6 +92,10 @@ public class BatchWriteSuccessEvent implements OperatorEvent {
|
||||
return isLastBatch;
|
||||
}
|
||||
|
||||
public boolean isEndInput() {
|
||||
return isEndInput;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges this event with given {@link BatchWriteSuccessEvent} {@code other}.
|
||||
*
|
||||
@@ -101,4 +115,51 @@ public class BatchWriteSuccessEvent implements OperatorEvent {
|
||||
public boolean isReady(String currentInstant) {
|
||||
return isLastBatch && this.instantTime.equals(currentInstant);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Builder
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Builder for {@link BatchWriteSuccessEvent}.
|
||||
*/
|
||||
public static class Builder {
|
||||
private List<WriteStatus> writeStatus;
|
||||
private Integer taskID;
|
||||
private String instantTime;
|
||||
private boolean isLastBatch = false;
|
||||
private boolean isEndInput = false;
|
||||
|
||||
public BatchWriteSuccessEvent build() {
|
||||
Objects.requireNonNull(taskID);
|
||||
Objects.requireNonNull(instantTime);
|
||||
Objects.requireNonNull(writeStatus);
|
||||
return new BatchWriteSuccessEvent(taskID, instantTime, writeStatus, isLastBatch, isEndInput);
|
||||
}
|
||||
|
||||
public Builder taskID(int taskID) {
|
||||
this.taskID = taskID;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder instantTime(String instantTime) {
|
||||
this.instantTime = instantTime;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder writeStatus(List<WriteStatus> writeStatus) {
|
||||
this.writeStatus = writeStatus;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder isLastBatch(boolean isLastBatch) {
|
||||
this.isLastBatch = isLastBatch;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder isEndInput(boolean isEndInput) {
|
||||
this.isEndInput = isEndInput;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,7 +69,7 @@ public class HoodieTableFactory implements TableSourceFactory<RowData>, TableSin
|
||||
conf.setString(FlinkOptions.PARTITION_PATH_FIELD, String.join(",", context.getTable().getPartitionKeys()));
|
||||
TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(context.getTable().getSchema());
|
||||
inferAvroSchema(conf, tableSchema.toRowDataType().notNull().getLogicalType());
|
||||
return new HoodieTableSink(conf, tableSchema, context.isBounded());
|
||||
return new HoodieTableSink(conf, tableSchema);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@@ -54,12 +54,10 @@ public class HoodieTableSink implements AppendStreamTableSink<RowData>, Partitio
|
||||
|
||||
private final Configuration conf;
|
||||
private final TableSchema schema;
|
||||
private final boolean isBounded;
|
||||
|
||||
public HoodieTableSink(Configuration conf, TableSchema schema, boolean isBounded) {
|
||||
public HoodieTableSink(Configuration conf, TableSchema schema) {
|
||||
this.conf = conf;
|
||||
this.schema = schema;
|
||||
this.isBounded = isBounded;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -67,7 +65,7 @@ public class HoodieTableSink implements AppendStreamTableSink<RowData>, Partitio
|
||||
// Read from kafka source
|
||||
RowType rowType = (RowType) this.schema.toRowDataType().notNull().getLogicalType();
|
||||
int numWriteTasks = this.conf.getInteger(FlinkOptions.WRITE_TASKS);
|
||||
StreamWriteOperatorFactory<HoodieRecord> operatorFactory = new StreamWriteOperatorFactory<>(conf, isBounded);
|
||||
StreamWriteOperatorFactory<HoodieRecord> operatorFactory = new StreamWriteOperatorFactory<>(conf);
|
||||
|
||||
DataStream<Object> pipeline = dataStream
|
||||
.map(new RowDataToHoodieFunction<>(rowType, conf), TypeInformation.of(HoodieRecord.class))
|
||||
|
||||
Reference in New Issue
Block a user