1
0

[HUDI-1692] Bounded source for stream writer (#2674)

Supports bounded source such as VALUES for stream mode writer.
This commit is contained in:
Danny Chan
2021-03-15 19:42:36 +08:00
committed by GitHub
parent fc6c5f4285
commit 76bf2cc790
9 changed files with 135 additions and 62 deletions

View File

@@ -182,7 +182,7 @@ public class StreamWriteFunction<K, I, O>
// it would check the validity.
this.onCheckpointing = true;
// wait for the buffer data flush out and request a new instant
flushBuffer(true);
flushBuffer(true, false);
// signal the task thread to start buffering
addToBufferCondition.signal();
} finally {
@@ -221,7 +221,7 @@ public class StreamWriteFunction<K, I, O>
* End input action for batch source.
*/
public void endInput() {
flushBuffer(true);
flushBuffer(true, true);
this.writeClient.cleanHandles();
}
@@ -333,13 +333,13 @@ public class StreamWriteFunction<K, I, O>
private void flushBufferOnCondition(I value) {
boolean needFlush = this.detector.detect(value);
if (needFlush) {
flushBuffer(false);
flushBuffer(false, false);
this.detector.reset();
}
}
@SuppressWarnings("unchecked, rawtypes")
private void flushBuffer(boolean isFinalBatch) {
private void flushBuffer(boolean isLastBatch, boolean isEndInput) {
this.currentInstant = this.writeClient.getInflightAndRequestedInstant(this.config.get(FlinkOptions.TABLE_TYPE));
if (this.currentInstant == null) {
// in case there are empty checkpoints that has no input data
@@ -364,8 +364,14 @@ public class StreamWriteFunction<K, I, O>
LOG.info("No data to write in subtask [{}] for instant [{}]", taskID, currentInstant);
writeStatus = Collections.emptyList();
}
this.eventGateway.sendEventToCoordinator(
new BatchWriteSuccessEvent(this.taskID, currentInstant, writeStatus, isFinalBatch));
final BatchWriteSuccessEvent event = BatchWriteSuccessEvent.builder()
.taskID(taskID)
.instantTime(currentInstant)
.writeStatus(writeStatus)
.isLastBatch(isLastBatch)
.isEndInput(isEndInput)
.build();
this.eventGateway.sendEventToCoordinator(event);
this.buffer.clear();
this.currentInstant = "";
}

View File

@@ -102,11 +102,6 @@ public class StreamWriteOperatorCoordinator
*/
private final int parallelism;
/**
* Whether the coordinator executes with the bounded data set.
*/
private final boolean isBounded;
/**
* Whether needs to schedule compaction task on finished checkpoints.
*/
@@ -117,16 +112,13 @@ public class StreamWriteOperatorCoordinator
*
* @param conf The config options
* @param parallelism The operator task number
* @param isBounded Whether the input data source is bounded
*/
public StreamWriteOperatorCoordinator(
Configuration conf,
int parallelism,
boolean isBounded) {
int parallelism) {
this.conf = conf;
this.parallelism = parallelism;
this.needsScheduleCompaction = StreamerUtil.needsScheduleCompaction(conf);
this.isBounded = isBounded;
}
@Override
@@ -143,11 +135,6 @@ public class StreamWriteOperatorCoordinator
@Override
public void close() {
if (isBounded) {
// start to commit the instant.
checkAndCommitWithRetry();
// no compaction scheduling for batch mode
}
// teardown the resource
if (writeClient != null) {
writeClient.close();
@@ -216,6 +203,11 @@ public class StreamWriteOperatorCoordinator
} else {
this.eventBuffer[event.getTaskID()] = event;
}
if (event.isEndInput() && checkReady()) {
// start to commit the instant.
doCommit();
// no compaction scheduling for batch mode
}
}
@Override
@@ -424,12 +416,10 @@ public class StreamWriteOperatorCoordinator
public static class Provider implements OperatorCoordinator.Provider {
private final OperatorID operatorId;
private final Configuration conf;
private final boolean isBounded;
public Provider(OperatorID operatorId, Configuration conf, boolean isBounded) {
public Provider(OperatorID operatorId, Configuration conf) {
this.operatorId = operatorId;
this.conf = conf;
this.isBounded = isBounded;
}
@Override
@@ -439,7 +429,7 @@ public class StreamWriteOperatorCoordinator
@Override
public OperatorCoordinator create(Context context) {
return new StreamWriteOperatorCoordinator(this.conf, context.currentParallelism(), isBounded);
return new StreamWriteOperatorCoordinator(this.conf, context.currentParallelism());
}
}
}

View File

@@ -39,20 +39,11 @@ public class StreamWriteOperatorFactory<I>
private final StreamWriteOperator<I> operator;
private final Configuration conf;
private final boolean isBounded;
public StreamWriteOperatorFactory(
Configuration conf) {
this(conf, false);
}
public StreamWriteOperatorFactory(
Configuration conf,
boolean isBounded) {
public StreamWriteOperatorFactory(Configuration conf) {
super(new StreamWriteOperator<>(conf));
this.operator = (StreamWriteOperator<I>) getOperator();
this.conf = conf;
this.isBounded = isBounded;
}
@Override
@@ -70,7 +61,7 @@ public class StreamWriteOperatorFactory<I>
@Override
public OperatorCoordinator.Provider getCoordinatorProvider(String s, OperatorID operatorID) {
return new StreamWriteOperatorCoordinator.Provider(operatorID, this.conf, isBounded);
return new StreamWriteOperatorCoordinator.Provider(operatorID, this.conf);
}
@Override

View File

@@ -25,6 +25,7 @@ import org.apache.flink.runtime.operators.coordination.OperatorEvent;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
/**
* An operator event to mark successful checkpoint batch write.
@@ -36,13 +37,13 @@ public class BatchWriteSuccessEvent implements OperatorEvent {
private final int taskID;
private final String instantTime;
private boolean isLastBatch;
public BatchWriteSuccessEvent(
int taskID,
String instantTime,
List<WriteStatus> writeStatuses) {
this(taskID, instantTime, writeStatuses, false);
}
/**
* Flag saying whether the event comes from the end of input, e.g. the source
* is bounded, there are two cases in which this flag should be set to true:
* 1. batch execution mode
* 2. bounded stream source such as VALUES
*/
private final boolean isEndInput;
/**
* Creates an event.
@@ -55,15 +56,24 @@ public class BatchWriteSuccessEvent implements OperatorEvent {
* if true, the whole data set of the checkpoint
* has been flushed successfully
*/
public BatchWriteSuccessEvent(
private BatchWriteSuccessEvent(
int taskID,
String instantTime,
List<WriteStatus> writeStatuses,
boolean isLastBatch) {
boolean isLastBatch,
boolean isEndInput) {
this.taskID = taskID;
this.instantTime = instantTime;
this.writeStatuses = new ArrayList<>(writeStatuses);
this.isLastBatch = isLastBatch;
this.isEndInput = isEndInput;
}
/**
* Returns the builder for {@link BatchWriteSuccessEvent}.
*/
public static Builder builder() {
return new Builder();
}
public List<WriteStatus> getWriteStatuses() {
@@ -82,6 +92,10 @@ public class BatchWriteSuccessEvent implements OperatorEvent {
return isLastBatch;
}
public boolean isEndInput() {
return isEndInput;
}
/**
* Merges this event with given {@link BatchWriteSuccessEvent} {@code other}.
*
@@ -101,4 +115,51 @@ public class BatchWriteSuccessEvent implements OperatorEvent {
public boolean isReady(String currentInstant) {
return isLastBatch && this.instantTime.equals(currentInstant);
}
// -------------------------------------------------------------------------
// Builder
// -------------------------------------------------------------------------
/**
* Builder for {@link BatchWriteSuccessEvent}.
*/
public static class Builder {
private List<WriteStatus> writeStatus;
private Integer taskID;
private String instantTime;
private boolean isLastBatch = false;
private boolean isEndInput = false;
public BatchWriteSuccessEvent build() {
Objects.requireNonNull(taskID);
Objects.requireNonNull(instantTime);
Objects.requireNonNull(writeStatus);
return new BatchWriteSuccessEvent(taskID, instantTime, writeStatus, isLastBatch, isEndInput);
}
public Builder taskID(int taskID) {
this.taskID = taskID;
return this;
}
public Builder instantTime(String instantTime) {
this.instantTime = instantTime;
return this;
}
public Builder writeStatus(List<WriteStatus> writeStatus) {
this.writeStatus = writeStatus;
return this;
}
public Builder isLastBatch(boolean isLastBatch) {
this.isLastBatch = isLastBatch;
return this;
}
public Builder isEndInput(boolean isEndInput) {
this.isEndInput = isEndInput;
return this;
}
}
}

View File

@@ -69,7 +69,7 @@ public class HoodieTableFactory implements TableSourceFactory<RowData>, TableSin
conf.setString(FlinkOptions.PARTITION_PATH_FIELD, String.join(",", context.getTable().getPartitionKeys()));
TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(context.getTable().getSchema());
inferAvroSchema(conf, tableSchema.toRowDataType().notNull().getLogicalType());
return new HoodieTableSink(conf, tableSchema, context.isBounded());
return new HoodieTableSink(conf, tableSchema);
}
@Override

View File

@@ -54,12 +54,10 @@ public class HoodieTableSink implements AppendStreamTableSink<RowData>, Partitio
private final Configuration conf;
private final TableSchema schema;
private final boolean isBounded;
public HoodieTableSink(Configuration conf, TableSchema schema, boolean isBounded) {
public HoodieTableSink(Configuration conf, TableSchema schema) {
this.conf = conf;
this.schema = schema;
this.isBounded = isBounded;
}
@Override
@@ -67,7 +65,7 @@ public class HoodieTableSink implements AppendStreamTableSink<RowData>, Partitio
// Read from kafka source
RowType rowType = (RowType) this.schema.toRowDataType().notNull().getLogicalType();
int numWriteTasks = this.conf.getInteger(FlinkOptions.WRITE_TASKS);
StreamWriteOperatorFactory<HoodieRecord> operatorFactory = new StreamWriteOperatorFactory<>(conf, isBounded);
StreamWriteOperatorFactory<HoodieRecord> operatorFactory = new StreamWriteOperatorFactory<>(conf);
DataStream<Object> pipeline = dataStream
.map(new RowDataToHoodieFunction<>(rowType, conf), TypeInformation.of(HoodieRecord.class))