1
0

[MINOR] Some cosmetic changes for Flink (#3503)

This commit is contained in:
Danny Chan
2021-08-19 23:21:20 +08:00
committed by GitHub
parent 7dddd54406
commit 9762e4c08c
6 changed files with 184 additions and 108 deletions

View File

@@ -148,7 +148,7 @@ public class BootstrapFunction<I, O extends HoodieRecord>
}
/**
* Load all the indices of give partition path into the backup state.
* Loads all the indices of give partition path into the backup state.
*
* @param partitionPath The partition path
*/

View File

@@ -29,7 +29,15 @@ import java.util.HashSet;
import java.util.Set;
/**
* The function to load specify partition index from existing hoodieTable.
* The function to load index from existing hoodieTable.
*
* <p>This function should only be used for bounded source.
*
* <p>When a record comes in, the function firstly checks whether the partition path of the record is already loaded,
* if the partition is not loaded yet, loads the entire partition and sends the index records to downstream operators
* before it sends the input record; if the partition is loaded already, sends the input record directly.
*
* <p>The input records should shuffle by the partition path to avoid repeated loading.
*/
public class BatchBootstrapFunction<I, O extends HoodieRecord>
extends BootstrapFunction<I, O> {
@@ -61,5 +69,4 @@ public class BatchBootstrapFunction<I, O extends HoodieRecord>
// send the trigger record
out.collect((O) value);
}
}

View File

@@ -85,7 +85,22 @@ public class Pipelines {
.name("clean_commits");
}
public static DataStream<HoodieRecord> bootstrap(Configuration conf, RowType rowType, int defaultParallelism, DataStream<RowData> dataStream) {
public static DataStream<HoodieRecord> bootstrap(
Configuration conf,
RowType rowType,
int defaultParallelism,
DataStream<RowData> dataStream,
boolean bounded) {
return bounded
? boundedBootstrap(conf, rowType, defaultParallelism, dataStream)
: streamBootstrap(conf, rowType, defaultParallelism, dataStream);
}
private static DataStream<HoodieRecord> streamBootstrap(
Configuration conf,
RowType rowType,
int defaultParallelism,
DataStream<RowData> dataStream) {
DataStream<HoodieRecord> dataStream1 = rowDataToHoodieRecord(conf, rowType, dataStream);
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
@@ -101,8 +116,11 @@ public class Pipelines {
return dataStream1;
}
public static DataStream<HoodieRecord> batchBootstrap(Configuration conf, RowType rowType, int defaultParallelism, DataStream<RowData> dataStream) {
// shuffle and sort by partition keys
private static DataStream<HoodieRecord> boundedBootstrap(
Configuration conf,
RowType rowType,
int defaultParallelism,
DataStream<RowData> dataStream) {
final String[] partitionFields = FilePathUtils.extractPartitionKeys(conf);
if (partitionFields.length > 0) {
RowDataKeyGen rowDataKeyGen = RowDataKeyGen.instance(conf, rowType);

View File

@@ -77,9 +77,9 @@ public class HoodieTableSink implements DynamicTableSink, SupportsPartitioning,
// default parallelism
int parallelism = dataStream.getExecutionConfig().getParallelism();
final DataStream<HoodieRecord> dataStream1 = context.isBounded()
? Pipelines.batchBootstrap(conf, rowType, parallelism, dataStream)
: Pipelines.bootstrap(conf, rowType, parallelism, dataStream);
// bootstrap
final DataStream<HoodieRecord> dataStream1 = Pipelines.bootstrap(conf, rowType, parallelism, dataStream, context.isBounded());
// write pipeline
DataStream<Object> pipeline = Pipelines.hoodieStreamWrite(conf, parallelism, dataStream1);