[HUDI-2342] Optimize Bootstrap operator (#3516)
Co-authored-by: 喻兆靖 <yuzhaojing@bilibili.com>
This commit is contained in:
@@ -19,6 +19,7 @@
|
|||||||
package org.apache.hudi.sink.bootstrap;
|
package org.apache.hudi.sink.bootstrap;
|
||||||
|
|
||||||
import org.apache.hudi.client.FlinkTaskContextSupplier;
|
import org.apache.hudi.client.FlinkTaskContextSupplier;
|
||||||
|
import org.apache.hudi.client.HoodieFlinkWriteClient;
|
||||||
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
|
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
@@ -26,15 +27,19 @@ import org.apache.hudi.common.model.FileSlice;
|
|||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordGlobalLocation;
|
import org.apache.hudi.common.model.HoodieRecordGlobalLocation;
|
||||||
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
|
import org.apache.hudi.common.model.WriteOperationType;
|
||||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||||
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
|
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
import org.apache.hudi.common.util.BaseFileUtils;
|
import org.apache.hudi.common.util.BaseFileUtils;
|
||||||
|
import org.apache.hudi.common.util.CommitUtils;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.apache.hudi.common.util.StringUtils;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.configuration.FlinkOptions;
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.sink.bootstrap.aggregate.BootstrapAggFunction;
|
|
||||||
import org.apache.hudi.table.HoodieFlinkTable;
|
import org.apache.hudi.table.HoodieFlinkTable;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
import org.apache.hudi.table.format.FormatUtils;
|
import org.apache.hudi.table.format.FormatUtils;
|
||||||
@@ -42,35 +47,40 @@ import org.apache.hudi.util.StreamerUtil;
|
|||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.flink.annotation.VisibleForTesting;
|
import org.apache.flink.annotation.VisibleForTesting;
|
||||||
|
import org.apache.flink.api.common.state.ListState;
|
||||||
|
import org.apache.flink.api.common.state.ListStateDescriptor;
|
||||||
|
import org.apache.flink.api.common.typeinfo.Types;
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.flink.runtime.state.KeyGroupRangeAssignment;
|
import org.apache.flink.runtime.state.KeyGroupRangeAssignment;
|
||||||
import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager;
|
import org.apache.flink.runtime.state.StateInitializationContext;
|
||||||
import org.apache.flink.streaming.api.functions.ProcessFunction;
|
import org.apache.flink.runtime.state.StateSnapshotContext;
|
||||||
import org.apache.flink.streaming.api.operators.StreamingRuntimeContext;
|
import org.apache.flink.streaming.api.operators.AbstractStreamOperator;
|
||||||
import org.apache.flink.util.Collector;
|
import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
|
||||||
|
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import static java.util.stream.Collectors.toList;
|
import static java.util.stream.Collectors.toList;
|
||||||
import static org.apache.hudi.util.StreamerUtil.isValidFile;
|
import static org.apache.hudi.util.StreamerUtil.isValidFile;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The function to load index from existing hoodieTable.
|
* The operator to load index from existing hoodieTable.
|
||||||
*
|
*
|
||||||
* <p>Each subtask of the function triggers the index bootstrap when the first element came in,
|
* <p>Each subtask of the function triggers the index bootstrap when the first element came in,
|
||||||
* the record cannot be sent until all the index records have been sent.
|
* the record cannot be sent until all the index records have been sent.
|
||||||
*
|
*
|
||||||
* <p>The output records should then shuffle by the recordKey and thus do scalable write.
|
* <p>The output records should then shuffle by the recordKey and thus do scalable write.
|
||||||
*/
|
*/
|
||||||
public class BootstrapFunction<I, O extends HoodieRecord>
|
public class BootstrapOperator<I, O extends HoodieRecord>
|
||||||
extends ProcessFunction<I, O> {
|
extends AbstractStreamOperator<O> implements OneInputStreamOperator<I, O> {
|
||||||
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(BootstrapFunction.class);
|
private static final Logger LOG = LoggerFactory.getLogger(BootstrapOperator.class);
|
||||||
|
|
||||||
protected HoodieTable<?, ?, ?, ?> hoodieTable;
|
protected HoodieTable<?, ?, ?, ?> hoodieTable;
|
||||||
|
|
||||||
@@ -79,65 +89,62 @@ public class BootstrapFunction<I, O extends HoodieRecord>
|
|||||||
protected transient org.apache.hadoop.conf.Configuration hadoopConf;
|
protected transient org.apache.hadoop.conf.Configuration hadoopConf;
|
||||||
protected transient HoodieWriteConfig writeConfig;
|
protected transient HoodieWriteConfig writeConfig;
|
||||||
|
|
||||||
private GlobalAggregateManager aggregateManager;
|
private transient ListState<String> instantState;
|
||||||
|
|
||||||
private final Pattern pattern;
|
private final Pattern pattern;
|
||||||
private boolean alreadyBootstrap;
|
private String lastInstantTime;
|
||||||
|
private HoodieFlinkWriteClient writeClient;
|
||||||
|
private String actionType;
|
||||||
|
|
||||||
public BootstrapFunction(Configuration conf) {
|
public BootstrapOperator(Configuration conf) {
|
||||||
this.conf = conf;
|
this.conf = conf;
|
||||||
this.pattern = Pattern.compile(conf.getString(FlinkOptions.INDEX_PARTITION_REGEX));
|
this.pattern = Pattern.compile(conf.getString(FlinkOptions.INDEX_PARTITION_REGEX));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void open(Configuration parameters) throws Exception {
|
public void snapshotState(StateSnapshotContext context) throws Exception {
|
||||||
super.open(parameters);
|
lastInstantTime = this.writeClient.getLastPendingInstant(this.actionType);
|
||||||
|
instantState.update(Collections.singletonList(lastInstantTime));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void initializeState(StateInitializationContext context) throws Exception {
|
||||||
|
ListStateDescriptor<String> instantStateDescriptor = new ListStateDescriptor<>(
|
||||||
|
"instantStateDescriptor",
|
||||||
|
Types.STRING
|
||||||
|
);
|
||||||
|
instantState = context.getOperatorStateStore().getListState(instantStateDescriptor);
|
||||||
|
|
||||||
|
if (context.isRestored()) {
|
||||||
|
Iterator<String> instantIterator = instantState.get().iterator();
|
||||||
|
if (instantIterator.hasNext()) {
|
||||||
|
lastInstantTime = instantIterator.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
this.hadoopConf = StreamerUtil.getHadoopConf();
|
this.hadoopConf = StreamerUtil.getHadoopConf();
|
||||||
this.writeConfig = StreamerUtil.getHoodieClientConfig(this.conf);
|
this.writeConfig = StreamerUtil.getHoodieClientConfig(this.conf);
|
||||||
this.hoodieTable = getTable();
|
this.hoodieTable = getTable();
|
||||||
this.aggregateManager = ((StreamingRuntimeContext) getRuntimeContext()).getGlobalAggregateManager();
|
this.writeClient = StreamerUtil.createWriteClient(this.conf, getRuntimeContext());
|
||||||
|
this.actionType = CommitUtils.getCommitActionType(
|
||||||
|
WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)),
|
||||||
|
HoodieTableType.valueOf(conf.getString(FlinkOptions.TABLE_TYPE)));
|
||||||
|
|
||||||
|
String basePath = hoodieTable.getMetaClient().getBasePath();
|
||||||
|
int taskID = getRuntimeContext().getIndexOfThisSubtask();
|
||||||
|
LOG.info("Start loading records in table {} into the index state, taskId = {}", basePath, taskID);
|
||||||
|
for (String partitionPath : FSUtils.getAllFoldersWithPartitionMetaFile(FSUtils.getFs(basePath, hadoopConf), basePath)) {
|
||||||
|
if (pattern.matcher(partitionPath).matches()) {
|
||||||
|
loadRecords(partitionPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG.info("Finish sending index records, taskId = {}.", getRuntimeContext().getIndexOfThisSubtask());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
public void processElement(I value, Context ctx, Collector<O> out) throws Exception {
|
public void processElement(StreamRecord<I> element) throws Exception {
|
||||||
if (!alreadyBootstrap) {
|
output.collect((StreamRecord<O>) element);
|
||||||
String basePath = hoodieTable.getMetaClient().getBasePath();
|
|
||||||
int taskID = getRuntimeContext().getIndexOfThisSubtask();
|
|
||||||
LOG.info("Start loading records in table {} into the index state, taskId = {}", basePath, taskID);
|
|
||||||
for (String partitionPath : FSUtils.getAllFoldersWithPartitionMetaFile(FSUtils.getFs(basePath, hadoopConf), basePath)) {
|
|
||||||
if (pattern.matcher(partitionPath).matches()) {
|
|
||||||
loadRecords(partitionPath, out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// wait for others bootstrap task send bootstrap complete.
|
|
||||||
waitForBootstrapReady(taskID);
|
|
||||||
|
|
||||||
alreadyBootstrap = true;
|
|
||||||
LOG.info("Finish sending index records, taskId = {}.", getRuntimeContext().getIndexOfThisSubtask());
|
|
||||||
}
|
|
||||||
|
|
||||||
// send the trigger record
|
|
||||||
out.collect((O) value);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Wait for other bootstrap tasks to finish the index bootstrap.
|
|
||||||
*/
|
|
||||||
private void waitForBootstrapReady(int taskID) {
|
|
||||||
int taskNum = getRuntimeContext().getNumberOfParallelSubtasks();
|
|
||||||
int readyTaskNum = 1;
|
|
||||||
while (taskNum != readyTaskNum) {
|
|
||||||
try {
|
|
||||||
readyTaskNum = aggregateManager.updateGlobalAggregate(BootstrapAggFunction.NAME, taskID, new BootstrapAggFunction());
|
|
||||||
LOG.info("Waiting for other bootstrap tasks to complete, taskId = {}.", taskID);
|
|
||||||
|
|
||||||
TimeUnit.SECONDS.sleep(5);
|
|
||||||
} catch (Exception e) {
|
|
||||||
LOG.warn("Update global task bootstrap summary error", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private HoodieFlinkTable getTable() {
|
private HoodieFlinkTable getTable() {
|
||||||
@@ -153,7 +160,7 @@ public class BootstrapFunction<I, O extends HoodieRecord>
|
|||||||
* @param partitionPath The partition path
|
* @param partitionPath The partition path
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
protected void loadRecords(String partitionPath, Collector<O> out) throws Exception {
|
protected void loadRecords(String partitionPath) throws Exception {
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
|
||||||
BaseFileUtils fileUtils = BaseFileUtils.getInstance(this.hoodieTable.getBaseFileFormat());
|
BaseFileUtils fileUtils = BaseFileUtils.getInstance(this.hoodieTable.getBaseFileFormat());
|
||||||
@@ -163,8 +170,11 @@ public class BootstrapFunction<I, O extends HoodieRecord>
|
|||||||
final int maxParallelism = getRuntimeContext().getMaxNumberOfParallelSubtasks();
|
final int maxParallelism = getRuntimeContext().getMaxNumberOfParallelSubtasks();
|
||||||
final int taskID = getRuntimeContext().getIndexOfThisSubtask();
|
final int taskID = getRuntimeContext().getIndexOfThisSubtask();
|
||||||
|
|
||||||
Option<HoodieInstant> latestCommitTime = this.hoodieTable.getMetaClient().getCommitsTimeline()
|
HoodieTimeline commitsTimeline = this.hoodieTable.getMetaClient().getCommitsTimeline();
|
||||||
.filterCompletedInstants().lastInstant();
|
if (!StringUtils.isNullOrEmpty(lastInstantTime)) {
|
||||||
|
commitsTimeline = commitsTimeline.findInstantsAfter(lastInstantTime);
|
||||||
|
}
|
||||||
|
Option<HoodieInstant> latestCommitTime = commitsTimeline.filterCompletedInstants().lastInstant();
|
||||||
|
|
||||||
if (latestCommitTime.isPresent()) {
|
if (latestCommitTime.isPresent()) {
|
||||||
List<FileSlice> fileSlices = this.hoodieTable.getSliceView()
|
List<FileSlice> fileSlices = this.hoodieTable.getSliceView()
|
||||||
@@ -193,22 +203,22 @@ public class BootstrapFunction<I, O extends HoodieRecord>
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (HoodieKey hoodieKey : hoodieKeys) {
|
for (HoodieKey hoodieKey : hoodieKeys) {
|
||||||
out.collect((O) new IndexRecord(generateHoodieRecord(hoodieKey, fileSlice)));
|
output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(hoodieKey, fileSlice))));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// load avro log records
|
// load avro log records
|
||||||
List<String> logPaths = fileSlice.getLogFiles()
|
List<String> logPaths = fileSlice.getLogFiles()
|
||||||
// filter out crushed files
|
// filter out crushed files
|
||||||
.filter(logFile -> isValidFile(logFile.getFileStatus()))
|
.filter(logFile -> isValidFile(logFile.getFileStatus()))
|
||||||
.map(logFile -> logFile.getPath().toString())
|
.map(logFile -> logFile.getPath().toString())
|
||||||
.collect(toList());
|
.collect(toList());
|
||||||
HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(logPaths, schema, latestCommitTime.get().getTimestamp(),
|
HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(logPaths, schema, latestCommitTime.get().getTimestamp(),
|
||||||
writeConfig, hadoopConf);
|
writeConfig, hadoopConf);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (String recordKey : scanner.getRecords().keySet()) {
|
for (String recordKey : scanner.getRecords().keySet()) {
|
||||||
out.collect((O) new IndexRecord(generateHoodieRecord(new HoodieKey(recordKey, partitionPath), fileSlice)));
|
output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(new HoodieKey(recordKey, partitionPath), fileSlice))));
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new HoodieException(String.format("Error when loading record keys from files: %s", logPaths), e);
|
throw new HoodieException(String.format("Error when loading record keys from files: %s", logPaths), e);
|
||||||
@@ -241,7 +251,7 @@ public class BootstrapFunction<I, O extends HoodieRecord>
|
|||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public boolean isAlreadyBootstrap() {
|
public boolean isAlreadyBootstrap() throws Exception {
|
||||||
return alreadyBootstrap;
|
return instantState.get().iterator().hasNext();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.hudi.sink.bootstrap.aggregate;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Bootstrap ready task id accumulator.
|
|
||||||
*/
|
|
||||||
public class BootstrapAccumulator implements Serializable {
|
|
||||||
private static final long serialVersionUID = 1L;
|
|
||||||
|
|
||||||
private final Set<Integer> readyTaskSet;
|
|
||||||
|
|
||||||
public BootstrapAccumulator() {
|
|
||||||
this.readyTaskSet = new HashSet<>();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void update(int taskId) {
|
|
||||||
readyTaskSet.add(taskId);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int readyTaskNum() {
|
|
||||||
return readyTaskSet.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public BootstrapAccumulator merge(BootstrapAccumulator acc) {
|
|
||||||
if (acc == null) {
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
readyTaskSet.addAll(acc.readyTaskSet);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.hudi.sink.bootstrap.aggregate;
|
|
||||||
|
|
||||||
import org.apache.flink.api.common.functions.AggregateFunction;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Aggregate Function that accumulates the loaded task number of
|
|
||||||
* function {@link org.apache.hudi.sink.bootstrap.BootstrapFunction}.
|
|
||||||
*/
|
|
||||||
public class BootstrapAggFunction implements AggregateFunction<Integer, BootstrapAccumulator, Integer> {
|
|
||||||
public static final String NAME = BootstrapAggFunction.class.getSimpleName();
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public BootstrapAccumulator createAccumulator() {
|
|
||||||
return new BootstrapAccumulator();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public BootstrapAccumulator add(Integer taskId, BootstrapAccumulator bootstrapAccumulator) {
|
|
||||||
bootstrapAccumulator.update(taskId);
|
|
||||||
return bootstrapAccumulator;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Integer getResult(BootstrapAccumulator bootstrapAccumulator) {
|
|
||||||
return bootstrapAccumulator.readyTaskNum();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public BootstrapAccumulator merge(BootstrapAccumulator bootstrapAccumulator, BootstrapAccumulator acc) {
|
|
||||||
return bootstrapAccumulator.merge(acc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -19,17 +19,17 @@
|
|||||||
package org.apache.hudi.sink.bootstrap.batch;
|
package org.apache.hudi.sink.bootstrap.batch;
|
||||||
|
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.sink.bootstrap.BootstrapFunction;
|
import org.apache.hudi.sink.bootstrap.BootstrapOperator;
|
||||||
import org.apache.hudi.util.StreamerUtil;
|
import org.apache.hudi.util.StreamerUtil;
|
||||||
|
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.flink.util.Collector;
|
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The function to load index from existing hoodieTable.
|
* The operator to load index from existing hoodieTable.
|
||||||
*
|
*
|
||||||
* <p>This function should only be used for bounded source.
|
* <p>This function should only be used for bounded source.
|
||||||
*
|
*
|
||||||
@@ -39,34 +39,30 @@ import java.util.Set;
|
|||||||
*
|
*
|
||||||
* <p>The input records should shuffle by the partition path to avoid repeated loading.
|
* <p>The input records should shuffle by the partition path to avoid repeated loading.
|
||||||
*/
|
*/
|
||||||
public class BatchBootstrapFunction<I, O extends HoodieRecord>
|
public class BatchBootstrapOperator<I, O extends HoodieRecord>
|
||||||
extends BootstrapFunction<I, O> {
|
extends BootstrapOperator<I, O> {
|
||||||
|
|
||||||
private Set<String> partitionPathSet;
|
private final Set<String> partitionPathSet;
|
||||||
private boolean haveSuccessfulCommits;
|
private final boolean haveSuccessfulCommits;
|
||||||
|
|
||||||
public BatchBootstrapFunction(Configuration conf) {
|
public BatchBootstrapOperator(Configuration conf) {
|
||||||
super(conf);
|
super(conf);
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void open(Configuration parameters) throws Exception {
|
|
||||||
super.open(parameters);
|
|
||||||
this.partitionPathSet = new HashSet<>();
|
this.partitionPathSet = new HashSet<>();
|
||||||
this.haveSuccessfulCommits = StreamerUtil.haveSuccessfulCommits(hoodieTable.getMetaClient());
|
this.haveSuccessfulCommits = StreamerUtil.haveSuccessfulCommits(hoodieTable.getMetaClient());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void processElement(I value, Context context, Collector<O> out) throws Exception {
|
@SuppressWarnings("unchecked")
|
||||||
final HoodieRecord record = (HoodieRecord<?>) value;
|
public void processElement(StreamRecord<I> element) throws Exception {
|
||||||
|
final HoodieRecord record = (HoodieRecord<?>) element.getValue();
|
||||||
final String partitionPath = record.getKey().getPartitionPath();
|
final String partitionPath = record.getKey().getPartitionPath();
|
||||||
|
|
||||||
if (haveSuccessfulCommits && !partitionPathSet.contains(partitionPath)) {
|
if (haveSuccessfulCommits && !partitionPathSet.contains(partitionPath)) {
|
||||||
loadRecords(partitionPath, out);
|
loadRecords(partitionPath);
|
||||||
partitionPathSet.add(partitionPath);
|
partitionPathSet.add(partitionPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
// send the trigger record
|
// send the trigger record
|
||||||
out.collect((O) value);
|
output.collect((StreamRecord<O>) element);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -22,8 +22,8 @@ import org.apache.hudi.common.model.HoodieRecord;
|
|||||||
import org.apache.hudi.configuration.FlinkOptions;
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
import org.apache.hudi.sink.CleanFunction;
|
import org.apache.hudi.sink.CleanFunction;
|
||||||
import org.apache.hudi.sink.StreamWriteOperatorFactory;
|
import org.apache.hudi.sink.StreamWriteOperatorFactory;
|
||||||
import org.apache.hudi.sink.bootstrap.BootstrapFunction;
|
import org.apache.hudi.sink.bootstrap.BootstrapOperator;
|
||||||
import org.apache.hudi.sink.bootstrap.batch.BatchBootstrapFunction;
|
import org.apache.hudi.sink.bootstrap.batch.BatchBootstrapOperator;
|
||||||
import org.apache.hudi.sink.bulk.BulkInsertWriteOperator;
|
import org.apache.hudi.sink.bulk.BulkInsertWriteOperator;
|
||||||
import org.apache.hudi.sink.bulk.RowDataKeyGen;
|
import org.apache.hudi.sink.bulk.RowDataKeyGen;
|
||||||
import org.apache.hudi.sink.bulk.sort.SortOperatorGen;
|
import org.apache.hudi.sink.bulk.sort.SortOperatorGen;
|
||||||
@@ -104,11 +104,11 @@ public class Pipelines {
|
|||||||
DataStream<HoodieRecord> dataStream1 = rowDataToHoodieRecord(conf, rowType, dataStream);
|
DataStream<HoodieRecord> dataStream1 = rowDataToHoodieRecord(conf, rowType, dataStream);
|
||||||
|
|
||||||
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
||||||
dataStream1 = dataStream1.rebalance()
|
dataStream1 = dataStream1
|
||||||
.transform(
|
.transform(
|
||||||
"index_bootstrap",
|
"index_bootstrap",
|
||||||
TypeInformation.of(HoodieRecord.class),
|
TypeInformation.of(HoodieRecord.class),
|
||||||
new ProcessOperator<>(new BootstrapFunction<>(conf)))
|
new BootstrapOperator<>(conf))
|
||||||
.setParallelism(conf.getOptional(FlinkOptions.INDEX_BOOTSTRAP_TASKS).orElse(defaultParallelism))
|
.setParallelism(conf.getOptional(FlinkOptions.INDEX_BOOTSTRAP_TASKS).orElse(defaultParallelism))
|
||||||
.uid("uid_index_bootstrap_" + conf.getString(FlinkOptions.TABLE_NAME));
|
.uid("uid_index_bootstrap_" + conf.getString(FlinkOptions.TABLE_NAME));
|
||||||
}
|
}
|
||||||
@@ -133,7 +133,7 @@ public class Pipelines {
|
|||||||
.transform(
|
.transform(
|
||||||
"batch_index_bootstrap",
|
"batch_index_bootstrap",
|
||||||
TypeInformation.of(HoodieRecord.class),
|
TypeInformation.of(HoodieRecord.class),
|
||||||
new ProcessOperator<>(new BatchBootstrapFunction<>(conf)))
|
new BatchBootstrapOperator<>(conf))
|
||||||
.setParallelism(conf.getOptional(FlinkOptions.INDEX_BOOTSTRAP_TASKS).orElse(defaultParallelism))
|
.setParallelism(conf.getOptional(FlinkOptions.INDEX_BOOTSTRAP_TASKS).orElse(defaultParallelism))
|
||||||
.uid("uid_batch_index_bootstrap_" + conf.getString(FlinkOptions.TABLE_NAME));
|
.uid("uid_batch_index_bootstrap_" + conf.getString(FlinkOptions.TABLE_NAME));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ import org.apache.hudi.common.util.Option;
|
|||||||
import org.apache.hudi.configuration.FlinkOptions;
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
import org.apache.hudi.sink.CleanFunction;
|
import org.apache.hudi.sink.CleanFunction;
|
||||||
import org.apache.hudi.sink.StreamWriteOperatorFactory;
|
import org.apache.hudi.sink.StreamWriteOperatorFactory;
|
||||||
import org.apache.hudi.sink.bootstrap.BootstrapFunction;
|
import org.apache.hudi.sink.bootstrap.BootstrapOperator;
|
||||||
import org.apache.hudi.sink.compact.CompactFunction;
|
import org.apache.hudi.sink.compact.CompactFunction;
|
||||||
import org.apache.hudi.sink.compact.CompactionCommitEvent;
|
import org.apache.hudi.sink.compact.CompactionCommitEvent;
|
||||||
import org.apache.hudi.sink.compact.CompactionCommitSink;
|
import org.apache.hudi.sink.compact.CompactionCommitSink;
|
||||||
@@ -114,11 +114,11 @@ public class HoodieFlinkStreamer {
|
|||||||
.map(RowDataToHoodieFunctions.create(rowType, conf), TypeInformation.of(HoodieRecord.class));
|
.map(RowDataToHoodieFunctions.create(rowType, conf), TypeInformation.of(HoodieRecord.class));
|
||||||
|
|
||||||
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
||||||
dataStream2 = dataStream2.rebalance()
|
dataStream2 = dataStream2
|
||||||
.transform(
|
.transform(
|
||||||
"index_bootstrap",
|
"index_bootstrap",
|
||||||
TypeInformation.of(HoodieRecord.class),
|
TypeInformation.of(HoodieRecord.class),
|
||||||
new ProcessOperator<>(new BootstrapFunction<>(conf)))
|
new BootstrapOperator<>(conf))
|
||||||
.setParallelism(conf.getOptional(FlinkOptions.INDEX_BOOTSTRAP_TASKS).orElse(parallelism))
|
.setParallelism(conf.getOptional(FlinkOptions.INDEX_BOOTSTRAP_TASKS).orElse(parallelism))
|
||||||
.uid("uid_index_bootstrap_" + conf.getString(FlinkOptions.TABLE_NAME));
|
.uid("uid_index_bootstrap_" + conf.getString(FlinkOptions.TABLE_NAME));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
|||||||
import org.apache.hudi.common.util.CompactionUtils;
|
import org.apache.hudi.common.util.CompactionUtils;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.configuration.FlinkOptions;
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
import org.apache.hudi.sink.bootstrap.BootstrapFunction;
|
import org.apache.hudi.sink.bootstrap.BootstrapOperator;
|
||||||
import org.apache.hudi.sink.compact.CompactFunction;
|
import org.apache.hudi.sink.compact.CompactFunction;
|
||||||
import org.apache.hudi.sink.compact.CompactionCommitEvent;
|
import org.apache.hudi.sink.compact.CompactionCommitEvent;
|
||||||
import org.apache.hudi.sink.compact.CompactionCommitSink;
|
import org.apache.hudi.sink.compact.CompactionCommitSink;
|
||||||
@@ -277,7 +277,7 @@ public class StreamWriteITCase extends TestLogger {
|
|||||||
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
||||||
hoodieDataStream = hoodieDataStream.transform("index_bootstrap",
|
hoodieDataStream = hoodieDataStream.transform("index_bootstrap",
|
||||||
TypeInformation.of(HoodieRecord.class),
|
TypeInformation.of(HoodieRecord.class),
|
||||||
new ProcessOperator<>(new BootstrapFunction<>(conf)));
|
new BootstrapOperator<>(conf));
|
||||||
}
|
}
|
||||||
|
|
||||||
DataStream<Object> pipeline = hoodieDataStream
|
DataStream<Object> pipeline = hoodieDataStream
|
||||||
@@ -370,7 +370,7 @@ public class StreamWriteITCase extends TestLogger {
|
|||||||
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
||||||
hoodieDataStream = hoodieDataStream.transform("index_bootstrap",
|
hoodieDataStream = hoodieDataStream.transform("index_bootstrap",
|
||||||
TypeInformation.of(HoodieRecord.class),
|
TypeInformation.of(HoodieRecord.class),
|
||||||
new ProcessOperator<>(new BootstrapFunction<>(conf)));
|
new BootstrapOperator<>(conf));
|
||||||
}
|
}
|
||||||
|
|
||||||
DataStream<Object> pipeline = hoodieDataStream
|
DataStream<Object> pipeline = hoodieDataStream
|
||||||
|
|||||||
@@ -725,8 +725,6 @@ public class TestWriteCopyOnWrite {
|
|||||||
funcWrapper.invoke(rowData);
|
funcWrapper.invoke(rowData);
|
||||||
}
|
}
|
||||||
|
|
||||||
assertTrue(funcWrapper.isAlreadyBootstrap());
|
|
||||||
|
|
||||||
checkIndexLoaded(
|
checkIndexLoaded(
|
||||||
new HoodieKey("id1", "par1"),
|
new HoodieKey("id1", "par1"),
|
||||||
new HoodieKey("id2", "par1"),
|
new HoodieKey("id2", "par1"),
|
||||||
@@ -743,6 +741,8 @@ public class TestWriteCopyOnWrite {
|
|||||||
// this triggers the data write and event send
|
// this triggers the data write and event send
|
||||||
funcWrapper.checkpointFunction(1);
|
funcWrapper.checkpointFunction(1);
|
||||||
|
|
||||||
|
assertTrue(funcWrapper.isAlreadyBootstrap());
|
||||||
|
|
||||||
String instant = funcWrapper.getWriteClient()
|
String instant = funcWrapper.getWriteClient()
|
||||||
.getLastPendingInstant(getTableType());
|
.getLastPendingInstant(getTableType());
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,78 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.sink.utils;
|
||||||
|
|
||||||
|
import org.apache.flink.streaming.api.operators.Output;
|
||||||
|
import org.apache.flink.streaming.api.watermark.Watermark;
|
||||||
|
import org.apache.flink.streaming.runtime.streamrecord.LatencyMarker;
|
||||||
|
import org.apache.flink.streaming.runtime.streamrecord.StreamElement;
|
||||||
|
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
|
||||||
|
import org.apache.flink.util.InstantiationUtil;
|
||||||
|
import org.apache.flink.util.OutputTag;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Collecting {@link Output} for {@link StreamRecord}.
|
||||||
|
*/
|
||||||
|
public class CollectorOutput<T> implements Output<StreamRecord<T>> {
|
||||||
|
|
||||||
|
private final List<StreamElement> list;
|
||||||
|
|
||||||
|
public CollectorOutput(List<StreamElement> list) {
|
||||||
|
this.list = list;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<StreamElement> getList() {
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void emitWatermark(Watermark mark) {
|
||||||
|
list.add(mark);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void emitLatencyMarker(LatencyMarker latencyMarker) {
|
||||||
|
list.add(latencyMarker);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void collect(StreamRecord<T> record) {
|
||||||
|
try {
|
||||||
|
ClassLoader cl = record.getClass().getClassLoader();
|
||||||
|
T copied =
|
||||||
|
InstantiationUtil.deserializeObject(
|
||||||
|
InstantiationUtil.serializeObject(record.getValue()), cl);
|
||||||
|
list.add(record.copy(copied));
|
||||||
|
} catch (IOException | ClassNotFoundException ex) {
|
||||||
|
throw new RuntimeException("Unable to deserialize record: " + record, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <X> void collect(OutputTag<X> outputTag, StreamRecord<X> record) {
|
||||||
|
throw new UnsupportedOperationException("Side output not supported for CollectorOutput");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -52,7 +52,6 @@ public class CompactFunctionWrapper {
|
|||||||
|
|
||||||
private final IOManager ioManager;
|
private final IOManager ioManager;
|
||||||
private final StreamingRuntimeContext runtimeContext;
|
private final StreamingRuntimeContext runtimeContext;
|
||||||
private final MockFunctionInitializationContext functionInitializationContext;
|
|
||||||
|
|
||||||
/** Function that generates the {@link HoodieCompactionPlan}. */
|
/** Function that generates the {@link HoodieCompactionPlan}. */
|
||||||
private CompactionPlanOperator compactionPlanOperator;
|
private CompactionPlanOperator compactionPlanOperator;
|
||||||
@@ -70,7 +69,6 @@ public class CompactFunctionWrapper {
|
|||||||
.build();
|
.build();
|
||||||
this.runtimeContext = new MockStreamingRuntimeContext(false, 1, 0, environment);
|
this.runtimeContext = new MockStreamingRuntimeContext(false, 1, 0, environment);
|
||||||
this.conf = conf;
|
this.conf = conf;
|
||||||
this.functionInitializationContext = new MockFunctionInitializationContext();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void openFunction() throws Exception {
|
public void openFunction() throws Exception {
|
||||||
|
|||||||
@@ -19,15 +19,18 @@ package org.apache.hudi.sink.utils;
|
|||||||
|
|
||||||
import org.apache.flink.api.common.state.KeyedStateStore;
|
import org.apache.flink.api.common.state.KeyedStateStore;
|
||||||
import org.apache.flink.runtime.state.FunctionInitializationContext;
|
import org.apache.flink.runtime.state.FunctionInitializationContext;
|
||||||
|
import org.apache.flink.runtime.state.KeyGroupStatePartitionStreamProvider;
|
||||||
|
import org.apache.flink.runtime.state.StateInitializationContext;
|
||||||
|
import org.apache.flink.runtime.state.StatePartitionStreamProvider;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link FunctionInitializationContext} for testing purpose.
|
* A {@link FunctionInitializationContext} for testing purpose.
|
||||||
*/
|
*/
|
||||||
public class MockFunctionInitializationContext implements FunctionInitializationContext {
|
public class MockStateInitializationContext implements StateInitializationContext {
|
||||||
|
|
||||||
private final MockOperatorStateStore operatorStateStore;
|
private final MockOperatorStateStore operatorStateStore;
|
||||||
|
|
||||||
public MockFunctionInitializationContext() {
|
public MockStateInitializationContext() {
|
||||||
operatorStateStore = new MockOperatorStateStore();
|
operatorStateStore = new MockOperatorStateStore();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -45,4 +48,14 @@ public class MockFunctionInitializationContext implements FunctionInitialization
|
|||||||
public KeyedStateStore getKeyedStateStore() {
|
public KeyedStateStore getKeyedStateStore() {
|
||||||
return operatorStateStore;
|
return operatorStateStore;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterable<StatePartitionStreamProvider> getRawOperatorStateInputs() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterable<KeyGroupStatePartitionStreamProvider> getRawKeyedStateInputs() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -25,8 +25,7 @@ import org.apache.hudi.configuration.FlinkOptions;
|
|||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.sink.StreamWriteFunction;
|
import org.apache.hudi.sink.StreamWriteFunction;
|
||||||
import org.apache.hudi.sink.StreamWriteOperatorCoordinator;
|
import org.apache.hudi.sink.StreamWriteOperatorCoordinator;
|
||||||
import org.apache.hudi.sink.bootstrap.BootstrapFunction;
|
import org.apache.hudi.sink.bootstrap.BootstrapOperator;
|
||||||
import org.apache.hudi.sink.bootstrap.IndexRecord;
|
|
||||||
import org.apache.hudi.sink.event.WriteMetadataEvent;
|
import org.apache.hudi.sink.event.WriteMetadataEvent;
|
||||||
import org.apache.hudi.sink.partitioner.BucketAssignFunction;
|
import org.apache.hudi.sink.partitioner.BucketAssignFunction;
|
||||||
import org.apache.hudi.sink.partitioner.BucketAssignOperator;
|
import org.apache.hudi.sink.partitioner.BucketAssignOperator;
|
||||||
@@ -34,6 +33,7 @@ import org.apache.hudi.sink.transform.RowDataToHoodieFunction;
|
|||||||
import org.apache.hudi.util.StreamerUtil;
|
import org.apache.hudi.util.StreamerUtil;
|
||||||
import org.apache.hudi.utils.TestConfigurations;
|
import org.apache.hudi.utils.TestConfigurations;
|
||||||
|
|
||||||
|
import org.apache.flink.api.common.ExecutionConfig;
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.flink.runtime.io.disk.iomanager.IOManager;
|
import org.apache.flink.runtime.io.disk.iomanager.IOManager;
|
||||||
import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync;
|
import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync;
|
||||||
@@ -43,8 +43,14 @@ import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorCo
|
|||||||
import org.apache.flink.runtime.operators.coordination.OperatorEvent;
|
import org.apache.flink.runtime.operators.coordination.OperatorEvent;
|
||||||
import org.apache.flink.runtime.operators.testutils.MockEnvironment;
|
import org.apache.flink.runtime.operators.testutils.MockEnvironment;
|
||||||
import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder;
|
import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder;
|
||||||
|
import org.apache.flink.streaming.api.graph.StreamConfig;
|
||||||
|
import org.apache.flink.streaming.api.operators.Output;
|
||||||
import org.apache.flink.streaming.api.operators.StreamingRuntimeContext;
|
import org.apache.flink.streaming.api.operators.StreamingRuntimeContext;
|
||||||
import org.apache.flink.streaming.api.operators.collect.utils.MockOperatorEventGateway;
|
import org.apache.flink.streaming.api.operators.collect.utils.MockOperatorEventGateway;
|
||||||
|
import org.apache.flink.streaming.runtime.streamrecord.StreamElement;
|
||||||
|
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
|
||||||
|
import org.apache.flink.streaming.util.MockStreamTask;
|
||||||
|
import org.apache.flink.streaming.util.MockStreamTaskBuilder;
|
||||||
import org.apache.flink.table.data.RowData;
|
import org.apache.flink.table.data.RowData;
|
||||||
import org.apache.flink.util.Collector;
|
import org.apache.flink.util.Collector;
|
||||||
|
|
||||||
@@ -68,7 +74,7 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
private final MockOperatorEventGateway gateway;
|
private final MockOperatorEventGateway gateway;
|
||||||
private final MockOperatorCoordinatorContext coordinatorContext;
|
private final MockOperatorCoordinatorContext coordinatorContext;
|
||||||
private final StreamWriteOperatorCoordinator coordinator;
|
private final StreamWriteOperatorCoordinator coordinator;
|
||||||
private final MockFunctionInitializationContext functionInitializationContext;
|
private final MockStateInitializationContext stateInitializationContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function that converts row data to HoodieRecord.
|
* Function that converts row data to HoodieRecord.
|
||||||
@@ -77,7 +83,7 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
/**
|
/**
|
||||||
* Function that load index in state.
|
* Function that load index in state.
|
||||||
*/
|
*/
|
||||||
private BootstrapFunction<HoodieRecord<?>, HoodieRecord<?>> bootstrapFunction;
|
private BootstrapOperator<HoodieRecord<?>, HoodieRecord<?>> bootstrapOperator;
|
||||||
/**
|
/**
|
||||||
* Function that assigns bucket ID.
|
* Function that assigns bucket ID.
|
||||||
*/
|
*/
|
||||||
@@ -93,6 +99,12 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
|
|
||||||
private CompactFunctionWrapper compactFunctionWrapper;
|
private CompactFunctionWrapper compactFunctionWrapper;
|
||||||
|
|
||||||
|
private final Output<StreamRecord<HoodieRecord<?>>> output;
|
||||||
|
|
||||||
|
private final MockStreamTask streamTask;
|
||||||
|
|
||||||
|
private final StreamConfig streamConfig;
|
||||||
|
|
||||||
private final boolean asyncCompaction;
|
private final boolean asyncCompaction;
|
||||||
|
|
||||||
public StreamWriteFunctionWrapper(String tablePath) throws Exception {
|
public StreamWriteFunctionWrapper(String tablePath) throws Exception {
|
||||||
@@ -114,10 +126,17 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
this.coordinator = new StreamWriteOperatorCoordinator(conf, this.coordinatorContext);
|
this.coordinator = new StreamWriteOperatorCoordinator(conf, this.coordinatorContext);
|
||||||
this.compactFunctionWrapper = new CompactFunctionWrapper(this.conf);
|
this.compactFunctionWrapper = new CompactFunctionWrapper(this.conf);
|
||||||
this.bucketAssignOperatorContext = new MockBucketAssignOperatorContext();
|
this.bucketAssignOperatorContext = new MockBucketAssignOperatorContext();
|
||||||
this.functionInitializationContext = new MockFunctionInitializationContext();
|
this.stateInitializationContext = new MockStateInitializationContext();
|
||||||
this.compactFunctionWrapper = new CompactFunctionWrapper(this.conf);
|
this.compactFunctionWrapper = new CompactFunctionWrapper(this.conf);
|
||||||
this.asyncCompaction = StreamerUtil.needsAsyncCompaction(conf);
|
this.asyncCompaction = StreamerUtil.needsAsyncCompaction(conf);
|
||||||
this.bucketAssignOperatorContext = new MockBucketAssignOperatorContext();
|
this.bucketAssignOperatorContext = new MockBucketAssignOperatorContext();
|
||||||
|
this.output = new CollectorOutput<>(new ArrayList<>());
|
||||||
|
this.streamConfig = new StreamConfig(conf);
|
||||||
|
streamConfig.setOperatorID(new OperatorID());
|
||||||
|
this.streamTask = new MockStreamTaskBuilder(environment)
|
||||||
|
.setConfig(new StreamConfig(conf))
|
||||||
|
.setExecutionConfig(new ExecutionConfig().enableObjectReuse())
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void openFunction() throws Exception {
|
public void openFunction() throws Exception {
|
||||||
@@ -128,16 +147,16 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
toHoodieFunction.open(conf);
|
toHoodieFunction.open(conf);
|
||||||
|
|
||||||
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
||||||
bootstrapFunction = new BootstrapFunction<>(conf);
|
bootstrapOperator = new BootstrapOperator<>(conf);
|
||||||
bootstrapFunction.setRuntimeContext(runtimeContext);
|
bootstrapOperator.setup(streamTask, streamConfig, output);
|
||||||
bootstrapFunction.open(conf);
|
bootstrapOperator.initializeState(this.stateInitializationContext);
|
||||||
}
|
}
|
||||||
|
|
||||||
bucketAssignerFunction = new BucketAssignFunction<>(conf);
|
bucketAssignerFunction = new BucketAssignFunction<>(conf);
|
||||||
bucketAssignerFunction.setRuntimeContext(runtimeContext);
|
bucketAssignerFunction.setRuntimeContext(runtimeContext);
|
||||||
bucketAssignerFunction.open(conf);
|
bucketAssignerFunction.open(conf);
|
||||||
bucketAssignerFunction.setContext(bucketAssignOperatorContext);
|
bucketAssignerFunction.setContext(bucketAssignOperatorContext);
|
||||||
bucketAssignerFunction.initializeState(this.functionInitializationContext);
|
bucketAssignerFunction.initializeState(this.stateInitializationContext);
|
||||||
|
|
||||||
setupWriteFunction();
|
setupWriteFunction();
|
||||||
|
|
||||||
@@ -146,6 +165,7 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
public void invoke(I record) throws Exception {
|
public void invoke(I record) throws Exception {
|
||||||
HoodieRecord<?> hoodieRecord = toHoodieFunction.map((RowData) record);
|
HoodieRecord<?> hoodieRecord = toHoodieFunction.map((RowData) record);
|
||||||
HoodieRecord<?>[] hoodieRecords = new HoodieRecord[1];
|
HoodieRecord<?>[] hoodieRecords = new HoodieRecord[1];
|
||||||
@@ -162,27 +182,16 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
||||||
List<HoodieRecord<?>> bootstrapRecords = new ArrayList<>();
|
List<StreamElement> list = ((CollectorOutput) output).getList();
|
||||||
|
for (StreamElement streamElement : list) {
|
||||||
Collector<HoodieRecord<?>> bootstrapCollector = new Collector<HoodieRecord<?>>() {
|
if (streamElement.isRecord()) {
|
||||||
@Override
|
HoodieRecord<?> bootstrapRecord = (HoodieRecord<?>) streamElement.asRecord().getValue();
|
||||||
public void collect(HoodieRecord<?> record) {
|
bucketAssignerFunction.processElement(bootstrapRecord, null, collector);
|
||||||
if (record instanceof IndexRecord) {
|
|
||||||
bootstrapRecords.add(record);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() {
|
|
||||||
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
bootstrapFunction.processElement(hoodieRecord, null, bootstrapCollector);
|
|
||||||
for (HoodieRecord bootstrapRecord : bootstrapRecords) {
|
|
||||||
bucketAssignerFunction.processElement(bootstrapRecord, null, collector);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bootstrapOperator.processElement(new StreamRecord<>(hoodieRecord));
|
||||||
|
list.clear();
|
||||||
this.bucketAssignOperatorContext.setCurrentKey(hoodieRecord.getRecordKey());
|
this.bucketAssignOperatorContext.setCurrentKey(hoodieRecord.getRecordKey());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -210,14 +219,17 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
public void checkpointFunction(long checkpointId) throws Exception {
|
public void checkpointFunction(long checkpointId) throws Exception {
|
||||||
// checkpoint the coordinator first
|
// checkpoint the coordinator first
|
||||||
this.coordinator.checkpointCoordinator(checkpointId, new CompletableFuture<>());
|
this.coordinator.checkpointCoordinator(checkpointId, new CompletableFuture<>());
|
||||||
|
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
||||||
|
bootstrapOperator.snapshotState(null);
|
||||||
|
}
|
||||||
bucketAssignerFunction.snapshotState(null);
|
bucketAssignerFunction.snapshotState(null);
|
||||||
|
|
||||||
writeFunction.snapshotState(null);
|
writeFunction.snapshotState(null);
|
||||||
functionInitializationContext.getOperatorStateStore().checkpointBegin(checkpointId);
|
stateInitializationContext.getOperatorStateStore().checkpointBegin(checkpointId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void checkpointComplete(long checkpointId) {
|
public void checkpointComplete(long checkpointId) {
|
||||||
functionInitializationContext.getOperatorStateStore().checkpointSuccess(checkpointId);
|
stateInitializationContext.getOperatorStateStore().checkpointSuccess(checkpointId);
|
||||||
coordinator.notifyCheckpointComplete(checkpointId);
|
coordinator.notifyCheckpointComplete(checkpointId);
|
||||||
this.bucketAssignerFunction.notifyCheckpointComplete(checkpointId);
|
this.bucketAssignerFunction.notifyCheckpointComplete(checkpointId);
|
||||||
if (asyncCompaction) {
|
if (asyncCompaction) {
|
||||||
@@ -264,8 +276,8 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
return this.writeFunction.isConfirming();
|
return this.writeFunction.isConfirming();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isAlreadyBootstrap() {
|
public boolean isAlreadyBootstrap() throws Exception {
|
||||||
return this.bootstrapFunction.isAlreadyBootstrap();
|
return this.bootstrapOperator.isAlreadyBootstrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -276,7 +288,7 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
writeFunction = new StreamWriteFunction<>(conf);
|
writeFunction = new StreamWriteFunction<>(conf);
|
||||||
writeFunction.setRuntimeContext(runtimeContext);
|
writeFunction.setRuntimeContext(runtimeContext);
|
||||||
writeFunction.setOperatorEventGateway(gateway);
|
writeFunction.setOperatorEventGateway(gateway);
|
||||||
writeFunction.initializeState(this.functionInitializationContext);
|
writeFunction.initializeState(this.stateInitializationContext);
|
||||||
writeFunction.open(conf);
|
writeFunction.open(conf);
|
||||||
|
|
||||||
// handle the bootstrap event
|
// handle the bootstrap event
|
||||||
|
|||||||
Reference in New Issue
Block a user