|
|
|
|
@@ -18,22 +18,6 @@
|
|
|
|
|
|
|
|
|
|
package org.apache.hudi.sink.bootstrap;
|
|
|
|
|
|
|
|
|
|
import org.apache.flink.annotation.VisibleForTesting;
|
|
|
|
|
import org.apache.flink.api.common.state.CheckpointListener;
|
|
|
|
|
import org.apache.flink.api.common.state.ListState;
|
|
|
|
|
import org.apache.flink.api.common.state.ListStateDescriptor;
|
|
|
|
|
import org.apache.flink.api.common.typeinfo.TypeHint;
|
|
|
|
|
import org.apache.flink.api.common.typeinfo.TypeInformation;
|
|
|
|
|
import org.apache.flink.configuration.Configuration;
|
|
|
|
|
import org.apache.flink.runtime.state.FunctionInitializationContext;
|
|
|
|
|
import org.apache.flink.runtime.state.FunctionSnapshotContext;
|
|
|
|
|
import org.apache.flink.runtime.state.KeyGroupRangeAssignment;
|
|
|
|
|
import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager;
|
|
|
|
|
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
|
|
|
|
|
import org.apache.flink.streaming.api.functions.ProcessFunction;
|
|
|
|
|
import org.apache.flink.streaming.api.operators.StreamingRuntimeContext;
|
|
|
|
|
import org.apache.flink.util.Collector;
|
|
|
|
|
import org.apache.hadoop.fs.Path;
|
|
|
|
|
import org.apache.hudi.client.FlinkTaskContextSupplier;
|
|
|
|
|
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
|
|
|
|
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
|
|
|
|
@@ -47,11 +31,26 @@ import org.apache.hudi.config.HoodieWriteConfig;
|
|
|
|
|
import org.apache.hudi.configuration.FlinkOptions;
|
|
|
|
|
import org.apache.hudi.exception.HoodieException;
|
|
|
|
|
import org.apache.hudi.index.HoodieIndexUtils;
|
|
|
|
|
import org.apache.hudi.sink.bootstrap.aggregate.BootstrapAggFunc;
|
|
|
|
|
import org.apache.hudi.sink.bootstrap.aggregate.BootstrapAggFunction;
|
|
|
|
|
import org.apache.hudi.table.HoodieFlinkTable;
|
|
|
|
|
import org.apache.hudi.table.HoodieTable;
|
|
|
|
|
import org.apache.hudi.util.StreamerUtil;
|
|
|
|
|
|
|
|
|
|
import org.apache.flink.annotation.VisibleForTesting;
|
|
|
|
|
import org.apache.flink.api.common.state.CheckpointListener;
|
|
|
|
|
import org.apache.flink.api.common.state.ListState;
|
|
|
|
|
import org.apache.flink.api.common.state.ListStateDescriptor;
|
|
|
|
|
import org.apache.flink.api.scala.typeutils.Types;
|
|
|
|
|
import org.apache.flink.configuration.Configuration;
|
|
|
|
|
import org.apache.flink.runtime.state.FunctionInitializationContext;
|
|
|
|
|
import org.apache.flink.runtime.state.FunctionSnapshotContext;
|
|
|
|
|
import org.apache.flink.runtime.state.KeyGroupRangeAssignment;
|
|
|
|
|
import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager;
|
|
|
|
|
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
|
|
|
|
|
import org.apache.flink.streaming.api.functions.ProcessFunction;
|
|
|
|
|
import org.apache.flink.streaming.api.operators.StreamingRuntimeContext;
|
|
|
|
|
import org.apache.flink.util.Collector;
|
|
|
|
|
import org.apache.hadoop.fs.Path;
|
|
|
|
|
import org.slf4j.Logger;
|
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
|
|
|
|
@@ -61,18 +60,16 @@ import java.util.concurrent.TimeUnit;
|
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The function to load index from exists hoodieTable.
|
|
|
|
|
* The function to load index from existing hoodieTable.
|
|
|
|
|
*
|
|
|
|
|
* <p>Each subtask in bootstrapFunction triggers the bootstrap index with the first element,
|
|
|
|
|
* Received record cannot be sent until the index is all sent.
|
|
|
|
|
* <p>Each subtask of the function triggers the index bootstrap when the first element came in,
|
|
|
|
|
* the record cannot be sent until all the index records have been sent.
|
|
|
|
|
*
|
|
|
|
|
* <p>The output records should then shuffle by the recordKey and thus do scalable write.
|
|
|
|
|
*
|
|
|
|
|
* @see BootstrapFunction
|
|
|
|
|
*/
|
|
|
|
|
public class BootstrapFunction<I, O extends HoodieRecord>
|
|
|
|
|
extends ProcessFunction<I, O>
|
|
|
|
|
implements CheckpointedFunction, CheckpointListener {
|
|
|
|
|
extends ProcessFunction<I, O>
|
|
|
|
|
implements CheckpointedFunction, CheckpointListener {
|
|
|
|
|
|
|
|
|
|
private static final Logger LOG = LoggerFactory.getLogger(BootstrapFunction.class);
|
|
|
|
|
|
|
|
|
|
@@ -95,12 +92,8 @@ public class BootstrapFunction<I, O extends HoodieRecord>
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void initializeState(FunctionInitializationContext context) throws Exception {
|
|
|
|
|
this.bootstrapState = context.getOperatorStateStore().getListState(
|
|
|
|
|
new ListStateDescriptor<>(
|
|
|
|
|
"bootstrap-state",
|
|
|
|
|
TypeInformation.of(new TypeHint<Boolean>() {})
|
|
|
|
|
)
|
|
|
|
|
);
|
|
|
|
|
this.bootstrapState = context.getOperatorStateStore()
|
|
|
|
|
.getListState(new ListStateDescriptor<>("bootstrap-state", Types.BOOLEAN()));
|
|
|
|
|
|
|
|
|
|
if (context.isRestored()) {
|
|
|
|
|
LOG.info("Restoring state for the {}.", getClass().getSimpleName());
|
|
|
|
|
@@ -123,8 +116,9 @@ public class BootstrapFunction<I, O extends HoodieRecord>
|
|
|
|
|
@SuppressWarnings("unchecked")
|
|
|
|
|
public void processElement(I value, Context ctx, Collector<O> out) throws IOException {
|
|
|
|
|
if (!alreadyBootstrap) {
|
|
|
|
|
LOG.info("Start loading records in table {} into the index state, taskId = {}", conf.getString(FlinkOptions.PATH), getRuntimeContext().getIndexOfThisSubtask());
|
|
|
|
|
String basePath = hoodieTable.getMetaClient().getBasePath();
|
|
|
|
|
int taskID = getRuntimeContext().getIndexOfThisSubtask();
|
|
|
|
|
LOG.info("Start loading records in table {} into the index state, taskId = {}", basePath, taskID);
|
|
|
|
|
for (String partitionPath : FSUtils.getAllFoldersWithPartitionMetaFile(FSUtils.getFs(basePath, hadoopConf), basePath)) {
|
|
|
|
|
if (pattern.matcher(partitionPath).matches()) {
|
|
|
|
|
loadRecords(partitionPath, out);
|
|
|
|
|
@@ -132,30 +126,30 @@ public class BootstrapFunction<I, O extends HoodieRecord>
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// wait for others bootstrap task send bootstrap complete.
|
|
|
|
|
updateAndWaiting();
|
|
|
|
|
waitForBootstrapReady(taskID);
|
|
|
|
|
|
|
|
|
|
alreadyBootstrap = true;
|
|
|
|
|
LOG.info("Finish send index to BucketAssign, taskId = {}.", getRuntimeContext().getIndexOfThisSubtask());
|
|
|
|
|
LOG.info("Finish sending index records, taskId = {}.", getRuntimeContext().getIndexOfThisSubtask());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// send data to next operator
|
|
|
|
|
// send the trigger record
|
|
|
|
|
out.collect((O) value);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Wait for other bootstrap task send bootstrap complete.
|
|
|
|
|
* Wait for other bootstrap tasks to finish the index bootstrap.
|
|
|
|
|
*/
|
|
|
|
|
private void updateAndWaiting() {
|
|
|
|
|
private void waitForBootstrapReady(int taskID) {
|
|
|
|
|
int taskNum = getRuntimeContext().getNumberOfParallelSubtasks();
|
|
|
|
|
int readyTaskNum = 1;
|
|
|
|
|
while (taskNum != readyTaskNum) {
|
|
|
|
|
try {
|
|
|
|
|
readyTaskNum = aggregateManager.updateGlobalAggregate(BootstrapAggFunc.NAME, getRuntimeContext().getIndexOfThisSubtask(), new BootstrapAggFunc());
|
|
|
|
|
LOG.info("Waiting for others bootstrap task complete, taskId = {}.", getRuntimeContext().getIndexOfThisSubtask());
|
|
|
|
|
readyTaskNum = aggregateManager.updateGlobalAggregate(BootstrapAggFunction.NAME, taskID, new BootstrapAggFunction());
|
|
|
|
|
LOG.info("Waiting for other bootstrap tasks to complete, taskId = {}.", taskID);
|
|
|
|
|
|
|
|
|
|
TimeUnit.SECONDS.sleep(5);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
LOG.warn("update global aggregate error", e);
|
|
|
|
|
LOG.warn("Update global task bootstrap summary error", e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@@ -178,7 +172,7 @@ public class BootstrapFunction<I, O extends HoodieRecord>
|
|
|
|
|
long start = System.currentTimeMillis();
|
|
|
|
|
BaseFileUtils fileUtils = BaseFileUtils.getInstance(this.hoodieTable.getBaseFileFormat());
|
|
|
|
|
List<HoodieBaseFile> latestBaseFiles =
|
|
|
|
|
HoodieIndexUtils.getLatestBaseFilesForPartition(partitionPath, this.hoodieTable);
|
|
|
|
|
HoodieIndexUtils.getLatestBaseFilesForPartition(partitionPath, this.hoodieTable);
|
|
|
|
|
LOG.info("All baseFile in partition {} size = {}", partitionPath, latestBaseFiles.size());
|
|
|
|
|
|
|
|
|
|
final int parallelism = getRuntimeContext().getNumberOfParallelSubtasks();
|
|
|
|
|
@@ -193,20 +187,20 @@ public class BootstrapFunction<I, O extends HoodieRecord>
|
|
|
|
|
final List<HoodieKey> hoodieKeys;
|
|
|
|
|
try {
|
|
|
|
|
hoodieKeys =
|
|
|
|
|
fileUtils.fetchRecordKeyPartitionPath(this.hadoopConf, new Path(baseFile.getPath()));
|
|
|
|
|
fileUtils.fetchRecordKeyPartitionPath(this.hadoopConf, new Path(baseFile.getPath()));
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
throw new HoodieException(String.format("Error when loading record keys from file: %s", baseFile), e);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (HoodieKey hoodieKey : hoodieKeys) {
|
|
|
|
|
out.collect((O) new BootstrapRecord(generateHoodieRecord(hoodieKey, baseFile)));
|
|
|
|
|
out.collect((O) new IndexRecord(generateHoodieRecord(hoodieKey, baseFile)));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
long cost = System.currentTimeMillis() - start;
|
|
|
|
|
LOG.info("Task [{}}:{}}] finish loading the index under partition {} and sending them to downstream, time cost: {} milliseconds.",
|
|
|
|
|
this.getClass().getSimpleName(), taskID, partitionPath, cost);
|
|
|
|
|
this.getClass().getSimpleName(), taskID, partitionPath, cost);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@SuppressWarnings("unchecked")
|
|
|
|
|
|