1
0

[HUDI-2686] Proccess record after all bootstrap operator ready (#3925)

Co-authored-by: yuzhaojing <yuzhaojing@bytedance.com>
This commit is contained in:
yuzhaojing
2021-11-05 14:36:22 +08:00
committed by GitHub
parent 2c1e259329
commit f67da0c7d0
4 changed files with 133 additions and 3 deletions

View File

@@ -34,6 +34,7 @@ import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.sink.bootstrap.aggregate.BootstrapAggFunction;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.format.FormatUtils;
import org.apache.hudi.util.FlinkTables;
@@ -48,6 +49,7 @@ import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.state.KeyGroupRangeAssignment;
import org.apache.flink.runtime.state.StateInitializationContext;
import org.apache.flink.runtime.state.StateSnapshotContext;
import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager;
import org.apache.flink.streaming.api.operators.AbstractStreamOperator;
import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
@@ -58,6 +60,7 @@ import org.slf4j.LoggerFactory;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import static java.util.stream.Collectors.toList;
@@ -71,7 +74,7 @@ import static org.apache.hudi.util.StreamerUtil.isValidFile;
*
* <p>The output records should then shuffle by the recordKey and thus do scalable write.
*/
public class BootstrapOperator<I, O extends HoodieRecord>
public class BootstrapOperator<I, O extends HoodieRecord<?>>
extends AbstractStreamOperator<O> implements OneInputStreamOperator<I, O> {
private static final Logger LOG = LoggerFactory.getLogger(BootstrapOperator.class);
@@ -83,6 +86,8 @@ public class BootstrapOperator<I, O extends HoodieRecord>
protected transient org.apache.hadoop.conf.Configuration hadoopConf;
protected transient HoodieWriteConfig writeConfig;
private transient GlobalAggregateManager aggregateManager;
private transient ListState<String> instantState;
private final Pattern pattern;
private String lastInstantTime;
@@ -117,6 +122,7 @@ public class BootstrapOperator<I, O extends HoodieRecord>
this.hadoopConf = StreamerUtil.getHadoopConf();
this.writeConfig = StreamerUtil.getHoodieClientConfig(this.conf, true);
this.hoodieTable = FlinkTables.createTable(writeConfig, hadoopConf, getRuntimeContext());
this.aggregateManager = getRuntimeContext().getGlobalAggregateManager();
preLoadIndexRecords();
}
@@ -135,6 +141,27 @@ public class BootstrapOperator<I, O extends HoodieRecord>
}
LOG.info("Finish sending index records, taskId = {}.", getRuntimeContext().getIndexOfThisSubtask());
// wait for the other bootstrap tasks finish bootstrapping.
waitForBootstrapReady(getRuntimeContext().getIndexOfThisSubtask());
}
/**
* Wait for other bootstrap tasks to finish the index bootstrap.
*/
private void waitForBootstrapReady(int taskID) {
int taskNum = getRuntimeContext().getNumberOfParallelSubtasks();
int readyTaskNum = 1;
while (taskNum != readyTaskNum) {
try {
readyTaskNum = aggregateManager.updateGlobalAggregate(BootstrapAggFunction.NAME, taskID, new BootstrapAggFunction());
LOG.info("Waiting for other bootstrap tasks to complete, taskId = {}.", taskID);
TimeUnit.SECONDS.sleep(5);
} catch (Exception e) {
LOG.warn("Update global task bootstrap summary error", e);
}
}
}
@Override

View File

@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sink.bootstrap.aggregate;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
/**
* Bootstrap ready task id accumulator.
*/
public class BootstrapAccumulator implements Serializable {
private static final long serialVersionUID = 1L;
private final Set<Integer> readyTaskSet;
public BootstrapAccumulator() {
this.readyTaskSet = new HashSet<>();
}
public void update(int taskId) {
readyTaskSet.add(taskId);
}
public int readyTaskNum() {
return readyTaskSet.size();
}
public BootstrapAccumulator merge(BootstrapAccumulator acc) {
if (acc == null) {
return this;
}
readyTaskSet.addAll(acc.readyTaskSet);
return this;
}
}

View File

@@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sink.bootstrap.aggregate;
import org.apache.flink.api.common.functions.AggregateFunction;
/**
* Aggregate function that accumulates the loaded task number of
* function {@link org.apache.hudi.sink.bootstrap.BootstrapOperator}.
*/
public class BootstrapAggFunction implements AggregateFunction<Integer, BootstrapAccumulator, Integer> {
public static final String NAME = BootstrapAggFunction.class.getSimpleName();
@Override
public BootstrapAccumulator createAccumulator() {
return new BootstrapAccumulator();
}
@Override
public BootstrapAccumulator add(Integer taskId, BootstrapAccumulator bootstrapAccumulator) {
bootstrapAccumulator.update(taskId);
return bootstrapAccumulator;
}
@Override
public Integer getResult(BootstrapAccumulator bootstrapAccumulator) {
return bootstrapAccumulator.readyTaskNum();
}
@Override
public BootstrapAccumulator merge(BootstrapAccumulator bootstrapAccumulator, BootstrapAccumulator acc) {
return bootstrapAccumulator.merge(acc);
}
}

View File

@@ -39,7 +39,7 @@ import java.util.Set;
*
* <p>The input records should shuffle by the partition path to avoid repeated loading.
*/
public class BatchBootstrapOperator<I, O extends HoodieRecord>
public class BatchBootstrapOperator<I, O extends HoodieRecord<?>>
extends BootstrapOperator<I, O> {
private Set<String> partitionPathSet;
@@ -64,7 +64,7 @@ public class BatchBootstrapOperator<I, O extends HoodieRecord>
@Override
@SuppressWarnings("unchecked")
public void processElement(StreamRecord<I> element) throws Exception {
final HoodieRecord record = (HoodieRecord<?>) element.getValue();
final HoodieRecord<?> record = (HoodieRecord<?>) element.getValue();
final String partitionPath = record.getKey().getPartitionPath();
if (haveSuccessfulCommits && !partitionPathSet.contains(partitionPath)) {