1
0

[HUDI-3559] Flink bucket index with COW table throws NoSuchElementException

Actually method FlinkWriteHelper#deduplicateRecords does not guarantee the records sequence, but there is a
implicit constraint: all the records in one bucket should have the same bucket type(instant time here),
the BucketStreamWriteFunction breaks the rule and fails to comply with this constraint.

close apache/hudi#5018
This commit is contained in:
wxp4532
2022-03-11 14:07:52 +08:00
committed by yuzhao.cyz
parent a118d56b07
commit 26e5d2e6fc
3 changed files with 114 additions and 89 deletions

View File

@@ -27,7 +27,6 @@ import org.apache.hudi.common.model.HoodieOperation;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.table.HoodieTable;
@@ -91,13 +90,11 @@ public class FlinkWriteHelper<T extends HoodieRecordPayload, R> extends BaseWrit
@Override
public List<HoodieRecord<T>> deduplicateRecords(
List<HoodieRecord<T>> records, HoodieIndex<?, ?> index, int parallelism) {
Map<Object, List<Pair<Object, HoodieRecord<T>>>> keyedRecords = records.stream().map(record -> {
// If index used is global, then records are expected to differ in their partitionPath
final Object key = record.getKey().getRecordKey();
return Pair.of(key, record);
}).collect(Collectors.groupingBy(Pair::getLeft));
// If index used is global, then records are expected to differ in their partitionPath
Map<Object, List<HoodieRecord<T>>> keyedRecords = records.stream()
.collect(Collectors.groupingBy(record -> record.getKey().getRecordKey()));
return keyedRecords.values().stream().map(x -> x.stream().map(Pair::getRight).reduce((rec1, rec2) -> {
return keyedRecords.values().stream().map(x -> x.stream().reduce((rec1, rec2) -> {
final T data1 = rec1.getData();
final T data2 = rec2.getData();