[HUDI-1801] FlinkMergeHandle rolling over may miss to rename the latest file handle (#2831)
The FlinkMergeHandle may rename the N-1 th file handle instead of the latest one, thus to cause data duplication.
This commit is contained in:
@@ -21,7 +21,6 @@ package org.apache.hudi.io;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.HoodieTimer;
|
||||
@@ -37,7 +36,6 @@ import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A {@link HoodieMergeHandle} that supports merge write incrementally(small data buffers).
|
||||
@@ -63,14 +61,16 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
|
||||
* Records the current file handles number that rolls over.
|
||||
*/
|
||||
private int rollNumber = 0;
|
||||
|
||||
/**
|
||||
* Whether the handle should roll over to new, E.G. the handle has written some intermediate files already.
|
||||
*/
|
||||
private volatile boolean shouldRollover = false;
|
||||
|
||||
/**
|
||||
* Records the rolled over file paths.
|
||||
*/
|
||||
private List<Path> rolloverPaths;
|
||||
/**
|
||||
* Whether it is the first time to generate file handle, E.G. the handle has not rolled over yet.
|
||||
*/
|
||||
private boolean needBootStrap = true;
|
||||
private final List<Path> rolloverPaths;
|
||||
|
||||
public FlinkMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
|
||||
@@ -79,32 +79,22 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
|
||||
rolloverPaths = new ArrayList<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Called by compactor code path.
|
||||
*/
|
||||
public FlinkMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||
Map<String, HoodieRecord<T>> keyToNewRecords, String partitionPath, String fileId,
|
||||
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
|
||||
super(config, instantTime, hoodieTable, keyToNewRecords, partitionPath, fileId,
|
||||
dataFileToBeMerged, taskContextSupplier);
|
||||
}
|
||||
|
||||
/**
|
||||
* Use the fileId + "-" + rollNumber as the new fileId of a mini-batch write.
|
||||
*/
|
||||
protected String generatesDataFileName() {
|
||||
final String fileID = this.needBootStrap ? fileId : fileId + "-" + rollNumber;
|
||||
protected String generatesDataFileNameWithRollover() {
|
||||
final String fileID = this.fileId + "-" + rollNumber;
|
||||
return FSUtils.makeDataFileName(instantTime, writeToken, fileID, hoodieTable.getBaseFileExtension());
|
||||
}
|
||||
|
||||
public boolean isNeedBootStrap() {
|
||||
return needBootStrap;
|
||||
public boolean shouldRollover() {
|
||||
return shouldRollover;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<WriteStatus> close() {
|
||||
List<WriteStatus> writeStatus = super.close();
|
||||
this.needBootStrap = false;
|
||||
this.shouldRollover = true;
|
||||
return writeStatus;
|
||||
}
|
||||
|
||||
@@ -138,12 +128,12 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
|
||||
this.writeStatus.setTotalErrorRecords(0);
|
||||
this.timer = new HoodieTimer().startTimer();
|
||||
|
||||
rollNumber++;
|
||||
rollNumber += 1;
|
||||
|
||||
rolloverPaths.add(newFilePath);
|
||||
oldFilePath = newFilePath;
|
||||
// Use the fileId + "-" + rollNumber as the new fileId of a mini-batch write.
|
||||
String newFileName = generatesDataFileName();
|
||||
String newFileName = generatesDataFileNameWithRollover();
|
||||
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
|
||||
+ newFileName).toString();
|
||||
newFilePath = new Path(config.getBasePath(), relativePath);
|
||||
@@ -160,6 +150,7 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
|
||||
|
||||
public void finishWrite() {
|
||||
// The file visibility should be kept by the configured ConsistencyGuard instance.
|
||||
rolloverPaths.add(newFilePath);
|
||||
if (rolloverPaths.size() == 1) {
|
||||
// only one flush action, no need to roll over
|
||||
return;
|
||||
@@ -176,7 +167,7 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
|
||||
Path lastPath = rolloverPaths.size() > 0
|
||||
? rolloverPaths.get(rolloverPaths.size() - 1)
|
||||
: newFilePath;
|
||||
String newFileName = FSUtils.makeDataFileName(instantTime, writeToken, fileId, hoodieTable.getBaseFileExtension());
|
||||
String newFileName = generatesDataFileName();
|
||||
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
|
||||
+ newFileName).toString();
|
||||
final Path desiredPath = new Path(config.getBasePath(), relativePath);
|
||||
|
||||
@@ -240,7 +240,7 @@ public abstract class BaseFlinkCommitActionExecutor<T extends HoodieRecordPayloa
|
||||
} else {
|
||||
FlinkMergeHandle writeHandle = (FlinkMergeHandle) this.writeHandle;
|
||||
// add the incremental records.
|
||||
if (!writeHandle.isNeedBootStrap()) {
|
||||
if (writeHandle.shouldRollover()) {
|
||||
writeHandle.rollOver(recordItr);
|
||||
}
|
||||
return writeHandle;
|
||||
|
||||
@@ -71,7 +71,7 @@ public class FlinkMergeHelper<T extends HoodieRecordPayload> extends AbstractMer
|
||||
final GenericDatumWriter<GenericRecord> gWriter;
|
||||
final GenericDatumReader<GenericRecord> gReader;
|
||||
Schema readSchema;
|
||||
if (isNeedBootStrap(mergeHandle)
|
||||
if (isFirstTimeMerge(mergeHandle)
|
||||
&& (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent())) {
|
||||
readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema();
|
||||
gWriter = new GenericDatumWriter<>(readSchema);
|
||||
@@ -86,7 +86,7 @@ public class FlinkMergeHelper<T extends HoodieRecordPayload> extends AbstractMer
|
||||
HoodieFileReader<GenericRecord> reader = HoodieFileReaderFactory.<GenericRecord>getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath());
|
||||
try {
|
||||
final Iterator<GenericRecord> readerIterator;
|
||||
if (isNeedBootStrap(mergeHandle) && baseFile.getBootstrapBaseFile().isPresent()) {
|
||||
if (isFirstTimeMerge(mergeHandle) && baseFile.getBootstrapBaseFile().isPresent()) {
|
||||
readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation);
|
||||
} else {
|
||||
readerIterator = reader.getRecordIterator(readSchema);
|
||||
@@ -115,8 +115,8 @@ public class FlinkMergeHelper<T extends HoodieRecordPayload> extends AbstractMer
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isNeedBootStrap(HoodieMergeHandle<?, ?, ?, ?> mergeHandle) {
|
||||
return mergeHandle instanceof FlinkMergeHandle && ((FlinkMergeHandle) mergeHandle).isNeedBootStrap();
|
||||
private static boolean isFirstTimeMerge(HoodieMergeHandle<?, ?, ?, ?> mergeHandle) {
|
||||
return mergeHandle instanceof FlinkMergeHandle && !((FlinkMergeHandle) mergeHandle).shouldRollover();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -324,6 +324,34 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
|
||||
assertRowsEquals(result2, expected);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@EnumSource(value = ExecMode.class)
|
||||
void testUpsertWithMiniBatches(ExecMode execMode) {
|
||||
TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv;
|
||||
Map<String, String> options = new HashMap<>();
|
||||
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
|
||||
options.put(FlinkOptions.WRITE_BATCH_SIZE.key(), "0.001");
|
||||
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
|
||||
tableEnv.executeSql(hoodieTableDDL);
|
||||
|
||||
final String insertInto1 = "insert into t1 values\n"
|
||||
+ "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1')";
|
||||
|
||||
execInsertSql(tableEnv, insertInto1);
|
||||
|
||||
final String insertInto2 = "insert into t1 values\n"
|
||||
+ "('id1','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),\n"
|
||||
+ "('id1','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par1'),\n"
|
||||
+ "('id1','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par1'),\n"
|
||||
+ "('id1','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par1')";
|
||||
|
||||
execInsertSql(tableEnv, insertInto2);
|
||||
|
||||
List<Row> result = CollectionUtil.iterableToList(
|
||||
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
|
||||
assertRowsEquals(result, "[id1,Sophia,18,1970-01-01T00:00:05,par1]");
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Utilities
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user