1
0

[HUDI-4393] Add marker file for target file when flink merge handle rolls over (#6103)

This commit is contained in:
Danny Chan
2022-07-14 16:00:08 +08:00
committed by GitHub
parent aaccc63ad5
commit 05606708fa
3 changed files with 8 additions and 24 deletions

View File

@@ -655,7 +655,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
invalidDataPaths.removeAll(validDataPaths);
if (!invalidDataPaths.isEmpty()) {
LOG.info("Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths);
LOG.info("Removing duplicate data files created due to task retries before committing. Paths=" + invalidDataPaths);
Map<String, List<Pair<String, String>>> invalidPathsByPartition = invalidDataPaths.stream()
.map(dp -> Pair.of(new Path(basePath, dp).getParent().toString(), new Path(basePath, dp).toString()))
.collect(Collectors.groupingBy(Pair::getKey));

View File

@@ -167,7 +167,7 @@ public class FlinkMergeAndReplaceHandle<T extends HoodieRecordPayload, I, K, O>
try {
fs.rename(newFilePath, oldFilePath);
} catch (IOException e) {
throw new HoodieIOException("Error while renaming the temporary roll file: "
throw new HoodieIOException("Error while renaming the temporary rollover file: "
+ newFilePath + " to old base file name: " + oldFilePath, e);
}
}

View File

@@ -43,7 +43,7 @@ import java.util.List;
/**
* A {@link HoodieMergeHandle} that supports MERGE write incrementally(small data buffers).
*
* <p>For a new data buffer, it initialize and set up the next file path to write,
* <p>For a new data buffer, it initializes and set up the next file path to write,
* and closes the file path when the data buffer write finish. When next data buffer
* write starts, it rolls over to another new file. If all the data buffers write finish
* for a checkpoint round, it renames the last new file path as the desired file name
@@ -143,8 +143,7 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
break;
}
oldFilePath = newFilePath; // override the old file name
rolloverPaths.add(oldFilePath);
rolloverPaths.add(newFilePath);
newFileName = newFileNameWithRollover(rollNumber++);
newFilePath = makeNewFilePath(partitionPath, newFileName);
LOG.warn("Duplicate write for MERGE bucket with path: " + oldFilePath + ", rolls over to new path: " + newFilePath);
@@ -162,13 +161,6 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
this.fileId, hoodieTable.getBaseFileExtension());
}
@Override
protected void setWriteStatusPath() {
// if there was rollover, should set up the path as the initial new file path.
Path path = rolloverPaths.size() > 0 ? rolloverPaths.get(0) : newFilePath;
writeStatus.getStat().setPath(new Path(config.getBasePath()), path);
}
@Override
public List<WriteStatus> close() {
try {
@@ -188,27 +180,19 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
public void finalizeWrite() {
// The file visibility should be kept by the configured ConsistencyGuard instance.
rolloverPaths.add(newFilePath);
if (rolloverPaths.size() == 1) {
if (rolloverPaths.size() == 0) {
// only one flush action, no need to roll over
return;
}
for (int i = 0; i < rolloverPaths.size() - 1; i++) {
Path path = rolloverPaths.get(i);
for (Path path : rolloverPaths) {
try {
fs.delete(path, false);
LOG.info("Delete the rollover data file: " + path + " success!");
} catch (IOException e) {
throw new HoodieIOException("Error when clean the temporary roll file: " + path, e);
throw new HoodieIOException("Error when clean the temporary rollover data file: " + path, e);
}
}
final Path lastPath = rolloverPaths.get(rolloverPaths.size() - 1);
final Path desiredPath = rolloverPaths.get(0);
try {
fs.rename(lastPath, desiredPath);
} catch (IOException e) {
throw new HoodieIOException("Error when rename the temporary roll file: " + lastPath + " to: " + desiredPath, e);
}
}
@Override