1
0

[HUDI -409] Match header and footer block length to improve corrupted block detection (#1332)

This commit is contained in:
Ramachandran M S
2020-03-03 13:26:54 -08:00
committed by GitHub
parent 8306205d7a
commit 9d46ce380a
3 changed files with 36 additions and 4 deletions

View File

@@ -231,6 +231,7 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
inputStream.seek(currentPos + blocksize);
}
} catch (EOFException e) {
LOG.info("Found corrupted block in file " + logFile + " with block size(" + blocksize + ") running past EOF");
// this is corrupt
// This seek is required because contract of seek() is different for naked DFSInputStream vs BufferedFSInputStream
// release-3.1.0-RC1/DFSInputStream.java#L1455
@@ -239,12 +240,26 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
return true;
}
// check if the blocksize mentioned in the footer is the same as the header; by seeking back the length of a long
// the backward seek does not incur additional IO as {@link org.apache.hadoop.hdfs.DFSInputStream#seek()}
// only moves the index. actual IO happens on the next read operation
inputStream.seek(inputStream.getPos() - Long.BYTES);
// Block size in the footer includes the magic header, which the header does not include.
// So we have to shorten the footer block size by the size of magic hash
long blockSizeFromFooter = inputStream.readLong() - MAGIC_BUFFER.length;
if (blocksize != blockSizeFromFooter) {
LOG.info("Found corrupted block in file " + logFile + ". Header block size(" + blocksize
+ ") did not match the footer block size(" + blockSizeFromFooter + ")");
inputStream.seek(currentPos);
return true;
}
try {
readMagic();
// all good - either we found the sync marker or EOF. Reset position and continue
return false;
} catch (CorruptedLogFileException e) {
// This is a corrupted block
LOG.info("Found corrupted block in file " + logFile + ". No magic hash found right after footer block size entry");
return true;
} finally {
inputStream.seek(currentPos);
@@ -310,7 +325,6 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
}
private boolean hasNextMagic() throws IOException {
long pos = inputStream.getPos();
// 1. Read magic header from the start of the block
inputStream.readFully(MAGIC_BUFFER, 0, 6);
return Arrays.equals(MAGIC_BUFFER, HoodieLogFormat.MAGIC);

View File

@@ -158,6 +158,9 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer {
this.output.write(footerBytes);
// 9. Write the total size of the log block (including magic) which is everything written
// until now (for reverse pointer)
// Update: this information is now used in determining if a block is corrupt by comparing to the
// block size in header. This change assumes that the block size will be the last data written
// to a block. Read will break if any data is written past this point for a block.
this.output.writeLong(this.output.size() - currentSize);
// Flush every block to disk
flush();