[HUDI -409] Match header and footer block length to improve corrupted block detection (#1332)
This commit is contained in:
@@ -231,6 +231,7 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
inputStream.seek(currentPos + blocksize);
|
||||
}
|
||||
} catch (EOFException e) {
|
||||
LOG.info("Found corrupted block in file " + logFile + " with block size(" + blocksize + ") running past EOF");
|
||||
// this is corrupt
|
||||
// This seek is required because contract of seek() is different for naked DFSInputStream vs BufferedFSInputStream
|
||||
// release-3.1.0-RC1/DFSInputStream.java#L1455
|
||||
@@ -239,12 +240,26 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
return true;
|
||||
}
|
||||
|
||||
// check if the blocksize mentioned in the footer is the same as the header; by seeking back the length of a long
|
||||
// the backward seek does not incur additional IO as {@link org.apache.hadoop.hdfs.DFSInputStream#seek()}
|
||||
// only moves the index. actual IO happens on the next read operation
|
||||
inputStream.seek(inputStream.getPos() - Long.BYTES);
|
||||
// Block size in the footer includes the magic header, which the header does not include.
|
||||
// So we have to shorten the footer block size by the size of magic hash
|
||||
long blockSizeFromFooter = inputStream.readLong() - MAGIC_BUFFER.length;
|
||||
if (blocksize != blockSizeFromFooter) {
|
||||
LOG.info("Found corrupted block in file " + logFile + ". Header block size(" + blocksize
|
||||
+ ") did not match the footer block size(" + blockSizeFromFooter + ")");
|
||||
inputStream.seek(currentPos);
|
||||
return true;
|
||||
}
|
||||
try {
|
||||
readMagic();
|
||||
// all good - either we found the sync marker or EOF. Reset position and continue
|
||||
return false;
|
||||
} catch (CorruptedLogFileException e) {
|
||||
// This is a corrupted block
|
||||
LOG.info("Found corrupted block in file " + logFile + ". No magic hash found right after footer block size entry");
|
||||
return true;
|
||||
} finally {
|
||||
inputStream.seek(currentPos);
|
||||
@@ -310,7 +325,6 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
}
|
||||
|
||||
private boolean hasNextMagic() throws IOException {
|
||||
long pos = inputStream.getPos();
|
||||
// 1. Read magic header from the start of the block
|
||||
inputStream.readFully(MAGIC_BUFFER, 0, 6);
|
||||
return Arrays.equals(MAGIC_BUFFER, HoodieLogFormat.MAGIC);
|
||||
|
||||
@@ -158,6 +158,9 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer {
|
||||
this.output.write(footerBytes);
|
||||
// 9. Write the total size of the log block (including magic) which is everything written
|
||||
// until now (for reverse pointer)
|
||||
// Update: this information is now used in determining if a block is corrupt by comparing to the
|
||||
// block size in header. This change assumes that the block size will be the last data written
|
||||
// to a block. Read will break if any data is written past this point for a block.
|
||||
this.output.writeLong(this.output.size() - currentSize);
|
||||
// Flush every block to disk
|
||||
flush();
|
||||
|
||||
Reference in New Issue
Block a user