HUDI-140 : GCS: Log File Reading not working due to difference in seek() behavior for EOF
This commit is contained in:
committed by
Balaji Varadarajan
parent
9857c4b21c
commit
0b451b3a58
@@ -27,6 +27,7 @@ import com.uber.hoodie.common.table.log.block.HoodieDeleteBlock;
|
||||
import com.uber.hoodie.common.table.log.block.HoodieLogBlock;
|
||||
import com.uber.hoodie.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||
import com.uber.hoodie.common.table.log.block.HoodieLogBlock.HoodieLogBlockType;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.exception.CorruptedLogFileException;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import com.uber.hoodie.exception.HoodieNotSupportedException;
|
||||
@@ -234,7 +235,11 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
private boolean isBlockCorrupt(int blocksize) throws IOException {
|
||||
long currentPos = inputStream.getPos();
|
||||
try {
|
||||
inputStream.seek(currentPos + blocksize);
|
||||
if (FSUtils.isGCSInputStream(inputStream)) {
|
||||
inputStream.seek(currentPos + blocksize - 1);
|
||||
} else {
|
||||
inputStream.seek(currentPos + blocksize);
|
||||
}
|
||||
} catch (EOFException e) {
|
||||
// this is corrupt
|
||||
// This seek is required because contract of seek() is different for naked DFSInputStream vs BufferedFSInputStream
|
||||
|
||||
@@ -21,6 +21,7 @@ package com.uber.hoodie.common.table.log.block;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.common.table.log.HoodieMergedLogRecordScanner;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
@@ -233,7 +234,7 @@ public abstract class HoodieLogBlock {
|
||||
inputStream.readFully(content, 0, contentLength);
|
||||
} else {
|
||||
// Seek to the end of the content block
|
||||
inputStream.seek(inputStream.getPos() + contentLength);
|
||||
safeSeek(inputStream, inputStream.getPos() + contentLength);
|
||||
}
|
||||
return content;
|
||||
}
|
||||
@@ -245,9 +246,9 @@ public abstract class HoodieLogBlock {
|
||||
|
||||
try {
|
||||
content = Optional.of(new byte[(int) this.getBlockContentLocation().get().getBlockSize()]);
|
||||
inputStream.seek(this.getBlockContentLocation().get().getContentPositionInLogFile());
|
||||
safeSeek(inputStream, this.getBlockContentLocation().get().getContentPositionInLogFile());
|
||||
inputStream.readFully(content.get(), 0, content.get().length);
|
||||
inputStream.seek(this.getBlockContentLocation().get().getBlockEndPos());
|
||||
safeSeek(inputStream, this.getBlockContentLocation().get().getBlockEndPos());
|
||||
} catch (IOException e) {
|
||||
try {
|
||||
// TODO : fs.open() and return inputstream again, need to pass FS configuration
|
||||
@@ -268,4 +269,21 @@ public abstract class HoodieLogBlock {
|
||||
content = Optional.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles difference in seek behavior for GCS and non-GCS input stream
|
||||
* @param inputStream Input Stream
|
||||
* @param pos Position to seek
|
||||
* @throws IOException
|
||||
*/
|
||||
private static void safeSeek(FSDataInputStream inputStream, long pos) throws IOException {
|
||||
try {
|
||||
inputStream.seek(pos);
|
||||
} catch (EOFException e) {
|
||||
if (FSUtils.isGCSInputStream(inputStream)) {
|
||||
inputStream.seek(pos - 1);
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
@@ -552,4 +553,15 @@ public class FSUtils {
|
||||
return ((partitionPath == null) || (partitionPath.isEmpty())) ? basePath :
|
||||
new Path(basePath, partitionPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* This is due to HUDI-140 GCS has a different behavior for detecting EOF during seek().
|
||||
* @param inputStream FSDataInputStream
|
||||
* @return true if the inputstream or the wrapped one is of type GoogleHadoopFSInputStream
|
||||
*/
|
||||
public static boolean isGCSInputStream(FSDataInputStream inputStream) {
|
||||
return inputStream.getClass().getCanonicalName().equals("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream")
|
||||
|| inputStream.getWrappedStream().getClass().getCanonicalName()
|
||||
.equals("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user