[HUDI-2005] Removing direct fs call in HoodieLogFileReader (#3865)
This commit is contained in:
committed by
GitHub
parent
6f5d8d04cd
commit
8340ccb503
@@ -76,6 +76,11 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
private boolean closed = false;
|
||||
private transient Thread shutdownThread = null;
|
||||
|
||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize,
|
||||
boolean readBlockLazily) throws IOException {
|
||||
this(fs, logFile, readerSchema, bufferSize, readBlockLazily, false);
|
||||
}
|
||||
|
||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize,
|
||||
boolean readBlockLazily, boolean reverseReader) throws IOException {
|
||||
this(fs, logFile, readerSchema, bufferSize, readBlockLazily, reverseReader, false,
|
||||
@@ -94,16 +99,11 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
this.enableInlineReading = enableInlineReading;
|
||||
this.keyField = keyField;
|
||||
if (this.reverseReader) {
|
||||
this.reverseLogFilePosition = this.lastReverseLogFilePosition = fs.getFileStatus(logFile.getPath()).getLen();
|
||||
this.reverseLogFilePosition = this.lastReverseLogFilePosition = logFile.getFileSize();
|
||||
}
|
||||
addShutDownHook();
|
||||
}
|
||||
|
||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, boolean readBlockLazily,
|
||||
boolean reverseReader) throws IOException {
|
||||
this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, readBlockLazily, reverseReader);
|
||||
}
|
||||
|
||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) throws IOException {
|
||||
this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, false, false);
|
||||
}
|
||||
|
||||
@@ -274,7 +274,7 @@ public interface HoodieLogFormat {
|
||||
|
||||
static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema)
|
||||
throws IOException {
|
||||
return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, false, false);
|
||||
return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, false);
|
||||
}
|
||||
|
||||
static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema,
|
||||
|
||||
@@ -104,9 +104,8 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
|
||||
} else {
|
||||
this.prevReadersInOpenState.add(currentReader);
|
||||
}
|
||||
this.currentReader =
|
||||
new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false,
|
||||
enableInlineReading, recordKeyField);
|
||||
this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false,
|
||||
enableInlineReading, recordKeyField);
|
||||
} catch (IOException io) {
|
||||
throw new HoodieIOException("unable to initialize read with log file ", io);
|
||||
}
|
||||
|
||||
@@ -27,14 +27,15 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
@@ -42,9 +43,10 @@ import java.util.stream.Collectors;
|
||||
*/
|
||||
public class LogReaderUtils {
|
||||
|
||||
private static Schema readSchemaFromLogFileInReverse(FileSystem fs, HoodieActiveTimeline activeTimeline, Path path)
|
||||
private static Schema readSchemaFromLogFileInReverse(FileSystem fs, HoodieActiveTimeline activeTimeline, HoodieLogFile hoodieLogFile)
|
||||
throws IOException {
|
||||
Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null, true, true);
|
||||
// set length for the HoodieLogFile as it will be leveraged by HoodieLogFormat.Reader with reverseReading enabled
|
||||
Reader reader = HoodieLogFormat.newReader(fs, hoodieLogFile, null, true, true);
|
||||
Schema writerSchema = null;
|
||||
HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
|
||||
while (reader.hasPrev()) {
|
||||
@@ -62,17 +64,17 @@ public class LogReaderUtils {
|
||||
return writerSchema;
|
||||
}
|
||||
|
||||
public static Schema readLatestSchemaFromLogFiles(String basePath, List<String> deltaFilePaths, Configuration config)
|
||||
public static Schema readLatestSchemaFromLogFiles(String basePath, List<HoodieLogFile> logFiles, Configuration config)
|
||||
throws IOException {
|
||||
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(config).setBasePath(basePath).build();
|
||||
List<String> deltaPaths = deltaFilePaths.stream().map(s -> new HoodieLogFile(new Path(s)))
|
||||
.sorted(HoodieLogFile.getReverseLogFileComparator()).map(s -> s.getPath().toString())
|
||||
List<String> deltaPaths = logFiles.stream().sorted(HoodieLogFile.getReverseLogFileComparator()).map(s -> s.getPath().toString())
|
||||
.collect(Collectors.toList());
|
||||
if (deltaPaths.size() > 0) {
|
||||
Map<String, HoodieLogFile> deltaFilePathToFileStatus = logFiles.stream().map(entry -> Pair.of(entry.getPath().toString(), entry))
|
||||
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
|
||||
for (String logPath : deltaPaths) {
|
||||
FileSystem fs = FSUtils.getFs(logPath, config);
|
||||
Schema schemaFromLogFile =
|
||||
readSchemaFromLogFileInReverse(fs, metaClient.getActiveTimeline(), new Path(logPath));
|
||||
Schema schemaFromLogFile = readSchemaFromLogFileInReverse(fs, metaClient.getActiveTimeline(), deltaFilePathToFileStatus.get(logPath));
|
||||
if (schemaFromLogFile != null) {
|
||||
return schemaFromLogFile;
|
||||
}
|
||||
@@ -80,5 +82,4 @@ public class LogReaderUtils {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1482,7 +1482,8 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
|
||||
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
|
||||
|
||||
HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(),
|
||||
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
|
||||
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(),
|
||||
bufferSize, readBlocksLazily, true);
|
||||
|
||||
assertTrue(reader.hasPrev(), "Last block should be available");
|
||||
@@ -1560,7 +1561,8 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
|
||||
// First round of reads - we should be able to read the first block and then EOF
|
||||
HoodieLogFileReader reader =
|
||||
new HoodieLogFileReader(fs, writer.getLogFile(), schema, bufferSize, readBlocksLazily, true);
|
||||
new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
|
||||
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), schema, bufferSize, readBlocksLazily, true);
|
||||
|
||||
assertTrue(reader.hasPrev(), "Last block should be available");
|
||||
HoodieLogBlock block = reader.prev();
|
||||
@@ -1610,7 +1612,8 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
|
||||
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
|
||||
|
||||
HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(),
|
||||
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
|
||||
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(),
|
||||
bufferSize, readBlocksLazily, true);
|
||||
|
||||
assertTrue(reader.hasPrev(), "Third block should be available");
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
|
||||
package org.apache.hudi.hadoop;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.FileSplit;
|
||||
@@ -36,7 +38,7 @@ public class BaseFileWithLogsSplit extends FileSplit {
|
||||
// a flag to mark this split is produced by incremental query or not.
|
||||
private boolean belongToIncrementalSplit = false;
|
||||
// the log file paths of this split.
|
||||
private List<String> deltaLogPaths = new ArrayList<>();
|
||||
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
|
||||
// max commit time of current split.
|
||||
private String maxCommitTime = "";
|
||||
// the basePath of current hoodie table.
|
||||
@@ -55,9 +57,10 @@ public class BaseFileWithLogsSplit extends FileSplit {
|
||||
Text.writeString(out, maxCommitTime);
|
||||
Text.writeString(out, basePath);
|
||||
Text.writeString(out, baseFilePath);
|
||||
out.writeInt(deltaLogPaths.size());
|
||||
for (String logPath : deltaLogPaths) {
|
||||
Text.writeString(out, logPath);
|
||||
out.writeInt(deltaLogFiles.size());
|
||||
for (HoodieLogFile logFile : deltaLogFiles) {
|
||||
Text.writeString(out, logFile.getPath().toString());
|
||||
out.writeLong(logFile.getFileSize());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,11 +72,13 @@ public class BaseFileWithLogsSplit extends FileSplit {
|
||||
basePath = Text.readString(in);
|
||||
baseFilePath = Text.readString(in);
|
||||
int deltaLogSize = in.readInt();
|
||||
List<String> tempDeltaLogs = new ArrayList<>();
|
||||
List<HoodieLogFile> tempDeltaLogs = new ArrayList<>();
|
||||
for (int i = 0; i < deltaLogSize; i++) {
|
||||
tempDeltaLogs.add(Text.readString(in));
|
||||
String logPath = Text.readString(in);
|
||||
long logFileSize = in.readLong();
|
||||
tempDeltaLogs.add(new HoodieLogFile(new Path(logPath), logFileSize));
|
||||
}
|
||||
deltaLogPaths = tempDeltaLogs;
|
||||
deltaLogFiles = tempDeltaLogs;
|
||||
}
|
||||
|
||||
public boolean getBelongToIncrementalSplit() {
|
||||
@@ -84,12 +89,12 @@ public class BaseFileWithLogsSplit extends FileSplit {
|
||||
this.belongToIncrementalSplit = belongToIncrementalSplit;
|
||||
}
|
||||
|
||||
public List<String> getDeltaLogPaths() {
|
||||
return deltaLogPaths;
|
||||
public List<HoodieLogFile> getDeltaLogFiles() {
|
||||
return deltaLogFiles;
|
||||
}
|
||||
|
||||
public void setDeltaLogPaths(List<String> deltaLogPaths) {
|
||||
this.deltaLogPaths = deltaLogPaths;
|
||||
public void setDeltaLogFiles(List<HoodieLogFile> deltaLogFiles) {
|
||||
this.deltaLogFiles = deltaLogFiles;
|
||||
}
|
||||
|
||||
public String getMaxCommitTime() {
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
|
||||
package org.apache.hudi.hadoop;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@@ -31,7 +33,7 @@ public class PathWithLogFilePath extends Path {
|
||||
// a flag to mark this split is produced by incremental query or not.
|
||||
private boolean belongToIncrementalPath = false;
|
||||
// the log files belong this path.
|
||||
private List<String> deltaLogPaths = new ArrayList<>();
|
||||
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
|
||||
// max commit time of current path.
|
||||
private String maxCommitTime = "";
|
||||
// the basePath of current hoodie table.
|
||||
@@ -50,12 +52,12 @@ public class PathWithLogFilePath extends Path {
|
||||
this.belongToIncrementalPath = belongToIncrementalPath;
|
||||
}
|
||||
|
||||
public List<String> getDeltaLogPaths() {
|
||||
return deltaLogPaths;
|
||||
public List<HoodieLogFile> getDeltaLogFiles() {
|
||||
return deltaLogFiles;
|
||||
}
|
||||
|
||||
public void setDeltaLogPaths(List<String> deltaLogPaths) {
|
||||
this.deltaLogPaths = deltaLogPaths;
|
||||
public void setDeltaLogFiles(List<HoodieLogFile> deltaLogFiles) {
|
||||
this.deltaLogFiles = deltaLogFiles;
|
||||
}
|
||||
|
||||
public String getMaxCommitTime() {
|
||||
@@ -97,7 +99,7 @@ public class PathWithLogFilePath extends Path {
|
||||
public BaseFileWithLogsSplit buildSplit(Path file, long start, long length, String[] hosts) {
|
||||
BaseFileWithLogsSplit bs = new BaseFileWithLogsSplit(file, start, length, hosts);
|
||||
bs.setBelongToIncrementalSplit(belongToIncrementalPath);
|
||||
bs.setDeltaLogPaths(deltaLogPaths);
|
||||
bs.setDeltaLogFiles(deltaLogFiles);
|
||||
bs.setMaxCommitTime(maxCommitTime);
|
||||
bs.setBasePath(basePath);
|
||||
bs.setBaseFilePath(baseFilePath);
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
|
||||
package org.apache.hudi.hadoop;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
@@ -35,7 +37,7 @@ public class RealtimeFileStatus extends FileStatus {
|
||||
// a flag to mark this split is produced by incremental query or not.
|
||||
private boolean belongToIncrementalFileStatus = false;
|
||||
// the log files belong this fileStatus.
|
||||
private List<String> deltaLogPaths = new ArrayList<>();
|
||||
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
|
||||
// max commit time of current fileStatus.
|
||||
private String maxCommitTime = "";
|
||||
// the basePath of current hoodie table.
|
||||
@@ -55,7 +57,7 @@ public class RealtimeFileStatus extends FileStatus {
|
||||
Path path = super.getPath();
|
||||
PathWithLogFilePath pathWithLogFilePath = new PathWithLogFilePath(path.getParent(), path.getName());
|
||||
pathWithLogFilePath.setBelongToIncrementalPath(belongToIncrementalFileStatus);
|
||||
pathWithLogFilePath.setDeltaLogPaths(deltaLogPaths);
|
||||
pathWithLogFilePath.setDeltaLogFiles(deltaLogFiles);
|
||||
pathWithLogFilePath.setMaxCommitTime(maxCommitTime);
|
||||
pathWithLogFilePath.setBasePath(basePath);
|
||||
pathWithLogFilePath.setBaseFilePath(baseFilePath);
|
||||
@@ -69,12 +71,12 @@ public class RealtimeFileStatus extends FileStatus {
|
||||
this.belongToIncrementalFileStatus = belongToIncrementalFileStatus;
|
||||
}
|
||||
|
||||
public List<String> getDeltaLogPaths() {
|
||||
return deltaLogPaths;
|
||||
public List<HoodieLogFile> getDeltaLogFiles() {
|
||||
return deltaLogFiles;
|
||||
}
|
||||
|
||||
public void setDeltaLogPaths(List<String> deltaLogPaths) {
|
||||
this.deltaLogPaths = deltaLogPaths;
|
||||
public void setDeltaLogFiles(List<HoodieLogFile> deltaLogFiles) {
|
||||
this.deltaLogFiles = deltaLogFiles;
|
||||
}
|
||||
|
||||
public String getMaxCommitTime() {
|
||||
|
||||
@@ -82,7 +82,7 @@ public abstract class AbstractRealtimeRecordReader {
|
||||
* job conf.
|
||||
*/
|
||||
private void init() throws IOException {
|
||||
Schema schemaFromLogFile = LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogPaths(), jobConf);
|
||||
Schema schemaFromLogFile = LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogFiles(), jobConf);
|
||||
if (schemaFromLogFile == null) {
|
||||
writerSchema = InputSplitUtils.getBaseFileSchema((FileSplit)split, jobConf);
|
||||
LOG.info("Writer Schema From Parquet => " + writerSchema.getFields());
|
||||
|
||||
@@ -23,6 +23,7 @@ import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.FileSlice;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieFileGroup;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
@@ -189,7 +190,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
|
||||
fileStatus.setBelongToIncrementalFileStatus(true);
|
||||
fileStatus.setBasePath(basePath);
|
||||
fileStatus.setBaseFilePath(baseFilePath);
|
||||
fileStatus.setDeltaLogPaths(f.getLatestFileSlice().get().getLogFiles().map(l -> l.getPath().toString()).collect(Collectors.toList()));
|
||||
fileStatus.setDeltaLogFiles(f.getLatestFileSlice().get().getLogFiles().collect(Collectors.toList()));
|
||||
// try to set bootstrapfileStatus
|
||||
if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) {
|
||||
fileStatus.setBootStrapFileStatus(baseFileStatus);
|
||||
@@ -202,7 +203,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
|
||||
if (logFileStatus.size() > 0) {
|
||||
RealtimeFileStatus fileStatus = new RealtimeFileStatus(logFileStatus.get(0));
|
||||
fileStatus.setBelongToIncrementalFileStatus(true);
|
||||
fileStatus.setDeltaLogPaths(logFileStatus.stream().map(l -> l.getPath().toString()).collect(Collectors.toList()));
|
||||
fileStatus.setDeltaLogFiles(logFileStatus.stream().map(l -> new HoodieLogFile(l.getPath(), l.getLen())).collect(Collectors.toList()));
|
||||
fileStatus.setMaxCommitTime(maxCommitTime);
|
||||
fileStatus.setBasePath(basePath);
|
||||
result.add(fileStatus);
|
||||
@@ -256,7 +257,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
|
||||
? super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts)
|
||||
: super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts, inMemoryHosts);
|
||||
return HoodieRealtimeInputFormatUtils
|
||||
.createRealtimeBoostrapBaseFileSplit((BootstrapBaseFileSplit) bf, path.getBasePath(), path.getDeltaLogPaths(), path.getMaxCommitTime());
|
||||
.createRealtimeBoostrapBaseFileSplit((BootstrapBaseFileSplit) bf, path.getBasePath(), path.getDeltaLogFiles(), path.getMaxCommitTime());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hudi.hadoop.realtime;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import org.apache.hadoop.mapred.FileSplit;
|
||||
@@ -25,7 +26,9 @@ import org.apache.hadoop.mapred.FileSplit;
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Filesplit that wraps the base split and a list of log files to merge deltas from.
|
||||
@@ -33,6 +36,7 @@ import java.util.List;
|
||||
public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit {
|
||||
|
||||
private List<String> deltaLogPaths;
|
||||
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
|
||||
|
||||
private String maxCommitTime;
|
||||
|
||||
@@ -44,11 +48,12 @@ public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit
|
||||
super();
|
||||
}
|
||||
|
||||
public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List<String> deltaLogPaths, String maxCommitTime,
|
||||
public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List<HoodieLogFile> deltaLogFiles, String maxCommitTime,
|
||||
Option<HoodieVirtualKeyInfo> hoodieVirtualKeyInfo)
|
||||
throws IOException {
|
||||
super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(), baseSplit.getLocations());
|
||||
this.deltaLogPaths = deltaLogPaths;
|
||||
this.deltaLogFiles = deltaLogFiles;
|
||||
this.deltaLogPaths = deltaLogFiles.stream().map(entry -> entry.getPath().toString()).collect(Collectors.toList());
|
||||
this.maxCommitTime = maxCommitTime;
|
||||
this.basePath = basePath;
|
||||
this.hoodieVirtualKeyInfo = hoodieVirtualKeyInfo;
|
||||
@@ -58,6 +63,10 @@ public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit
|
||||
return deltaLogPaths;
|
||||
}
|
||||
|
||||
public List<HoodieLogFile> getDeltaLogFiles() {
|
||||
return deltaLogFiles;
|
||||
}
|
||||
|
||||
public String getMaxCommitTime() {
|
||||
return maxCommitTime;
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hudi.hadoop.realtime;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.hadoop.BootstrapBaseFileSplit;
|
||||
|
||||
@@ -26,7 +27,9 @@ import org.apache.hadoop.mapred.FileSplit;
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Realtime File Split with external base file.
|
||||
@@ -34,6 +37,7 @@ import java.util.List;
|
||||
public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit implements RealtimeSplit {
|
||||
|
||||
private List<String> deltaLogPaths;
|
||||
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
|
||||
|
||||
private String maxInstantTime;
|
||||
|
||||
@@ -43,11 +47,12 @@ public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit imple
|
||||
super();
|
||||
}
|
||||
|
||||
public RealtimeBootstrapBaseFileSplit(FileSplit baseSplit, String basePath, List<String> deltaLogPaths,
|
||||
public RealtimeBootstrapBaseFileSplit(FileSplit baseSplit, String basePath, List<HoodieLogFile> deltaLogFiles,
|
||||
String maxInstantTime, FileSplit externalFileSplit) throws IOException {
|
||||
super(baseSplit, externalFileSplit);
|
||||
this.maxInstantTime = maxInstantTime;
|
||||
this.deltaLogPaths = deltaLogPaths;
|
||||
this.deltaLogFiles = deltaLogFiles;
|
||||
this.deltaLogPaths = deltaLogFiles.stream().map(entry -> entry.getPath().toString()).collect(Collectors.toList());
|
||||
this.basePath = basePath;
|
||||
}
|
||||
|
||||
@@ -68,6 +73,11 @@ public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit imple
|
||||
return deltaLogPaths;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<HoodieLogFile> getDeltaLogFiles() {
|
||||
return deltaLogFiles;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getMaxCommitTime() {
|
||||
return maxInstantTime;
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hudi.hadoop.realtime;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.hadoop.InputSplitUtils;
|
||||
|
||||
@@ -41,6 +42,8 @@ public interface RealtimeSplit extends InputSplitWithLocationInfo {
|
||||
*/
|
||||
List<String> getDeltaLogPaths();
|
||||
|
||||
List<HoodieLogFile> getDeltaLogFiles();
|
||||
|
||||
/**
|
||||
* Return Max Instant Time.
|
||||
* @return
|
||||
|
||||
@@ -466,7 +466,7 @@ public class HoodieInputFormatUtils {
|
||||
HoodieTableFileSystemView fsView = fsViewCache.computeIfAbsent(metaClient, tableMetaClient ->
|
||||
FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, tableMetaClient, buildMetadataConfig(job), timeline));
|
||||
List<HoodieBaseFile> filteredBaseFiles = new ArrayList<>();
|
||||
Map<FileStatus, List<String>> filteredLogs = new HashMap<>();
|
||||
Map<FileStatus, List<HoodieLogFile>> filteredLogs = new HashMap<>();
|
||||
for (Path p : entry.getValue()) {
|
||||
String relativePartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), p);
|
||||
List<HoodieBaseFile> matched = fsView.getLatestBaseFiles(relativePartitionPath).collect(Collectors.toList());
|
||||
@@ -476,9 +476,8 @@ public class HoodieInputFormatUtils {
|
||||
.filter(f -> !f.getBaseFile().isPresent() && f.getLatestLogFile().isPresent())
|
||||
.collect(Collectors.toList());
|
||||
logMatched.forEach(f -> {
|
||||
List<String> logPaths = f.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
|
||||
.map(log -> log.getPath().toString()).collect(Collectors.toList());
|
||||
filteredLogs.put(f.getLatestLogFile().get().getFileStatus(), logPaths);
|
||||
List<HoodieLogFile> logPathSizePairs = f.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
|
||||
filteredLogs.put(f.getLatestLogFile().get().getFileStatus(), logPathSizePairs);
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -492,9 +491,9 @@ public class HoodieInputFormatUtils {
|
||||
returns.add(getFileStatus(filteredFile));
|
||||
}
|
||||
|
||||
for (Map.Entry<FileStatus, List<String>> filterLogEntry : filteredLogs.entrySet()) {
|
||||
for (Map.Entry<FileStatus, List<HoodieLogFile>> filterLogEntry : filteredLogs.entrySet()) {
|
||||
RealtimeFileStatus rs = new RealtimeFileStatus(filterLogEntry.getKey());
|
||||
rs.setDeltaLogPaths(filterLogEntry.getValue());
|
||||
rs.setDeltaLogFiles(filterLogEntry.getValue());
|
||||
returns.add(rs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,13 +120,13 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
||||
List<FileSplit> dataFileSplits = groupedInputSplits.getOrDefault(fileSlice.getFileId(), new ArrayList<>());
|
||||
dataFileSplits.forEach(split -> {
|
||||
try {
|
||||
List<String> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
|
||||
.map(logFile -> logFile.getPath().toString()).collect(Collectors.toList());
|
||||
List<HoodieLogFile> logFiles = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
|
||||
.collect(Collectors.toList());
|
||||
if (split instanceof BootstrapBaseFileSplit) {
|
||||
BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split;
|
||||
rtSplits.add(createRealtimeBoostrapBaseFileSplit(eSplit, metaClient.getBasePath(), logFilePaths, maxCommitTime));
|
||||
rtSplits.add(createRealtimeBoostrapBaseFileSplit(eSplit, metaClient.getBasePath(), logFiles, maxCommitTime));
|
||||
} else {
|
||||
rtSplits.add(new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFilePaths, maxCommitTime, finalHoodieVirtualKeyInfo));
|
||||
rtSplits.add(new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFiles, maxCommitTime, finalHoodieVirtualKeyInfo));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Error creating hoodie real time split ", e);
|
||||
@@ -162,7 +162,7 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
||||
if (s instanceof BaseFileWithLogsSplit) {
|
||||
BaseFileWithLogsSplit bs = (BaseFileWithLogsSplit)s;
|
||||
if (bs.getBelongToIncrementalSplit()) {
|
||||
rtSplits.add(new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogPaths(), bs.getMaxCommitTime(), finalHoodieVirtualKeyInfo));
|
||||
rtSplits.add(new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogFiles(), bs.getMaxCommitTime(), finalHoodieVirtualKeyInfo));
|
||||
}
|
||||
} else if (s instanceof RealtimeBootstrapBaseFileSplit) {
|
||||
rtSplits.add(s);
|
||||
@@ -206,7 +206,7 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
||||
}
|
||||
|
||||
public static RealtimeBootstrapBaseFileSplit createRealtimeBoostrapBaseFileSplit(
|
||||
BootstrapBaseFileSplit split, String basePath, List<String> deltaLogPaths, String maxInstantTime) {
|
||||
BootstrapBaseFileSplit split, String basePath, List<HoodieLogFile> logFiles, String maxInstantTime) {
|
||||
try {
|
||||
String[] hosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo())
|
||||
.filter(x -> !x.isInMemory()).toArray(String[]::new) : new String[0];
|
||||
@@ -214,7 +214,7 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
||||
.filter(SplitLocationInfo::isInMemory).toArray(String[]::new) : new String[0];
|
||||
FileSplit baseSplit = new FileSplit(split.getPath(), split.getStart(), split.getLength(),
|
||||
hosts, inMemoryHosts);
|
||||
return new RealtimeBootstrapBaseFileSplit(baseSplit, basePath, deltaLogPaths, maxInstantTime, split.getBootstrapFileSplit());
|
||||
return new RealtimeBootstrapBaseFileSplit(baseSplit, basePath, logFiles, maxInstantTime, split.getBootstrapFileSplit());
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Error creating hoodie real time split ", e);
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hudi.hadoop.realtime;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
@@ -56,6 +57,7 @@ public class TestHoodieRealtimeFileSplit {
|
||||
|
||||
private HoodieRealtimeFileSplit split;
|
||||
private String basePath;
|
||||
private List<HoodieLogFile> deltaLogFiles;
|
||||
private List<String> deltaLogPaths;
|
||||
private String fileSplitName;
|
||||
private FileSplit baseFileSplit;
|
||||
@@ -64,12 +66,13 @@ public class TestHoodieRealtimeFileSplit {
|
||||
@BeforeEach
|
||||
public void setUp(@TempDir java.nio.file.Path tempDir) throws Exception {
|
||||
basePath = tempDir.toAbsolutePath().toString();
|
||||
deltaLogFiles = Collections.singletonList(new HoodieLogFile(new Path(basePath + "/1.log"), 0L));
|
||||
deltaLogPaths = Collections.singletonList(basePath + "/1.log");
|
||||
fileSplitName = basePath + "/test.file";
|
||||
baseFileSplit = new FileSplit(new Path(fileSplitName), 0, 100, new String[] {});
|
||||
maxCommitTime = "10001";
|
||||
|
||||
split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogPaths, maxCommitTime, Option.empty());
|
||||
split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogFiles, maxCommitTime, Option.empty());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
||||
@@ -221,7 +221,7 @@ public class TestHoodieRealtimeRecordReader {
|
||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
||||
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, baseJobConf),
|
||||
basePath.toUri().toString(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
|
||||
.map(h -> h.getPath().toString()).collect(Collectors.toList()),
|
||||
.collect(Collectors.toList()),
|
||||
instantTime, Option.empty());
|
||||
|
||||
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||
@@ -290,10 +290,9 @@ public class TestHoodieRealtimeRecordReader {
|
||||
FileCreateUtils.createDeltaCommit(basePath.toString(), newCommitTime);
|
||||
|
||||
// create a split with baseFile (parquet file written earlier) and new log file(s)
|
||||
String logFilePath = writer.getLogFile().getPath().toString();
|
||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
||||
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
|
||||
basePath.toUri().toString(), Collections.singletonList(logFilePath), newCommitTime, Option.empty());
|
||||
basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, Option.empty());
|
||||
|
||||
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
|
||||
@@ -370,10 +369,9 @@ public class TestHoodieRealtimeRecordReader {
|
||||
InputFormatTestUtil.deltaCommit(basePath, newCommitTime);
|
||||
|
||||
// create a split with baseFile (parquet file written earlier) and new log file(s)
|
||||
String logFilePath = writer.getLogFile().getPath().toString();
|
||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
||||
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
|
||||
basePath.toUri().toString(), Collections.singletonList(logFilePath), newCommitTime, Option.empty());
|
||||
basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, Option.empty());
|
||||
|
||||
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
|
||||
@@ -483,7 +481,7 @@ public class TestHoodieRealtimeRecordReader {
|
||||
public void testSchemaEvolutionAndRollbackBlockInLastLogFile(ExternalSpillableMap.DiskMapType diskMapType,
|
||||
boolean isCompressionEnabled) throws Exception {
|
||||
// initial commit
|
||||
List<String> logFilePaths = new ArrayList<>();
|
||||
List<HoodieLogFile> logFiles = new ArrayList<>();
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
|
||||
HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
|
||||
String instantTime = "100";
|
||||
@@ -504,7 +502,7 @@ public class TestHoodieRealtimeRecordReader {
|
||||
InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime,
|
||||
numberOfLogRecords, 0, 1);
|
||||
long size = writer.getCurrentSize();
|
||||
logFilePaths.add(writer.getLogFile().getPath().toString());
|
||||
logFiles.add(writer.getLogFile());
|
||||
writer.close();
|
||||
assertTrue(size > 0, "block - size should be > 0");
|
||||
|
||||
@@ -512,14 +510,14 @@ public class TestHoodieRealtimeRecordReader {
|
||||
newCommitTime = "102";
|
||||
writer = InputFormatTestUtil.writeRollbackBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime,
|
||||
newCommitTime, "101", 1);
|
||||
logFilePaths.add(writer.getLogFile().getPath().toString());
|
||||
logFiles.add(writer.getLogFile());
|
||||
writer.close();
|
||||
InputFormatTestUtil.deltaCommit(basePath, newCommitTime);
|
||||
|
||||
// create a split with baseFile (parquet file written earlier) and new log file(s)
|
||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
||||
new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
|
||||
basePath.toUri().toString(), logFilePaths, newCommitTime, Option.empty());
|
||||
basePath.toUri().toString(), logFiles, newCommitTime, Option.empty());
|
||||
|
||||
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
|
||||
@@ -695,6 +693,7 @@ public class TestHoodieRealtimeRecordReader {
|
||||
writeStat.setNumUpdateWrites(100);
|
||||
writeStat.setNumWrites(100);
|
||||
writeStat.setPath(filePath);
|
||||
writeStat.setFileSizeInBytes(new File(new Path(basePath.toString(), filePath).toString()).length());
|
||||
writeStat.setPartitionPath(partitionPath);
|
||||
writeStat.setTotalLogFilesCompacted(100L);
|
||||
HoodieWriteStat.RuntimeStats runtimeStats = new HoodieWriteStat.RuntimeStats();
|
||||
@@ -750,14 +749,14 @@ public class TestHoodieRealtimeRecordReader {
|
||||
assertTrue(size > 0, "block - size should be > 0");
|
||||
FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime);
|
||||
// create a split with new log file(s)
|
||||
fileSlice.addLogFile(writer.getLogFile());
|
||||
RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(0, false, 1, 1, 0, writer.getLogFile().getPath()));
|
||||
fileSlice.addLogFile(new HoodieLogFile(writer.getLogFile().getPath(), size));
|
||||
RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath()));
|
||||
realtimeFileStatus.setMaxCommitTime(instantTime);
|
||||
realtimeFileStatus.setBasePath(basePath.toString());
|
||||
realtimeFileStatus.setDeltaLogPaths(fileSlice.getLogFiles().map(l -> l.getPath().toString()).collect(Collectors.toList()));
|
||||
realtimeFileStatus.setDeltaLogFiles(fileSlice.getLogFiles().collect(Collectors.toList()));
|
||||
PathWithLogFilePath pathWithLogFileStatus = (PathWithLogFilePath) realtimeFileStatus.getPath();
|
||||
BaseFileWithLogsSplit bs = pathWithLogFileStatus.buildSplit(pathWithLogFileStatus, 0, 0, new String[] {""});
|
||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogPaths(), bs.getMaxCommitTime(), Option.empty());
|
||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogFiles(), bs.getMaxCommitTime(), Option.empty());
|
||||
|
||||
JobConf newJobConf = new JobConf(baseJobConf);
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
|
||||
Reference in New Issue
Block a user