1
0

[HUDI-2005] Removing direct fs call in HoodieLogFileReader (#3865)

This commit is contained in:
Sivabalan Narayanan
2021-11-25 18:51:38 -05:00
committed by GitHub
parent 6f5d8d04cd
commit 8340ccb503
17 changed files with 116 additions and 80 deletions

View File

@@ -76,6 +76,11 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
private boolean closed = false;
private transient Thread shutdownThread = null;
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize,
boolean readBlockLazily) throws IOException {
this(fs, logFile, readerSchema, bufferSize, readBlockLazily, false);
}
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize,
boolean readBlockLazily, boolean reverseReader) throws IOException {
this(fs, logFile, readerSchema, bufferSize, readBlockLazily, reverseReader, false,
@@ -94,16 +99,11 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
this.enableInlineReading = enableInlineReading;
this.keyField = keyField;
if (this.reverseReader) {
this.reverseLogFilePosition = this.lastReverseLogFilePosition = fs.getFileStatus(logFile.getPath()).getLen();
this.reverseLogFilePosition = this.lastReverseLogFilePosition = logFile.getFileSize();
}
addShutDownHook();
}
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, boolean readBlockLazily,
boolean reverseReader) throws IOException {
this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, readBlockLazily, reverseReader);
}
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) throws IOException {
this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, false, false);
}

View File

@@ -274,7 +274,7 @@ public interface HoodieLogFormat {
static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema)
throws IOException {
return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, false, false);
return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, false);
}
static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema,

View File

@@ -104,9 +104,8 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
} else {
this.prevReadersInOpenState.add(currentReader);
}
this.currentReader =
new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false,
enableInlineReading, recordKeyField);
this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false,
enableInlineReading, recordKeyField);
} catch (IOException io) {
throw new HoodieIOException("unable to initialize read with log file ", io);
}

View File

@@ -27,14 +27,15 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
@@ -42,9 +43,10 @@ import java.util.stream.Collectors;
*/
public class LogReaderUtils {
private static Schema readSchemaFromLogFileInReverse(FileSystem fs, HoodieActiveTimeline activeTimeline, Path path)
private static Schema readSchemaFromLogFileInReverse(FileSystem fs, HoodieActiveTimeline activeTimeline, HoodieLogFile hoodieLogFile)
throws IOException {
Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null, true, true);
// set length for the HoodieLogFile as it will be leveraged by HoodieLogFormat.Reader with reverseReading enabled
Reader reader = HoodieLogFormat.newReader(fs, hoodieLogFile, null, true, true);
Schema writerSchema = null;
HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
while (reader.hasPrev()) {
@@ -62,17 +64,17 @@ public class LogReaderUtils {
return writerSchema;
}
public static Schema readLatestSchemaFromLogFiles(String basePath, List<String> deltaFilePaths, Configuration config)
public static Schema readLatestSchemaFromLogFiles(String basePath, List<HoodieLogFile> logFiles, Configuration config)
throws IOException {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(config).setBasePath(basePath).build();
List<String> deltaPaths = deltaFilePaths.stream().map(s -> new HoodieLogFile(new Path(s)))
.sorted(HoodieLogFile.getReverseLogFileComparator()).map(s -> s.getPath().toString())
List<String> deltaPaths = logFiles.stream().sorted(HoodieLogFile.getReverseLogFileComparator()).map(s -> s.getPath().toString())
.collect(Collectors.toList());
if (deltaPaths.size() > 0) {
Map<String, HoodieLogFile> deltaFilePathToFileStatus = logFiles.stream().map(entry -> Pair.of(entry.getPath().toString(), entry))
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
for (String logPath : deltaPaths) {
FileSystem fs = FSUtils.getFs(logPath, config);
Schema schemaFromLogFile =
readSchemaFromLogFileInReverse(fs, metaClient.getActiveTimeline(), new Path(logPath));
Schema schemaFromLogFile = readSchemaFromLogFileInReverse(fs, metaClient.getActiveTimeline(), deltaFilePathToFileStatus.get(logPath));
if (schemaFromLogFile != null) {
return schemaFromLogFile;
}
@@ -80,5 +82,4 @@ public class LogReaderUtils {
}
return null;
}
}

View File

@@ -1482,7 +1482,8 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(),
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(),
bufferSize, readBlocksLazily, true);
assertTrue(reader.hasPrev(), "Last block should be available");
@@ -1560,7 +1561,8 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
// First round of reads - we should be able to read the first block and then EOF
HoodieLogFileReader reader =
new HoodieLogFileReader(fs, writer.getLogFile(), schema, bufferSize, readBlocksLazily, true);
new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), schema, bufferSize, readBlocksLazily, true);
assertTrue(reader.hasPrev(), "Last block should be available");
HoodieLogBlock block = reader.prev();
@@ -1610,7 +1612,8 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(),
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(),
bufferSize, readBlocksLazily, true);
assertTrue(reader.hasPrev(), "Third block should be available");

View File

@@ -18,6 +18,8 @@
package org.apache.hudi.hadoop;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
@@ -36,7 +38,7 @@ public class BaseFileWithLogsSplit extends FileSplit {
// a flag to mark this split is produced by incremental query or not.
private boolean belongToIncrementalSplit = false;
// the log file paths of this split.
private List<String> deltaLogPaths = new ArrayList<>();
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
// max commit time of current split.
private String maxCommitTime = "";
// the basePath of current hoodie table.
@@ -55,9 +57,10 @@ public class BaseFileWithLogsSplit extends FileSplit {
Text.writeString(out, maxCommitTime);
Text.writeString(out, basePath);
Text.writeString(out, baseFilePath);
out.writeInt(deltaLogPaths.size());
for (String logPath : deltaLogPaths) {
Text.writeString(out, logPath);
out.writeInt(deltaLogFiles.size());
for (HoodieLogFile logFile : deltaLogFiles) {
Text.writeString(out, logFile.getPath().toString());
out.writeLong(logFile.getFileSize());
}
}
@@ -69,11 +72,13 @@ public class BaseFileWithLogsSplit extends FileSplit {
basePath = Text.readString(in);
baseFilePath = Text.readString(in);
int deltaLogSize = in.readInt();
List<String> tempDeltaLogs = new ArrayList<>();
List<HoodieLogFile> tempDeltaLogs = new ArrayList<>();
for (int i = 0; i < deltaLogSize; i++) {
tempDeltaLogs.add(Text.readString(in));
String logPath = Text.readString(in);
long logFileSize = in.readLong();
tempDeltaLogs.add(new HoodieLogFile(new Path(logPath), logFileSize));
}
deltaLogPaths = tempDeltaLogs;
deltaLogFiles = tempDeltaLogs;
}
public boolean getBelongToIncrementalSplit() {
@@ -84,12 +89,12 @@ public class BaseFileWithLogsSplit extends FileSplit {
this.belongToIncrementalSplit = belongToIncrementalSplit;
}
public List<String> getDeltaLogPaths() {
return deltaLogPaths;
public List<HoodieLogFile> getDeltaLogFiles() {
return deltaLogFiles;
}
public void setDeltaLogPaths(List<String> deltaLogPaths) {
this.deltaLogPaths = deltaLogPaths;
public void setDeltaLogFiles(List<HoodieLogFile> deltaLogFiles) {
this.deltaLogFiles = deltaLogFiles;
}
public String getMaxCommitTime() {

View File

@@ -18,6 +18,8 @@
package org.apache.hudi.hadoop;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hadoop.fs.Path;
import java.util.ArrayList;
@@ -31,7 +33,7 @@ public class PathWithLogFilePath extends Path {
// a flag to mark this split is produced by incremental query or not.
private boolean belongToIncrementalPath = false;
// the log files belong this path.
private List<String> deltaLogPaths = new ArrayList<>();
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
// max commit time of current path.
private String maxCommitTime = "";
// the basePath of current hoodie table.
@@ -50,12 +52,12 @@ public class PathWithLogFilePath extends Path {
this.belongToIncrementalPath = belongToIncrementalPath;
}
public List<String> getDeltaLogPaths() {
return deltaLogPaths;
public List<HoodieLogFile> getDeltaLogFiles() {
return deltaLogFiles;
}
public void setDeltaLogPaths(List<String> deltaLogPaths) {
this.deltaLogPaths = deltaLogPaths;
public void setDeltaLogFiles(List<HoodieLogFile> deltaLogFiles) {
this.deltaLogFiles = deltaLogFiles;
}
public String getMaxCommitTime() {
@@ -97,7 +99,7 @@ public class PathWithLogFilePath extends Path {
public BaseFileWithLogsSplit buildSplit(Path file, long start, long length, String[] hosts) {
BaseFileWithLogsSplit bs = new BaseFileWithLogsSplit(file, start, length, hosts);
bs.setBelongToIncrementalSplit(belongToIncrementalPath);
bs.setDeltaLogPaths(deltaLogPaths);
bs.setDeltaLogFiles(deltaLogFiles);
bs.setMaxCommitTime(maxCommitTime);
bs.setBasePath(basePath);
bs.setBaseFilePath(baseFilePath);

View File

@@ -18,6 +18,8 @@
package org.apache.hudi.hadoop;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
@@ -35,7 +37,7 @@ public class RealtimeFileStatus extends FileStatus {
// a flag to mark this split is produced by incremental query or not.
private boolean belongToIncrementalFileStatus = false;
// the log files belong this fileStatus.
private List<String> deltaLogPaths = new ArrayList<>();
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
// max commit time of current fileStatus.
private String maxCommitTime = "";
// the basePath of current hoodie table.
@@ -55,7 +57,7 @@ public class RealtimeFileStatus extends FileStatus {
Path path = super.getPath();
PathWithLogFilePath pathWithLogFilePath = new PathWithLogFilePath(path.getParent(), path.getName());
pathWithLogFilePath.setBelongToIncrementalPath(belongToIncrementalFileStatus);
pathWithLogFilePath.setDeltaLogPaths(deltaLogPaths);
pathWithLogFilePath.setDeltaLogFiles(deltaLogFiles);
pathWithLogFilePath.setMaxCommitTime(maxCommitTime);
pathWithLogFilePath.setBasePath(basePath);
pathWithLogFilePath.setBaseFilePath(baseFilePath);
@@ -69,12 +71,12 @@ public class RealtimeFileStatus extends FileStatus {
this.belongToIncrementalFileStatus = belongToIncrementalFileStatus;
}
public List<String> getDeltaLogPaths() {
return deltaLogPaths;
public List<HoodieLogFile> getDeltaLogFiles() {
return deltaLogFiles;
}
public void setDeltaLogPaths(List<String> deltaLogPaths) {
this.deltaLogPaths = deltaLogPaths;
public void setDeltaLogFiles(List<HoodieLogFile> deltaLogFiles) {
this.deltaLogFiles = deltaLogFiles;
}
public String getMaxCommitTime() {

View File

@@ -82,7 +82,7 @@ public abstract class AbstractRealtimeRecordReader {
* job conf.
*/
private void init() throws IOException {
Schema schemaFromLogFile = LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogPaths(), jobConf);
Schema schemaFromLogFile = LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogFiles(), jobConf);
if (schemaFromLogFile == null) {
writerSchema = InputSplitUtils.getBaseFileSchema((FileSplit)split, jobConf);
LOG.info("Writer Schema From Parquet => " + writerSchema.getFields());

View File

@@ -23,6 +23,7 @@ import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieFileGroup;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
@@ -189,7 +190,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
fileStatus.setBelongToIncrementalFileStatus(true);
fileStatus.setBasePath(basePath);
fileStatus.setBaseFilePath(baseFilePath);
fileStatus.setDeltaLogPaths(f.getLatestFileSlice().get().getLogFiles().map(l -> l.getPath().toString()).collect(Collectors.toList()));
fileStatus.setDeltaLogFiles(f.getLatestFileSlice().get().getLogFiles().collect(Collectors.toList()));
// try to set bootstrapfileStatus
if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) {
fileStatus.setBootStrapFileStatus(baseFileStatus);
@@ -202,7 +203,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
if (logFileStatus.size() > 0) {
RealtimeFileStatus fileStatus = new RealtimeFileStatus(logFileStatus.get(0));
fileStatus.setBelongToIncrementalFileStatus(true);
fileStatus.setDeltaLogPaths(logFileStatus.stream().map(l -> l.getPath().toString()).collect(Collectors.toList()));
fileStatus.setDeltaLogFiles(logFileStatus.stream().map(l -> new HoodieLogFile(l.getPath(), l.getLen())).collect(Collectors.toList()));
fileStatus.setMaxCommitTime(maxCommitTime);
fileStatus.setBasePath(basePath);
result.add(fileStatus);
@@ -256,7 +257,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
? super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts)
: super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts, inMemoryHosts);
return HoodieRealtimeInputFormatUtils
.createRealtimeBoostrapBaseFileSplit((BootstrapBaseFileSplit) bf, path.getBasePath(), path.getDeltaLogPaths(), path.getMaxCommitTime());
.createRealtimeBoostrapBaseFileSplit((BootstrapBaseFileSplit) bf, path.getBasePath(), path.getDeltaLogFiles(), path.getMaxCommitTime());
}
}

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.hadoop.realtime;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.util.Option;
import org.apache.hadoop.mapred.FileSplit;
@@ -25,7 +26,9 @@ import org.apache.hadoop.mapred.FileSplit;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
/**
* Filesplit that wraps the base split and a list of log files to merge deltas from.
@@ -33,6 +36,7 @@ import java.util.List;
public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit {
private List<String> deltaLogPaths;
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
private String maxCommitTime;
@@ -44,11 +48,12 @@ public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit
super();
}
public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List<String> deltaLogPaths, String maxCommitTime,
public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List<HoodieLogFile> deltaLogFiles, String maxCommitTime,
Option<HoodieVirtualKeyInfo> hoodieVirtualKeyInfo)
throws IOException {
super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(), baseSplit.getLocations());
this.deltaLogPaths = deltaLogPaths;
this.deltaLogFiles = deltaLogFiles;
this.deltaLogPaths = deltaLogFiles.stream().map(entry -> entry.getPath().toString()).collect(Collectors.toList());
this.maxCommitTime = maxCommitTime;
this.basePath = basePath;
this.hoodieVirtualKeyInfo = hoodieVirtualKeyInfo;
@@ -58,6 +63,10 @@ public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit
return deltaLogPaths;
}
public List<HoodieLogFile> getDeltaLogFiles() {
return deltaLogFiles;
}
public String getMaxCommitTime() {
return maxCommitTime;
}

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.hadoop.realtime;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.hadoop.BootstrapBaseFileSplit;
@@ -26,7 +27,9 @@ import org.apache.hadoop.mapred.FileSplit;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
/**
* Realtime File Split with external base file.
@@ -34,6 +37,7 @@ import java.util.List;
public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit implements RealtimeSplit {
private List<String> deltaLogPaths;
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
private String maxInstantTime;
@@ -43,11 +47,12 @@ public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit imple
super();
}
public RealtimeBootstrapBaseFileSplit(FileSplit baseSplit, String basePath, List<String> deltaLogPaths,
public RealtimeBootstrapBaseFileSplit(FileSplit baseSplit, String basePath, List<HoodieLogFile> deltaLogFiles,
String maxInstantTime, FileSplit externalFileSplit) throws IOException {
super(baseSplit, externalFileSplit);
this.maxInstantTime = maxInstantTime;
this.deltaLogPaths = deltaLogPaths;
this.deltaLogFiles = deltaLogFiles;
this.deltaLogPaths = deltaLogFiles.stream().map(entry -> entry.getPath().toString()).collect(Collectors.toList());
this.basePath = basePath;
}
@@ -68,6 +73,11 @@ public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit imple
return deltaLogPaths;
}
@Override
public List<HoodieLogFile> getDeltaLogFiles() {
return deltaLogFiles;
}
@Override
public String getMaxCommitTime() {
return maxInstantTime;

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.hadoop.realtime;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.hadoop.InputSplitUtils;
@@ -41,6 +42,8 @@ public interface RealtimeSplit extends InputSplitWithLocationInfo {
*/
List<String> getDeltaLogPaths();
List<HoodieLogFile> getDeltaLogFiles();
/**
* Return Max Instant Time.
* @return

View File

@@ -466,7 +466,7 @@ public class HoodieInputFormatUtils {
HoodieTableFileSystemView fsView = fsViewCache.computeIfAbsent(metaClient, tableMetaClient ->
FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, tableMetaClient, buildMetadataConfig(job), timeline));
List<HoodieBaseFile> filteredBaseFiles = new ArrayList<>();
Map<FileStatus, List<String>> filteredLogs = new HashMap<>();
Map<FileStatus, List<HoodieLogFile>> filteredLogs = new HashMap<>();
for (Path p : entry.getValue()) {
String relativePartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), p);
List<HoodieBaseFile> matched = fsView.getLatestBaseFiles(relativePartitionPath).collect(Collectors.toList());
@@ -476,9 +476,8 @@ public class HoodieInputFormatUtils {
.filter(f -> !f.getBaseFile().isPresent() && f.getLatestLogFile().isPresent())
.collect(Collectors.toList());
logMatched.forEach(f -> {
List<String> logPaths = f.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
.map(log -> log.getPath().toString()).collect(Collectors.toList());
filteredLogs.put(f.getLatestLogFile().get().getFileStatus(), logPaths);
List<HoodieLogFile> logPathSizePairs = f.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
filteredLogs.put(f.getLatestLogFile().get().getFileStatus(), logPathSizePairs);
});
}
}
@@ -492,9 +491,9 @@ public class HoodieInputFormatUtils {
returns.add(getFileStatus(filteredFile));
}
for (Map.Entry<FileStatus, List<String>> filterLogEntry : filteredLogs.entrySet()) {
for (Map.Entry<FileStatus, List<HoodieLogFile>> filterLogEntry : filteredLogs.entrySet()) {
RealtimeFileStatus rs = new RealtimeFileStatus(filterLogEntry.getKey());
rs.setDeltaLogPaths(filterLogEntry.getValue());
rs.setDeltaLogFiles(filterLogEntry.getValue());
returns.add(rs);
}
}

View File

@@ -120,13 +120,13 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
List<FileSplit> dataFileSplits = groupedInputSplits.getOrDefault(fileSlice.getFileId(), new ArrayList<>());
dataFileSplits.forEach(split -> {
try {
List<String> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
.map(logFile -> logFile.getPath().toString()).collect(Collectors.toList());
List<HoodieLogFile> logFiles = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
.collect(Collectors.toList());
if (split instanceof BootstrapBaseFileSplit) {
BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split;
rtSplits.add(createRealtimeBoostrapBaseFileSplit(eSplit, metaClient.getBasePath(), logFilePaths, maxCommitTime));
rtSplits.add(createRealtimeBoostrapBaseFileSplit(eSplit, metaClient.getBasePath(), logFiles, maxCommitTime));
} else {
rtSplits.add(new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFilePaths, maxCommitTime, finalHoodieVirtualKeyInfo));
rtSplits.add(new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFiles, maxCommitTime, finalHoodieVirtualKeyInfo));
}
} catch (IOException e) {
throw new HoodieIOException("Error creating hoodie real time split ", e);
@@ -162,7 +162,7 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
if (s instanceof BaseFileWithLogsSplit) {
BaseFileWithLogsSplit bs = (BaseFileWithLogsSplit)s;
if (bs.getBelongToIncrementalSplit()) {
rtSplits.add(new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogPaths(), bs.getMaxCommitTime(), finalHoodieVirtualKeyInfo));
rtSplits.add(new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogFiles(), bs.getMaxCommitTime(), finalHoodieVirtualKeyInfo));
}
} else if (s instanceof RealtimeBootstrapBaseFileSplit) {
rtSplits.add(s);
@@ -206,7 +206,7 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
}
public static RealtimeBootstrapBaseFileSplit createRealtimeBoostrapBaseFileSplit(
BootstrapBaseFileSplit split, String basePath, List<String> deltaLogPaths, String maxInstantTime) {
BootstrapBaseFileSplit split, String basePath, List<HoodieLogFile> logFiles, String maxInstantTime) {
try {
String[] hosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo())
.filter(x -> !x.isInMemory()).toArray(String[]::new) : new String[0];
@@ -214,7 +214,7 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
.filter(SplitLocationInfo::isInMemory).toArray(String[]::new) : new String[0];
FileSplit baseSplit = new FileSplit(split.getPath(), split.getStart(), split.getLength(),
hosts, inMemoryHosts);
return new RealtimeBootstrapBaseFileSplit(baseSplit, basePath, deltaLogPaths, maxInstantTime, split.getBootstrapFileSplit());
return new RealtimeBootstrapBaseFileSplit(baseSplit, basePath, logFiles, maxInstantTime, split.getBootstrapFileSplit());
} catch (IOException e) {
throw new HoodieIOException("Error creating hoodie real time split ", e);
}

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.hadoop.realtime;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.util.Option;
import org.apache.hadoop.fs.Path;
@@ -56,6 +57,7 @@ public class TestHoodieRealtimeFileSplit {
private HoodieRealtimeFileSplit split;
private String basePath;
private List<HoodieLogFile> deltaLogFiles;
private List<String> deltaLogPaths;
private String fileSplitName;
private FileSplit baseFileSplit;
@@ -64,12 +66,13 @@ public class TestHoodieRealtimeFileSplit {
@BeforeEach
public void setUp(@TempDir java.nio.file.Path tempDir) throws Exception {
basePath = tempDir.toAbsolutePath().toString();
deltaLogFiles = Collections.singletonList(new HoodieLogFile(new Path(basePath + "/1.log"), 0L));
deltaLogPaths = Collections.singletonList(basePath + "/1.log");
fileSplitName = basePath + "/test.file";
baseFileSplit = new FileSplit(new Path(fileSplitName), 0, 100, new String[] {});
maxCommitTime = "10001";
split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogPaths, maxCommitTime, Option.empty());
split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogFiles, maxCommitTime, Option.empty());
}
@Test

View File

@@ -221,7 +221,7 @@ public class TestHoodieRealtimeRecordReader {
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, baseJobConf),
basePath.toUri().toString(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
.map(h -> h.getPath().toString()).collect(Collectors.toList()),
.collect(Collectors.toList()),
instantTime, Option.empty());
// create a RecordReader to be used by HoodieRealtimeRecordReader
@@ -290,10 +290,9 @@ public class TestHoodieRealtimeRecordReader {
FileCreateUtils.createDeltaCommit(basePath.toString(), newCommitTime);
// create a split with baseFile (parquet file written earlier) and new log file(s)
String logFilePath = writer.getLogFile().getPath().toString();
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
basePath.toUri().toString(), Collections.singletonList(logFilePath), newCommitTime, Option.empty());
basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, Option.empty());
// create a RecordReader to be used by HoodieRealtimeRecordReader
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
@@ -370,10 +369,9 @@ public class TestHoodieRealtimeRecordReader {
InputFormatTestUtil.deltaCommit(basePath, newCommitTime);
// create a split with baseFile (parquet file written earlier) and new log file(s)
String logFilePath = writer.getLogFile().getPath().toString();
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
basePath.toUri().toString(), Collections.singletonList(logFilePath), newCommitTime, Option.empty());
basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, Option.empty());
// create a RecordReader to be used by HoodieRealtimeRecordReader
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
@@ -483,7 +481,7 @@ public class TestHoodieRealtimeRecordReader {
public void testSchemaEvolutionAndRollbackBlockInLastLogFile(ExternalSpillableMap.DiskMapType diskMapType,
boolean isCompressionEnabled) throws Exception {
// initial commit
List<String> logFilePaths = new ArrayList<>();
List<HoodieLogFile> logFiles = new ArrayList<>();
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
String instantTime = "100";
@@ -504,7 +502,7 @@ public class TestHoodieRealtimeRecordReader {
InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime,
numberOfLogRecords, 0, 1);
long size = writer.getCurrentSize();
logFilePaths.add(writer.getLogFile().getPath().toString());
logFiles.add(writer.getLogFile());
writer.close();
assertTrue(size > 0, "block - size should be > 0");
@@ -512,14 +510,14 @@ public class TestHoodieRealtimeRecordReader {
newCommitTime = "102";
writer = InputFormatTestUtil.writeRollbackBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime,
newCommitTime, "101", 1);
logFilePaths.add(writer.getLogFile().getPath().toString());
logFiles.add(writer.getLogFile());
writer.close();
InputFormatTestUtil.deltaCommit(basePath, newCommitTime);
// create a split with baseFile (parquet file written earlier) and new log file(s)
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
basePath.toUri().toString(), logFilePaths, newCommitTime, Option.empty());
basePath.toUri().toString(), logFiles, newCommitTime, Option.empty());
// create a RecordReader to be used by HoodieRealtimeRecordReader
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
@@ -695,6 +693,7 @@ public class TestHoodieRealtimeRecordReader {
writeStat.setNumUpdateWrites(100);
writeStat.setNumWrites(100);
writeStat.setPath(filePath);
writeStat.setFileSizeInBytes(new File(new Path(basePath.toString(), filePath).toString()).length());
writeStat.setPartitionPath(partitionPath);
writeStat.setTotalLogFilesCompacted(100L);
HoodieWriteStat.RuntimeStats runtimeStats = new HoodieWriteStat.RuntimeStats();
@@ -750,14 +749,14 @@ public class TestHoodieRealtimeRecordReader {
assertTrue(size > 0, "block - size should be > 0");
FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime);
// create a split with new log file(s)
fileSlice.addLogFile(writer.getLogFile());
RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(0, false, 1, 1, 0, writer.getLogFile().getPath()));
fileSlice.addLogFile(new HoodieLogFile(writer.getLogFile().getPath(), size));
RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath()));
realtimeFileStatus.setMaxCommitTime(instantTime);
realtimeFileStatus.setBasePath(basePath.toString());
realtimeFileStatus.setDeltaLogPaths(fileSlice.getLogFiles().map(l -> l.getPath().toString()).collect(Collectors.toList()));
realtimeFileStatus.setDeltaLogFiles(fileSlice.getLogFiles().collect(Collectors.toList()));
PathWithLogFilePath pathWithLogFileStatus = (PathWithLogFilePath) realtimeFileStatus.getPath();
BaseFileWithLogsSplit bs = pathWithLogFileStatus.buildSplit(pathWithLogFileStatus, 0, 0, new String[] {""});
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogPaths(), bs.getMaxCommitTime(), Option.empty());
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogFiles(), bs.getMaxCommitTime(), Option.empty());
JobConf newJobConf = new JobConf(baseJobConf);
List<Schema.Field> fields = schema.getFields();