[HUDI-2005] Removing direct fs call in HoodieLogFileReader (#3865)
This commit is contained in:
committed by
GitHub
parent
6f5d8d04cd
commit
8340ccb503
@@ -76,6 +76,11 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
|||||||
private boolean closed = false;
|
private boolean closed = false;
|
||||||
private transient Thread shutdownThread = null;
|
private transient Thread shutdownThread = null;
|
||||||
|
|
||||||
|
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize,
|
||||||
|
boolean readBlockLazily) throws IOException {
|
||||||
|
this(fs, logFile, readerSchema, bufferSize, readBlockLazily, false);
|
||||||
|
}
|
||||||
|
|
||||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize,
|
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize,
|
||||||
boolean readBlockLazily, boolean reverseReader) throws IOException {
|
boolean readBlockLazily, boolean reverseReader) throws IOException {
|
||||||
this(fs, logFile, readerSchema, bufferSize, readBlockLazily, reverseReader, false,
|
this(fs, logFile, readerSchema, bufferSize, readBlockLazily, reverseReader, false,
|
||||||
@@ -94,16 +99,11 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
|||||||
this.enableInlineReading = enableInlineReading;
|
this.enableInlineReading = enableInlineReading;
|
||||||
this.keyField = keyField;
|
this.keyField = keyField;
|
||||||
if (this.reverseReader) {
|
if (this.reverseReader) {
|
||||||
this.reverseLogFilePosition = this.lastReverseLogFilePosition = fs.getFileStatus(logFile.getPath()).getLen();
|
this.reverseLogFilePosition = this.lastReverseLogFilePosition = logFile.getFileSize();
|
||||||
}
|
}
|
||||||
addShutDownHook();
|
addShutDownHook();
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, boolean readBlockLazily,
|
|
||||||
boolean reverseReader) throws IOException {
|
|
||||||
this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, readBlockLazily, reverseReader);
|
|
||||||
}
|
|
||||||
|
|
||||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) throws IOException {
|
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) throws IOException {
|
||||||
this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, false, false);
|
this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, false, false);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -274,7 +274,7 @@ public interface HoodieLogFormat {
|
|||||||
|
|
||||||
static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema)
|
static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, false, false);
|
return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema,
|
static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema,
|
||||||
|
|||||||
@@ -104,8 +104,7 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
|
|||||||
} else {
|
} else {
|
||||||
this.prevReadersInOpenState.add(currentReader);
|
this.prevReadersInOpenState.add(currentReader);
|
||||||
}
|
}
|
||||||
this.currentReader =
|
this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false,
|
||||||
new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false,
|
|
||||||
enableInlineReading, recordKeyField);
|
enableInlineReading, recordKeyField);
|
||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
throw new HoodieIOException("unable to initialize read with log file ", io);
|
throw new HoodieIOException("unable to initialize read with log file ", io);
|
||||||
|
|||||||
@@ -27,14 +27,15 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
|||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -42,9 +43,10 @@ import java.util.stream.Collectors;
|
|||||||
*/
|
*/
|
||||||
public class LogReaderUtils {
|
public class LogReaderUtils {
|
||||||
|
|
||||||
private static Schema readSchemaFromLogFileInReverse(FileSystem fs, HoodieActiveTimeline activeTimeline, Path path)
|
private static Schema readSchemaFromLogFileInReverse(FileSystem fs, HoodieActiveTimeline activeTimeline, HoodieLogFile hoodieLogFile)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null, true, true);
|
// set length for the HoodieLogFile as it will be leveraged by HoodieLogFormat.Reader with reverseReading enabled
|
||||||
|
Reader reader = HoodieLogFormat.newReader(fs, hoodieLogFile, null, true, true);
|
||||||
Schema writerSchema = null;
|
Schema writerSchema = null;
|
||||||
HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
|
HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
|
||||||
while (reader.hasPrev()) {
|
while (reader.hasPrev()) {
|
||||||
@@ -62,17 +64,17 @@ public class LogReaderUtils {
|
|||||||
return writerSchema;
|
return writerSchema;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Schema readLatestSchemaFromLogFiles(String basePath, List<String> deltaFilePaths, Configuration config)
|
public static Schema readLatestSchemaFromLogFiles(String basePath, List<HoodieLogFile> logFiles, Configuration config)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(config).setBasePath(basePath).build();
|
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(config).setBasePath(basePath).build();
|
||||||
List<String> deltaPaths = deltaFilePaths.stream().map(s -> new HoodieLogFile(new Path(s)))
|
List<String> deltaPaths = logFiles.stream().sorted(HoodieLogFile.getReverseLogFileComparator()).map(s -> s.getPath().toString())
|
||||||
.sorted(HoodieLogFile.getReverseLogFileComparator()).map(s -> s.getPath().toString())
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
if (deltaPaths.size() > 0) {
|
if (deltaPaths.size() > 0) {
|
||||||
|
Map<String, HoodieLogFile> deltaFilePathToFileStatus = logFiles.stream().map(entry -> Pair.of(entry.getPath().toString(), entry))
|
||||||
|
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
|
||||||
for (String logPath : deltaPaths) {
|
for (String logPath : deltaPaths) {
|
||||||
FileSystem fs = FSUtils.getFs(logPath, config);
|
FileSystem fs = FSUtils.getFs(logPath, config);
|
||||||
Schema schemaFromLogFile =
|
Schema schemaFromLogFile = readSchemaFromLogFileInReverse(fs, metaClient.getActiveTimeline(), deltaFilePathToFileStatus.get(logPath));
|
||||||
readSchemaFromLogFileInReverse(fs, metaClient.getActiveTimeline(), new Path(logPath));
|
|
||||||
if (schemaFromLogFile != null) {
|
if (schemaFromLogFile != null) {
|
||||||
return schemaFromLogFile;
|
return schemaFromLogFile;
|
||||||
}
|
}
|
||||||
@@ -80,5 +82,4 @@ public class LogReaderUtils {
|
|||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1482,7 +1482,8 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
|
|
||||||
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
|
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
|
||||||
|
|
||||||
HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(),
|
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
|
||||||
|
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(),
|
||||||
bufferSize, readBlocksLazily, true);
|
bufferSize, readBlocksLazily, true);
|
||||||
|
|
||||||
assertTrue(reader.hasPrev(), "Last block should be available");
|
assertTrue(reader.hasPrev(), "Last block should be available");
|
||||||
@@ -1560,7 +1561,8 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
|
|
||||||
// First round of reads - we should be able to read the first block and then EOF
|
// First round of reads - we should be able to read the first block and then EOF
|
||||||
HoodieLogFileReader reader =
|
HoodieLogFileReader reader =
|
||||||
new HoodieLogFileReader(fs, writer.getLogFile(), schema, bufferSize, readBlocksLazily, true);
|
new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
|
||||||
|
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), schema, bufferSize, readBlocksLazily, true);
|
||||||
|
|
||||||
assertTrue(reader.hasPrev(), "Last block should be available");
|
assertTrue(reader.hasPrev(), "Last block should be available");
|
||||||
HoodieLogBlock block = reader.prev();
|
HoodieLogBlock block = reader.prev();
|
||||||
@@ -1610,7 +1612,8 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
|
|
||||||
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
|
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
|
||||||
|
|
||||||
HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(),
|
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
|
||||||
|
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(),
|
||||||
bufferSize, readBlocksLazily, true);
|
bufferSize, readBlocksLazily, true);
|
||||||
|
|
||||||
assertTrue(reader.hasPrev(), "Third block should be available");
|
assertTrue(reader.hasPrev(), "Third block should be available");
|
||||||
|
|||||||
@@ -18,6 +18,8 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop;
|
package org.apache.hudi.hadoop;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.mapred.FileSplit;
|
import org.apache.hadoop.mapred.FileSplit;
|
||||||
@@ -36,7 +38,7 @@ public class BaseFileWithLogsSplit extends FileSplit {
|
|||||||
// a flag to mark this split is produced by incremental query or not.
|
// a flag to mark this split is produced by incremental query or not.
|
||||||
private boolean belongToIncrementalSplit = false;
|
private boolean belongToIncrementalSplit = false;
|
||||||
// the log file paths of this split.
|
// the log file paths of this split.
|
||||||
private List<String> deltaLogPaths = new ArrayList<>();
|
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
|
||||||
// max commit time of current split.
|
// max commit time of current split.
|
||||||
private String maxCommitTime = "";
|
private String maxCommitTime = "";
|
||||||
// the basePath of current hoodie table.
|
// the basePath of current hoodie table.
|
||||||
@@ -55,9 +57,10 @@ public class BaseFileWithLogsSplit extends FileSplit {
|
|||||||
Text.writeString(out, maxCommitTime);
|
Text.writeString(out, maxCommitTime);
|
||||||
Text.writeString(out, basePath);
|
Text.writeString(out, basePath);
|
||||||
Text.writeString(out, baseFilePath);
|
Text.writeString(out, baseFilePath);
|
||||||
out.writeInt(deltaLogPaths.size());
|
out.writeInt(deltaLogFiles.size());
|
||||||
for (String logPath : deltaLogPaths) {
|
for (HoodieLogFile logFile : deltaLogFiles) {
|
||||||
Text.writeString(out, logPath);
|
Text.writeString(out, logFile.getPath().toString());
|
||||||
|
out.writeLong(logFile.getFileSize());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -69,11 +72,13 @@ public class BaseFileWithLogsSplit extends FileSplit {
|
|||||||
basePath = Text.readString(in);
|
basePath = Text.readString(in);
|
||||||
baseFilePath = Text.readString(in);
|
baseFilePath = Text.readString(in);
|
||||||
int deltaLogSize = in.readInt();
|
int deltaLogSize = in.readInt();
|
||||||
List<String> tempDeltaLogs = new ArrayList<>();
|
List<HoodieLogFile> tempDeltaLogs = new ArrayList<>();
|
||||||
for (int i = 0; i < deltaLogSize; i++) {
|
for (int i = 0; i < deltaLogSize; i++) {
|
||||||
tempDeltaLogs.add(Text.readString(in));
|
String logPath = Text.readString(in);
|
||||||
|
long logFileSize = in.readLong();
|
||||||
|
tempDeltaLogs.add(new HoodieLogFile(new Path(logPath), logFileSize));
|
||||||
}
|
}
|
||||||
deltaLogPaths = tempDeltaLogs;
|
deltaLogFiles = tempDeltaLogs;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean getBelongToIncrementalSplit() {
|
public boolean getBelongToIncrementalSplit() {
|
||||||
@@ -84,12 +89,12 @@ public class BaseFileWithLogsSplit extends FileSplit {
|
|||||||
this.belongToIncrementalSplit = belongToIncrementalSplit;
|
this.belongToIncrementalSplit = belongToIncrementalSplit;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getDeltaLogPaths() {
|
public List<HoodieLogFile> getDeltaLogFiles() {
|
||||||
return deltaLogPaths;
|
return deltaLogFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDeltaLogPaths(List<String> deltaLogPaths) {
|
public void setDeltaLogFiles(List<HoodieLogFile> deltaLogFiles) {
|
||||||
this.deltaLogPaths = deltaLogPaths;
|
this.deltaLogFiles = deltaLogFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getMaxCommitTime() {
|
public String getMaxCommitTime() {
|
||||||
|
|||||||
@@ -18,6 +18,8 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop;
|
package org.apache.hudi.hadoop;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@@ -31,7 +33,7 @@ public class PathWithLogFilePath extends Path {
|
|||||||
// a flag to mark this split is produced by incremental query or not.
|
// a flag to mark this split is produced by incremental query or not.
|
||||||
private boolean belongToIncrementalPath = false;
|
private boolean belongToIncrementalPath = false;
|
||||||
// the log files belong this path.
|
// the log files belong this path.
|
||||||
private List<String> deltaLogPaths = new ArrayList<>();
|
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
|
||||||
// max commit time of current path.
|
// max commit time of current path.
|
||||||
private String maxCommitTime = "";
|
private String maxCommitTime = "";
|
||||||
// the basePath of current hoodie table.
|
// the basePath of current hoodie table.
|
||||||
@@ -50,12 +52,12 @@ public class PathWithLogFilePath extends Path {
|
|||||||
this.belongToIncrementalPath = belongToIncrementalPath;
|
this.belongToIncrementalPath = belongToIncrementalPath;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getDeltaLogPaths() {
|
public List<HoodieLogFile> getDeltaLogFiles() {
|
||||||
return deltaLogPaths;
|
return deltaLogFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDeltaLogPaths(List<String> deltaLogPaths) {
|
public void setDeltaLogFiles(List<HoodieLogFile> deltaLogFiles) {
|
||||||
this.deltaLogPaths = deltaLogPaths;
|
this.deltaLogFiles = deltaLogFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getMaxCommitTime() {
|
public String getMaxCommitTime() {
|
||||||
@@ -97,7 +99,7 @@ public class PathWithLogFilePath extends Path {
|
|||||||
public BaseFileWithLogsSplit buildSplit(Path file, long start, long length, String[] hosts) {
|
public BaseFileWithLogsSplit buildSplit(Path file, long start, long length, String[] hosts) {
|
||||||
BaseFileWithLogsSplit bs = new BaseFileWithLogsSplit(file, start, length, hosts);
|
BaseFileWithLogsSplit bs = new BaseFileWithLogsSplit(file, start, length, hosts);
|
||||||
bs.setBelongToIncrementalSplit(belongToIncrementalPath);
|
bs.setBelongToIncrementalSplit(belongToIncrementalPath);
|
||||||
bs.setDeltaLogPaths(deltaLogPaths);
|
bs.setDeltaLogFiles(deltaLogFiles);
|
||||||
bs.setMaxCommitTime(maxCommitTime);
|
bs.setMaxCommitTime(maxCommitTime);
|
||||||
bs.setBasePath(basePath);
|
bs.setBasePath(basePath);
|
||||||
bs.setBaseFilePath(baseFilePath);
|
bs.setBaseFilePath(baseFilePath);
|
||||||
|
|||||||
@@ -18,6 +18,8 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop;
|
package org.apache.hudi.hadoop;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
@@ -35,7 +37,7 @@ public class RealtimeFileStatus extends FileStatus {
|
|||||||
// a flag to mark this split is produced by incremental query or not.
|
// a flag to mark this split is produced by incremental query or not.
|
||||||
private boolean belongToIncrementalFileStatus = false;
|
private boolean belongToIncrementalFileStatus = false;
|
||||||
// the log files belong this fileStatus.
|
// the log files belong this fileStatus.
|
||||||
private List<String> deltaLogPaths = new ArrayList<>();
|
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
|
||||||
// max commit time of current fileStatus.
|
// max commit time of current fileStatus.
|
||||||
private String maxCommitTime = "";
|
private String maxCommitTime = "";
|
||||||
// the basePath of current hoodie table.
|
// the basePath of current hoodie table.
|
||||||
@@ -55,7 +57,7 @@ public class RealtimeFileStatus extends FileStatus {
|
|||||||
Path path = super.getPath();
|
Path path = super.getPath();
|
||||||
PathWithLogFilePath pathWithLogFilePath = new PathWithLogFilePath(path.getParent(), path.getName());
|
PathWithLogFilePath pathWithLogFilePath = new PathWithLogFilePath(path.getParent(), path.getName());
|
||||||
pathWithLogFilePath.setBelongToIncrementalPath(belongToIncrementalFileStatus);
|
pathWithLogFilePath.setBelongToIncrementalPath(belongToIncrementalFileStatus);
|
||||||
pathWithLogFilePath.setDeltaLogPaths(deltaLogPaths);
|
pathWithLogFilePath.setDeltaLogFiles(deltaLogFiles);
|
||||||
pathWithLogFilePath.setMaxCommitTime(maxCommitTime);
|
pathWithLogFilePath.setMaxCommitTime(maxCommitTime);
|
||||||
pathWithLogFilePath.setBasePath(basePath);
|
pathWithLogFilePath.setBasePath(basePath);
|
||||||
pathWithLogFilePath.setBaseFilePath(baseFilePath);
|
pathWithLogFilePath.setBaseFilePath(baseFilePath);
|
||||||
@@ -69,12 +71,12 @@ public class RealtimeFileStatus extends FileStatus {
|
|||||||
this.belongToIncrementalFileStatus = belongToIncrementalFileStatus;
|
this.belongToIncrementalFileStatus = belongToIncrementalFileStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getDeltaLogPaths() {
|
public List<HoodieLogFile> getDeltaLogFiles() {
|
||||||
return deltaLogPaths;
|
return deltaLogFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDeltaLogPaths(List<String> deltaLogPaths) {
|
public void setDeltaLogFiles(List<HoodieLogFile> deltaLogFiles) {
|
||||||
this.deltaLogPaths = deltaLogPaths;
|
this.deltaLogFiles = deltaLogFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getMaxCommitTime() {
|
public String getMaxCommitTime() {
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ public abstract class AbstractRealtimeRecordReader {
|
|||||||
* job conf.
|
* job conf.
|
||||||
*/
|
*/
|
||||||
private void init() throws IOException {
|
private void init() throws IOException {
|
||||||
Schema schemaFromLogFile = LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogPaths(), jobConf);
|
Schema schemaFromLogFile = LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogFiles(), jobConf);
|
||||||
if (schemaFromLogFile == null) {
|
if (schemaFromLogFile == null) {
|
||||||
writerSchema = InputSplitUtils.getBaseFileSchema((FileSplit)split, jobConf);
|
writerSchema = InputSplitUtils.getBaseFileSchema((FileSplit)split, jobConf);
|
||||||
LOG.info("Writer Schema From Parquet => " + writerSchema.getFields());
|
LOG.info("Writer Schema From Parquet => " + writerSchema.getFields());
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import org.apache.hudi.common.fs.FSUtils;
|
|||||||
import org.apache.hudi.common.model.FileSlice;
|
import org.apache.hudi.common.model.FileSlice;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
import org.apache.hudi.common.model.HoodieFileGroup;
|
import org.apache.hudi.common.model.HoodieFileGroup;
|
||||||
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
@@ -189,7 +190,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
|
|||||||
fileStatus.setBelongToIncrementalFileStatus(true);
|
fileStatus.setBelongToIncrementalFileStatus(true);
|
||||||
fileStatus.setBasePath(basePath);
|
fileStatus.setBasePath(basePath);
|
||||||
fileStatus.setBaseFilePath(baseFilePath);
|
fileStatus.setBaseFilePath(baseFilePath);
|
||||||
fileStatus.setDeltaLogPaths(f.getLatestFileSlice().get().getLogFiles().map(l -> l.getPath().toString()).collect(Collectors.toList()));
|
fileStatus.setDeltaLogFiles(f.getLatestFileSlice().get().getLogFiles().collect(Collectors.toList()));
|
||||||
// try to set bootstrapfileStatus
|
// try to set bootstrapfileStatus
|
||||||
if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) {
|
if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) {
|
||||||
fileStatus.setBootStrapFileStatus(baseFileStatus);
|
fileStatus.setBootStrapFileStatus(baseFileStatus);
|
||||||
@@ -202,7 +203,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
|
|||||||
if (logFileStatus.size() > 0) {
|
if (logFileStatus.size() > 0) {
|
||||||
RealtimeFileStatus fileStatus = new RealtimeFileStatus(logFileStatus.get(0));
|
RealtimeFileStatus fileStatus = new RealtimeFileStatus(logFileStatus.get(0));
|
||||||
fileStatus.setBelongToIncrementalFileStatus(true);
|
fileStatus.setBelongToIncrementalFileStatus(true);
|
||||||
fileStatus.setDeltaLogPaths(logFileStatus.stream().map(l -> l.getPath().toString()).collect(Collectors.toList()));
|
fileStatus.setDeltaLogFiles(logFileStatus.stream().map(l -> new HoodieLogFile(l.getPath(), l.getLen())).collect(Collectors.toList()));
|
||||||
fileStatus.setMaxCommitTime(maxCommitTime);
|
fileStatus.setMaxCommitTime(maxCommitTime);
|
||||||
fileStatus.setBasePath(basePath);
|
fileStatus.setBasePath(basePath);
|
||||||
result.add(fileStatus);
|
result.add(fileStatus);
|
||||||
@@ -256,7 +257,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
|
|||||||
? super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts)
|
? super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts)
|
||||||
: super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts, inMemoryHosts);
|
: super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts, inMemoryHosts);
|
||||||
return HoodieRealtimeInputFormatUtils
|
return HoodieRealtimeInputFormatUtils
|
||||||
.createRealtimeBoostrapBaseFileSplit((BootstrapBaseFileSplit) bf, path.getBasePath(), path.getDeltaLogPaths(), path.getMaxCommitTime());
|
.createRealtimeBoostrapBaseFileSplit((BootstrapBaseFileSplit) bf, path.getBasePath(), path.getDeltaLogFiles(), path.getMaxCommitTime());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop.realtime;
|
package org.apache.hudi.hadoop.realtime;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
import org.apache.hadoop.mapred.FileSplit;
|
import org.apache.hadoop.mapred.FileSplit;
|
||||||
@@ -25,7 +26,9 @@ import org.apache.hadoop.mapred.FileSplit;
|
|||||||
import java.io.DataInput;
|
import java.io.DataInput;
|
||||||
import java.io.DataOutput;
|
import java.io.DataOutput;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filesplit that wraps the base split and a list of log files to merge deltas from.
|
* Filesplit that wraps the base split and a list of log files to merge deltas from.
|
||||||
@@ -33,6 +36,7 @@ import java.util.List;
|
|||||||
public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit {
|
public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit {
|
||||||
|
|
||||||
private List<String> deltaLogPaths;
|
private List<String> deltaLogPaths;
|
||||||
|
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
|
||||||
|
|
||||||
private String maxCommitTime;
|
private String maxCommitTime;
|
||||||
|
|
||||||
@@ -44,11 +48,12 @@ public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit
|
|||||||
super();
|
super();
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List<String> deltaLogPaths, String maxCommitTime,
|
public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List<HoodieLogFile> deltaLogFiles, String maxCommitTime,
|
||||||
Option<HoodieVirtualKeyInfo> hoodieVirtualKeyInfo)
|
Option<HoodieVirtualKeyInfo> hoodieVirtualKeyInfo)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(), baseSplit.getLocations());
|
super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(), baseSplit.getLocations());
|
||||||
this.deltaLogPaths = deltaLogPaths;
|
this.deltaLogFiles = deltaLogFiles;
|
||||||
|
this.deltaLogPaths = deltaLogFiles.stream().map(entry -> entry.getPath().toString()).collect(Collectors.toList());
|
||||||
this.maxCommitTime = maxCommitTime;
|
this.maxCommitTime = maxCommitTime;
|
||||||
this.basePath = basePath;
|
this.basePath = basePath;
|
||||||
this.hoodieVirtualKeyInfo = hoodieVirtualKeyInfo;
|
this.hoodieVirtualKeyInfo = hoodieVirtualKeyInfo;
|
||||||
@@ -58,6 +63,10 @@ public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit
|
|||||||
return deltaLogPaths;
|
return deltaLogPaths;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<HoodieLogFile> getDeltaLogFiles() {
|
||||||
|
return deltaLogFiles;
|
||||||
|
}
|
||||||
|
|
||||||
public String getMaxCommitTime() {
|
public String getMaxCommitTime() {
|
||||||
return maxCommitTime;
|
return maxCommitTime;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop.realtime;
|
package org.apache.hudi.hadoop.realtime;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.hadoop.BootstrapBaseFileSplit;
|
import org.apache.hudi.hadoop.BootstrapBaseFileSplit;
|
||||||
|
|
||||||
@@ -26,7 +27,9 @@ import org.apache.hadoop.mapred.FileSplit;
|
|||||||
import java.io.DataInput;
|
import java.io.DataInput;
|
||||||
import java.io.DataOutput;
|
import java.io.DataOutput;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Realtime File Split with external base file.
|
* Realtime File Split with external base file.
|
||||||
@@ -34,6 +37,7 @@ import java.util.List;
|
|||||||
public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit implements RealtimeSplit {
|
public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit implements RealtimeSplit {
|
||||||
|
|
||||||
private List<String> deltaLogPaths;
|
private List<String> deltaLogPaths;
|
||||||
|
private List<HoodieLogFile> deltaLogFiles = new ArrayList<>();
|
||||||
|
|
||||||
private String maxInstantTime;
|
private String maxInstantTime;
|
||||||
|
|
||||||
@@ -43,11 +47,12 @@ public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit imple
|
|||||||
super();
|
super();
|
||||||
}
|
}
|
||||||
|
|
||||||
public RealtimeBootstrapBaseFileSplit(FileSplit baseSplit, String basePath, List<String> deltaLogPaths,
|
public RealtimeBootstrapBaseFileSplit(FileSplit baseSplit, String basePath, List<HoodieLogFile> deltaLogFiles,
|
||||||
String maxInstantTime, FileSplit externalFileSplit) throws IOException {
|
String maxInstantTime, FileSplit externalFileSplit) throws IOException {
|
||||||
super(baseSplit, externalFileSplit);
|
super(baseSplit, externalFileSplit);
|
||||||
this.maxInstantTime = maxInstantTime;
|
this.maxInstantTime = maxInstantTime;
|
||||||
this.deltaLogPaths = deltaLogPaths;
|
this.deltaLogFiles = deltaLogFiles;
|
||||||
|
this.deltaLogPaths = deltaLogFiles.stream().map(entry -> entry.getPath().toString()).collect(Collectors.toList());
|
||||||
this.basePath = basePath;
|
this.basePath = basePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -68,6 +73,11 @@ public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit imple
|
|||||||
return deltaLogPaths;
|
return deltaLogPaths;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<HoodieLogFile> getDeltaLogFiles() {
|
||||||
|
return deltaLogFiles;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getMaxCommitTime() {
|
public String getMaxCommitTime() {
|
||||||
return maxInstantTime;
|
return maxInstantTime;
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop.realtime;
|
package org.apache.hudi.hadoop.realtime;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.hadoop.InputSplitUtils;
|
import org.apache.hudi.hadoop.InputSplitUtils;
|
||||||
|
|
||||||
@@ -41,6 +42,8 @@ public interface RealtimeSplit extends InputSplitWithLocationInfo {
|
|||||||
*/
|
*/
|
||||||
List<String> getDeltaLogPaths();
|
List<String> getDeltaLogPaths();
|
||||||
|
|
||||||
|
List<HoodieLogFile> getDeltaLogFiles();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return Max Instant Time.
|
* Return Max Instant Time.
|
||||||
* @return
|
* @return
|
||||||
|
|||||||
@@ -466,7 +466,7 @@ public class HoodieInputFormatUtils {
|
|||||||
HoodieTableFileSystemView fsView = fsViewCache.computeIfAbsent(metaClient, tableMetaClient ->
|
HoodieTableFileSystemView fsView = fsViewCache.computeIfAbsent(metaClient, tableMetaClient ->
|
||||||
FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, tableMetaClient, buildMetadataConfig(job), timeline));
|
FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, tableMetaClient, buildMetadataConfig(job), timeline));
|
||||||
List<HoodieBaseFile> filteredBaseFiles = new ArrayList<>();
|
List<HoodieBaseFile> filteredBaseFiles = new ArrayList<>();
|
||||||
Map<FileStatus, List<String>> filteredLogs = new HashMap<>();
|
Map<FileStatus, List<HoodieLogFile>> filteredLogs = new HashMap<>();
|
||||||
for (Path p : entry.getValue()) {
|
for (Path p : entry.getValue()) {
|
||||||
String relativePartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), p);
|
String relativePartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), p);
|
||||||
List<HoodieBaseFile> matched = fsView.getLatestBaseFiles(relativePartitionPath).collect(Collectors.toList());
|
List<HoodieBaseFile> matched = fsView.getLatestBaseFiles(relativePartitionPath).collect(Collectors.toList());
|
||||||
@@ -476,9 +476,8 @@ public class HoodieInputFormatUtils {
|
|||||||
.filter(f -> !f.getBaseFile().isPresent() && f.getLatestLogFile().isPresent())
|
.filter(f -> !f.getBaseFile().isPresent() && f.getLatestLogFile().isPresent())
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
logMatched.forEach(f -> {
|
logMatched.forEach(f -> {
|
||||||
List<String> logPaths = f.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
|
List<HoodieLogFile> logPathSizePairs = f.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
|
||||||
.map(log -> log.getPath().toString()).collect(Collectors.toList());
|
filteredLogs.put(f.getLatestLogFile().get().getFileStatus(), logPathSizePairs);
|
||||||
filteredLogs.put(f.getLatestLogFile().get().getFileStatus(), logPaths);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -492,9 +491,9 @@ public class HoodieInputFormatUtils {
|
|||||||
returns.add(getFileStatus(filteredFile));
|
returns.add(getFileStatus(filteredFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Map.Entry<FileStatus, List<String>> filterLogEntry : filteredLogs.entrySet()) {
|
for (Map.Entry<FileStatus, List<HoodieLogFile>> filterLogEntry : filteredLogs.entrySet()) {
|
||||||
RealtimeFileStatus rs = new RealtimeFileStatus(filterLogEntry.getKey());
|
RealtimeFileStatus rs = new RealtimeFileStatus(filterLogEntry.getKey());
|
||||||
rs.setDeltaLogPaths(filterLogEntry.getValue());
|
rs.setDeltaLogFiles(filterLogEntry.getValue());
|
||||||
returns.add(rs);
|
returns.add(rs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -120,13 +120,13 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
|||||||
List<FileSplit> dataFileSplits = groupedInputSplits.getOrDefault(fileSlice.getFileId(), new ArrayList<>());
|
List<FileSplit> dataFileSplits = groupedInputSplits.getOrDefault(fileSlice.getFileId(), new ArrayList<>());
|
||||||
dataFileSplits.forEach(split -> {
|
dataFileSplits.forEach(split -> {
|
||||||
try {
|
try {
|
||||||
List<String> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
|
List<HoodieLogFile> logFiles = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
|
||||||
.map(logFile -> logFile.getPath().toString()).collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
if (split instanceof BootstrapBaseFileSplit) {
|
if (split instanceof BootstrapBaseFileSplit) {
|
||||||
BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split;
|
BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split;
|
||||||
rtSplits.add(createRealtimeBoostrapBaseFileSplit(eSplit, metaClient.getBasePath(), logFilePaths, maxCommitTime));
|
rtSplits.add(createRealtimeBoostrapBaseFileSplit(eSplit, metaClient.getBasePath(), logFiles, maxCommitTime));
|
||||||
} else {
|
} else {
|
||||||
rtSplits.add(new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFilePaths, maxCommitTime, finalHoodieVirtualKeyInfo));
|
rtSplits.add(new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFiles, maxCommitTime, finalHoodieVirtualKeyInfo));
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieIOException("Error creating hoodie real time split ", e);
|
throw new HoodieIOException("Error creating hoodie real time split ", e);
|
||||||
@@ -162,7 +162,7 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
|||||||
if (s instanceof BaseFileWithLogsSplit) {
|
if (s instanceof BaseFileWithLogsSplit) {
|
||||||
BaseFileWithLogsSplit bs = (BaseFileWithLogsSplit)s;
|
BaseFileWithLogsSplit bs = (BaseFileWithLogsSplit)s;
|
||||||
if (bs.getBelongToIncrementalSplit()) {
|
if (bs.getBelongToIncrementalSplit()) {
|
||||||
rtSplits.add(new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogPaths(), bs.getMaxCommitTime(), finalHoodieVirtualKeyInfo));
|
rtSplits.add(new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogFiles(), bs.getMaxCommitTime(), finalHoodieVirtualKeyInfo));
|
||||||
}
|
}
|
||||||
} else if (s instanceof RealtimeBootstrapBaseFileSplit) {
|
} else if (s instanceof RealtimeBootstrapBaseFileSplit) {
|
||||||
rtSplits.add(s);
|
rtSplits.add(s);
|
||||||
@@ -206,7 +206,7 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static RealtimeBootstrapBaseFileSplit createRealtimeBoostrapBaseFileSplit(
|
public static RealtimeBootstrapBaseFileSplit createRealtimeBoostrapBaseFileSplit(
|
||||||
BootstrapBaseFileSplit split, String basePath, List<String> deltaLogPaths, String maxInstantTime) {
|
BootstrapBaseFileSplit split, String basePath, List<HoodieLogFile> logFiles, String maxInstantTime) {
|
||||||
try {
|
try {
|
||||||
String[] hosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo())
|
String[] hosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo())
|
||||||
.filter(x -> !x.isInMemory()).toArray(String[]::new) : new String[0];
|
.filter(x -> !x.isInMemory()).toArray(String[]::new) : new String[0];
|
||||||
@@ -214,7 +214,7 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
|||||||
.filter(SplitLocationInfo::isInMemory).toArray(String[]::new) : new String[0];
|
.filter(SplitLocationInfo::isInMemory).toArray(String[]::new) : new String[0];
|
||||||
FileSplit baseSplit = new FileSplit(split.getPath(), split.getStart(), split.getLength(),
|
FileSplit baseSplit = new FileSplit(split.getPath(), split.getStart(), split.getLength(),
|
||||||
hosts, inMemoryHosts);
|
hosts, inMemoryHosts);
|
||||||
return new RealtimeBootstrapBaseFileSplit(baseSplit, basePath, deltaLogPaths, maxInstantTime, split.getBootstrapFileSplit());
|
return new RealtimeBootstrapBaseFileSplit(baseSplit, basePath, logFiles, maxInstantTime, split.getBootstrapFileSplit());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieIOException("Error creating hoodie real time split ", e);
|
throw new HoodieIOException("Error creating hoodie real time split ", e);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop.realtime;
|
package org.apache.hudi.hadoop.realtime;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
@@ -56,6 +57,7 @@ public class TestHoodieRealtimeFileSplit {
|
|||||||
|
|
||||||
private HoodieRealtimeFileSplit split;
|
private HoodieRealtimeFileSplit split;
|
||||||
private String basePath;
|
private String basePath;
|
||||||
|
private List<HoodieLogFile> deltaLogFiles;
|
||||||
private List<String> deltaLogPaths;
|
private List<String> deltaLogPaths;
|
||||||
private String fileSplitName;
|
private String fileSplitName;
|
||||||
private FileSplit baseFileSplit;
|
private FileSplit baseFileSplit;
|
||||||
@@ -64,12 +66,13 @@ public class TestHoodieRealtimeFileSplit {
|
|||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp(@TempDir java.nio.file.Path tempDir) throws Exception {
|
public void setUp(@TempDir java.nio.file.Path tempDir) throws Exception {
|
||||||
basePath = tempDir.toAbsolutePath().toString();
|
basePath = tempDir.toAbsolutePath().toString();
|
||||||
|
deltaLogFiles = Collections.singletonList(new HoodieLogFile(new Path(basePath + "/1.log"), 0L));
|
||||||
deltaLogPaths = Collections.singletonList(basePath + "/1.log");
|
deltaLogPaths = Collections.singletonList(basePath + "/1.log");
|
||||||
fileSplitName = basePath + "/test.file";
|
fileSplitName = basePath + "/test.file";
|
||||||
baseFileSplit = new FileSplit(new Path(fileSplitName), 0, 100, new String[] {});
|
baseFileSplit = new FileSplit(new Path(fileSplitName), 0, 100, new String[] {});
|
||||||
maxCommitTime = "10001";
|
maxCommitTime = "10001";
|
||||||
|
|
||||||
split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogPaths, maxCommitTime, Option.empty());
|
split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogFiles, maxCommitTime, Option.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|||||||
@@ -221,7 +221,7 @@ public class TestHoodieRealtimeRecordReader {
|
|||||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
||||||
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, baseJobConf),
|
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, baseJobConf),
|
||||||
basePath.toUri().toString(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
|
basePath.toUri().toString(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
|
||||||
.map(h -> h.getPath().toString()).collect(Collectors.toList()),
|
.collect(Collectors.toList()),
|
||||||
instantTime, Option.empty());
|
instantTime, Option.empty());
|
||||||
|
|
||||||
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||||
@@ -290,10 +290,9 @@ public class TestHoodieRealtimeRecordReader {
|
|||||||
FileCreateUtils.createDeltaCommit(basePath.toString(), newCommitTime);
|
FileCreateUtils.createDeltaCommit(basePath.toString(), newCommitTime);
|
||||||
|
|
||||||
// create a split with baseFile (parquet file written earlier) and new log file(s)
|
// create a split with baseFile (parquet file written earlier) and new log file(s)
|
||||||
String logFilePath = writer.getLogFile().getPath().toString();
|
|
||||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
||||||
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
|
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
|
||||||
basePath.toUri().toString(), Collections.singletonList(logFilePath), newCommitTime, Option.empty());
|
basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, Option.empty());
|
||||||
|
|
||||||
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||||
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
|
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
|
||||||
@@ -370,10 +369,9 @@ public class TestHoodieRealtimeRecordReader {
|
|||||||
InputFormatTestUtil.deltaCommit(basePath, newCommitTime);
|
InputFormatTestUtil.deltaCommit(basePath, newCommitTime);
|
||||||
|
|
||||||
// create a split with baseFile (parquet file written earlier) and new log file(s)
|
// create a split with baseFile (parquet file written earlier) and new log file(s)
|
||||||
String logFilePath = writer.getLogFile().getPath().toString();
|
|
||||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
||||||
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
|
new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
|
||||||
basePath.toUri().toString(), Collections.singletonList(logFilePath), newCommitTime, Option.empty());
|
basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, Option.empty());
|
||||||
|
|
||||||
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||||
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
|
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
|
||||||
@@ -483,7 +481,7 @@ public class TestHoodieRealtimeRecordReader {
|
|||||||
public void testSchemaEvolutionAndRollbackBlockInLastLogFile(ExternalSpillableMap.DiskMapType diskMapType,
|
public void testSchemaEvolutionAndRollbackBlockInLastLogFile(ExternalSpillableMap.DiskMapType diskMapType,
|
||||||
boolean isCompressionEnabled) throws Exception {
|
boolean isCompressionEnabled) throws Exception {
|
||||||
// initial commit
|
// initial commit
|
||||||
List<String> logFilePaths = new ArrayList<>();
|
List<HoodieLogFile> logFiles = new ArrayList<>();
|
||||||
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
|
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
|
||||||
HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
|
HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
|
||||||
String instantTime = "100";
|
String instantTime = "100";
|
||||||
@@ -504,7 +502,7 @@ public class TestHoodieRealtimeRecordReader {
|
|||||||
InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime,
|
InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime,
|
||||||
numberOfLogRecords, 0, 1);
|
numberOfLogRecords, 0, 1);
|
||||||
long size = writer.getCurrentSize();
|
long size = writer.getCurrentSize();
|
||||||
logFilePaths.add(writer.getLogFile().getPath().toString());
|
logFiles.add(writer.getLogFile());
|
||||||
writer.close();
|
writer.close();
|
||||||
assertTrue(size > 0, "block - size should be > 0");
|
assertTrue(size > 0, "block - size should be > 0");
|
||||||
|
|
||||||
@@ -512,14 +510,14 @@ public class TestHoodieRealtimeRecordReader {
|
|||||||
newCommitTime = "102";
|
newCommitTime = "102";
|
||||||
writer = InputFormatTestUtil.writeRollbackBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime,
|
writer = InputFormatTestUtil.writeRollbackBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime,
|
||||||
newCommitTime, "101", 1);
|
newCommitTime, "101", 1);
|
||||||
logFilePaths.add(writer.getLogFile().getPath().toString());
|
logFiles.add(writer.getLogFile());
|
||||||
writer.close();
|
writer.close();
|
||||||
InputFormatTestUtil.deltaCommit(basePath, newCommitTime);
|
InputFormatTestUtil.deltaCommit(basePath, newCommitTime);
|
||||||
|
|
||||||
// create a split with baseFile (parquet file written earlier) and new log file(s)
|
// create a split with baseFile (parquet file written earlier) and new log file(s)
|
||||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
|
||||||
new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
|
new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, baseJobConf),
|
||||||
basePath.toUri().toString(), logFilePaths, newCommitTime, Option.empty());
|
basePath.toUri().toString(), logFiles, newCommitTime, Option.empty());
|
||||||
|
|
||||||
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
// create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||||
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
|
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
|
||||||
@@ -695,6 +693,7 @@ public class TestHoodieRealtimeRecordReader {
|
|||||||
writeStat.setNumUpdateWrites(100);
|
writeStat.setNumUpdateWrites(100);
|
||||||
writeStat.setNumWrites(100);
|
writeStat.setNumWrites(100);
|
||||||
writeStat.setPath(filePath);
|
writeStat.setPath(filePath);
|
||||||
|
writeStat.setFileSizeInBytes(new File(new Path(basePath.toString(), filePath).toString()).length());
|
||||||
writeStat.setPartitionPath(partitionPath);
|
writeStat.setPartitionPath(partitionPath);
|
||||||
writeStat.setTotalLogFilesCompacted(100L);
|
writeStat.setTotalLogFilesCompacted(100L);
|
||||||
HoodieWriteStat.RuntimeStats runtimeStats = new HoodieWriteStat.RuntimeStats();
|
HoodieWriteStat.RuntimeStats runtimeStats = new HoodieWriteStat.RuntimeStats();
|
||||||
@@ -750,14 +749,14 @@ public class TestHoodieRealtimeRecordReader {
|
|||||||
assertTrue(size > 0, "block - size should be > 0");
|
assertTrue(size > 0, "block - size should be > 0");
|
||||||
FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime);
|
FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime);
|
||||||
// create a split with new log file(s)
|
// create a split with new log file(s)
|
||||||
fileSlice.addLogFile(writer.getLogFile());
|
fileSlice.addLogFile(new HoodieLogFile(writer.getLogFile().getPath(), size));
|
||||||
RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(0, false, 1, 1, 0, writer.getLogFile().getPath()));
|
RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath()));
|
||||||
realtimeFileStatus.setMaxCommitTime(instantTime);
|
realtimeFileStatus.setMaxCommitTime(instantTime);
|
||||||
realtimeFileStatus.setBasePath(basePath.toString());
|
realtimeFileStatus.setBasePath(basePath.toString());
|
||||||
realtimeFileStatus.setDeltaLogPaths(fileSlice.getLogFiles().map(l -> l.getPath().toString()).collect(Collectors.toList()));
|
realtimeFileStatus.setDeltaLogFiles(fileSlice.getLogFiles().collect(Collectors.toList()));
|
||||||
PathWithLogFilePath pathWithLogFileStatus = (PathWithLogFilePath) realtimeFileStatus.getPath();
|
PathWithLogFilePath pathWithLogFileStatus = (PathWithLogFilePath) realtimeFileStatus.getPath();
|
||||||
BaseFileWithLogsSplit bs = pathWithLogFileStatus.buildSplit(pathWithLogFileStatus, 0, 0, new String[] {""});
|
BaseFileWithLogsSplit bs = pathWithLogFileStatus.buildSplit(pathWithLogFileStatus, 0, 0, new String[] {""});
|
||||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogPaths(), bs.getMaxCommitTime(), Option.empty());
|
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogFiles(), bs.getMaxCommitTime(), Option.empty());
|
||||||
|
|
||||||
JobConf newJobConf = new JobConf(baseJobConf);
|
JobConf newJobConf = new JobConf(baseJobConf);
|
||||||
List<Schema.Field> fields = schema.getFields();
|
List<Schema.Field> fields = schema.getFields();
|
||||||
|
|||||||
Reference in New Issue
Block a user