- Fixing memory leak due to HoodieLogFileReader holding on to a logblock
- Removed inMemory HashMap usage in merge(..) code in LogScanner
This commit is contained in:
committed by
vinoth chandar
parent
d3df32fa03
commit
123da020e2
@@ -240,14 +240,12 @@ public class HoodieCompactedLogRecordScanner implements
|
||||
|
||||
/**
|
||||
* Iterate over the GenericRecord in the block, read the hoodie key and partition path and merge
|
||||
* with the application specific payload if the same key was found before Sufficient to just merge
|
||||
* the log records since the base data is merged on previous compaction
|
||||
* with the application specific payload if the same key was found before. Sufficient to just merge
|
||||
* the log records since the base data is merged on previous compaction.
|
||||
* Finally, merge this log block with the accumulated records
|
||||
*/
|
||||
private Map<String, HoodieRecord<? extends HoodieRecordPayload>> loadRecordsFromBlock(
|
||||
private Map<String, HoodieRecord<? extends HoodieRecordPayload>> merge(
|
||||
HoodieAvroDataBlock dataBlock) throws IOException {
|
||||
// TODO (NA) - Instead of creating a new HashMap use the spillable map
|
||||
Map<String, HoodieRecord<? extends HoodieRecordPayload>> recordsFromLastBlock = Maps
|
||||
.newHashMap();
|
||||
// TODO (NA) - Implemnt getRecordItr() in HoodieAvroDataBlock and use that here
|
||||
List<IndexedRecord> recs = dataBlock.getRecords();
|
||||
totalLogRecords.addAndGet(recs.size());
|
||||
@@ -256,19 +254,19 @@ public class HoodieCompactedLogRecordScanner implements
|
||||
.toString();
|
||||
HoodieRecord<? extends HoodieRecordPayload> hoodieRecord =
|
||||
SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) rec, this.payloadClassFQN);
|
||||
if (recordsFromLastBlock.containsKey(key)) {
|
||||
if (records.containsKey(key)) {
|
||||
// Merge and store the merged record
|
||||
HoodieRecordPayload combinedValue = recordsFromLastBlock.get(key).getData()
|
||||
HoodieRecordPayload combinedValue = records.get(key).getData()
|
||||
.preCombine(hoodieRecord.getData());
|
||||
recordsFromLastBlock
|
||||
records
|
||||
.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()),
|
||||
combinedValue));
|
||||
} else {
|
||||
// Put the record as is
|
||||
recordsFromLastBlock.put(key, hoodieRecord);
|
||||
records.put(key, hoodieRecord);
|
||||
}
|
||||
});
|
||||
return recordsFromLastBlock;
|
||||
return records;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -277,11 +275,12 @@ public class HoodieCompactedLogRecordScanner implements
|
||||
private void merge(Map<String, HoodieRecord<? extends HoodieRecordPayload>> records,
|
||||
Deque<HoodieLogBlock> lastBlocks) throws IOException {
|
||||
while (!lastBlocks.isEmpty()) {
|
||||
log.info("Number of remaining logblocks to merge " + lastBlocks.size());
|
||||
// poll the element at the bottom of the stack since that's the order it was inserted
|
||||
HoodieLogBlock lastBlock = lastBlocks.pollLast();
|
||||
switch (lastBlock.getBlockType()) {
|
||||
case AVRO_DATA_BLOCK:
|
||||
merge(records, loadRecordsFromBlock((HoodieAvroDataBlock) lastBlock));
|
||||
merge((HoodieAvroDataBlock) lastBlock);
|
||||
break;
|
||||
case DELETE_BLOCK:
|
||||
// TODO : If delete is the only block written and/or records are present in parquet file
|
||||
@@ -295,25 +294,6 @@ public class HoodieCompactedLogRecordScanner implements
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge the records read from a single data block with the accumulated records
|
||||
*/
|
||||
private void merge(Map<String, HoodieRecord<? extends HoodieRecordPayload>> records,
|
||||
Map<String, HoodieRecord<? extends HoodieRecordPayload>> recordsFromLastBlock) {
|
||||
recordsFromLastBlock.forEach((key, hoodieRecord) -> {
|
||||
if (records.containsKey(key)) {
|
||||
// Merge and store the merged record
|
||||
HoodieRecordPayload combinedValue = records.get(key).getData()
|
||||
.preCombine(hoodieRecord.getData());
|
||||
records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()),
|
||||
combinedValue));
|
||||
} else {
|
||||
// Put the record as is
|
||||
records.put(key, hoodieRecord);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<HoodieRecord<? extends HoodieRecordPayload>> iterator() {
|
||||
return records.iterator();
|
||||
|
||||
@@ -56,7 +56,6 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
private static final byte[] oldMagicBuffer = new byte[4];
|
||||
private static final byte[] magicBuffer = new byte[6];
|
||||
private final Schema readerSchema;
|
||||
private HoodieLogBlock nextBlock = null;
|
||||
private LogFormatVersion nextBlockVersion;
|
||||
private boolean readBlockLazily;
|
||||
private long reverseLogFilePosition;
|
||||
@@ -271,8 +270,8 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
if (isEOF) {
|
||||
return false;
|
||||
}
|
||||
this.nextBlock = readBlock();
|
||||
return nextBlock != null;
|
||||
// If not hasNext(), we either we reach EOF or throw an exception on invalid magic header
|
||||
return true;
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("IOException when reading logfile " + logFile, e);
|
||||
}
|
||||
@@ -322,11 +321,12 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
|
||||
@Override
|
||||
public HoodieLogBlock next() {
|
||||
if (nextBlock == null) {
|
||||
// may be hasNext is not called
|
||||
hasNext();
|
||||
try {
|
||||
// hasNext() must be called before next()
|
||||
return readBlock();
|
||||
} catch(IOException io) {
|
||||
throw new HoodieIOException("IOException when reading logblock from log file " + logFile, io);
|
||||
}
|
||||
return nextBlock;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -378,7 +378,7 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
boolean hasNext = hasNext();
|
||||
reverseLogFilePosition -= blockSize;
|
||||
lastReverseLogFilePosition = reverseLogFilePosition;
|
||||
return this.nextBlock;
|
||||
return next();
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -24,6 +24,8 @@ import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
|
||||
|
||||
@@ -34,6 +36,8 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
|
||||
private final boolean readBlocksLazily;
|
||||
private final boolean reverseLogReader;
|
||||
|
||||
private final static Logger log = LogManager.getLogger(HoodieLogFormatReader.class);
|
||||
|
||||
HoodieLogFormatReader(FileSystem fs, List<HoodieLogFile> logFiles,
|
||||
Schema readerSchema, boolean readBlocksLazily, boolean reverseLogReader) throws IOException {
|
||||
this.logFiles = logFiles;
|
||||
@@ -77,6 +81,7 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
|
||||
} catch (IOException io) {
|
||||
throw new HoodieIOException("unable to initialize read with log file ", io);
|
||||
}
|
||||
log.info("Moving to the next reader for logfile " + currentReader.getLogFile());
|
||||
return this.currentReader.hasNext();
|
||||
}
|
||||
return false;
|
||||
@@ -84,8 +89,7 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
|
||||
|
||||
@Override
|
||||
public HoodieLogBlock next() {
|
||||
HoodieLogBlock block = currentReader.next();
|
||||
return block;
|
||||
return currentReader.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@@ -374,6 +374,7 @@ public class HoodieLogFormatTest {
|
||||
assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords1,
|
||||
dataBlockRead.getRecords());
|
||||
|
||||
reader.hasNext();
|
||||
nextBlock = reader.next();
|
||||
dataBlockRead = (HoodieAvroDataBlock) nextBlock;
|
||||
assertEquals("Read records size should be equal to the written records size",
|
||||
@@ -381,6 +382,7 @@ public class HoodieLogFormatTest {
|
||||
assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords2,
|
||||
dataBlockRead.getRecords());
|
||||
|
||||
reader.hasNext();
|
||||
nextBlock = reader.next();
|
||||
dataBlockRead = (HoodieAvroDataBlock) nextBlock;
|
||||
assertEquals("Read records size should be equal to the written records size",
|
||||
|
||||
Reference in New Issue
Block a user