Enabling hard deletes for MergeOnRead table type
This commit is contained in:
committed by
vinoth chandar
parent
345aaa31aa
commit
110df7190b
@@ -36,7 +36,11 @@ public class HoodieAvroPayload implements HoodieRecordPayload<HoodieAvroPayload>
|
||||
|
||||
public HoodieAvroPayload(Optional<GenericRecord> record) {
|
||||
try {
|
||||
this.recordBytes = HoodieAvroUtils.avroToBytes(record.get());
|
||||
if (record.isPresent()) {
|
||||
this.recordBytes = HoodieAvroUtils.avroToBytes(record.get());
|
||||
} else {
|
||||
this.recordBytes = new byte[0];
|
||||
}
|
||||
} catch (IOException io) {
|
||||
throw new HoodieIOException("Cannot convert record to bytes", io);
|
||||
}
|
||||
@@ -55,6 +59,9 @@ public class HoodieAvroPayload implements HoodieRecordPayload<HoodieAvroPayload>
|
||||
|
||||
@Override
|
||||
public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException {
|
||||
if (recordBytes.length == 0) {
|
||||
return Optional.empty();
|
||||
}
|
||||
Optional<GenericRecord> record = Optional.of(HoodieAvroUtils.bytesToAvro(recordBytes, schema));
|
||||
return record.map(r -> HoodieAvroUtils.rewriteRecord(r, schema));
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ package com.uber.hoodie.common.table.log;
|
||||
import static com.uber.hoodie.common.table.log.block.HoodieLogBlock.HeaderMetadataType.INSTANT_TIME;
|
||||
import static com.uber.hoodie.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.CORRUPT_BLOCK;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
@@ -63,7 +64,7 @@ public abstract class AbstractHoodieLogRecordScanner {
|
||||
private static final Logger log = LogManager.getLogger(AbstractHoodieLogRecordScanner.class);
|
||||
|
||||
// Reader schema for the records
|
||||
private final Schema readerSchema;
|
||||
protected final Schema readerSchema;
|
||||
// Latest valid instant time
|
||||
// Log-Blocks belonging to inflight delta-instants are filtered-out using this high-watermark.
|
||||
private final String latestInstantTime;
|
||||
@@ -291,7 +292,7 @@ public abstract class AbstractHoodieLogRecordScanner {
|
||||
*
|
||||
* @param key Deleted record key
|
||||
*/
|
||||
protected abstract void processNextDeletedKey(String key);
|
||||
protected abstract void processNextDeletedKey(HoodieKey key);
|
||||
|
||||
/**
|
||||
* Process the set of log blocks belonging to the last instant which is read fully.
|
||||
|
||||
@@ -22,6 +22,7 @@ import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.util.DefaultSizeEstimator;
|
||||
import com.uber.hoodie.common.util.HoodieRecordSizeEstimator;
|
||||
import com.uber.hoodie.common.util.HoodieTimer;
|
||||
import com.uber.hoodie.common.util.SpillableMapUtils;
|
||||
import com.uber.hoodie.common.util.collection.ExternalSpillableMap;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import java.io.IOException;
|
||||
@@ -102,10 +103,11 @@ public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordScanner
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processNextRecord(HoodieRecord<? extends HoodieRecordPayload> hoodieRecord) {
|
||||
protected void processNextRecord(HoodieRecord<? extends HoodieRecordPayload> hoodieRecord) throws IOException {
|
||||
String key = hoodieRecord.getRecordKey();
|
||||
if (records.containsKey(key)) {
|
||||
// Merge and store the merged record
|
||||
// Merge and store the merged record. The HoodieRecordPayload implementation is free to decide what should be
|
||||
// done when a delete (empty payload) is encountered before or after an insert/update.
|
||||
HoodieRecordPayload combinedValue = records.get(key).getData().preCombine(hoodieRecord.getData());
|
||||
records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue));
|
||||
} else {
|
||||
@@ -115,10 +117,9 @@ public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordScanner
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processNextDeletedKey(String key) {
|
||||
// TODO : If delete is the only block written and/or records are present in parquet file
|
||||
// TODO : Mark as tombstone (optional.empty()) for data instead of deleting the entry
|
||||
records.remove(key);
|
||||
protected void processNextDeletedKey(HoodieKey hoodieKey) {
|
||||
records.put(hoodieKey.getRecordKey(), SpillableMapUtils.generateEmptyPayload(hoodieKey.getRecordKey(),
|
||||
hoodieKey.getPartitionPath(), getPayloadClassFQN()));
|
||||
}
|
||||
|
||||
public long getTotalTimeTakenToReadAndMergeBlocks() {
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package com.uber.hoodie.common.table.log;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import java.util.List;
|
||||
@@ -43,7 +44,7 @@ public class HoodieUnMergedLogRecordScanner extends AbstractHoodieLogRecordScann
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processNextDeletedKey(String key) {
|
||||
protected void processNextDeletedKey(HoodieKey key) {
|
||||
throw new IllegalStateException("Not expected to see delete records in this log-scan mode. Check Job Config");
|
||||
}
|
||||
|
||||
|
||||
@@ -16,16 +16,16 @@
|
||||
|
||||
package com.uber.hoodie.common.table.log.block;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.common.storage.SizeAwareDataInputStream;
|
||||
import com.uber.hoodie.common.util.StringUtils;
|
||||
import com.uber.hoodie.common.util.SerializationUtils;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
@@ -36,9 +36,9 @@ import org.apache.hadoop.fs.FSDataInputStream;
|
||||
*/
|
||||
public class HoodieDeleteBlock extends HoodieLogBlock {
|
||||
|
||||
private String[] keysToDelete;
|
||||
private HoodieKey[] keysToDelete;
|
||||
|
||||
public HoodieDeleteBlock(String[] keysToDelete,
|
||||
public HoodieDeleteBlock(HoodieKey[] keysToDelete,
|
||||
Map<HeaderMetadataType, String> header) {
|
||||
this(Optional.empty(), null, false, Optional.empty(), header, new HashMap<>());
|
||||
this.keysToDelete = keysToDelete;
|
||||
@@ -64,15 +64,14 @@ public class HoodieDeleteBlock extends HoodieLogBlock {
|
||||
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
DataOutputStream output = new DataOutputStream(baos);
|
||||
byte[] bytesToWrite = StringUtils.join(getKeysToDelete(), ",")
|
||||
.getBytes(Charset.forName("utf-8"));
|
||||
byte[] bytesToWrite = SerializationUtils.serialize(getKeysToDelete());
|
||||
output.writeInt(HoodieLogBlock.version);
|
||||
output.writeInt(bytesToWrite.length);
|
||||
output.write(bytesToWrite);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
public String[] getKeysToDelete() {
|
||||
public HoodieKey[] getKeysToDelete() {
|
||||
try {
|
||||
if (keysToDelete == null) {
|
||||
if (!getContent().isPresent() && readBlockLazily) {
|
||||
@@ -86,7 +85,7 @@ public class HoodieDeleteBlock extends HoodieLogBlock {
|
||||
int dataLength = dis.readInt();
|
||||
byte[] data = new byte[dataLength];
|
||||
dis.readFully(data);
|
||||
this.keysToDelete = new String(data).split(",");
|
||||
this.keysToDelete = SerializationUtils.deserialize(data);
|
||||
deflate();
|
||||
}
|
||||
return keysToDelete;
|
||||
|
||||
@@ -117,4 +117,15 @@ public class SpillableMapUtils {
|
||||
.loadPayload(payloadClazz, new Object[]{Optional.of(rec)}, Optional.class));
|
||||
return (R) hoodieRecord;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method to convert bytes to HoodieRecord using schema and payload class
|
||||
*/
|
||||
public static <R> R generateEmptyPayload(String recKey, String partitionPath, String payloadClazz) {
|
||||
HoodieRecord<? extends HoodieRecordPayload> hoodieRecord = new HoodieRecord<>(
|
||||
new HoodieKey(recKey, partitionPath),
|
||||
ReflectionUtils
|
||||
.loadPayload(payloadClazz, new Object[]{Optional.empty()}, Optional.class));
|
||||
return (R) hoodieRecord;
|
||||
}
|
||||
}
|
||||
@@ -25,6 +25,7 @@ import static org.junit.Assert.fail;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.common.minicluster.MiniClusterUtil;
|
||||
import com.uber.hoodie.common.model.HoodieArchivedLogFile;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieTableType;
|
||||
@@ -43,6 +44,7 @@ import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import com.uber.hoodie.common.util.SchemaTestUtil;
|
||||
import com.uber.hoodie.exception.CorruptedLogFileException;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
@@ -714,10 +716,13 @@ public class HoodieLogFormatTest {
|
||||
s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()).collect(Collectors.toList());
|
||||
|
||||
// Delete 50 keys
|
||||
List<String> deletedKeys = originalKeys.subList(0, 50);
|
||||
List<HoodieKey> deletedKeys = copyOfRecords1.stream().map(
|
||||
s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(),
|
||||
((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString())))
|
||||
.collect(Collectors.toList()).subList(0, 50);
|
||||
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102");
|
||||
HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new String[50]), header);
|
||||
HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header);
|
||||
writer = writer.appendBlock(deleteBlock);
|
||||
|
||||
List<String> allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION,
|
||||
@@ -727,8 +732,19 @@ public class HoodieLogFormatTest {
|
||||
"102", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
|
||||
assertEquals("We still would read 200 records", 200, scanner.getTotalLogRecords());
|
||||
final List<String> readKeys = new ArrayList<>(200);
|
||||
final List<Boolean> emptyPayloads = new ArrayList<>();
|
||||
scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey()));
|
||||
assertEquals("Stream collect should return all 150 records", 150, readKeys.size());
|
||||
scanner.forEach(s -> {
|
||||
try {
|
||||
if (!s.getData().getInsertValue(schema).isPresent()) {
|
||||
emptyPayloads.add(true);
|
||||
}
|
||||
} catch (IOException io) {
|
||||
throw new UncheckedIOException(io);
|
||||
}
|
||||
});
|
||||
assertEquals("Stream collect should return all 200 records", 200, readKeys.size());
|
||||
assertEquals("Stream collect should return all 50 records with empty payloads", 50, emptyPayloads.size());
|
||||
originalKeys.removeAll(deletedKeys);
|
||||
Collections.sort(originalKeys);
|
||||
Collections.sort(readKeys);
|
||||
@@ -782,8 +798,13 @@ public class HoodieLogFormatTest {
|
||||
s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()).collect(Collectors.toList());
|
||||
|
||||
// Delete 50 keys
|
||||
List<String> deletedKeys = originalKeys.subList(0, 50);
|
||||
HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new String[50]), header);
|
||||
// Delete 50 keys
|
||||
List<HoodieKey> deletedKeys = copyOfRecords1.stream().map(
|
||||
s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(),
|
||||
((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString())))
|
||||
.collect(Collectors.toList()).subList(0, 50);
|
||||
|
||||
HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header);
|
||||
writer = writer.appendBlock(deleteBlock);
|
||||
|
||||
// Attempt 1 : Write rollback block for a failed write
|
||||
@@ -839,8 +860,12 @@ public class HoodieLogFormatTest {
|
||||
s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()).collect(Collectors.toList());
|
||||
|
||||
// Delete 50 keys
|
||||
List<String> deletedKeys = originalKeys.subList(0, 50);
|
||||
HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new String[50]), header);
|
||||
// Delete 50 keys
|
||||
List<HoodieKey> deletedKeys = copyOfRecords1.stream().map(
|
||||
s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(),
|
||||
((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString())))
|
||||
.collect(Collectors.toList()).subList(0, 50);
|
||||
HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header);
|
||||
writer = writer.appendBlock(deleteBlock);
|
||||
|
||||
// Write 2 rollback blocks (1 data block + 1 delete bloc) for a failed write
|
||||
@@ -921,8 +946,12 @@ public class HoodieLogFormatTest {
|
||||
s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()).collect(Collectors.toList());
|
||||
|
||||
// Delete 50 keys
|
||||
List<String> deletedKeys = originalKeys.subList(0, 50);
|
||||
HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new String[50]), header);
|
||||
// Delete 50 keys
|
||||
List<HoodieKey> deletedKeys = copyOfRecords1.stream().map(
|
||||
s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(),
|
||||
((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString())))
|
||||
.collect(Collectors.toList()).subList(0, 50);
|
||||
HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header);
|
||||
writer = writer.appendBlock(deleteBlock);
|
||||
|
||||
// Write 1 rollback block for a failed write
|
||||
|
||||
Reference in New Issue
Block a user