1
0

[HUDI-2496] Insert duplicate records when precombined is deactivated for "insert" operation (#3740)

This commit is contained in:
Ilias Antoniou
2021-10-11 04:33:16 +03:00
committed by GitHub
parent ad63938890
commit ceace1c653
4 changed files with 111 additions and 31 deletions

View File

@@ -16,15 +16,16 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hudi.io.storage; package org.apache.hudi.io;
import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.KeyGenUtils; import org.apache.hudi.keygen.KeyGenUtils;
import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTable;
@@ -34,6 +35,7 @@ import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import java.io.IOException; import java.io.IOException;
import java.util.Collections;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
@@ -44,21 +46,21 @@ import java.util.Map;
* Simplified Logic: * Simplified Logic:
* For every existing record * For every existing record
* Write the record as is * Write the record as is
* For all incoming records, write to file as is. * For all incoming records, write to file as is, without de-duplicating based on the record key.
* *
* Illustration with simple data. * Illustration with simple data.
* Incoming data: * Incoming data:
* rec1_2, rec4_2, rec5_1, rec6_1 * rec1_2, rec1_3, rec4_2, rec5_1, rec6_1
* Existing data: * Existing data:
* rec1_1, rec2_1, rec3_1, rec4_1 * rec1_1, rec2_1, rec3_1, rec4_1
* *
* For every existing record, write to storage as is. * For every existing record, write to storage as is.
* => rec1_1, rec2_1, rec3_1 and rec4_1 is written to storage * => rec1_1, rec2_1, rec3_1 and rec4_1 is written to storage
* Write all records from incoming set to storage * Write all records from incoming set to storage
* => rec1_2, rec4_2, rec5_1 and rec6_1 * => rec1_2, rec1_3, rec4_2, rec5_1 and rec6_1
* *
* Final snapshot in storage * Final snapshot in storage
* rec1_1, rec2_1, rec3_1, rec4_1, rec1_2, rec4_2, rec5_1, rec6_1 * rec1_1, rec2_1, rec3_1, rec4_1, rec1_2, rec1_3, rec4_2, rec5_1, rec6_1
* *
* Users should ensure there are no duplicates when "insert" operation is used and if the respective config is enabled. So, above scenario should not * Users should ensure there are no duplicates when "insert" operation is used and if the respective config is enabled. So, above scenario should not
* happen and every batch should have new records to be inserted. Above example is for illustration purposes only. * happen and every batch should have new records to be inserted. Above example is for illustration purposes only.
@@ -66,16 +68,22 @@ import java.util.Map;
public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends HoodieMergeHandle<T, I, K, O> { public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends HoodieMergeHandle<T, I, K, O> {
private static final Logger LOG = LogManager.getLogger(HoodieConcatHandle.class); private static final Logger LOG = LogManager.getLogger(HoodieConcatHandle.class);
// a representation of incoming records that tolerates duplicate keys
private final Iterator<HoodieRecord<T>> recordItr;
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Iterator recordItr, public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) { Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt); TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
super(config, instantTime, hoodieTable, Collections.emptyIterator(), partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
this.recordItr = recordItr;
} }
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Map keyToNewRecords, String partitionPath, String fileId, public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable,
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) { Map<String, HoodieRecord<T>> keyToNewRecords, String partitionPath, String fileId,
super(config, instantTime, hoodieTable, keyToNewRecords, partitionPath, fileId, dataFileToBeMerged, taskContextSupplier, HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
super(config, instantTime, hoodieTable, Collections.emptyMap(), partitionPath, fileId, dataFileToBeMerged, taskContextSupplier,
Option.empty()); Option.empty());
this.recordItr = keyToNewRecords.values().iterator();
} }
/** /**
@@ -94,4 +102,17 @@ public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends
} }
recordsWritten++; recordsWritten++;
} }
@Override
protected void writeIncomingRecords() throws IOException {
while (recordItr.hasNext()) {
HoodieRecord<T> record = recordItr.next();
if (needsUpdateLocation()) {
record.unseal();
record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
record.seal();
}
writeInsertRecord(record);
}
}
} }

View File

@@ -257,6 +257,18 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
return writeRecord(hoodieRecord, indexedRecord); return writeRecord(hoodieRecord, indexedRecord);
} }
protected void writeInsertRecord(HoodieRecord<T> hoodieRecord) throws IOException {
Schema schema = useWriterSchema ? tableSchemaWithMetaFields : tableSchema;
Option<IndexedRecord> insertRecord = hoodieRecord.getData().getInsertValue(schema, config.getProps());
// just skip the ignored record
if (insertRecord.isPresent() && insertRecord.get().equals(IGNORE_RECORD)) {
return;
}
if (writeRecord(hoodieRecord, insertRecord)) {
insertRecordsWritten++;
}
}
protected boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord) { protected boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord) {
Option recordMetadata = hoodieRecord.getData().getMetadata(); Option recordMetadata = hoodieRecord.getData().getMetadata();
if (!partitionPath.equals(hoodieRecord.getPartitionPath())) { if (!partitionPath.equals(hoodieRecord.getPartitionPath())) {
@@ -340,28 +352,28 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
} }
} }
protected void writeIncomingRecords() throws IOException {
// write out any pending records (this can happen when inserts are turned into updates)
Iterator<HoodieRecord<T>> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap)
? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator();
while (newRecordsItr.hasNext()) {
HoodieRecord<T> hoodieRecord = newRecordsItr.next();
if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) {
writeInsertRecord(hoodieRecord);
}
}
}
@Override @Override
public List<WriteStatus> close() { public List<WriteStatus> close() {
try { try {
// write out any pending records (this can happen when inserts are turned into updates) writeIncomingRecords();
Iterator<HoodieRecord<T>> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap)
? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator();
while (newRecordsItr.hasNext()) {
HoodieRecord<T> hoodieRecord = newRecordsItr.next();
if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) {
Schema schema = useWriterSchema ? tableSchemaWithMetaFields : tableSchema;
Option<IndexedRecord> insertRecord =
hoodieRecord.getData().getInsertValue(schema, config.getProps());
// just skip the ignore record
if (insertRecord.isPresent() && insertRecord.get().equals(IGNORE_RECORD)) {
continue;
}
writeRecord(hoodieRecord, insertRecord);
insertRecordsWritten++;
}
}
((ExternalSpillableMap) keyToNewRecords).close(); if (keyToNewRecords instanceof ExternalSpillableMap) {
((ExternalSpillableMap) keyToNewRecords).close();
} else {
keyToNewRecords.clear();
}
writtenRecordKeys.clear(); writtenRecordKeys.clear();
if (fileWriter != null) { if (fileWriter != null) {

View File

@@ -45,7 +45,7 @@ import org.apache.hudi.execution.SparkLazyInsertIterable;
import org.apache.hudi.io.CreateHandleFactory; import org.apache.hudi.io.CreateHandleFactory;
import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.io.HoodieSortedMergeHandle; import org.apache.hudi.io.HoodieSortedMergeHandle;
import org.apache.hudi.io.storage.HoodieConcatHandle; import org.apache.hudi.io.HoodieConcatHandle;
import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieSparkTable;

View File

@@ -710,6 +710,53 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
2, false, config.populateMetaFields()); 2, false, config.populateMetaFields());
} }
/**
* Test Insert API for HoodieConcatHandle when incoming entries contain duplicate keys.
*/
@Test
public void testInsertsWithHoodieConcatHandleOnDuplicateIncomingKeys() throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
testHoodieConcatHandleOnDupInserts(cfgBuilder.build(), false);
}
/**
* Test InsertPrepped API for HoodieConcatHandle when incoming entries contain duplicate keys.
*/
@Test
public void testInsertsPreppedWithHoodieConcatHandleOnDuplicateIncomingKeys() throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
testHoodieConcatHandleOnDupInserts(cfgBuilder.build(), true);
}
private void testHoodieConcatHandleOnDupInserts(HoodieWriteConfig config, boolean isPrepped) throws Exception {
HoodieWriteConfig hoodieWriteConfig = getConfigBuilder()
.withProps(config.getProps())
.withMergeAllowDuplicateOnInserts(true)
.build();
SparkRDDWriteClient<RawTripTestPayload> client = getHoodieWriteClient(hoodieWriteConfig);
// Write 1 (only inserts)
String initCommitTime = "000";
String newCommitTime = "001";
int firstInsertRecords = 50;
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, firstInsertRecords, SparkRDDWriteClient::insert,
isPrepped, true, firstInsertRecords, config.populateMetaFields());
// Write 2 (updates with duplicates)
String prevCommitTime = newCommitTime;
newCommitTime = "004";
int secondInsertRecords = 100; // needs to be larger than firstInsertRecords to guarantee duplicate keys
List<String> commitTimesBetweenPrevAndNew = Arrays.asList("002", "003");
final Function2<List<HoodieRecord>, String, Integer> recordGenFunction =
generateWrapRecordsFn(isPrepped, hoodieWriteConfig, dataGen::generateUpdates);
writeBatch(client, newCommitTime, prevCommitTime, Option.of(commitTimesBetweenPrevAndNew), initCommitTime,
secondInsertRecords, recordGenFunction, SparkRDDWriteClient::insert, true, secondInsertRecords,
firstInsertRecords + secondInsertRecords, 2, false, config.populateMetaFields());
}
/** /**
* Tests deletion of records. * Tests deletion of records.
*/ */