[HUDI-2496] Insert duplicate records when precombined is deactivated for "insert" operation (#3740)
This commit is contained in:
@@ -16,15 +16,16 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.hudi.io.storage;
|
package org.apache.hudi.io;
|
||||||
|
|
||||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
|
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieUpsertException;
|
import org.apache.hudi.exception.HoodieUpsertException;
|
||||||
import org.apache.hudi.io.HoodieMergeHandle;
|
|
||||||
import org.apache.hudi.keygen.BaseKeyGenerator;
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
import org.apache.hudi.keygen.KeyGenUtils;
|
import org.apache.hudi.keygen.KeyGenUtils;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
@@ -34,6 +35,7 @@ import org.apache.log4j.LogManager;
|
|||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
@@ -44,21 +46,21 @@ import java.util.Map;
|
|||||||
* Simplified Logic:
|
* Simplified Logic:
|
||||||
* For every existing record
|
* For every existing record
|
||||||
* Write the record as is
|
* Write the record as is
|
||||||
* For all incoming records, write to file as is.
|
* For all incoming records, write to file as is, without de-duplicating based on the record key.
|
||||||
*
|
*
|
||||||
* Illustration with simple data.
|
* Illustration with simple data.
|
||||||
* Incoming data:
|
* Incoming data:
|
||||||
* rec1_2, rec4_2, rec5_1, rec6_1
|
* rec1_2, rec1_3, rec4_2, rec5_1, rec6_1
|
||||||
* Existing data:
|
* Existing data:
|
||||||
* rec1_1, rec2_1, rec3_1, rec4_1
|
* rec1_1, rec2_1, rec3_1, rec4_1
|
||||||
*
|
*
|
||||||
* For every existing record, write to storage as is.
|
* For every existing record, write to storage as is.
|
||||||
* => rec1_1, rec2_1, rec3_1 and rec4_1 is written to storage
|
* => rec1_1, rec2_1, rec3_1 and rec4_1 is written to storage
|
||||||
* Write all records from incoming set to storage
|
* Write all records from incoming set to storage
|
||||||
* => rec1_2, rec4_2, rec5_1 and rec6_1
|
* => rec1_2, rec1_3, rec4_2, rec5_1 and rec6_1
|
||||||
*
|
*
|
||||||
* Final snapshot in storage
|
* Final snapshot in storage
|
||||||
* rec1_1, rec2_1, rec3_1, rec4_1, rec1_2, rec4_2, rec5_1, rec6_1
|
* rec1_1, rec2_1, rec3_1, rec4_1, rec1_2, rec1_3, rec4_2, rec5_1, rec6_1
|
||||||
*
|
*
|
||||||
* Users should ensure there are no duplicates when "insert" operation is used and if the respective config is enabled. So, above scenario should not
|
* Users should ensure there are no duplicates when "insert" operation is used and if the respective config is enabled. So, above scenario should not
|
||||||
* happen and every batch should have new records to be inserted. Above example is for illustration purposes only.
|
* happen and every batch should have new records to be inserted. Above example is for illustration purposes only.
|
||||||
@@ -66,16 +68,22 @@ import java.util.Map;
|
|||||||
public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends HoodieMergeHandle<T, I, K, O> {
|
public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends HoodieMergeHandle<T, I, K, O> {
|
||||||
|
|
||||||
private static final Logger LOG = LogManager.getLogger(HoodieConcatHandle.class);
|
private static final Logger LOG = LogManager.getLogger(HoodieConcatHandle.class);
|
||||||
|
// a representation of incoming records that tolerates duplicate keys
|
||||||
|
private final Iterator<HoodieRecord<T>> recordItr;
|
||||||
|
|
||||||
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Iterator recordItr,
|
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||||
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
|
||||||
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
|
TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
|
super(config, instantTime, hoodieTable, Collections.emptyIterator(), partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
|
||||||
|
this.recordItr = recordItr;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Map keyToNewRecords, String partitionPath, String fileId,
|
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable,
|
||||||
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
|
Map<String, HoodieRecord<T>> keyToNewRecords, String partitionPath, String fileId,
|
||||||
super(config, instantTime, hoodieTable, keyToNewRecords, partitionPath, fileId, dataFileToBeMerged, taskContextSupplier,
|
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
|
||||||
|
super(config, instantTime, hoodieTable, Collections.emptyMap(), partitionPath, fileId, dataFileToBeMerged, taskContextSupplier,
|
||||||
Option.empty());
|
Option.empty());
|
||||||
|
this.recordItr = keyToNewRecords.values().iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -94,4 +102,17 @@ public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends
|
|||||||
}
|
}
|
||||||
recordsWritten++;
|
recordsWritten++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeIncomingRecords() throws IOException {
|
||||||
|
while (recordItr.hasNext()) {
|
||||||
|
HoodieRecord<T> record = recordItr.next();
|
||||||
|
if (needsUpdateLocation()) {
|
||||||
|
record.unseal();
|
||||||
|
record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
|
||||||
|
record.seal();
|
||||||
|
}
|
||||||
|
writeInsertRecord(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -257,6 +257,18 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
|
|||||||
return writeRecord(hoodieRecord, indexedRecord);
|
return writeRecord(hoodieRecord, indexedRecord);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void writeInsertRecord(HoodieRecord<T> hoodieRecord) throws IOException {
|
||||||
|
Schema schema = useWriterSchema ? tableSchemaWithMetaFields : tableSchema;
|
||||||
|
Option<IndexedRecord> insertRecord = hoodieRecord.getData().getInsertValue(schema, config.getProps());
|
||||||
|
// just skip the ignored record
|
||||||
|
if (insertRecord.isPresent() && insertRecord.get().equals(IGNORE_RECORD)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (writeRecord(hoodieRecord, insertRecord)) {
|
||||||
|
insertRecordsWritten++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
protected boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord) {
|
protected boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord) {
|
||||||
Option recordMetadata = hoodieRecord.getData().getMetadata();
|
Option recordMetadata = hoodieRecord.getData().getMetadata();
|
||||||
if (!partitionPath.equals(hoodieRecord.getPartitionPath())) {
|
if (!partitionPath.equals(hoodieRecord.getPartitionPath())) {
|
||||||
@@ -340,28 +352,28 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void writeIncomingRecords() throws IOException {
|
||||||
|
// write out any pending records (this can happen when inserts are turned into updates)
|
||||||
|
Iterator<HoodieRecord<T>> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap)
|
||||||
|
? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator();
|
||||||
|
while (newRecordsItr.hasNext()) {
|
||||||
|
HoodieRecord<T> hoodieRecord = newRecordsItr.next();
|
||||||
|
if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) {
|
||||||
|
writeInsertRecord(hoodieRecord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<WriteStatus> close() {
|
public List<WriteStatus> close() {
|
||||||
try {
|
try {
|
||||||
// write out any pending records (this can happen when inserts are turned into updates)
|
writeIncomingRecords();
|
||||||
Iterator<HoodieRecord<T>> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap)
|
|
||||||
? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator();
|
|
||||||
while (newRecordsItr.hasNext()) {
|
|
||||||
HoodieRecord<T> hoodieRecord = newRecordsItr.next();
|
|
||||||
if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) {
|
|
||||||
Schema schema = useWriterSchema ? tableSchemaWithMetaFields : tableSchema;
|
|
||||||
Option<IndexedRecord> insertRecord =
|
|
||||||
hoodieRecord.getData().getInsertValue(schema, config.getProps());
|
|
||||||
// just skip the ignore record
|
|
||||||
if (insertRecord.isPresent() && insertRecord.get().equals(IGNORE_RECORD)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
writeRecord(hoodieRecord, insertRecord);
|
|
||||||
insertRecordsWritten++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
((ExternalSpillableMap) keyToNewRecords).close();
|
if (keyToNewRecords instanceof ExternalSpillableMap) {
|
||||||
|
((ExternalSpillableMap) keyToNewRecords).close();
|
||||||
|
} else {
|
||||||
|
keyToNewRecords.clear();
|
||||||
|
}
|
||||||
writtenRecordKeys.clear();
|
writtenRecordKeys.clear();
|
||||||
|
|
||||||
if (fileWriter != null) {
|
if (fileWriter != null) {
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ import org.apache.hudi.execution.SparkLazyInsertIterable;
|
|||||||
import org.apache.hudi.io.CreateHandleFactory;
|
import org.apache.hudi.io.CreateHandleFactory;
|
||||||
import org.apache.hudi.io.HoodieMergeHandle;
|
import org.apache.hudi.io.HoodieMergeHandle;
|
||||||
import org.apache.hudi.io.HoodieSortedMergeHandle;
|
import org.apache.hudi.io.HoodieSortedMergeHandle;
|
||||||
import org.apache.hudi.io.storage.HoodieConcatHandle;
|
import org.apache.hudi.io.HoodieConcatHandle;
|
||||||
import org.apache.hudi.keygen.BaseKeyGenerator;
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
||||||
import org.apache.hudi.table.HoodieSparkTable;
|
import org.apache.hudi.table.HoodieSparkTable;
|
||||||
|
|||||||
@@ -710,6 +710,53 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
2, false, config.populateMetaFields());
|
2, false, config.populateMetaFields());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test Insert API for HoodieConcatHandle when incoming entries contain duplicate keys.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testInsertsWithHoodieConcatHandleOnDuplicateIncomingKeys() throws Exception {
|
||||||
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
|
||||||
|
testHoodieConcatHandleOnDupInserts(cfgBuilder.build(), false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test InsertPrepped API for HoodieConcatHandle when incoming entries contain duplicate keys.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testInsertsPreppedWithHoodieConcatHandleOnDuplicateIncomingKeys() throws Exception {
|
||||||
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
|
||||||
|
testHoodieConcatHandleOnDupInserts(cfgBuilder.build(), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testHoodieConcatHandleOnDupInserts(HoodieWriteConfig config, boolean isPrepped) throws Exception {
|
||||||
|
HoodieWriteConfig hoodieWriteConfig = getConfigBuilder()
|
||||||
|
.withProps(config.getProps())
|
||||||
|
.withMergeAllowDuplicateOnInserts(true)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
SparkRDDWriteClient<RawTripTestPayload> client = getHoodieWriteClient(hoodieWriteConfig);
|
||||||
|
|
||||||
|
// Write 1 (only inserts)
|
||||||
|
String initCommitTime = "000";
|
||||||
|
String newCommitTime = "001";
|
||||||
|
int firstInsertRecords = 50;
|
||||||
|
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, firstInsertRecords, SparkRDDWriteClient::insert,
|
||||||
|
isPrepped, true, firstInsertRecords, config.populateMetaFields());
|
||||||
|
|
||||||
|
// Write 2 (updates with duplicates)
|
||||||
|
String prevCommitTime = newCommitTime;
|
||||||
|
newCommitTime = "004";
|
||||||
|
int secondInsertRecords = 100; // needs to be larger than firstInsertRecords to guarantee duplicate keys
|
||||||
|
List<String> commitTimesBetweenPrevAndNew = Arrays.asList("002", "003");
|
||||||
|
|
||||||
|
final Function2<List<HoodieRecord>, String, Integer> recordGenFunction =
|
||||||
|
generateWrapRecordsFn(isPrepped, hoodieWriteConfig, dataGen::generateUpdates);
|
||||||
|
|
||||||
|
writeBatch(client, newCommitTime, prevCommitTime, Option.of(commitTimesBetweenPrevAndNew), initCommitTime,
|
||||||
|
secondInsertRecords, recordGenFunction, SparkRDDWriteClient::insert, true, secondInsertRecords,
|
||||||
|
firstInsertRecords + secondInsertRecords, 2, false, config.populateMetaFields());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests deletion of records.
|
* Tests deletion of records.
|
||||||
*/
|
*/
|
||||||
|
|||||||
Reference in New Issue
Block a user