1
0

[HUDI-1196] Update HoodieKey when deduplicating records with global index (#2248)

- Works only for overwrite payload (default)
- Does not alter current semantics otherwise 

Co-authored-by: Ryan Pifer <ryanpife@amazon.com>
This commit is contained in:
rmpifer
2020-12-01 13:50:46 -08:00
committed by GitHub
parent ac23d2587f
commit 78fd122594
2 changed files with 4 additions and 4 deletions

View File

@@ -59,10 +59,9 @@ public class SparkWriteHelper<T extends HoodieRecordPayload,R> extends AbstractW
}).reduceByKey((rec1, rec2) -> {
@SuppressWarnings("unchecked")
T reducedData = (T) rec1.getData().preCombine(rec2.getData());
// we cannot allow the user to change the key or partitionPath, since that will affect
// everything
// so pick it from one of the records.
return new HoodieRecord<T>(rec1.getKey(), reducedData);
HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey();
return new HoodieRecord<T>(reducedKey, reducedData);
}, parallelism).map(Tuple2::_2);
}

View File

@@ -243,6 +243,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
when(index.isGlobal()).thenReturn(true);
List<HoodieRecord<RawTripTestPayload>> dedupedRecs = SparkWriteHelper.newInstance().deduplicateRecords(records, index, 1).collect();
assertEquals(1, dedupedRecs.size());
assertEquals(dedupedRecs.get(0).getPartitionPath(), recordThree.getPartitionPath());
assertNodupesWithinPartition(dedupedRecs);
// non-Global dedup should be done based on both recordKey and partitionPath