DeduplicateRecords based on recordKey if global index is used
This commit is contained in:
committed by
vinoth chandar
parent
123da020e2
commit
291a88ba94
@@ -17,6 +17,7 @@
|
|||||||
package com.uber.hoodie;
|
package com.uber.hoodie;
|
||||||
|
|
||||||
import com.codahale.metrics.Timer;
|
import com.codahale.metrics.Timer;
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
|
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
|
||||||
@@ -111,10 +112,16 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
*/
|
*/
|
||||||
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig,
|
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig,
|
||||||
boolean rollbackInFlight) {
|
boolean rollbackInFlight) {
|
||||||
|
this(jsc, clientConfig, rollbackInFlight, HoodieIndex.createIndex(clientConfig, jsc));
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig,
|
||||||
|
boolean rollbackInFlight, HoodieIndex index) {
|
||||||
this.fs = FSUtils.getFs(clientConfig.getBasePath(), jsc.hadoopConfiguration());
|
this.fs = FSUtils.getFs(clientConfig.getBasePath(), jsc.hadoopConfiguration());
|
||||||
this.jsc = jsc;
|
this.jsc = jsc;
|
||||||
this.config = clientConfig;
|
this.config = clientConfig;
|
||||||
this.index = HoodieIndex.createIndex(config, jsc);
|
this.index = index;
|
||||||
this.metrics = new HoodieMetrics(config, config.getTableName());
|
this.metrics = new HoodieMetrics(config, config.getTableName());
|
||||||
|
|
||||||
if (rollbackInFlight) {
|
if (rollbackInFlight) {
|
||||||
@@ -1051,10 +1058,16 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
/**
|
/**
|
||||||
* Deduplicate Hoodie records, using the given deduplication funciton.
|
* Deduplicate Hoodie records, using the given deduplication funciton.
|
||||||
*/
|
*/
|
||||||
private JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records,
|
JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records,
|
||||||
int parallelism) {
|
int parallelism) {
|
||||||
|
boolean isIndexingGlobal = index.isGlobal();
|
||||||
return records
|
return records
|
||||||
.mapToPair(record -> new Tuple2<>(record.getKey(), record))
|
.mapToPair(record -> {
|
||||||
|
HoodieKey hoodieKey = record.getKey();
|
||||||
|
// If index used is global, then records are expected to differ in their partitionPath
|
||||||
|
Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey;
|
||||||
|
return new Tuple2<>(key, record);
|
||||||
|
})
|
||||||
.reduceByKey((rec1, rec2) -> {
|
.reduceByKey((rec1, rec2) -> {
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
T reducedData = (T) rec1.getData().preCombine(rec2.getData());
|
T reducedData = (T) rec1.getData().preCombine(rec2.getData());
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ import static org.junit.Assert.assertEquals;
|
|||||||
import static org.junit.Assert.assertFalse;
|
import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.junit.Assert.fail;
|
import static org.junit.Assert.fail;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
import com.uber.hoodie.common.HoodieCleanStat;
|
import com.uber.hoodie.common.HoodieCleanStat;
|
||||||
@@ -48,13 +50,13 @@ import com.uber.hoodie.config.HoodieStorageConfig;
|
|||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieRollbackException;
|
import com.uber.hoodie.exception.HoodieRollbackException;
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@@ -65,6 +67,7 @@ import java.util.Map;
|
|||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
import java.util.UUID;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
@@ -218,6 +221,36 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
testUpsertsInternal(getConfig());
|
testUpsertsInternal(getConfig());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDeduplication() throws Exception {
|
||||||
|
String newCommitTime = "001";
|
||||||
|
|
||||||
|
String recordKey = UUID.randomUUID().toString();
|
||||||
|
HoodieKey keyOne = new HoodieKey(recordKey, "2018-01-01");
|
||||||
|
HoodieRecord recordOne = new HoodieRecord(keyOne,
|
||||||
|
HoodieTestDataGenerator.generateRandomValue(keyOne, newCommitTime));
|
||||||
|
|
||||||
|
HoodieKey keyTwo = new HoodieKey(recordKey, "2018-02-01");
|
||||||
|
HoodieRecord recordTwo = new HoodieRecord(keyTwo,
|
||||||
|
HoodieTestDataGenerator.generateRandomValue(keyTwo, newCommitTime));
|
||||||
|
|
||||||
|
JavaRDD<HoodieRecord> records = jsc.parallelize(Arrays.asList(recordOne, recordTwo), 1);
|
||||||
|
|
||||||
|
// dedup should be done based on recordKey only
|
||||||
|
HoodieWriteClient clientWithDummyGlobalIndex = getWriteClientWithDummyIndex(true);
|
||||||
|
assertEquals(1, clientWithDummyGlobalIndex.deduplicateRecords(records, 1).collect().size());
|
||||||
|
|
||||||
|
// dedup should be done based on both recordKey and partitionPath
|
||||||
|
HoodieWriteClient clientWithDummyNonGlobalIndex = getWriteClientWithDummyIndex(false);
|
||||||
|
assertEquals(2, clientWithDummyNonGlobalIndex.deduplicateRecords(records, 1).collect().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
private HoodieWriteClient getWriteClientWithDummyIndex(final boolean isGlobal) throws Exception {
|
||||||
|
HoodieIndex index = mock(HoodieIndex.class);
|
||||||
|
when(index.isGlobal()).thenReturn(isGlobal);
|
||||||
|
return new HoodieWriteClient(jsc, getConfigBuilder().build(), false, index);
|
||||||
|
}
|
||||||
|
|
||||||
private void testUpsertsInternal(HoodieWriteConfig hoodieWriteConfig) throws Exception {
|
private void testUpsertsInternal(HoodieWriteConfig hoodieWriteConfig) throws Exception {
|
||||||
HoodieWriteClient client = new HoodieWriteClient(jsc, hoodieWriteConfig);
|
HoodieWriteClient client = new HoodieWriteClient(jsc, hoodieWriteConfig);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user