1
0

[HUDI-2656] Generalize HoodieIndex for flexible record data type (#3893)

Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
Y Ethan Guo
2022-02-03 20:24:04 -08:00
committed by GitHub
parent 69dfcda116
commit b8601a9f58
105 changed files with 564 additions and 504 deletions

View File

@@ -25,7 +25,6 @@ import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.config.HoodieWriteConfig;
@@ -33,12 +32,12 @@ import org.apache.hudi.exception.HoodieIndexException;
import org.apache.hudi.table.HoodieTable;
import java.util.List;
import java.util.stream.Collectors;
/**
* Base flink implementation of {@link HoodieIndex}.
* @param <T> payload type
*/
public abstract class FlinkHoodieIndex<T extends HoodieRecordPayload> extends HoodieIndex<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> {
public abstract class FlinkHoodieIndex<T extends HoodieRecordPayload> extends HoodieIndex<List<HoodieRecord<T>>, List<WriteStatus>> {
protected FlinkHoodieIndex(HoodieWriteConfig config) {
super(config);
}
@@ -48,21 +47,22 @@ public abstract class FlinkHoodieIndex<T extends HoodieRecordPayload> extends Ho
@PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED)
public abstract List<WriteStatus> updateLocation(List<WriteStatus> writeStatuses,
HoodieEngineContext context,
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> hoodieTable) throws HoodieIndexException;
HoodieTable hoodieTable) throws HoodieIndexException;
@Override
@Deprecated
@PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED)
public abstract List<HoodieRecord<T>> tagLocation(List<HoodieRecord<T>> records,
HoodieEngineContext context,
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> hoodieTable) throws HoodieIndexException;
HoodieTable hoodieTable) throws HoodieIndexException;
@Override
@PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
public HoodieData<HoodieRecord<T>> tagLocation(
HoodieData<HoodieRecord<T>> records, HoodieEngineContext context,
public <R> HoodieData<HoodieRecord<R>> tagLocation(
HoodieData<HoodieRecord<R>> records, HoodieEngineContext context,
HoodieTable hoodieTable) throws HoodieIndexException {
return HoodieList.of(tagLocation(HoodieList.getList(records), context, hoodieTable));
List<HoodieRecord<T>> hoodieRecords = tagLocation(HoodieList.getList(records.map(record -> (HoodieRecord<T>) record)), context, hoodieTable);
return HoodieList.of(hoodieRecords.stream().map(r -> (HoodieRecord<R>) r).collect(Collectors.toList()));
}
@Override

View File

@@ -46,11 +46,11 @@ public final class FlinkHoodieIndexFactory {
// TODO more indexes to be added
switch (config.getIndexType()) {
case INMEMORY:
return new FlinkInMemoryStateIndex<>(context, config);
return new FlinkInMemoryStateIndex(context, config);
case BLOOM:
return new HoodieBloomIndex<>(config, ListBasedHoodieBloomIndexHelper.getInstance());
return new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
case SIMPLE:
return new HoodieSimpleIndex<>(config, Option.empty());
return new HoodieSimpleIndex(config, Option.empty());
default:
throw new HoodieIndexException("Unsupported index type " + config.getIndexType());
}

View File

@@ -22,9 +22,7 @@ import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIndexException;
import org.apache.hudi.index.HoodieIndex;
@@ -37,11 +35,8 @@ import java.util.List;
/**
* Hoodie index implementation backed by flink state.
*
* @param <T> type of payload
*/
public class FlinkInMemoryStateIndex<T extends HoodieRecordPayload<T>>
extends HoodieIndex<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> {
public class FlinkInMemoryStateIndex extends HoodieIndex<List<HoodieRecord>, List<WriteStatus>> {
private static final Logger LOG = LogManager.getLogger(FlinkInMemoryStateIndex.class);
@@ -50,8 +45,8 @@ public class FlinkInMemoryStateIndex<T extends HoodieRecordPayload<T>>
}
@Override
public HoodieData<HoodieRecord<T>> tagLocation(
HoodieData<HoodieRecord<T>> records, HoodieEngineContext context,
public <R> HoodieData<HoodieRecord<R>> tagLocation(
HoodieData<HoodieRecord<R>> records, HoodieEngineContext context,
HoodieTable hoodieTable) throws HoodieIndexException {
throw new UnsupportedOperationException("No need to tag location for FlinkInMemoryStateIndex");
}
@@ -88,4 +83,4 @@ public class FlinkInMemoryStateIndex<T extends HoodieRecordPayload<T>>
public boolean isImplicitWithStorage() {
return true;
}
}
}

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.collection.Pair;
@@ -93,7 +94,7 @@ public class FlinkDeleteHelper<R> extends
}
List<HoodieRecord<EmptyHoodieRecordPayload>> dedupedRecords =
dedupedKeys.stream().map(key -> new HoodieRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList());
dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList());
Instant beginTag = Instant.now();
// perform index look up to get existing location of records
List<HoodieRecord<EmptyHoodieRecordPayload>> taggedRecords = HoodieList.getList(

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.table.action.commit;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieOperation;
import org.apache.hudi.common.model.HoodieRecord;
@@ -89,7 +90,7 @@ public class FlinkWriteHelper<T extends HoodieRecordPayload, R> extends BaseWrit
@Override
public List<HoodieRecord<T>> deduplicateRecords(
List<HoodieRecord<T>> records, HoodieIndex<T, ?, ?, ?> index, int parallelism) {
List<HoodieRecord<T>> records, HoodieIndex<?, ?> index, int parallelism) {
Map<Object, List<Pair<Object, HoodieRecord<T>>>> keyedRecords = records.stream().map(record -> {
// If index used is global, then records are expected to differ in their partitionPath
final Object key = record.getKey().getRecordKey();
@@ -107,7 +108,7 @@ public class FlinkWriteHelper<T extends HoodieRecordPayload, R> extends BaseWrit
boolean choosePrev = data1.equals(reducedData);
HoodieKey reducedKey = choosePrev ? rec1.getKey() : rec2.getKey();
HoodieOperation operation = choosePrev ? rec1.getOperation() : rec2.getOperation();
HoodieRecord<T> hoodieRecord = new HoodieRecord<>(reducedKey, reducedData, operation);
HoodieRecord<T> hoodieRecord = new HoodieAvroRecord<>(reducedKey, reducedData, operation);
// reuse the location from the first record.
hoodieRecord.setCurrentLocation(rec1.getCurrentLocation());
return hoodieRecord;

View File

@@ -23,6 +23,7 @@ import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.data.HoodieMapPair;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.HoodieTableMetaClient;
@@ -115,19 +116,19 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
RawTripTestPayload rowChange1 =
new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record1 =
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
RawTripTestPayload rowChange2 =
new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record2 =
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
RawTripTestPayload rowChange3 =
new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record3 =
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
RawTripTestPayload rowChange4 =
new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record4 =
new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
List<String> partitions = asList("2016/01/21", "2016/04/01", "2015/03/12");
List<Pair<String, BloomIndexFileInfo>> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
@@ -212,16 +213,16 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
HoodieRecord record1 =
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
HoodieRecord record2 =
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
HoodieRecord record3 =
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieRecord record4 =
new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
// We write record1, record2 to a base file, but the bloom filter contains (record1,
// record2, record3).
@@ -286,16 +287,16 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
HoodieRecord record1 =
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
HoodieRecord record2 =
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
HoodieRecord record3 =
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieRecord record4 =
new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
List<HoodieRecord> records = asList(record1, record2, record3, record4);
// Also create the metadata and config
@@ -354,15 +355,15 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
HoodieRecord record1 = new HoodieRecord(key1, rowChange1);
HoodieRecord record1 = new HoodieAvroRecord(key1, rowChange1);
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath());
HoodieRecord record2 = new HoodieRecord(key2, rowChange2);
HoodieRecord record2 = new HoodieAvroRecord(key2, rowChange2);
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath());
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath());
HoodieRecord record4 = new HoodieRecord(key4, rowChange4);
HoodieRecord record4 = new HoodieAvroRecord(key4, rowChange4);
List<HoodieKey> keys = asList(key1, key2, key3, key4);
// Also create the metadata and config
@@ -373,7 +374,7 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
// Let's tag
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
List<HoodieRecord> toTagRecords = new ArrayList<>();
toTagRecords.add(new HoodieRecord(record4.getKey(), null));
toTagRecords.add(new HoodieAvroRecord(record4.getKey(), null));
List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, toTagRecords, hoodieTable);
Map<HoodieKey, Option<Pair<String, String>>> recordLocations = new HashMap<>();
for (HoodieRecord taggedRecord : taggedRecords) {
@@ -396,7 +397,7 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
hoodieTable = HoodieFlinkTable.create(config, context, metaClient);
List<HoodieRecord> toTagRecords1 = new ArrayList<>();
for (HoodieKey key : keys) {
taggedRecords.add(new HoodieRecord(key, null));
taggedRecords.add(new HoodieAvroRecord(key, null));
}
taggedRecords = tagLocation(bloomIndex, toTagRecords1, hoodieTable);
@@ -436,9 +437,9 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
// We write record1 to a base file, using a bloom filter having both records
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
filter.add(record2.getRecordKey());

View File

@@ -27,6 +27,7 @@ import org.apache.hudi.common.bloom.BloomFilterTypeCode;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.log.HoodieLogFormat;
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
@@ -132,7 +133,7 @@ public class HoodieFlinkWriteableTestTable extends HoodieWriteableTestTable {
header.put(HeaderMetadataType.SCHEMA, schema.toString());
logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> {
try {
GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get();
GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get();
HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
return (IndexedRecord) val;
} catch (IOException e) {