1
0

[HUDI-2656] Generalize HoodieIndex for flexible record data type (#3893)

Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
Y Ethan Guo
2022-02-03 20:24:04 -08:00
committed by GitHub
parent 69dfcda116
commit b8601a9f58
105 changed files with 564 additions and 504 deletions

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.common.fs.inline;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.testutils.FileSystemTestUtils;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
@@ -144,7 +145,8 @@ public class TestParquetInLining {
List<HoodieRecord> hoodieRecords = dataGenerator.generateInsertsWithHoodieAvroPayload(commitTime, 10);
List<GenericRecord> toReturn = new ArrayList<>();
for (HoodieRecord record : hoodieRecords) {
toReturn.add((GenericRecord) record.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get());
toReturn.add((GenericRecord) ((HoodieAvroRecord) record).getData()
.getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get());
}
return toReturn;
}

View File

@@ -18,18 +18,10 @@
package org.apache.hudi.common.functional;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieArchivedLogFile;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord;
@@ -59,6 +51,16 @@ import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
import org.apache.hudi.exception.CorruptedLogFileException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.util.counters.BenchmarkCounter;
import org.junit.jupiter.api.AfterAll;
@@ -585,7 +587,8 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
List<IndexedRecord> scannedRecords = new ArrayList<>();
for (HoodieRecord record : scanner) {
scannedRecords.add((IndexedRecord) record.getData().getInsertValue(schema).get());
scannedRecords.add((IndexedRecord)
((HoodieAvroRecord) record).getData().getInsertValue(schema).get());
}
assertEquals(scannedRecords.size(), allRecords.stream().mapToLong(Collection::size).sum(),

View File

@@ -44,7 +44,7 @@ public class TestHoodieRecord {
public void setUp() throws Exception {
final List<IndexedRecord> indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
final List<HoodieRecord> hoodieRecords =
indexedRecords.stream().map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"),
indexedRecords.stream().map(r -> new HoodieAvroRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"),
new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList());
hoodieRecord = hoodieRecords.get(0);
}

View File

@@ -23,6 +23,7 @@ import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodiePartitionMetadata;
@@ -510,7 +511,7 @@ public class HoodieTestDataGenerator implements AutoCloseable {
populateKeysBySchema(schemaStr, currSize + i, kp);
incrementNumExistingKeysBySchema(schemaStr);
try {
return new HoodieRecord(key, generateRandomValueAsPerSchema(schemaStr, key, instantTime, isFlattened));
return new HoodieAvroRecord(key, generateRandomValueAsPerSchema(schemaStr, key, instantTime, isFlattened));
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
@@ -541,7 +542,7 @@ public class HoodieTestDataGenerator implements AutoCloseable {
List<HoodieRecord> copy = new ArrayList<>();
for (HoodieRecord r : origin) {
HoodieKey key = r.getKey();
HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, instantTime));
HoodieRecord record = new HoodieAvroRecord(key, generateRandomValue(key, instantTime));
copy.add(record);
}
return copy;
@@ -553,7 +554,7 @@ public class HoodieTestDataGenerator implements AutoCloseable {
for (int i = 0; i < limit; i++) {
String partitionPath = partitionPaths[RAND.nextInt(partitionPaths.length)];
HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath);
HoodieRecord record = new HoodieRecord(key, generateAvroPayload(key, instantTime));
HoodieRecord record = new HoodieAvroRecord(key, generateAvroPayload(key, instantTime));
inserts.add(record);
KeyPartition kp = new KeyPartition();
@@ -568,7 +569,7 @@ public class HoodieTestDataGenerator implements AutoCloseable {
public List<HoodieRecord> generateUpdatesWithHoodieAvroPayload(String instantTime, List<HoodieRecord> baseRecords) {
List<HoodieRecord> updates = new ArrayList<>();
for (HoodieRecord baseRecord : baseRecords) {
HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateAvroPayload(baseRecord.getKey(), instantTime));
HoodieRecord record = new HoodieAvroRecord(baseRecord.getKey(), generateAvroPayload(baseRecord.getKey(), instantTime));
updates.add(record);
}
return updates;
@@ -596,11 +597,11 @@ public class HoodieTestDataGenerator implements AutoCloseable {
public HoodieRecord generateDeleteRecord(HoodieKey key) throws IOException {
RawTripTestPayload payload =
new RawTripTestPayload(Option.empty(), key.getRecordKey(), key.getPartitionPath(), null, true, 0L);
return new HoodieRecord(key, payload);
return new HoodieAvroRecord(key, payload);
}
public HoodieRecord generateUpdateRecord(HoodieKey key, String instantTime) throws IOException {
return new HoodieRecord(key, generateRandomValue(key, instantTime));
return new HoodieAvroRecord(key, generateRandomValue(key, instantTime));
}
public List<HoodieRecord> generateUpdates(String instantTime, List<HoodieRecord> baseRecords) throws IOException {
@@ -615,7 +616,7 @@ public class HoodieTestDataGenerator implements AutoCloseable {
public List<HoodieRecord> generateUpdatesWithTS(String instantTime, List<HoodieRecord> baseRecords, int ts) throws IOException {
List<HoodieRecord> updates = new ArrayList<>();
for (HoodieRecord baseRecord : baseRecords) {
HoodieRecord record = new HoodieRecord(baseRecord.getKey(),
HoodieRecord record = new HoodieAvroRecord(baseRecord.getKey(),
generateRandomValue(baseRecord.getKey(), instantTime, false, ts));
updates.add(record);
}
@@ -735,7 +736,7 @@ public class HoodieTestDataGenerator implements AutoCloseable {
logger.debug("key getting updated: " + kp.key.getRecordKey());
used.add(kp);
try {
return new HoodieRecord(kp.key, generateRandomValueAsPerSchema(schemaStr, kp.key, instantTime, false));
return new HoodieAvroRecord(kp.key, generateRandomValueAsPerSchema(schemaStr, kp.key, instantTime, false));
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
@@ -801,7 +802,7 @@ public class HoodieTestDataGenerator implements AutoCloseable {
numExistingKeys--;
used.add(kp);
try {
result.add(new HoodieRecord(kp.key, generateRandomDeleteValue(kp.key, instantTime)));
result.add(new HoodieAvroRecord(kp.key, generateRandomDeleteValue(kp.key, instantTime)));
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.common.testutils;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.MercifulJsonConverter;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
@@ -152,7 +153,7 @@ public final class SchemaTestUtil {
}
private static HoodieRecord convertToHoodieRecords(IndexedRecord iRecord, String key, String partitionPath) {
return new HoodieRecord<>(new HoodieKey(key, partitionPath),
return new HoodieAvroRecord<>(new HoodieKey(key, partitionPath),
new HoodieAvroPayload(Option.of((GenericRecord) iRecord)));
}
@@ -172,7 +173,7 @@ public final class SchemaTestUtil {
throws IOException, URISyntaxException {
List<IndexedRecord> iRecords = generateTestRecords(from, limit);
return iRecords.stream().map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"),
return iRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"),
new HoodieAvroPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList());
}
@@ -180,9 +181,9 @@ public final class SchemaTestUtil {
Schema schema, String fieldNameToUpdate, String newValue) {
return oldRecords.stream().map(r -> {
try {
GenericRecord rec = (GenericRecord) r.getData().getInsertValue(schema).get();
GenericRecord rec = (GenericRecord) ((HoodieAvroRecord) r).getData().getInsertValue(schema).get();
rec.put(fieldNameToUpdate, newValue);
return new HoodieRecord<>(r.getKey(), new HoodieAvroPayload(Option.of(rec)));
return new HoodieAvroRecord<>(r.getKey(), new HoodieAvroPayload(Option.of(rec)));
} catch (IOException io) {
throw new HoodieIOException("unable to get data from hoodie record", io);
}

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.common.testutils;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
@@ -48,7 +49,7 @@ public class SpillableMapTestUtils {
String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
recordKeys.add(key);
HoodieRecord record =
new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r)));
new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r)));
record.unseal();
record.setCurrentLocation(new HoodieRecordLocation("DUMMY_COMMIT_TIME", "DUMMY_FILE_ID"));
record.seal();

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.common.util.collection;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
@@ -186,7 +187,7 @@ public class TestBitCaskDiskMap extends HoodieCommonTestHarness {
schema = SchemaTestUtil.getSimpleSchema();
List<IndexedRecord> indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
hoodieRecords =
indexedRecords.stream().map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"),
indexedRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"),
new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList());
payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
assertTrue(payloadSize > 0);
@@ -195,7 +196,7 @@ public class TestBitCaskDiskMap extends HoodieCommonTestHarness {
final Schema simpleSchemaWithMetadata = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
hoodieRecords = indexedRecords.stream()
.map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"),
.map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"),
new AvroBinaryTestPayload(
Option.of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata)))))
.collect(Collectors.toList());
@@ -212,7 +213,7 @@ public class TestBitCaskDiskMap extends HoodieCommonTestHarness {
iRecords.forEach(r -> {
String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
HoodieRecord value = new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r)));
HoodieRecord value = new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r)));
recordMap.put(key, value);
});

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.common.util.collection;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
@@ -135,7 +136,7 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness {
updatedRecords.forEach(record -> {
HoodieRecord rec = records.get(((GenericRecord) record).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
try {
assertEquals(rec.getData().getInsertValue(schema).get(), record);
assertEquals(((HoodieAvroRecord) rec).getData().getInsertValue(schema).get(), record);
} catch (IOException io) {
throw new UncheckedIOException(io);
}
@@ -159,13 +160,13 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness {
IndexedRecord inMemoryRecord = iRecords.get(0);
String ikey = ((GenericRecord) inMemoryRecord).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String iPartitionPath = ((GenericRecord) inMemoryRecord).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
HoodieRecord inMemoryHoodieRecord = new HoodieRecord<>(new HoodieKey(ikey, iPartitionPath),
HoodieRecord inMemoryHoodieRecord = new HoodieAvroRecord<>(new HoodieKey(ikey, iPartitionPath),
new HoodieAvroPayload(Option.of((GenericRecord) inMemoryRecord)));
IndexedRecord onDiskRecord = iRecords.get(99);
String dkey = ((GenericRecord) onDiskRecord).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String dPartitionPath = ((GenericRecord) onDiskRecord).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
HoodieRecord onDiskHoodieRecord = new HoodieRecord<>(new HoodieKey(dkey, dPartitionPath),
HoodieRecord onDiskHoodieRecord = new HoodieAvroRecord<>(new HoodieKey(dkey, dPartitionPath),
new HoodieAvroPayload(Option.of((GenericRecord) onDiskRecord)));
// assert size
assert records.size() == 100;
@@ -241,7 +242,7 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness {
// Get a record from the in-Memory map
String key = recordKeys.get(0);
HoodieRecord record = records.get(key);
HoodieAvroRecord record = (HoodieAvroRecord) records.get(key);
List<IndexedRecord> recordsToUpdate = new ArrayList<>();
recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
@@ -259,7 +260,7 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness {
// Get a record from the disk based map
key = recordKeys.get(recordKeys.size() - 1);
record = records.get(key);
record = (HoodieAvroRecord) records.get(key);
recordsToUpdate = new ArrayList<>();
recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());

View File

@@ -18,12 +18,9 @@
package org.apache.hudi.common.util.collection;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
@@ -33,6 +30,9 @@ import org.apache.hudi.common.testutils.SchemaTestUtil;
import org.apache.hudi.common.testutils.SpillableMapTestUtils;
import org.apache.hudi.common.util.Option;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -166,7 +166,7 @@ public class TestRocksDbDiskMap extends HoodieCommonTestHarness {
iRecords.forEach(r -> {
String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
HoodieRecord value = new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r)));
HoodieRecord value = new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r)));
recordMap.put(key, value);
});