1
0

[HUDI-2763] Metadata table records - support for key deduplication based on hardcoded key field (#4449)

* [HUDI-2763] Metadata table records - support for key deduplication and virtual keys
- The backing log format for the metadata table is HFile, a KeyValue type.
Since the key field in the metadata record payload is a duplicate of the
Key in the Cell, the redundant key field in the record can be emptied
to save on the cost.

- HoodieHFileWriter and HoodieHFileDataBlock will now serialize records
with the key field emptied by default. HFile writer tries to find if
the record has metadata payload schema field 'key' and if so it does
the key trimming from the record payload.

- HoodieHFileReader when reading the serialized records back from disk,
it materializes the missing keyFields if any. HFile reader tries to
find if the record has metadata payload schema fiels 'key' and if so
it does the key materialization in the record payload.

- Tests have been added to verify the default virtual keys and key
   deduplication support for the metadata table records.

Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
Manoj Govindassamy
2022-01-26 10:34:04 -08:00
committed by GitHub
parent dd4ce1bdfd
commit f87c47352a
17 changed files with 745 additions and 138 deletions

View File

@@ -87,7 +87,8 @@ public class HoodieFileWriterFactory {
BloomFilter filter = createBloomFilter(config);
HoodieHFileConfig hfileConfig = new HoodieHFileConfig(hoodieTable.getHadoopConf(),
config.getHFileCompressionAlgorithm(), config.getHFileBlockSize(), config.getHFileMaxFileSize(),
PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, filter, HFILE_COMPARATOR);
HoodieHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION,
filter, HFILE_COMPARATOR);
return new HoodieHFileWriter<>(instantTime, path, hfileConfig, schema, taskContextSupplier, config.populateMetaFields());
}

View File

@@ -43,9 +43,10 @@ public class HoodieHFileConfig {
private final Configuration hadoopConf;
private final BloomFilter bloomFilter;
private final KeyValue.KVComparator hfileComparator;
private final String keyFieldName;
public HoodieHFileConfig(Configuration hadoopConf, Compression.Algorithm compressionAlgorithm, int blockSize,
long maxFileSize, boolean prefetchBlocksOnOpen, boolean cacheDataInL1,
long maxFileSize, String keyFieldName, boolean prefetchBlocksOnOpen, boolean cacheDataInL1,
boolean dropBehindCacheCompaction, BloomFilter bloomFilter, KeyValue.KVComparator hfileComparator) {
this.hadoopConf = hadoopConf;
this.compressionAlgorithm = compressionAlgorithm;
@@ -56,6 +57,7 @@ public class HoodieHFileConfig {
this.dropBehindCacheCompaction = dropBehindCacheCompaction;
this.bloomFilter = bloomFilter;
this.hfileComparator = hfileComparator;
this.keyFieldName = keyFieldName;
}
public Configuration getHadoopConf() {
@@ -97,4 +99,8 @@ public class HoodieHFileConfig {
public KeyValue.KVComparator getHfileComparator() {
return hfileComparator;
}
public String getKeyFieldName() {
return keyFieldName;
}
}

View File

@@ -38,6 +38,8 @@ import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileContext;
import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
import org.apache.hadoop.io.Writable;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import java.io.DataInput;
import java.io.DataOutput;
@@ -63,6 +65,8 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
private final String instantTime;
private final TaskContextSupplier taskContextSupplier;
private final boolean populateMetaFields;
private final Schema schema;
private final Option<Schema.Field> keyFieldSchema;
private HFile.Writer writer;
private String minRecordKey;
private String maxRecordKey;
@@ -77,6 +81,8 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, conf);
this.fs = (HoodieWrapperFileSystem) this.file.getFileSystem(conf);
this.hfileConfig = hfileConfig;
this.schema = schema;
this.keyFieldSchema = Option.ofNullable(schema.getField(hfileConfig.getKeyFieldName()));
// TODO - compute this compression ratio dynamically by looking at the bytes written to the
// stream and the actual file size reported by HDFS
@@ -121,8 +127,25 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
}
@Override
public void writeAvro(String recordKey, IndexedRecord object) throws IOException {
byte[] value = HoodieAvroUtils.avroToBytes((GenericRecord)object);
public void writeAvro(String recordKey, IndexedRecord record) throws IOException {
byte[] value = null;
boolean isRecordSerialized = false;
if (keyFieldSchema.isPresent()) {
GenericRecord keyExcludedRecord = (GenericRecord) record;
int keyFieldPos = this.keyFieldSchema.get().pos();
boolean isKeyAvailable = (record.get(keyFieldPos) != null && !(record.get(keyFieldPos).toString().isEmpty()));
if (isKeyAvailable) {
Object originalKey = keyExcludedRecord.get(keyFieldPos);
keyExcludedRecord.put(keyFieldPos, StringUtils.EMPTY_STRING);
value = HoodieAvroUtils.avroToBytes(keyExcludedRecord);
keyExcludedRecord.put(keyFieldPos, originalKey);
isRecordSerialized = true;
}
}
if (!isRecordSerialized) {
value = HoodieAvroUtils.avroToBytes((GenericRecord) record);
}
KeyValue kv = new KeyValue(recordKey.getBytes(), null, null, value);
writer.append(kv);

View File

@@ -96,7 +96,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
// Virtual keys support for metadata table. This Field is
// from the metadata payload schema.
private static final String RECORD_KEY_FIELD = HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY;
private static final String RECORD_KEY_FIELD_NAME = HoodieMetadataPayload.KEY_FIELD_NAME;
protected HoodieWriteConfig metadataWriteConfig;
protected HoodieWriteConfig dataWriteConfig;
@@ -217,8 +217,8 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
// RecordKey properties are needed for the metadata table records
final Properties properties = new Properties();
properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), RECORD_KEY_FIELD);
properties.put("hoodie.datasource.write.recordkey.field", RECORD_KEY_FIELD);
properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), RECORD_KEY_FIELD_NAME);
properties.put("hoodie.datasource.write.recordkey.field", RECORD_KEY_FIELD_NAME);
builder.withProperties(properties);
if (writeConfig.isMetricsOn()) {
@@ -454,7 +454,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
.setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue())
.setPayloadClassName(HoodieMetadataPayload.class.getName())
.setBaseFileFormat(HoodieFileFormat.HFILE.toString())
.setRecordKeyFields(RECORD_KEY_FIELD)
.setRecordKeyFields(RECORD_KEY_FIELD_NAME)
.setPopulateMetaFields(dataWriteConfig.getMetadataConfig().populateMetaFields())
.setKeyGeneratorClassProp(HoodieTableMetadataKeyGenerator.class.getCanonicalName())
.initTable(hadoopConf.get(), metadataWriteConfig.getBasePath());

View File

@@ -42,7 +42,7 @@ public class HoodieTableMetadataKeyGenerator extends BaseKeyGenerator {
@Override
public String getRecordKey(GenericRecord record) {
return KeyGenUtils.getRecordKey(record, HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY, isConsistentLogicalTimestampEnabled());
return KeyGenUtils.getRecordKey(record, HoodieMetadataPayload.KEY_FIELD_NAME, isConsistentLogicalTimestampEnabled());
}
@Override

View File

@@ -103,7 +103,7 @@ public class TestHoodieHFileReaderWriter {
String instantTime = "000";
HoodieHFileConfig hoodieHFileConfig = new HoodieHFileConfig(conf, Compression.Algorithm.GZ, 1024 * 1024, 120 * 1024 * 1024,
PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, filter, HFILE_COMPARATOR);
HoodieHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, filter, HFILE_COMPARATOR);
return new HoodieHFileWriter(instantTime, filePath, hoodieHFileConfig, avroSchema, mockTaskContextSupplier, populateMetaFields);
}