1
0

[HUDI-2593] Virtual keys support for metadata table (#3968)

- Metadata table today has virtual keys disabled, thereby populating the metafields
  for each record written out and increasing the overall storage space used. Hereby
  adding virtual keys support for metadata table so that metafields are disabled
  for metadata table records.

- Adding a custom KeyGenerator for Metadata table so as to not rely on the
  default Base/SimpleKeyGenerators which currently look for record key
  and partition field set in the table config.

- AbstractHoodieLogRecordReader's version of processing next data block and
  createHoodieRecord() will be a generic version and making the derived class
  HoodieMetadataMergedLogRecordReader take care of the special creation of
  records from explictly passed in partition names.
This commit is contained in:
Manoj Govindassamy
2021-11-19 15:11:29 -08:00
committed by GitHub
parent eba354e922
commit 459b34240b
28 changed files with 423 additions and 123 deletions

View File

@@ -360,7 +360,12 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload, I, K, O> extends
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchemaWithMetaFields.toString());
List<HoodieLogBlock> blocks = new ArrayList<>(2);
if (recordList.size() > 0) {
blocks.add(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header));
if (config.populateMetaFields()) {
blocks.add(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header));
} else {
final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
blocks.add(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header, keyField));
}
}
if (keysToDelete.size() > 0) {
blocks.add(new HoodieDeleteBlock(keysToDelete.toArray(new HoodieKey[keysToDelete.size()]), header));

View File

@@ -41,6 +41,7 @@ import org.apache.hudi.common.model.HoodiePartitionMetadata;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.WriteConcurrencyMode;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.log.HoodieLogFormat;
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
@@ -76,6 +77,7 @@ import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER;
@@ -91,6 +93,10 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
private static final Logger LOG = LogManager.getLogger(HoodieBackedTableMetadataWriter.class);
// Virtual keys support for metadata table. This Field is
// from the metadata payload schema.
private static final String RECORD_KEY_FIELD = HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY;
protected HoodieWriteConfig metadataWriteConfig;
protected HoodieWriteConfig dataWriteConfig;
protected String tableName;
@@ -202,7 +208,15 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
.withDeleteParallelism(parallelism)
.withRollbackParallelism(parallelism)
.withFinalizeWriteParallelism(parallelism)
.withAllowMultiWriteOnSameInstant(true);
.withAllowMultiWriteOnSameInstant(true)
.withKeyGenerator(HoodieTableMetadataKeyGenerator.class.getCanonicalName())
.withPopulateMetaFields(dataWriteConfig.getMetadataConfig().populateMetaFields());
// RecordKey properties are needed for the metadata table records
final Properties properties = new Properties();
properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), RECORD_KEY_FIELD);
properties.put("hoodie.datasource.write.recordkey.field", RECORD_KEY_FIELD);
builder.withProperties(properties);
if (writeConfig.isMetricsOn()) {
builder.withMetricsConfig(HoodieMetricsConfig.newBuilder()
@@ -395,9 +409,12 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
.setTableType(HoodieTableType.MERGE_ON_READ)
.setTableName(tableName)
.setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue())
.setPayloadClassName(HoodieMetadataPayload.class.getName())
.setBaseFileFormat(HoodieFileFormat.HFILE.toString())
.initTable(hadoopConf.get(), metadataWriteConfig.getBasePath());
.setPayloadClassName(HoodieMetadataPayload.class.getName())
.setBaseFileFormat(HoodieFileFormat.HFILE.toString())
.setRecordKeyFields(RECORD_KEY_FIELD)
.setPopulateMetaFields(dataWriteConfig.getMetadataConfig().populateMetaFields())
.setKeyGeneratorClassProp(HoodieTableMetadataKeyGenerator.class.getCanonicalName())
.initTable(hadoopConf.get(), metadataWriteConfig.getBasePath());
initTableMetadata();
initializeFileGroups(dataMetaClient, MetadataPartitionType.FILES, createInstantTime, 1);

View File

@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.metadata;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.KeyGenUtils;
/**
* Custom key generator for the Hoodie table metadata. The metadata table record payload
* has an internal schema with a known key field HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY.
* With or without the virtual keys, getting the key from the metadata table record is always
* via the above field and there is no real need for a key generator. But, when a write
* client is instantiated for the metadata table, when virtual keys are enabled, and when
* key generator class is not configured, the default SimpleKeyGenerator will be used.
* To avoid using any other key generators for the metadata table which rely on certain
* config properties, we need this custom key generator exclusively for the metadata table.
*/
public class HoodieTableMetadataKeyGenerator extends BaseKeyGenerator {
public HoodieTableMetadataKeyGenerator(TypedProperties config) {
super(config);
}
@Override
public String getRecordKey(GenericRecord record) {
return KeyGenUtils.getRecordKey(record, HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY);
}
@Override
public String getPartitionPath(GenericRecord record) {
return "";
}
}

View File

@@ -346,7 +346,8 @@ public class HoodieTimelineArchiveLog<T extends HoodieAvroPayload, I, K, O> {
if (records.size() > 0) {
Map<HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, wrapperSchema.toString());
HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, header);
final String keyField = table.getMetaClient().getTableConfig().getRecordKeyFieldProp();
HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, header, keyField);
writer.appendBlock(block);
records.clear();
}

View File

@@ -180,6 +180,7 @@ public abstract class HoodieCompactor<T extends HoodieRecordPayload, I, K, O> im
.withDiskMapType(config.getCommonConfig().getSpillableDiskMapType())
.withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled())
.withOperationField(config.allowOperationMetadataField())
.withPartition(operation.getPartitionPath())
.build();
if (!scanner.iterator().hasNext()) {
scanner.close();