1
0

[HUDI-2176, 2178, 2179] Adding virtual key support to COW table (#3306)

This commit is contained in:
Sivabalan Narayanan
2021-07-26 17:21:04 -04:00
committed by GitHub
parent 5353243449
commit 61148c1c43
57 changed files with 969 additions and 413 deletions

View File

@@ -244,6 +244,22 @@ public class HoodieAvroUtils {
return recordSchema;
}
/**
* Fetch schema for record key and partition path.
*/
public static Schema getSchemaForFields(Schema fileSchema, List<String> fields) {
List<Schema.Field> toBeAddedFields = new ArrayList<>();
Schema recordSchema = Schema.createRecord("HoodieRecordKey", "", "", false);
for (Schema.Field schemaField: fileSchema.getFields()) {
if (fields.contains(schemaField.name())) {
toBeAddedFields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), schemaField.defaultValue()));
}
}
recordSchema.setFields(toBeAddedFields);
return recordSchema;
}
public static GenericRecord addHoodieKeyToRecord(GenericRecord record, String recordKey, String partitionPath,
String fileName) {
record.put(HoodieRecord.FILENAME_METADATA_FIELD, fileName);

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.avro;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
import org.apache.hudi.common.util.Option;
import org.apache.avro.Schema;
import org.apache.parquet.avro.AvroWriteSupport;
@@ -33,7 +34,7 @@ import java.util.HashMap;
*/
public class HoodieAvroWriteSupport extends AvroWriteSupport {
private BloomFilter bloomFilter;
private Option<BloomFilter> bloomFilterOpt;
private String minRecordKey;
private String maxRecordKey;
@@ -44,39 +45,41 @@ public class HoodieAvroWriteSupport extends AvroWriteSupport {
public static final String HOODIE_BLOOM_FILTER_TYPE_CODE = "hoodie_bloom_filter_type_code";
public static final String HOODIE_AVRO_SCHEMA_METADATA_KEY = "orc.avro.schema";
public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, BloomFilter bloomFilter) {
public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, Option<BloomFilter> bloomFilterOpt) {
super(schema, avroSchema);
this.bloomFilter = bloomFilter;
this.bloomFilterOpt = bloomFilterOpt;
}
@Override
public WriteSupport.FinalizedWriteContext finalizeWrite() {
HashMap<String, String> extraMetaData = new HashMap<>();
if (bloomFilter != null) {
extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString());
if (bloomFilterOpt.isPresent()) {
extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilterOpt.get().serializeToString());
if (minRecordKey != null && maxRecordKey != null) {
extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey);
extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey);
}
if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) {
extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilter.getBloomFilterTypeCode().name());
if (bloomFilterOpt.get().getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) {
extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilterOpt.get().getBloomFilterTypeCode().name());
}
}
return new WriteSupport.FinalizedWriteContext(extraMetaData);
}
public void add(String recordKey) {
this.bloomFilter.add(recordKey);
if (minRecordKey != null) {
minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey;
} else {
minRecordKey = recordKey;
}
if (bloomFilterOpt.isPresent()) {
this.bloomFilterOpt.get().add(recordKey);
if (minRecordKey != null) {
minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey;
} else {
minRecordKey = recordKey;
}
if (maxRecordKey != null) {
maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey;
} else {
maxRecordKey = recordKey;
if (maxRecordKey != null) {
maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey;
} else {
maxRecordKey = recordKey;
}
}
}
}

View File

@@ -319,12 +319,6 @@ public class HoodieTableMetaClient implements Serializable {
* @param operationType operation type to be executed.
*/
public void validateTableProperties(Properties properties, WriteOperationType operationType) {
// disabling meta fields are allowed only for bulk_insert operation
if (!Boolean.parseBoolean((String) properties.getOrDefault(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()))
&& operationType != WriteOperationType.BULK_INSERT) {
throw new HoodieException(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key() + " can only be disabled for " + WriteOperationType.BULK_INSERT
+ " operation");
}
// once meta fields are disabled, it cant be re-enabled for a given table.
if (!getTableConfig().populateMetaFields()
&& Boolean.parseBoolean((String) properties.getOrDefault(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()))) {

View File

@@ -34,6 +34,7 @@ import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.keygen.BaseKeyGenerator;
public abstract class BaseFileUtils {
@@ -170,6 +171,15 @@ public abstract class BaseFileUtils {
*/
public abstract List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath);
/**
* Fetch {@link HoodieKey}s from the given data file.
* @param configuration configuration to build fs object
* @param filePath The data file path
* @param keyGeneratorOpt instance of KeyGenerator.
* @return {@link List} of {@link HoodieKey}s fetched from the parquet file
*/
public abstract List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath, Option<BaseKeyGenerator> keyGeneratorOpt);
/**
* Read the Avro schema of the data file.
* @param configuration Configuration

View File

@@ -41,6 +41,8 @@ import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.MetadataNotFoundException;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcProto.UserMetadataItem;
import org.apache.orc.Reader;
@@ -109,6 +111,11 @@ public class OrcUtils extends BaseFileUtils {
return hoodieKeys;
}
@Override
public List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath, Option<BaseKeyGenerator> keyGeneratorOpt) {
throw new HoodieIOException("UnsupportedOperation : Disabling meta fields not yet supported for Orc");
}
/**
* NOTE: This literally reads the entire file contents, thus should be used with caution.
*/

View File

@@ -24,6 +24,7 @@ import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.MetadataNotFoundException;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
@@ -115,23 +116,36 @@ public class ParquetUtils extends BaseFileUtils {
*/
@Override
public List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath) {
return fetchRecordKeyPartitionPathInternal(configuration, filePath, Option.empty());
}
private List<HoodieKey> fetchRecordKeyPartitionPathInternal(Configuration configuration, Path filePath, Option<BaseKeyGenerator> keyGeneratorOpt) {
List<HoodieKey> hoodieKeys = new ArrayList<>();
try {
if (!filePath.getFileSystem(configuration).exists(filePath)) {
return new ArrayList<>();
}
Configuration conf = new Configuration(configuration);
conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
Schema readSchema = keyGeneratorOpt.map(keyGenerator -> {
List<String> fields = new ArrayList<>();
fields.addAll(keyGenerator.getRecordKeyFields());
fields.addAll(keyGenerator.getPartitionPathFields());
return HoodieAvroUtils.getSchemaForFields(readAvroSchema(conf, filePath), fields);
})
.orElse(HoodieAvroUtils.getRecordKeyPartitionPathSchema());
AvroReadSupport.setAvroReadSchema(conf, readSchema);
AvroReadSupport.setRequestedProjection(conf, readSchema);
ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build();
Object obj = reader.read();
while (obj != null) {
if (obj instanceof GenericRecord) {
String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String partitionPath = ((GenericRecord) obj).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
String recordKey = null;
String partitionPath = null;
if (keyGeneratorOpt.isPresent()) {
recordKey = keyGeneratorOpt.get().getRecordKey((GenericRecord) obj);
partitionPath = keyGeneratorOpt.get().getPartitionPath((GenericRecord) obj);
} else {
recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
partitionPath = ((GenericRecord) obj).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
}
hoodieKeys.add(new HoodieKey(recordKey, partitionPath));
obj = reader.read();
}
@@ -142,6 +156,19 @@ public class ParquetUtils extends BaseFileUtils {
return hoodieKeys;
}
/**
* Fetch {@link HoodieKey}s from the given parquet file.
*
* @param configuration configuration to build fs object
* @param filePath The parquet file path.
* @param keyGeneratorOpt
* @return {@link List} of {@link HoodieKey}s fetched from the parquet file
*/
@Override
public List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath, Option<BaseKeyGenerator> keyGeneratorOpt) {
return fetchRecordKeyPartitionPathInternal(configuration, filePath, keyGeneratorOpt);
}
public ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) {
ParquetMetadata footer;
try {
@@ -223,7 +250,7 @@ public class ParquetUtils extends BaseFileUtils {
/**
* Returns the number of records in the parquet file.
*
* @param conf Configuration
* @param conf Configuration
* @param parquetFilePath path of the file
*/
@Override

View File

@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.exception.HoodieKeyException;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import java.util.List;
import java.util.stream.Collectors;
public abstract class BaseKeyGenerator extends KeyGenerator {
protected List<String> recordKeyFields;
protected List<String> partitionPathFields;
protected final boolean encodePartitionPath;
protected final boolean hiveStylePartitioning;
protected BaseKeyGenerator(TypedProperties config) {
super(config);
this.encodePartitionPath = config.getBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key(),
Boolean.parseBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING_OPT_KEY.defaultValue()));
this.hiveStylePartitioning = config.getBoolean(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY.key(),
Boolean.parseBoolean(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY.defaultValue()));
}
/**
* Generate a record Key out of provided generic record.
*/
public abstract String getRecordKey(GenericRecord record);
/**
* Generate a partition path out of provided generic record.
*/
public abstract String getPartitionPath(GenericRecord record);
/**
* Generate a Hoodie Key out of provided generic record.
*/
@Override
public final HoodieKey getKey(GenericRecord record) {
if (getRecordKeyFields() == null || getPartitionPathFields() == null) {
throw new HoodieKeyException("Unable to find field names for record key or partition path in cfg");
}
return new HoodieKey(getRecordKey(record), getPartitionPath(record));
}
@Override
public final List<String> getRecordKeyFieldNames() {
// For nested columns, pick top level column name
return getRecordKeyFields().stream().map(k -> {
int idx = k.indexOf('.');
return idx > 0 ? k.substring(0, idx) : k;
}).collect(Collectors.toList());
}
public List<String> getRecordKeyFields() {
return recordKeyFields;
}
public List<String> getPartitionPathFields() {
return partitionPathFields;
}
}

View File

@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen;
import org.apache.hudi.ApiMaturityLevel;
import org.apache.hudi.PublicAPIClass;
import org.apache.hudi.PublicAPIMethod;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.avro.generic.GenericRecord;
import java.util.List;
/**
* Abstract class to extend for plugging in extraction of {@link HoodieKey} from an Avro record.
*/
@PublicAPIClass(maturity = ApiMaturityLevel.STABLE)
public abstract class KeyGenerator implements KeyGeneratorInterface {
protected TypedProperties config;
protected KeyGenerator(TypedProperties config) {
this.config = config;
}
/**
* Generate a Hoodie Key out of provided generic record.
*/
@PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
public abstract HoodieKey getKey(GenericRecord record);
/**
* Used during bootstrap, to project out only the record key fields from bootstrap source dataset.
*
* @return list of field names, when concatenated make up the record key.
*/
@PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
public List<String> getRecordKeyFieldNames() {
throw new UnsupportedOperationException("Bootstrap not supported for key generator. "
+ "Please override this method in your custom key generator.");
}
}

View File

@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.model.HoodieKey;
import java.io.Serializable;
import java.util.List;
/**
* Represents the interface key generators need to adhere to.
*/
public interface KeyGeneratorInterface extends Serializable {
HoodieKey getKey(GenericRecord record);
List<String> getRecordKeyFieldNames();
}

View File

@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen.constant;
import org.apache.hudi.common.config.ConfigClassProperty;
import org.apache.hudi.common.config.ConfigGroups;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;
@ConfigClassProperty(name = "Key Generator Options",
groupName = ConfigGroups.Names.WRITE_CLIENT,
description = "Hudi maintains keys (record key + partition path) "
+ "for uniquely identifying a particular record. "
+ "This config allows developers to setup the Key generator class that "
+ "will extract these out of incoming records.")
public class KeyGeneratorOptions extends HoodieConfig {
public static final ConfigProperty<String> URL_ENCODE_PARTITIONING_OPT_KEY = ConfigProperty
.key("hoodie.datasource.write.partitionpath.urlencode")
.defaultValue("false")
.withDocumentation("Should we url encode the partition path value, before creating the folder structure.");
public static final ConfigProperty<String> HIVE_STYLE_PARTITIONING_OPT_KEY = ConfigProperty
.key("hoodie.datasource.write.hive_style_partitioning")
.defaultValue("false")
.withDocumentation("Flag to indicate whether to use Hive style partitioning.\n"
+ "If set true, the names of partition folders follow <partition_column_name>=<partition_value> format.\n"
+ "By default false (the names of partition folders are only partition values)");
public static final ConfigProperty<String> RECORDKEY_FIELD_OPT_KEY = ConfigProperty
.key("hoodie.datasource.write.recordkey.field")
.defaultValue("uuid")
.withDocumentation("Record key field. Value to be used as the `recordKey` component of `HoodieKey`.\n"
+ "Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using\n"
+ "the dot notation eg: `a.b.c`");
public static final ConfigProperty<String> PARTITIONPATH_FIELD_OPT_KEY = ConfigProperty
.key("hoodie.datasource.write.partitionpath.field")
.defaultValue("partitionpath")
.withDocumentation("Partition path field. Value to be used at the partitionPath component of HoodieKey. "
+ "Actual value ontained by invoking .toString()");
}

View File

@@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen.constant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Types of {@link org.apache.hudi.keygen.KeyGenerator}.
*/
public enum KeyGeneratorType {
/**
* Simple key generator, which takes names of fields to be used for recordKey and partitionPath as configs.
*/
SIMPLE,
/**
* Complex key generator, which takes names of fields to be used for recordKey and partitionPath as configs.
*/
COMPLEX,
/**
* Key generator, that relies on timestamps for partitioning field. Still picks record key by name.
*/
TIMESTAMP,
/**
* This is a generic implementation type of KeyGenerator where users can configure record key as a single field or
* a combination of fields. Similarly partition path can be configured to have multiple fields or only one field.
* <p>
* This KeyGenerator expects value for prop "hoodie.datasource.write.partitionpath.field" in a specific format.
* For example:
* properties.put("hoodie.datasource.write.partitionpath.field", "field1:PartitionKeyType1,field2:PartitionKeyType2").
*/
CUSTOM,
/**
* Simple Key generator for unpartitioned Hive Tables.
*/
NON_PARTITION,
/**
* Key generator for deletes using global indices.
*/
GLOBAL_DELETE;
public static List<String> getNames() {
List<String> names = new ArrayList<>(KeyGeneratorType.values().length);
Arrays.stream(KeyGeneratorType.values())
.forEach(x -> names.add(x.name()));
return names;
}
}