[HUDI-2902] Fixing populate meta fields with Hfile writers and Disabling virtual keys by default for metadata table (#4194)
This commit is contained in:
committed by
GitHub
parent
ca427240c0
commit
e483f7c776
@@ -27,6 +27,7 @@ import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieUpsertException;
|
||||
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||
import org.apache.hudi.keygen.KeyGenUtils;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
@@ -72,7 +73,7 @@ public class HoodieSortedMergeHandle<T extends HoodieRecordPayload, I, K, O> ext
|
||||
*/
|
||||
@Override
|
||||
public void write(GenericRecord oldRecord) {
|
||||
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt);
|
||||
|
||||
// To maintain overall sorted order across updates and inserts, write any new inserts whose keys are less than
|
||||
// the oldRecord's key.
|
||||
|
||||
@@ -89,7 +89,7 @@ public class HoodieFileWriterFactory {
|
||||
config.getHFileCompressionAlgorithm(), config.getHFileBlockSize(), config.getHFileMaxFileSize(),
|
||||
PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, filter, HFILE_COMPARATOR);
|
||||
|
||||
return new HoodieHFileWriter<>(instantTime, path, hfileConfig, schema, taskContextSupplier);
|
||||
return new HoodieHFileWriter<>(instantTime, path, hfileConfig, schema, taskContextSupplier, config.populateMetaFields());
|
||||
}
|
||||
|
||||
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newOrcFileWriter(
|
||||
|
||||
@@ -62,6 +62,7 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
|
||||
private final long maxFileSize;
|
||||
private final String instantTime;
|
||||
private final TaskContextSupplier taskContextSupplier;
|
||||
private final boolean populateMetaFields;
|
||||
private HFile.Writer writer;
|
||||
private String minRecordKey;
|
||||
private String maxRecordKey;
|
||||
@@ -70,7 +71,7 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
|
||||
private static String DROP_BEHIND_CACHE_COMPACTION_KEY = "hbase.hfile.drop.behind.compaction";
|
||||
|
||||
public HoodieHFileWriter(String instantTime, Path file, HoodieHFileConfig hfileConfig, Schema schema,
|
||||
TaskContextSupplier taskContextSupplier) throws IOException {
|
||||
TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException {
|
||||
|
||||
Configuration conf = FSUtils.registerFileSystem(file, hfileConfig.getHadoopConf());
|
||||
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, conf);
|
||||
@@ -84,6 +85,7 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
|
||||
this.maxFileSize = hfileConfig.getMaxFileSize();
|
||||
this.instantTime = instantTime;
|
||||
this.taskContextSupplier = taskContextSupplier;
|
||||
this.populateMetaFields = populateMetaFields;
|
||||
|
||||
HFileContext context = new HFileContextBuilder().withBlockSize(hfileConfig.getBlockSize())
|
||||
.withCompression(hfileConfig.getCompressionAlgorithm())
|
||||
@@ -104,9 +106,13 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
|
||||
|
||||
@Override
|
||||
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
|
||||
prepRecordWithMetadata(avroRecord, record, instantTime,
|
||||
taskContextSupplier.getPartitionIdSupplier().get(), recordIndex, file.getName());
|
||||
writeAvro(record.getRecordKey(), (IndexedRecord) avroRecord);
|
||||
if (populateMetaFields) {
|
||||
prepRecordWithMetadata(avroRecord, record, instantTime,
|
||||
taskContextSupplier.getPartitionIdSupplier().get(), recordIndex, file.getName());
|
||||
writeAvro(record.getRecordKey(), (IndexedRecord) avroRecord);
|
||||
} else {
|
||||
writeAvro(record.getRecordKey(), (IndexedRecord) avroRecord);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@@ -22,6 +22,9 @@ import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
@@ -34,19 +37,24 @@ import org.apache.hadoop.hbase.io.hfile.CacheConfig;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM;
|
||||
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
|
||||
@@ -55,6 +63,9 @@ import static org.apache.hudi.io.storage.HoodieHFileConfig.DROP_BEHIND_CACHE_COM
|
||||
import static org.apache.hudi.io.storage.HoodieHFileConfig.HFILE_COMPARATOR;
|
||||
import static org.apache.hudi.io.storage.HoodieHFileConfig.PREFETCH_ON_OPEN;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class TestHoodieHFileReaderWriter {
|
||||
@TempDir File tempDir;
|
||||
@@ -73,21 +84,34 @@ public class TestHoodieHFileReaderWriter {
|
||||
}
|
||||
}
|
||||
|
||||
private HoodieHFileWriter createHFileWriter(Schema avroSchema) throws Exception {
|
||||
private static Stream<Arguments> populateMetaFieldsAndTestAvroWithMeta() {
|
||||
return Arrays.stream(new Boolean[][] {
|
||||
{true, true},
|
||||
{false, true},
|
||||
{true, false},
|
||||
{false, false}
|
||||
}).map(Arguments::of);
|
||||
}
|
||||
|
||||
private HoodieHFileWriter createHFileWriter(Schema avroSchema, boolean populateMetaFields) throws Exception {
|
||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.00001, -1, BloomFilterTypeCode.SIMPLE.name());
|
||||
Configuration conf = new Configuration();
|
||||
TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class);
|
||||
Supplier<Integer> partitionSupplier = Mockito.mock(Supplier.class);
|
||||
when(mockTaskContextSupplier.getPartitionIdSupplier()).thenReturn(partitionSupplier);
|
||||
when(partitionSupplier.get()).thenReturn(10);
|
||||
String instantTime = "000";
|
||||
|
||||
HoodieHFileConfig hoodieHFileConfig = new HoodieHFileConfig(conf, Compression.Algorithm.GZ, 1024 * 1024, 120 * 1024 * 1024,
|
||||
PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, filter, HFILE_COMPARATOR);
|
||||
return new HoodieHFileWriter(instantTime, filePath, hoodieHFileConfig, avroSchema, mockTaskContextSupplier);
|
||||
return new HoodieHFileWriter(instantTime, filePath, hoodieHFileConfig, avroSchema, mockTaskContextSupplier, populateMetaFields);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteReadHFile() throws Exception {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
|
||||
HoodieHFileWriter writer = createHFileWriter(avroSchema);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsAndTestAvroWithMeta")
|
||||
public void testWriteReadHFile(boolean populateMetaFields, boolean testAvroWithMeta) throws Exception {
|
||||
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchemaWithMetaFields.avsc");
|
||||
HoodieHFileWriter writer = createHFileWriter(avroSchema, populateMetaFields);
|
||||
List<String> keys = new ArrayList<>();
|
||||
Map<String, GenericRecord> recordMap = new HashMap<>();
|
||||
for (int i = 0; i < 100; i++) {
|
||||
@@ -97,7 +121,13 @@ public class TestHoodieHFileReaderWriter {
|
||||
keys.add(key);
|
||||
record.put("time", Integer.toString(RANDOM.nextInt()));
|
||||
record.put("number", i);
|
||||
writer.writeAvro(key, record);
|
||||
if (testAvroWithMeta) {
|
||||
writer.writeAvroWithMetadata(record, new HoodieRecord(new HoodieKey((String) record.get("_row_key"),
|
||||
Integer.toString((Integer) record.get("number"))), new EmptyHoodieRecordPayload())); // payload does not matter. GenericRecord passed in is what matters
|
||||
// only HoodieKey will be looked up from the 2nd arg(HoodieRecord).
|
||||
} else {
|
||||
writer.writeAvro(key, record);
|
||||
}
|
||||
recordMap.put(key, record);
|
||||
}
|
||||
writer.close();
|
||||
@@ -109,8 +139,8 @@ public class TestHoodieHFileReaderWriter {
|
||||
records.forEach(entry -> assertEquals(entry.getSecond(), recordMap.get(entry.getFirst())));
|
||||
hoodieHFileReader.close();
|
||||
|
||||
for (int i = 0; i < 20; i++) {
|
||||
int randomRowstoFetch = 5 + RANDOM.nextInt(50);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int randomRowstoFetch = 5 + RANDOM.nextInt(10);
|
||||
Set<String> rowsToFetch = getRandomKeys(randomRowstoFetch, keys);
|
||||
List<String> rowsList = new ArrayList<>(rowsToFetch);
|
||||
Collections.sort(rowsList);
|
||||
@@ -119,6 +149,11 @@ public class TestHoodieHFileReaderWriter {
|
||||
assertEquals(result.size(), randomRowstoFetch);
|
||||
result.forEach(entry -> {
|
||||
assertEquals(entry.getSecond(), recordMap.get(entry.getFirst()));
|
||||
if (populateMetaFields && testAvroWithMeta) {
|
||||
assertNotNull(entry.getSecond().get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
|
||||
} else {
|
||||
assertNull(entry.getSecond().get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
|
||||
}
|
||||
});
|
||||
hoodieHFileReader.close();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
{
|
||||
"namespace": "example.schema",
|
||||
"type": "record",
|
||||
"name": "trip",
|
||||
"fields": [
|
||||
{
|
||||
"name": "_hoodie_commit_time",
|
||||
"type": ["null","string"],
|
||||
"default":null
|
||||
},
|
||||
{
|
||||
"name": "_hoodie_commit_seqno",
|
||||
"type": ["null","string"],
|
||||
"default":null
|
||||
},
|
||||
{
|
||||
"name": "_hoodie_record_key",
|
||||
"type": ["null","string"],
|
||||
"default":null
|
||||
},
|
||||
{
|
||||
"name": "_hoodie_partition_path",
|
||||
"type": ["null","string"],
|
||||
"default":null
|
||||
},
|
||||
{
|
||||
"name": "_hoodie_file_name",
|
||||
"type": ["null","string"],
|
||||
"default":null
|
||||
},
|
||||
{
|
||||
"name": "_hoodie_operation",
|
||||
"type": ["null","string"],
|
||||
"default":null
|
||||
},
|
||||
{
|
||||
"name": "_row_key",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "time",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "number",
|
||||
"type": ["int", "null"]
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user