1
0

[HUDI-2176, 2178, 2179] Adding virtual key support to COW table (#3306)

This commit is contained in:
Sivabalan Narayanan
2021-07-26 17:21:04 -04:00
committed by GitHub
parent 5353243449
commit 61148c1c43
57 changed files with 969 additions and 413 deletions

View File

@@ -1594,6 +1594,11 @@ public class HoodieWriteConfig extends HoodieConfig {
return this;
}
public Builder withPopulateMetaFields(boolean populateMetaFields) {
writeConfig.setValue(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS, Boolean.toString(populateMetaFields));
return this;
}
public Builder withProperties(Properties properties) {
this.writeConfig.getProps().putAll(properties);
return this;

View File

@@ -207,9 +207,11 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload, I, K, O> extends
avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get()));
String seqId =
HoodieRecord.generateSequenceId(instantTime, getPartitionId(), RECORD_COUNTER.getAndIncrement());
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
hoodieRecord.getPartitionPath(), fileId);
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
if (config.populateMetaFields()) {
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
hoodieRecord.getPartitionPath(), fileId);
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
}
if (isUpdateRecord(hoodieRecord)) {
updatedRecordsWritten++;
} else {

View File

@@ -23,12 +23,16 @@ import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.BaseFileUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.table.HoodieTable;
import org.apache.hadoop.fs.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Stream;
/**
@@ -39,17 +43,25 @@ import java.util.stream.Stream;
public class HoodieKeyLocationFetchHandle<T extends HoodieRecordPayload, I, K, O> extends HoodieReadHandle<T, I, K, O> {
private final Pair<String, HoodieBaseFile> partitionPathBaseFilePair;
private final Option<BaseKeyGenerator> keyGeneratorOpt;
public HoodieKeyLocationFetchHandle(HoodieWriteConfig config, HoodieTable<T, I, K, O> hoodieTable,
Pair<String, HoodieBaseFile> partitionPathBaseFilePair) {
Pair<String, HoodieBaseFile> partitionPathBaseFilePair, Option<BaseKeyGenerator> keyGeneratorOpt) {
super(config, null, hoodieTable, Pair.of(partitionPathBaseFilePair.getLeft(), partitionPathBaseFilePair.getRight().getFileId()));
this.partitionPathBaseFilePair = partitionPathBaseFilePair;
this.keyGeneratorOpt = keyGeneratorOpt;
}
public Stream<Pair<HoodieKey, HoodieRecordLocation>> locations() {
HoodieBaseFile baseFile = partitionPathBaseFilePair.getRight();
return BaseFileUtils.getInstance(baseFile.getPath()).fetchRecordKeyPartitionPath(
hoodieTable.getHadoopConf(), new Path(baseFile.getPath())).stream()
BaseFileUtils baseFileUtils = BaseFileUtils.getInstance(baseFile.getPath());
List<HoodieKey> hoodieKeyList = new ArrayList<>();
if (keyGeneratorOpt.isPresent()) {
hoodieKeyList = baseFileUtils.fetchRecordKeyPartitionPath(hoodieTable.getHadoopConf(), new Path(baseFile.getPath()), keyGeneratorOpt);
} else {
hoodieKeyList = baseFileUtils.fetchRecordKeyPartitionPath(hoodieTable.getHadoopConf(), new Path(baseFile.getPath()));
}
return hoodieKeyList.stream()
.map(entry -> Pair.of(entry,
new HoodieRecordLocation(baseFile.getCommitTime(), baseFile.getFileId())));
}

View File

@@ -32,6 +32,7 @@ import org.apache.hudi.common.model.IOType;
import org.apache.hudi.common.util.DefaultSizeEstimator;
import org.apache.hudi.common.util.HoodieRecordSizeEstimator;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieCorruptedDataException;
@@ -40,6 +41,8 @@ import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
import org.apache.hudi.io.storage.HoodieFileWriter;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.KeyGenUtils;
import org.apache.hudi.table.HoodieTable;
import org.apache.avro.Schema;
@@ -101,21 +104,23 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
protected long updatedRecordsWritten = 0;
protected long insertRecordsWritten = 0;
protected boolean useWriterSchema;
protected Option<BaseKeyGenerator> keyGeneratorOpt;
private HoodieBaseFile baseFileToMerge;
public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
TaskContextSupplier taskContextSupplier) {
TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
this(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier,
hoodieTable.getBaseFileOnlyView().getLatestBaseFile(partitionPath, fileId).get());
hoodieTable.getBaseFileOnlyView().getLatestBaseFile(partitionPath, fileId).get(), keyGeneratorOpt);
}
public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
TaskContextSupplier taskContextSupplier, HoodieBaseFile baseFile) {
TaskContextSupplier taskContextSupplier, HoodieBaseFile baseFile, Option<BaseKeyGenerator> keyGeneratorOpt) {
super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier);
init(fileId, recordItr);
init(fileId, partitionPath, baseFile);
validateAndSetAndKeyGenProps(keyGeneratorOpt, config.populateMetaFields());
}
/**
@@ -123,11 +128,17 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
*/
public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
Map<String, HoodieRecord<T>> keyToNewRecords, String partitionPath, String fileId,
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier);
this.keyToNewRecords = keyToNewRecords;
this.useWriterSchema = true;
init(fileId, this.partitionPath, dataFileToBeMerged);
validateAndSetAndKeyGenProps(keyGeneratorOpt, config.populateMetaFields());
}
private void validateAndSetAndKeyGenProps(Option<BaseKeyGenerator> keyGeneratorOpt, boolean populateMetaFields) {
ValidationUtils.checkArgument(populateMetaFields == !keyGeneratorOpt.isPresent());
this.keyGeneratorOpt = keyGeneratorOpt;
}
@Override
@@ -278,7 +289,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
* Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
*/
public void write(GenericRecord oldRecord) {
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt);
boolean copyOldRecord = true;
if (keyToNewRecords.containsKey(key)) {
// If we have duplicate records that we are updating, then the hoodie record will be deflated after

View File

@@ -23,8 +23,10 @@ import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.table.HoodieTable;
import org.apache.avro.generic.GenericRecord;
@@ -47,8 +49,9 @@ public class HoodieSortedMergeHandle<T extends HoodieRecordPayload, I, K, O> ext
private Queue<String> newRecordKeysSorted = new PriorityQueue<>();
public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId, TaskContextSupplier taskContextSupplier) {
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier);
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId, TaskContextSupplier taskContextSupplier,
Option<BaseKeyGenerator> keyGeneratorOpt) {
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
newRecordKeysSorted.addAll(keyToNewRecords.keySet());
}
@@ -57,9 +60,9 @@ public class HoodieSortedMergeHandle<T extends HoodieRecordPayload, I, K, O> ext
*/
public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
Map<String, HoodieRecord<T>> keyToNewRecordsOrig, String partitionPath, String fileId,
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
super(config, instantTime, hoodieTable, keyToNewRecordsOrig, partitionPath, fileId, dataFileToBeMerged,
taskContextSupplier);
taskContextSupplier, keyGeneratorOpt);
newRecordKeysSorted.addAll(keyToNewRecords.keySet());
}

View File

@@ -20,11 +20,13 @@ package org.apache.hudi.io.storage;
import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.KeyGenUtils;
import org.apache.hudi.table.HoodieTable;
import org.apache.avro.generic.GenericRecord;
@@ -66,13 +68,14 @@ public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends
private static final Logger LOG = LogManager.getLogger(HoodieConcatHandle.class);
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Iterator recordItr,
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier) {
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier);
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
}
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Map keyToNewRecords, String partitionPath, String fileId,
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
super(config, instantTime, hoodieTable, keyToNewRecords, partitionPath, fileId, dataFileToBeMerged, taskContextSupplier);
super(config, instantTime, hoodieTable, keyToNewRecords, partitionPath, fileId, dataFileToBeMerged, taskContextSupplier,
Option.empty());
}
/**
@@ -80,7 +83,7 @@ public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends
*/
@Override
public void write(GenericRecord oldRecord) {
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt);
try {
fileWriter.writeAvro(key, oldRecord);
} catch (IOException | RuntimeException e) {

View File

@@ -24,6 +24,7 @@ import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.table.HoodieTable;
@@ -34,9 +35,9 @@ import org.apache.parquet.avro.AvroSchemaConverter;
import java.io.IOException;
import static org.apache.hudi.common.model.HoodieFileFormat.HFILE;
import static org.apache.hudi.common.model.HoodieFileFormat.ORC;
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
import static org.apache.hudi.common.model.HoodieFileFormat.HFILE;
public class HoodieFileWriterFactory {
@@ -45,7 +46,7 @@ public class HoodieFileWriterFactory {
TaskContextSupplier taskContextSupplier) throws IOException {
final String extension = FSUtils.getFileExtension(path.getName());
if (PARQUET.getFileExtension().equals(extension)) {
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier);
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier, config.populateMetaFields());
}
if (HFILE.getFileExtension().equals(extension)) {
return newHFileFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier);
@@ -58,16 +59,21 @@ public class HoodieFileWriterFactory {
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
TaskContextSupplier taskContextSupplier) throws IOException {
BloomFilter filter = createBloomFilter(config);
HoodieAvroWriteSupport writeSupport =
new HoodieAvroWriteSupport(new AvroSchemaConverter(hoodieTable.getHadoopConf()).convert(schema), schema, filter);
TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException {
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier, populateMetaFields, populateMetaFields);
}
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
TaskContextSupplier taskContextSupplier, boolean populateMetaFields, boolean enableBloomFilter) throws IOException {
Option<BloomFilter> filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty();
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(hoodieTable.getHadoopConf()).convert(schema), schema, filter);
HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(),
config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
hoodieTable.getHadoopConf(), config.getParquetCompressionRatio());
return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier);
return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier, populateMetaFields);
}
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newHFileFileWriter(

View File

@@ -49,9 +49,10 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
private final HoodieAvroWriteSupport writeSupport;
private final String instantTime;
private final TaskContextSupplier taskContextSupplier;
private final boolean populateMetaFields;
public HoodieParquetWriter(String instantTime, Path file, HoodieAvroParquetConfig parquetConfig,
Schema schema, TaskContextSupplier taskContextSupplier) throws IOException {
Schema schema, TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException {
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
@@ -69,14 +70,19 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
this.writeSupport = parquetConfig.getWriteSupport();
this.instantTime = instantTime;
this.taskContextSupplier = taskContextSupplier;
this.populateMetaFields = populateMetaFields;
}
@Override
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
prepRecordWithMetadata(avroRecord, record, instantTime,
taskContextSupplier.getPartitionIdSupplier().get(), recordIndex, file.getName());
super.write(avroRecord);
writeSupport.add(record.getRecordKey());
if (populateMetaFields) {
prepRecordWithMetadata(avroRecord, record, instantTime,
taskContextSupplier.getPartitionIdSupplier().get(), recordIndex, file.getName());
super.write(avroRecord);
writeSupport.add(record.getRecordKey());
} else {
super.write(avroRecord);
}
}
@Override
@@ -87,7 +93,9 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
@Override
public void writeAvro(String key, IndexedRecord object) throws IOException {
super.write(object);
writeSupport.add(key);
if (populateMetaFields) {
writeSupport.add(key);
}
}
@Override

View File

@@ -1,81 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.exception.HoodieKeyException;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import java.util.List;
import java.util.stream.Collectors;
public abstract class BaseKeyGenerator extends KeyGenerator {
protected List<String> recordKeyFields;
protected List<String> partitionPathFields;
protected final boolean encodePartitionPath;
protected final boolean hiveStylePartitioning;
protected BaseKeyGenerator(TypedProperties config) {
super(config);
this.encodePartitionPath = config.getBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key(),
Boolean.parseBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING_OPT_KEY.defaultValue()));
this.hiveStylePartitioning = config.getBoolean(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY.key(),
Boolean.parseBoolean(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY.defaultValue()));
}
/**
* Generate a record Key out of provided generic record.
*/
public abstract String getRecordKey(GenericRecord record);
/**
* Generate a partition path out of provided generic record.
*/
public abstract String getPartitionPath(GenericRecord record);
/**
* Generate a Hoodie Key out of provided generic record.
*/
@Override
public final HoodieKey getKey(GenericRecord record) {
if (getRecordKeyFields() == null || getPartitionPathFields() == null) {
throw new HoodieKeyException("Unable to find field names for record key or partition path in cfg");
}
return new HoodieKey(getRecordKey(record), getPartitionPath(record));
}
@Override
public final List<String> getRecordKeyFieldNames() {
// For nested columns, pick top level column name
return getRecordKeyFields().stream().map(k -> {
int idx = k.indexOf('.');
return idx > 0 ? k.substring(0, idx) : k;
}).collect(Collectors.toList());
}
public List<String> getRecordKeyFields() {
return recordKeyFields;
}
public List<String> getPartitionPathFields() {
return partitionPathFields;
}
}

View File

@@ -21,6 +21,8 @@ package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
@@ -41,6 +43,26 @@ public class KeyGenUtils {
protected static final String DEFAULT_PARTITION_PATH = "default";
public static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
/**
* Fetches record key from the GenericRecord.
* @param genericRecord generic record of interest.
* @param keyGeneratorOpt Optional BaseKeyGenerator. If not, meta field will be used.
* @return the record key for the passed in generic record.
*/
public static String getRecordKeyFromGenericRecord(GenericRecord genericRecord, Option<BaseKeyGenerator> keyGeneratorOpt) {
return keyGeneratorOpt.isPresent() ? keyGeneratorOpt.get().getRecordKey(genericRecord) : genericRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
}
/**
* Fetches partition path from the GenericRecord.
* @param genericRecord generic record of interest.
* @param keyGeneratorOpt Optional BaseKeyGenerator. If not, meta field will be used.
* @return the partition path for the passed in generic record.
*/
public static String getPartitionPathFromGenericRecord(GenericRecord genericRecord, Option<BaseKeyGenerator> keyGeneratorOpt) {
return keyGeneratorOpt.isPresent() ? keyGeneratorOpt.get().getRecordKey(genericRecord) : genericRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
}
/**
* Extracts the record key fields in strings out of the given record key,
* this is the reverse operation of {@link #getRecordKey(GenericRecord, String)}.

View File

@@ -1,59 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen;
import org.apache.hudi.ApiMaturityLevel;
import org.apache.hudi.PublicAPIClass;
import org.apache.hudi.PublicAPIMethod;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.avro.generic.GenericRecord;
import java.util.List;
/**
* Abstract class to extend for plugging in extraction of {@link HoodieKey} from an Avro record.
*/
@PublicAPIClass(maturity = ApiMaturityLevel.STABLE)
public abstract class KeyGenerator implements KeyGeneratorInterface {
protected TypedProperties config;
protected KeyGenerator(TypedProperties config) {
this.config = config;
}
/**
* Generate a Hoodie Key out of provided generic record.
*/
@PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
public abstract HoodieKey getKey(GenericRecord record);
/**
* Used during bootstrap, to project out only the record key fields from bootstrap source dataset.
*
* @return list of field names, when concatenated make up the record key.
*/
@PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
public List<String> getRecordKeyFieldNames() {
throw new UnsupportedOperationException("Bootstrap not supported for key generator. "
+ "Please override this method in your custom key generator.");
}
}

View File

@@ -1,36 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.model.HoodieKey;
import java.io.Serializable;
import java.util.List;
/**
* Represents the interface key generators need to adhere to.
*/
public interface KeyGeneratorInterface extends Serializable {
HoodieKey getKey(GenericRecord record);
List<String> getRecordKeyFieldNames();
}

View File

@@ -1,59 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen.constant;
import org.apache.hudi.common.config.ConfigClassProperty;
import org.apache.hudi.common.config.ConfigGroups;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;
@ConfigClassProperty(name = "Key Generator Options",
groupName = ConfigGroups.Names.WRITE_CLIENT,
description = "Hudi maintains keys (record key + partition path) "
+ "for uniquely identifying a particular record. "
+ "This config allows developers to setup the Key generator class that "
+ "will extract these out of incoming records.")
public class KeyGeneratorOptions extends HoodieConfig {
public static final ConfigProperty<String> URL_ENCODE_PARTITIONING_OPT_KEY = ConfigProperty
.key("hoodie.datasource.write.partitionpath.urlencode")
.defaultValue("false")
.withDocumentation("Should we url encode the partition path value, before creating the folder structure.");
public static final ConfigProperty<String> HIVE_STYLE_PARTITIONING_OPT_KEY = ConfigProperty
.key("hoodie.datasource.write.hive_style_partitioning")
.defaultValue("false")
.withDocumentation("Flag to indicate whether to use Hive style partitioning.\n"
+ "If set true, the names of partition folders follow <partition_column_name>=<partition_value> format.\n"
+ "By default false (the names of partition folders are only partition values)");
public static final ConfigProperty<String> RECORDKEY_FIELD_OPT_KEY = ConfigProperty
.key("hoodie.datasource.write.recordkey.field")
.defaultValue("uuid")
.withDocumentation("Record key field. Value to be used as the `recordKey` component of `HoodieKey`.\n"
+ "Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using\n"
+ "the dot notation eg: `a.b.c`");
public static final ConfigProperty<String> PARTITIONPATH_FIELD_OPT_KEY = ConfigProperty
.key("hoodie.datasource.write.partitionpath.field")
.defaultValue("partitionpath")
.withDocumentation("Partition path field. Value to be used at the partitionPath component of HoodieKey. "
+ "Actual value ontained by invoking .toString()");
}

View File

@@ -1,70 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen.constant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Types of {@link org.apache.hudi.keygen.KeyGenerator}.
*/
public enum KeyGeneratorType {
/**
* Simple key generator, which takes names of fields to be used for recordKey and partitionPath as configs.
*/
SIMPLE,
/**
* Complex key generator, which takes names of fields to be used for recordKey and partitionPath as configs.
*/
COMPLEX,
/**
* Key generator, that relies on timestamps for partitioning field. Still picks record key by name.
*/
TIMESTAMP,
/**
* This is a generic implementation type of KeyGenerator where users can configure record key as a single field or
* a combination of fields. Similarly partition path can be configured to have multiple fields or only one field.
* <p>
* This KeyGenerator expects value for prop "hoodie.datasource.write.partitionpath.field" in a specific format.
* For example:
* properties.put("hoodie.datasource.write.partitionpath.field", "field1:PartitionKeyType1,field2:PartitionKeyType2").
*/
CUSTOM,
/**
* Simple Key generator for unpartitioned Hive Tables.
*/
NON_PARTITION,
/**
* Key generator for deletes using global indices.
*/
GLOBAL_DELETE;
public static List<String> getNames() {
List<String> names = new ArrayList<>(KeyGeneratorType.values().length);
Arrays.stream(KeyGeneratorType.values())
.forEach(x -> names.add(x.name()));
return names;
}
}

View File

@@ -34,6 +34,7 @@ import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
import org.apache.hudi.common.testutils.FileCreateUtils;
import org.apache.hudi.common.testutils.HoodieTestTable;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieStorageConfig;
import org.apache.hudi.io.storage.HoodieAvroParquetConfig;
import org.apache.hudi.io.storage.HoodieOrcConfig;
@@ -68,11 +69,13 @@ public class HoodieWriteableTestTable extends HoodieTestTable {
protected final Schema schema;
protected final BloomFilter filter;
protected final boolean populateMetaFields;
protected HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) {
super(basePath, fs, metaClient);
this.schema = schema;
this.filter = filter;
this.populateMetaFields = metaClient.getTableConfig().populateMetaFields();
}
@Override
@@ -91,21 +94,25 @@ public class HoodieWriteableTestTable extends HoodieTestTable {
if (HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP.defaultValue().equals(HoodieFileFormat.PARQUET)) {
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
new AvroSchemaConverter().convert(schema), schema, filter);
new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
HoodieAvroParquetConfig config = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP,
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
new Configuration(), Double.parseDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO.defaultValue()));
try (HoodieParquetWriter writer = new HoodieParquetWriter(
currentInstantTime,
new Path(Paths.get(basePath, partition, fileName).toString()),
config, schema, contextSupplier)) {
config, schema, contextSupplier, populateMetaFields)) {
int seqId = 1;
for (HoodieRecord record : records) {
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++));
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName);
writer.writeAvro(record.getRecordKey(), avroRecord);
filter.add(record.getRecordKey());
if (populateMetaFields) {
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++));
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName);
writer.writeAvro(record.getRecordKey(), avroRecord);
filter.add(record.getRecordKey());
} else {
writer.writeAvro(record.getRecordKey(), avroRecord);
}
}
}
} else if (HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP.defaultValue().equals(HoodieFileFormat.ORC)) {

View File

@@ -130,7 +130,7 @@ public class FlinkHoodieSimpleIndex<T extends HoodieRecordPayload> extends Flink
List<Pair<String, HoodieBaseFile>> latestBaseFiles) {
List<HoodieKeyLocationFetchHandle<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>>> hoodieKeyLocationFetchHandles =
context.map(latestBaseFiles, partitionPathBaseFile -> new HoodieKeyLocationFetchHandle<>(config, hoodieTable, partitionPathBaseFile), parallelism);
context.map(latestBaseFiles, partitionPathBaseFile -> new HoodieKeyLocationFetchHandle<>(config, hoodieTable, partitionPathBaseFile, Option.empty()), parallelism);
Map<HoodieKey, HoodieRecordLocation> recordLocations = new HashMap<>();
hoodieKeyLocationFetchHandles.stream()
.flatMap(handle -> handle.locations())

View File

@@ -24,6 +24,7 @@ import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
@@ -65,7 +66,7 @@ public class FlinkMergeAndReplaceHandle<T extends HoodieRecordPayload, I, K, O>
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
TaskContextSupplier taskContextSupplier, Path basePath) {
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier,
new HoodieBaseFile(basePath.toString()));
new HoodieBaseFile(basePath.toString()), Option.empty());
// delete invalid data files generated by task retry.
if (getAttemptId() > 0) {
deleteInvalidDataFile(getAttemptId() - 1);

View File

@@ -23,6 +23,7 @@ import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
@@ -65,7 +66,7 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
public FlinkMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
TaskContextSupplier taskContextSupplier) {
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier);
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty());
if (rolloverPaths == null) {
// #makeOldAndNewFilePaths may already initialize it already
rolloverPaths = new ArrayList<>();

View File

@@ -351,10 +351,10 @@ public class HoodieFlinkCopyOnWriteTable<T extends HoodieRecordPayload> extends
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) {
if (requireSortedRecords()) {
return new HoodieSortedMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
dataFileToBeMerged, taskContextSupplier);
dataFileToBeMerged, taskContextSupplier, Option.empty());
} else {
return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
dataFileToBeMerged,taskContextSupplier);
dataFileToBeMerged,taskContextSupplier, Option.empty());
}
}

View File

@@ -283,9 +283,9 @@ public abstract class BaseJavaCommitActionExecutor<T extends HoodieRecordPayload
protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
if (table.requireSortedRecords()) {
return new HoodieSortedMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier);
return new HoodieSortedMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty());
} else {
return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier);
return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty());
}
}
@@ -293,7 +293,7 @@ public abstract class BaseJavaCommitActionExecutor<T extends HoodieRecordPayload
Map<String, HoodieRecord<T>> keyToNewRecords,
HoodieBaseFile dataFileToBeMerged) {
return new HoodieMergeHandle<>(config, instantTime, table, keyToNewRecords,
partitionPath, fileId, dataFileToBeMerged, taskContextSupplier);
partitionPath, fileId, dataFileToBeMerged, taskContextSupplier, Option.empty());
}
@Override

View File

@@ -25,6 +25,9 @@
}, {
"name" : "_row_key",
"type" : "string"
}, {
"name" : "partition_path",
"type" : "string"
}, {
"name" : "rider",
"type" : "string"

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.index.simple;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.utils.SparkMemoryUtils;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieKey;
@@ -30,15 +31,19 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.index.HoodieIndexUtils;
import org.apache.hudi.index.SparkHoodieIndex;
import org.apache.hudi.io.HoodieKeyLocationFetchHandle;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.table.HoodieTable;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.IOException;
import java.util.List;
import scala.Tuple2;
@@ -146,8 +151,15 @@ public class SparkHoodieSimpleIndex<T extends HoodieRecordPayload> extends Spark
List<Pair<String, HoodieBaseFile>> baseFiles) {
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
int fetchParallelism = Math.max(1, Math.max(baseFiles.size(), parallelism));
return jsc.parallelize(baseFiles, fetchParallelism)
.flatMapToPair(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile)
try {
Option<BaseKeyGenerator> keyGeneratorOpt = config.populateMetaFields() ? Option.empty()
: Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps())));
return jsc.parallelize(baseFiles, fetchParallelism)
.flatMapToPair(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile, keyGeneratorOpt)
.locations().map(x -> Tuple2.apply(((Pair)x).getLeft(), ((Pair)x).getRight())).iterator());
} catch (IOException e) {
throw new HoodieIOException("KeyGenerator instantiation failed ", e);
}
}
}

View File

@@ -97,6 +97,8 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
* @param structType schema of the internalRow.
* @return the partition path.
*/
@Override
@PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
public String getPartitionPath(InternalRow internalRow, StructType structType) {
try {
initDeserializer(structType);

View File

@@ -19,6 +19,8 @@
package org.apache.hudi.keygen;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;
/**
* Spark key generator interface.
@@ -28,4 +30,6 @@ public interface SparkKeyGeneratorInterface extends KeyGeneratorInterface {
String getRecordKey(Row row);
String getPartitionPath(Row row);
String getPartitionPath(InternalRow internalRow, StructType structType);
}

View File

@@ -27,6 +27,7 @@ import org.apache.hudi.avro.model.HoodieRollbackMetadata;
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieKey;
@@ -37,11 +38,14 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.io.HoodieCreateHandle;
import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.io.HoodieSortedMergeHandle;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor;
@@ -210,12 +214,21 @@ public class HoodieSparkCopyOnWriteTable<T extends HoodieRecordPayload> extends
protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId,
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) {
Option<BaseKeyGenerator> keyGeneratorOpt = Option.empty();
if (!config.populateMetaFields()) {
try {
keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps())));
} catch (IOException e) {
throw new HoodieIOException("Only BaseKeyGenerator (or any key generator that extends from BaseKeyGenerator) are supported when meta "
+ "columns are disabled. Please choose the right key generator if you wish to disable meta fields.", e);
}
}
if (requireSortedRecords()) {
return new HoodieSortedMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
dataFileToBeMerged, taskContextSupplier);
dataFileToBeMerged, taskContextSupplier, keyGeneratorOpt);
} else {
return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
dataFileToBeMerged,taskContextSupplier);
return new HoodieMergeHandle(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
dataFileToBeMerged, taskContextSupplier, keyGeneratorOpt);
}
}

View File

@@ -50,6 +50,7 @@ import org.apache.hudi.exception.HoodieClusteringException;
import org.apache.hudi.io.IOUtils;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
import org.apache.hudi.keygen.KeyGenUtils;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy;
@@ -247,8 +248,8 @@ public class SparkExecuteClusteringCommitActionExecutor<T extends HoodieRecordPa
*/
private HoodieRecord<? extends HoodieRecordPayload> transform(IndexedRecord indexedRecord) {
GenericRecord record = (GenericRecord) indexedRecord;
String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
String key = KeyGenUtils.getRecordKeyFromGenericRecord(record, keyGeneratorOpt);
String partition = KeyGenUtils.getPartitionPathFromGenericRecord(record, keyGeneratorOpt);
HoodieKey hoodieKey = new HoodieKey(key, partition);
HoodieRecordPayload avroPayload = ReflectionUtils.loadPayload(table.getMetaClient().getTableConfig().getPayloadClass(),

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.table.action.commit;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.utils.SparkMemoryUtils;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieKey;
@@ -37,6 +38,7 @@ import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieCommitException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieMetadataException;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.execution.SparkLazyInsertIterable;
@@ -44,6 +46,8 @@ import org.apache.hudi.io.CreateHandleFactory;
import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.io.HoodieSortedMergeHandle;
import org.apache.hudi.io.storage.HoodieConcatHandle;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter;
import org.apache.hudi.table.HoodieSparkTable;
@@ -78,6 +82,7 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
BaseCommitActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, HoodieWriteMetadata> {
private static final Logger LOG = LogManager.getLogger(BaseSparkCommitActionExecutor.class);
protected Option<BaseKeyGenerator> keyGeneratorOpt = Option.empty();
public BaseSparkCommitActionExecutor(HoodieEngineContext context,
HoodieWriteConfig config,
@@ -85,6 +90,7 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
String instantTime,
WriteOperationType operationType) {
super(context, config, table, instantTime, operationType, Option.empty());
initKeyGenIfNeeded(config.populateMetaFields());
}
public BaseSparkCommitActionExecutor(HoodieEngineContext context,
@@ -94,6 +100,17 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
WriteOperationType operationType,
Option extraMetadata) {
super(context, config, table, instantTime, operationType, extraMetadata);
initKeyGenIfNeeded(config.populateMetaFields());
}
private void initKeyGenIfNeeded(boolean populateMetaFields) {
if (!populateMetaFields) {
try {
keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps())));
} catch (IOException e) {
throw new HoodieIOException("Only BaseKeyGenerators are supported when meta columns are disabled ", e);
}
}
}
private JavaRDD<HoodieRecord<T>> clusteringHandleUpdate(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
@@ -327,11 +344,12 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
if (table.requireSortedRecords()) {
return new HoodieSortedMergeHandle<>(config, instantTime, (HoodieSparkTable) table, recordItr, partitionPath, fileId, taskContextSupplier);
return new HoodieSortedMergeHandle<>(config, instantTime, (HoodieSparkTable) table, recordItr, partitionPath, fileId, taskContextSupplier,
keyGeneratorOpt);
} else if (!WriteOperationType.isChangingRecords(operationType) && config.allowDuplicateInserts()) {
return new HoodieConcatHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier);
return new HoodieConcatHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
} else {
return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier);
return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
}
}

View File

@@ -24,6 +24,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hudi.avro.model.HoodieClusteringPlan;
import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata;
import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.FileSlice;
@@ -66,6 +67,9 @@ import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.HoodieIndex.IndexType;
import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
@@ -120,6 +124,7 @@ import static org.apache.hudi.config.HoodieClusteringConfig.CLUSTERING_EXECUTION
import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
@@ -136,8 +141,30 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
}
};
private static Stream<Arguments> configParams() {
return Arrays.stream(new Boolean[][] {{true},{false}}).map(Arguments::of);
private static Stream<Arguments> smallInsertHandlingParams() {
return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of);
}
private static Stream<Arguments> populateMetaFieldsParams() {
return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of);
}
private static Stream<Arguments> rollbackFailedCommitsParams() {
return Stream.of(
Arguments.of(HoodieFailedWritesCleaningPolicy.LAZY, true),
Arguments.of(HoodieFailedWritesCleaningPolicy.LAZY, false),
Arguments.of(HoodieFailedWritesCleaningPolicy.NEVER, true),
Arguments.of(HoodieFailedWritesCleaningPolicy.NEVER, false)
);
}
private static Stream<Arguments> rollbackAfterConsistencyCheckFailureParams() {
return Stream.of(
Arguments.of(true, true),
Arguments.of(true, false),
Arguments.of(false, true),
Arguments.of(false, false)
);
}
private HoodieTestTable testTable;
@@ -150,50 +177,56 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
/**
* Test Auto Commit behavior for HoodieWriteClient insert API.
*/
@Test
public void testAutoCommitOnInsert() throws Exception {
testAutoCommit(SparkRDDWriteClient::insert, false);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testAutoCommitOnInsert(boolean populateMetaFields) throws Exception {
testAutoCommit(SparkRDDWriteClient::insert, false, populateMetaFields);
}
/**
* Test Auto Commit behavior for HoodieWriteClient insertPrepped API.
*/
@Test
public void testAutoCommitOnInsertPrepped() throws Exception {
testAutoCommit(SparkRDDWriteClient::insertPreppedRecords, true);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testAutoCommitOnInsertPrepped(boolean populateMetaFields) throws Exception {
testAutoCommit(SparkRDDWriteClient::insertPreppedRecords, true, populateMetaFields);
}
/**
* Test Auto Commit behavior for HoodieWriteClient upsert API.
*/
@Test
public void testAutoCommitOnUpsert() throws Exception {
testAutoCommit(SparkRDDWriteClient::upsert, false);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testAutoCommitOnUpsert(boolean populateMetaFields) throws Exception {
testAutoCommit(SparkRDDWriteClient::upsert, false, populateMetaFields);
}
/**
* Test Auto Commit behavior for HoodieWriteClient upsert Prepped API.
*/
@Test
public void testAutoCommitOnUpsertPrepped() throws Exception {
testAutoCommit(SparkRDDWriteClient::upsertPreppedRecords, true);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testAutoCommitOnUpsertPrepped(boolean populateMetaFields) throws Exception {
testAutoCommit(SparkRDDWriteClient::upsertPreppedRecords, true, populateMetaFields);
}
/**
* Test Auto Commit behavior for HoodieWriteClient bulk-insert API.
*/
@Test
public void testAutoCommitOnBulkInsert() throws Exception {
testAutoCommit(SparkRDDWriteClient::bulkInsert, false);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testAutoCommitOnBulkInsert(boolean populateMetaFields) throws Exception {
testAutoCommit(SparkRDDWriteClient::bulkInsert, false, populateMetaFields);
}
/**
* Test Auto Commit behavior for HoodieWriteClient bulk-insert prepped API.
*/
@Test
public void testAutoCommitOnBulkInsertPrepped() throws Exception {
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testAutoCommitOnBulkInsertPrepped(boolean populateMetaFields) throws Exception {
testAutoCommit((writeClient, recordRDD, instantTime) -> writeClient.bulkInsertPreppedRecords(recordRDD, instantTime,
Option.empty()), true);
Option.empty()), true, populateMetaFields);
}
/**
@@ -203,15 +236,16 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
* @throws Exception in case of failure
*/
private void testAutoCommit(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
boolean isPrepped) throws Exception {
boolean isPrepped, boolean populateMetaFields) throws Exception {
// Set autoCommit false
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());) {
String prevCommitTime = "000";
String newCommitTime = "001";
int numRecords = 200;
JavaRDD<WriteStatus> result = insertFirstBatch(cfg, client, newCommitTime, prevCommitTime, numRecords, writeFn,
JavaRDD<WriteStatus> result = insertFirstBatch(cfgBuilder.build(), client, newCommitTime, prevCommitTime, numRecords, writeFn,
isPrepped, false, numRecords);
assertFalse(testTable.commitExists(newCommitTime),
@@ -225,25 +259,28 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
/**
* Test De-duplication behavior for HoodieWriteClient insert API.
*/
@Test
public void testDeduplicationOnInsert() throws Exception {
testDeduplication(SparkRDDWriteClient::insert);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testDeduplicationOnInsert(boolean populateMetaFields) throws Exception {
testDeduplication(SparkRDDWriteClient::insert, populateMetaFields);
}
/**
* Test De-duplication behavior for HoodieWriteClient bulk-insert API.
*/
@Test
public void testDeduplicationOnBulkInsert() throws Exception {
testDeduplication(SparkRDDWriteClient::bulkInsert);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testDeduplicationOnBulkInsert(boolean populateMetaFields) throws Exception {
testDeduplication(SparkRDDWriteClient::bulkInsert, populateMetaFields);
}
/**
* Test De-duplication behavior for HoodieWriteClient upsert API.
*/
@Test
public void testDeduplicationOnUpsert() throws Exception {
testDeduplication(SparkRDDWriteClient::upsert);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testDeduplicationOnUpsert(boolean populateMetaFields) throws Exception {
testDeduplication(SparkRDDWriteClient::upsert, populateMetaFields);
}
/**
@@ -253,7 +290,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
* @throws Exception in case of failure
*/
private void testDeduplication(
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn) throws Exception {
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean populateMetaFields) throws Exception {
String newCommitTime = "001";
String recordKey = UUID.randomUUID().toString();
@@ -289,8 +326,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
// Perform write-action and check
JavaRDD<HoodieRecord> recordList = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1);
try (SparkRDDWriteClient client = getHoodieWriteClient(getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY)
.combineInput(true, true).build());) {
HoodieWriteConfig.Builder configBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY)
.combineInput(true, true);
addAppropriatePropsForPopulateMetaFields(configBuilder, populateMetaFields);
try (SparkRDDWriteClient client = getHoodieWriteClient(configBuilder.build());) {
client.startCommitWithTime(newCommitTime);
List<WriteStatus> statuses = writeFn.apply(client, recordList, newCommitTime).collect();
assertNoWriteErrors(statuses);
@@ -321,17 +361,23 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
/**
* Test Upsert API.
*/
@Test
public void testUpserts() throws Exception {
testUpsertsInternal(getConfig(), SparkRDDWriteClient::upsert, false);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testUpserts(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
testUpsertsInternal(cfgBuilder.build(), SparkRDDWriteClient::upsert, false);
}
/**
* Test UpsertPrepped API.
*/
@Test
public void testUpsertsPrepped() throws Exception {
testUpsertsInternal(getConfig(), SparkRDDWriteClient::upsertPreppedRecords, true);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testUpsertsPrepped(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
testUpsertsInternal(cfgBuilder.build(), SparkRDDWriteClient::upsertPreppedRecords, true);
}
/**
@@ -348,10 +394,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
HoodieWriteConfig hoodieWriteConfig = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY)
.withProps(config.getProps()).withTimelineLayoutVersion(
VERSION_0).build();
HoodieTableMetaClient.withPropertyBuilder()
.fromMetaClient(metaClient)
.setTimelineLayoutVersion(VERSION_0)
.initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
.setPopulateMetaFields(config.populateMetaFields())
.initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
@@ -360,7 +408,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
String initCommitTime = "000";
int numRecords = 200;
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert,
isPrepped, true, numRecords);
isPrepped, true, numRecords, config.populateMetaFields());
// Write 2 (updates)
String prevCommitTime = newCommitTime;
@@ -369,7 +417,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
String commitTimeBetweenPrevAndNew = "002";
updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime,
Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, writeFn, isPrepped, true,
numRecords, 200, 2);
numRecords, 200, 2, config.populateMetaFields());
// Delete 1
prevCommitTime = newCommitTime;
@@ -378,7 +426,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
deleteBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime,
initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true,
0, 150);
0, 150, config.populateMetaFields());
// Now simulate an upgrade and perform a restore operation
HoodieWriteConfig newConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion(
@@ -440,7 +488,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
try {
HoodieMergeHandle handle = new HoodieMergeHandle(cfg, instantTime, table, new HashMap<>(),
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier());
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(),
config.populateMetaFields() ? Option.empty() :
Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))));
WriteStatus writeStatus = new WriteStatus(false, 0.0);
writeStatus.setStat(new HoodieWriteStat());
writeStatus.getStat().setNumWrites(0);
@@ -454,7 +504,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
cfg.getProps().setProperty("hoodie.merge.data.validation.enabled", "true");
HoodieWriteConfig cfg2 = HoodieWriteConfig.newBuilder().withProps(cfg.getProps()).build();
HoodieMergeHandle handle = new HoodieMergeHandle(cfg2, newInstantTime, table, new HashMap<>(),
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier());
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(),
config.populateMetaFields() ? Option.empty() :
Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))));
WriteStatus writeStatus = new WriteStatus(false, 0.0);
writeStatus.setStat(new HoodieWriteStat());
writeStatus.getStat().setNumWrites(0);
@@ -470,17 +522,23 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
/**
* Test Insert API for HoodieConcatHandle.
*/
@Test
public void testInsertsWithHoodieConcatHandle() throws Exception {
testHoodieConcatHandle(getConfig(), false);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testInsertsWithHoodieConcatHandle(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
testHoodieConcatHandle(cfgBuilder.build(), false);
}
/**
* Test InsertPrepped API for HoodieConcatHandle.
*/
@Test
public void testInsertsPreppedWithHoodieConcatHandle() throws Exception {
testHoodieConcatHandle(getConfig(), true);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testInsertsPreppedWithHoodieConcatHandle(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
testHoodieConcatHandle(cfgBuilder.build(), true);
}
/**
@@ -507,7 +565,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
String initCommitTime = "000";
int numRecords = 200;
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert,
isPrepped, true, numRecords);
isPrepped, true, numRecords, config.populateMetaFields());
// Write 2 (updates)
String prevCommitTime = newCommitTime;
@@ -520,15 +578,18 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
writeBatch(client, newCommitTime, prevCommitTime, Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime,
numRecords, recordGenFunction, SparkRDDWriteClient::insert, true, numRecords, 300,
2, false);
2, false, config.populateMetaFields());
}
/**
* Tests deletion of records.
*/
@Test
public void testDeletes() throws Exception {
SparkRDDWriteClient client = getHoodieWriteClient(getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).build());
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testDeletes(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY);
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());
/**
* Write 1 (inserts and deletes) Write actual 200 insert records and ignore 100 delete records
*/
@@ -547,7 +608,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
};
writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime,
// unused as genFn uses hard-coded number of inserts/updates/deletes
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 200, 200, 1, false);
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 200, 200, 1, false,
populateMetaFields);
/**
* Write 2 (deletes+writes).
@@ -564,7 +626,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
return recordsInSecondBatch;
};
writeBatch(client, newCommitTime, prevCommitTime, Option.empty(), initCommitTime, 100, recordGenFunction,
SparkRDDWriteClient::upsert, true, 50, 150, 2, false);
SparkRDDWriteClient::upsert, true, 50, 150, 2, false,
populateMetaFields);
}
/**
@@ -572,9 +635,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
* not be available in read path.
* @throws Exception
*/
@Test
public void testDeletesForInsertsInSameBatch() throws Exception {
SparkRDDWriteClient client = getHoodieWriteClient(getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).build());
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testDeletesForInsertsInSameBatch(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY);
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());
/**
* Write 200 inserts and issue deletes to a subset(50) of inserts.
*/
@@ -593,7 +659,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
};
writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime,
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 150, 150, 1, false);
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 150, 150, 1, false,
populateMetaFields);
}
/**
@@ -793,7 +860,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
Properties props = new Properties();
props.setProperty(ASYNC_CLUSTERING_ENABLE_OPT_KEY.key(), "true");
HoodieWriteConfig config = getSmallInsertWriteConfig(100,
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), props);
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), true, props);
SparkRDDWriteClient client = getHoodieWriteClient(config);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
@@ -847,7 +914,10 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
final String testPartitionPath = "2016/09/26";
final int insertSplitLimit = 100;
// setup the small file handling params
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max
// hold upto 200 records max
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit,
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150));
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
SparkRDDWriteClient client = getHoodieWriteClient(config);
BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
@@ -954,11 +1024,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
* Test scenario of new file-group getting added during insert().
*/
@ParameterizedTest
@MethodSource("configParams")
@MethodSource("smallInsertHandlingParams")
public void testSmallInsertHandlingForInserts(boolean mergeAllowDuplicateInserts) throws Exception {
final String testPartitionPath = "2016/09/26";
final int insertSplitLimit = 100;
// setup the small file handling params
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, false, mergeAllowDuplicateInserts); // hold upto 200 records max
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
SparkRDDWriteClient client = getHoodieWriteClient(config);
@@ -1039,7 +1110,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
final String testPartitionPath = "2016/09/26";
final int insertSplitLimit = 100;
// setup the small file handling params
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max
// hold upto 200 records max
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit,
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150));
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
SparkRDDWriteClient client = getHoodieWriteClient(config);
@@ -1100,31 +1173,34 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
testDeletes(client, updateBatch3.getRight(), 10, file1, "007", 140, keysSoFar);
}
@Test
public void testSimpleClustering() throws Exception {
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testSimpleClustering(boolean populateMetaFields) throws Exception {
// setup clustering config
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
testClustering(clusteringConfig);
testClustering(clusteringConfig, populateMetaFields);
}
@Test
public void testClusteringWithSortColumns() throws Exception {
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testClusteringWithSortColumns(boolean populateMetaFields) throws Exception {
// setup clustering config
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
.withClusteringSortColumns("_hoodie_record_key")
.withClusteringSortColumns(populateMetaFields ? "_hoodie_record_key" : "_row_key")
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
testClustering(clusteringConfig);
testClustering(clusteringConfig, populateMetaFields);
}
@Test
public void testPendingClusteringRollback() throws Exception {
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testPendingClusteringRollback(boolean populateMetaFields) throws Exception {
// setup clustering config
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
// start clustering, but dont commit
List<HoodieRecord> allRecords = testClustering(clusteringConfig, false);
// start clustering, but don't commit
List<HoodieRecord> allRecords = testClustering(clusteringConfig, populateMetaFields);
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
List<Pair<HoodieInstant, HoodieClusteringPlan>> pendingClusteringPlans =
ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList());
@@ -1132,7 +1208,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
HoodieInstant pendingClusteringInstant = pendingClusteringPlans.get(0).getLeft();
// complete another commit after pending clustering
HoodieWriteConfig config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER).build();
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER);
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig config = cfgBuilder.build();
SparkRDDWriteClient client = getHoodieWriteClient(config);
dataGen = new HoodieTestDataGenerator();
String commitTime = HoodieActiveTimeline.createNewInstantTime();
@@ -1146,13 +1224,14 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
assertEquals(0, ClusteringUtils.getAllPendingClusteringPlans(metaClient).count());
}
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig) throws Exception {
return testClustering(clusteringConfig, false);
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig, boolean populateMetaFields) throws Exception {
return testClustering(clusteringConfig, false, populateMetaFields);
}
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig, boolean completeClustering) throws Exception {
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig, boolean completeClustering, boolean populateMetaFields) throws Exception {
// create config to not update small files.
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, false, 10);
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, 10, false, populateMetaFields,
populateMetaFields ? new Properties() : getPropertiesForKeyGen());
SparkRDDWriteClient client = getHoodieWriteClient(config);
dataGen = new HoodieTestDataGenerator();
String commitTime = HoodieActiveTimeline.createNewInstantTime();
@@ -1170,14 +1249,15 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
assertEquals(0, fileIdIntersection.size());
config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withAutoCommit(completeClustering)
.withClusteringConfig(clusteringConfig).build();
.withClusteringConfig(clusteringConfig)
.withProps(populateMetaFields ? new Properties() : getPropertiesForKeyGen()).build();
// create client with new config.
client = getHoodieWriteClient(config);
String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = client.cluster(clusteringCommitTime, completeClustering);
List<HoodieRecord> allRecords = Stream.concat(records1.stream(), records2.stream()).collect(Collectors.toList());
verifyRecordsWritten(clusteringCommitTime, allRecords, clusterMetadata.getWriteStatuses().collect());
verifyRecordsWritten(clusteringCommitTime, allRecords, clusterMetadata.getWriteStatuses().collect(), config);
Set<HoodieFileGroupId> insertedFileIds = new HashSet<>();
insertedFileIds.addAll(fileIds1);
insertedFileIds.addAll(fileIds2);
@@ -1197,25 +1277,28 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
/**
* Test scenario of writing more file groups than existing number of file groups in partition.
*/
@Test
public void testInsertOverwritePartitionHandlingWithMoreRecords() throws Exception {
verifyInsertOverwritePartitionHandling(1000, 3000);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testInsertOverwritePartitionHandlingWithMoreRecords(boolean populateMetaFields) throws Exception {
verifyInsertOverwritePartitionHandling(1000, 3000, populateMetaFields);
}
/**
* Test scenario of writing fewer file groups than existing number of file groups in partition.
*/
@Test
public void testInsertOverwritePartitionHandlingWithFewerRecords() throws Exception {
verifyInsertOverwritePartitionHandling(3000, 1000);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testInsertOverwritePartitionHandlingWithFewerRecords(boolean populateMetaFields) throws Exception {
verifyInsertOverwritePartitionHandling(3000, 1000, populateMetaFields);
}
/**
* Test scenario of writing similar number file groups in partition.
*/
@Test
public void testInsertOverwritePartitionHandlingWithSimilarNumberOfRecords() throws Exception {
verifyInsertOverwritePartitionHandling(3000, 3000);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testInsertOverwritePartitionHandlingWithSimilarNumberOfRecords(boolean populateMetaFields) throws Exception {
verifyInsertOverwritePartitionHandling(3000, 3000, populateMetaFields);
}
/**
@@ -1224,9 +1307,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
*
* Verify that all records in step1 are overwritten
*/
private void verifyInsertOverwritePartitionHandling(int batch1RecordsCount, int batch2RecordsCount) throws Exception {
private void verifyInsertOverwritePartitionHandling(int batch1RecordsCount, int batch2RecordsCount, boolean populateMetaFields) throws Exception {
final String testPartitionPath = "americas";
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, false);
HoodieWriteConfig config = getSmallInsertWriteConfig(2000,
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields
? new Properties() : getPropertiesForKeyGen());
SparkRDDWriteClient client = getHoodieWriteClient(config);
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
@@ -1247,7 +1332,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
assertNoWriteErrors(statuses);
assertEquals(batch1Buckets, new HashSet<>(writeResult.getPartitionToReplaceFileIds().get(testPartitionPath)));
verifyRecordsWritten(commitTime2, inserts2, statuses);
verifyRecordsWritten(commitTime2, inserts2, statuses, config);
}
private Set<String> getFileIdsFromWriteStatus(List<WriteStatus> statuses) {
@@ -1257,35 +1342,38 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
/**
* Test scenario of writing fewer file groups for first partition than second an third partition.
*/
@Test
public void verifyDeletePartitionsHandlingWithFewerRecordsFirstPartition() throws Exception {
verifyDeletePartitionsHandling(1000, 3000, 3000);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void verifyDeletePartitionsHandlingWithFewerRecordsFirstPartition(boolean populateMetaFields) throws Exception {
verifyDeletePartitionsHandling(1000, 3000, 3000, populateMetaFields);
}
/**
* Test scenario of writing similar number file groups in partition.
*/
@Test
public void verifyDeletePartitionsHandlingWithSimilarNumberOfRecords() throws Exception {
verifyDeletePartitionsHandling(3000, 3000, 3000);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void verifyDeletePartitionsHandlingWithSimilarNumberOfRecords(boolean populateMetaFields) throws Exception {
verifyDeletePartitionsHandling(3000, 3000, 3000, populateMetaFields);
}
/**
* Test scenario of writing more file groups for first partition than second an third partition.
*/
@Test
public void verifyDeletePartitionsHandlingHandlingWithFewerRecordsSecondThirdPartition() throws Exception {
verifyDeletePartitionsHandling(3000, 1000, 1000);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void verifyDeletePartitionsHandlingHandlingWithFewerRecordsSecondThirdPartition(boolean populateMetaFields) throws Exception {
verifyDeletePartitionsHandling(3000, 1000, 1000, populateMetaFields);
}
private Set<String> insertPartitionRecordsWithCommit(SparkRDDWriteClient client, int recordsCount, String commitTime1, String partitionPath) {
private Set<String> insertPartitionRecordsWithCommit(SparkRDDWriteClient client, int recordsCount, String commitTime1, String partitionPath) throws IOException {
client.startCommitWithTime(commitTime1);
List<HoodieRecord> inserts1 = dataGen.generateInsertsForPartition(commitTime1, recordsCount, partitionPath);
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 2);
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
assertNoWriteErrors(statuses);
Set<String> batchBuckets = statuses.stream().map(s -> s.getFileId()).collect(Collectors.toSet());
verifyRecordsWritten(commitTime1, inserts1, statuses);
verifyRecordsWritten(commitTime1, inserts1, statuses, client.config);
return batchBuckets;
}
@@ -1306,8 +1394,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
* 5) delete second and third partition and check result.
*
*/
private void verifyDeletePartitionsHandling(int batch1RecordsCount, int batch2RecordsCount, int batch3RecordsCount) throws Exception {
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, false);
private void verifyDeletePartitionsHandling(int batch1RecordsCount, int batch2RecordsCount, int batch3RecordsCount,
boolean populateMetaFields) throws Exception {
HoodieWriteConfig config = getSmallInsertWriteConfig(2000,
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields
? new Properties() : getPropertiesForKeyGen());
SparkRDDWriteClient client = getHoodieWriteClient(config);
dataGen = new HoodieTestDataGenerator();
@@ -1360,7 +1451,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
/**
* Verify data in base files matches expected records and commit time.
*/
private void verifyRecordsWritten(String commitTime, List<HoodieRecord> expectedRecords, List<WriteStatus> allStatus) {
private void verifyRecordsWritten(String commitTime, List<HoodieRecord> expectedRecords, List<WriteStatus> allStatus,
HoodieWriteConfig config) throws IOException {
List<GenericRecord> records = new ArrayList<>();
for (WriteStatus status : allStatus) {
Path filePath = new Path(basePath, status.getStat().getPath());
@@ -1369,20 +1461,29 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
Set<String> expectedKeys = recordsToRecordKeySet(expectedRecords);
assertEquals(records.size(), expectedKeys.size());
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals(commitTime,
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
assertTrue(expectedKeys.contains(recordKey));
if (config.populateMetaFields()) {
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals(commitTime,
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
assertTrue(expectedKeys.contains(recordKey));
}
} else {
KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()));
for (GenericRecord record : records) {
String recordKey = keyGenerator.getKey(record).getRecordKey();
assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD));
assertTrue(expectedKeys.contains(recordKey));
}
}
}
private List<WriteStatus> writeAndVerifyBatch(SparkRDDWriteClient client, List<HoodieRecord> inserts, String commitTime) {
private List<WriteStatus> writeAndVerifyBatch(SparkRDDWriteClient client, List<HoodieRecord> inserts, String commitTime) throws IOException {
client.startCommitWithTime(commitTime);
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts, 2);
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime).collect();
assertNoWriteErrors(statuses);
verifyRecordsWritten(commitTime, inserts, statuses);
verifyRecordsWritten(commitTime, inserts, statuses, client.config);
return statuses;
}
@@ -1449,12 +1550,15 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
/**
* Test delete with delete api.
*/
@Test
public void testDeletesWithoutInserts() {
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testDeletesWithoutInserts(boolean populateMetaFields) {
final String testPartitionPath = "2016/09/26";
final int insertSplitLimit = 100;
// setup the small file handling params
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, true); // hold upto 200 records max
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit,
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields
? new Properties() : getPropertiesForKeyGen());
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
SparkRDDWriteClient client = getHoodieWriteClient(config);
@@ -1473,13 +1577,15 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
/**
* Test to ensure commit metadata points to valid files.
*/
@Test
public void testCommitWritesRelativePaths() throws Exception {
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testCommitWritesRelativePaths(boolean populateMetaFields) throws Exception {
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());) {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
HoodieSparkTable table = HoodieSparkTable.create(cfg, context, metaClient);
HoodieSparkTable table = HoodieSparkTable.create(cfgBuilder.build(), context, metaClient);
String instantTime = "000";
client.startCommitWithTime(instantTime);
@@ -1518,9 +1624,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
/**
* Test to ensure commit metadata points to valid files.10.
*/
@Test
public void testMetadataStatsOnCommit() throws Exception {
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testMetadataStatsOnCommit(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig cfg = cfgBuilder.build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
String instantTime0 = "000";
@@ -1607,16 +1716,24 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
}
}
private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollbackUsingMarkers, boolean enableOptimisticConsistencyGuard) throws Exception {
private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollbackUsingMarkers, boolean enableOptimisticConsistencyGuard,
boolean populateMetaFields) throws Exception {
String instantTime = "000";
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
Properties properties = new Properties();
if (!populateMetaFields) {
properties = getPropertiesForKeyGen();
}
HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false)
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true)
.withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build() :
getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false)
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder()
.withConsistencyCheckEnabled(true)
.withOptimisticConsistencyGuardSleepTimeMs(1).build()).build();
.withOptimisticConsistencyGuardSleepTimeMs(1).build())
.withProperties(properties).build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
testConsistencyCheck(metaClient, instantTime, enableOptimisticConsistencyGuard);
@@ -1651,28 +1768,28 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
}
@ParameterizedTest
@ValueSource(booleans = {true, false})
public void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean enableOptimisticConsistencyGuard) throws Exception {
testRollbackAfterConsistencyCheckFailureUsingFileList(false, enableOptimisticConsistencyGuard);
@MethodSource("rollbackAfterConsistencyCheckFailureParams")
public void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean enableOptimisticConsistencyGuard, boolean populateMetCols) throws Exception {
testRollbackAfterConsistencyCheckFailureUsingFileList(false, enableOptimisticConsistencyGuard, populateMetCols);
}
@ParameterizedTest
@ValueSource(booleans = {true, false})
public void testRollbackAfterConsistencyCheckFailureUsingMarkers(boolean enableOptimisticConsistencyGuard) throws Exception {
testRollbackAfterConsistencyCheckFailureUsingFileList(true, enableOptimisticConsistencyGuard);
@MethodSource("rollbackAfterConsistencyCheckFailureParams")
public void testRollbackAfterConsistencyCheckFailureUsingMarkers(boolean enableOptimisticConsistencyGuard, boolean populateMetCols) throws Exception {
testRollbackAfterConsistencyCheckFailureUsingFileList(true, enableOptimisticConsistencyGuard, populateMetCols);
}
@ParameterizedTest
@EnumSource(value = HoodieFailedWritesCleaningPolicy.class, names = {"LAZY", "NEVER"})
public void testRollbackFailedCommits(HoodieFailedWritesCleaningPolicy cleaningPolicy) throws Exception {
@MethodSource("rollbackFailedCommitsParams")
public void testRollbackFailedCommits(HoodieFailedWritesCleaningPolicy cleaningPolicy, boolean populateMetaFields) throws Exception {
HoodieTestUtils.init(hadoopConf, basePath);
// Perform 2 failed writes to table
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
0, false);
client.close();
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
0, false);
@@ -1680,7 +1797,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
// refresh data generator to delete records generated from failed commits
dataGen = new HoodieTestDataGenerator();
// Perform 1 successful write
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
0, true);
@@ -1696,7 +1813,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("200");
Thread.sleep(2000);
}
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
// Perform 1 successful write
writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
@@ -1732,11 +1849,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
}
}
@Test
public void testRollbackFailedCommitsToggleCleaningPolicy() throws Exception {
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testRollbackFailedCommitsToggleCleaningPolicy(boolean populateMetaFields) throws Exception {
HoodieTestUtils.init(hadoopConf, basePath);
HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.EAGER;
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
// Perform 1 failed writes to table
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
@@ -1745,12 +1863,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
// Toggle cleaning policy to LAZY
cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
// Perform 2 failed writes to table
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
0, false);
client.close();
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
0, false);
@@ -1766,19 +1884,19 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
assertTrue(timeline.getTimelineOfActions(
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).countInstants() == 3);
// Perform 2 failed commits
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
0, false);
client.close();
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "500", "400", Option.of(Arrays.asList("500")), "500",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
0, false);
client.close();
// Toggle cleaning policy to EAGER
cleaningPolicy = HoodieFailedWritesCleaningPolicy.EAGER;
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
client.startCommit();
timeline = metaClient.getActiveTimeline().reload();
assertTrue(timeline.getTimelineOfActions(
@@ -1786,18 +1904,19 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 0);
}
@Test
public void testParallelInsertAndCleanPreviousFailedCommits() throws Exception {
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testParallelInsertAndCleanPreviousFailedCommits(boolean populateMetaFields) throws Exception {
HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
ExecutorService service = Executors.newFixedThreadPool(2);
HoodieTestUtils.init(hadoopConf, basePath);
// Perform 2 failed writes to table
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100,
0, false);
client.close();
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "200", "200", Option.of(Arrays.asList("200")), "200",
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100,
0, false);
@@ -1805,7 +1924,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
// refresh data generator to delete records generated from failed commits
dataGen = new HoodieTestDataGenerator();
// Create a succesful commit
Future<JavaRDD<WriteStatus>> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy)),
Future<JavaRDD<WriteStatus>> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)),
"300", "200", Option.of(Arrays.asList("300")), "200", 100, dataGen::generateInserts,
SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
commit3.get();
@@ -1815,17 +1934,17 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).countInstants() == 0);
assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2);
assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 1);
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
// Await till enough time passes such that the first 2 failed commits heartbeats are expired
boolean conditionMet = false;
while (!conditionMet) {
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("200");
Thread.sleep(2000);
}
Future<JavaRDD<WriteStatus>> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy)),
Future<JavaRDD<WriteStatus>> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)),
"400", "300", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts,
SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
Future<HoodieCleanMetadata> clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy)).clean());
Future<HoodieCleanMetadata> clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)).clean());
commit4.get();
clean1.get();
HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
@@ -1878,11 +1997,13 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
return Pair.of(markerFilePath, result);
}
@Test
public void testMultiOperationsPerCommit() throws IOException {
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false)
.withAllowMultiWriteOnSameInstant(true)
.build();
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testMultiOperationsPerCommit(boolean populateMetaFields) throws IOException {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false)
.withAllowMultiWriteOnSameInstant(true);
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig cfg = cfgBuilder.build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
String firstInstantTime = "0000";
client.startCommitWithTime(firstInstantTime);
@@ -1957,16 +2078,19 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
}
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts) {
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, mergeAllowDuplicateInserts, new Properties());
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, mergeAllowDuplicateInserts, true, new Properties());
}
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, Properties props) {
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, false, props);
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean populateMetaFields, Properties props) {
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, false, populateMetaFields, props);
}
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts,
Properties props) {
boolean populateMetaFields, Properties props) {
HoodieWriteConfig.Builder builder = getConfigBuilder(schemaStr);
if (!populateMetaFields) {
builder.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.SIMPLE).build());
}
return builder
.withCompactionConfig(
HoodieCompactionConfig.newBuilder()
@@ -1994,7 +2118,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
return clusteringInstant;
}
private HoodieWriteConfig getParallelWritingWriteConfig(HoodieFailedWritesCleaningPolicy cleaningPolicy) {
private HoodieWriteConfig getParallelWritingWriteConfig(HoodieFailedWritesCleaningPolicy cleaningPolicy, boolean populateMetaFields) {
return getConfigBuilder()
.withEmbeddedTimelineServerEnabled(false)
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
@@ -2002,7 +2126,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
.withAutoClean(false).build())
.withTimelineLayoutVersion(1)
.withHeartbeatIntervalInMs(3 * 1000)
.withAutoCommit(false).build();
.withAutoCommit(false)
.withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen()).build();
}
}

View File

@@ -26,6 +26,7 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.common.testutils.RawTripTestPayload;
import org.apache.hudi.common.util.BaseFileUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.io.HoodieCreateHandle;
@@ -121,7 +122,7 @@ public class TestUpdateSchemaEvolution extends HoodieClientTestHarness {
jsc.parallelize(Arrays.asList(1)).map(x -> {
Executable executable = () -> {
HoodieMergeHandle mergeHandle = new HoodieMergeHandle(updateTable.getConfig(), "101", updateTable,
updateRecords.iterator(), updateRecords.get(0).getPartitionPath(), insertResult.getFileId(), supplier);
updateRecords.iterator(), updateRecords.get(0).getPartitionPath(), insertResult.getFileId(), supplier, Option.empty());
List<GenericRecord> oldRecords = BaseFileUtils.getInstance(updateTable.getBaseFileFormat())
.readAvroRecords(updateTable.getHadoopConf(),
new Path(updateTable.getConfig().getBasePath() + "/" + insertResult.getStat().getPath()),

View File

@@ -23,10 +23,12 @@ import org.apache.hudi.common.fs.ConsistencyGuardConfig;
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.common.testutils.RawTripTestPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
@@ -47,8 +49,10 @@ import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.io.IOException;
import java.util.Arrays;
@@ -56,8 +60,10 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.UUID;
import java.util.stream.Stream;
import scala.Tuple2;
@@ -69,16 +75,34 @@ import static org.junit.jupiter.api.Assertions.fail;
public class TestHoodieIndex extends HoodieClientTestHarness {
private static Stream<Arguments> indexTypeParams() {
Object[][] data = new Object[][] {
{IndexType.BLOOM, true},
{IndexType.GLOBAL_BLOOM, true},
{IndexType.SIMPLE, true},
{IndexType.GLOBAL_SIMPLE, true},
{IndexType.SIMPLE, false},
{IndexType.GLOBAL_SIMPLE, false}
};
return Stream.of(data).map(Arguments::of);
}
private static final Schema SCHEMA = getSchemaFromResource(TestHoodieIndex.class, "/exampleSchema.avsc", true);
private final Random random = new Random();
private IndexType indexType;
private HoodieIndex index;
private HoodieWriteConfig config;
private void setUp(IndexType indexType) throws Exception {
private void setUp(IndexType indexType, boolean populateMetaFields) throws Exception {
this.indexType = indexType;
initResources();
initPath();
initSparkContexts();
initTestDataGenerator();
initFileSystem();
metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties()
: getPropertiesForKeyGen());
config = getConfigBuilder()
.withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen())
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType)
.build()).withAutoCommit(false).build();
writeClient = getHoodieWriteClient(config);
@@ -91,9 +115,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
}
@ParameterizedTest
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
public void testSimpleTagLocationAndUpdate(IndexType indexType) throws Exception {
setUp(indexType);
@MethodSource("indexTypeParams")
public void testSimpleTagLocationAndUpdate(IndexType indexType, boolean populateMetaFields) throws Exception {
setUp(indexType, populateMetaFields);
String newCommitTime = "001";
int totalRecords = 10 + random.nextInt(20);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
@@ -141,9 +165,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
}
@ParameterizedTest
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
public void testTagLocationAndDuplicateUpdate(IndexType indexType) throws Exception {
setUp(indexType);
@MethodSource("indexTypeParams")
public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean populateMetaFields) throws Exception {
setUp(indexType, populateMetaFields);
String newCommitTime = "001";
int totalRecords = 10 + random.nextInt(20);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
@@ -191,9 +215,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
}
@ParameterizedTest
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType) throws Exception {
setUp(indexType);
@MethodSource("indexTypeParams")
public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType, boolean populateMetaFields) throws Exception {
setUp(indexType, populateMetaFields);
String newCommitTime = writeClient.startCommit();
int totalRecords = 20 + random.nextInt(20);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
@@ -242,10 +266,18 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
}
private static Stream<Arguments> regularIndexTypeParams() {
Object[][] data = new Object[][] {
{IndexType.BLOOM, true},
{IndexType.SIMPLE, true}
};
return Stream.of(data).map(Arguments::of);
}
@ParameterizedTest
@EnumSource(value = IndexType.class, names = {"BLOOM", "SIMPLE",})
public void testTagLocationAndFetchRecordLocations(IndexType indexType) throws Exception {
setUp(indexType);
@MethodSource("regularIndexTypeParams")
public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean populateMetaFields) throws Exception {
setUp(indexType, populateMetaFields);
String p1 = "2016/01/31";
String p2 = "2015/01/31";
String rowKey1 = UUID.randomUUID().toString();
@@ -325,10 +357,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
}
}
@ParameterizedTest
@EnumSource(value = IndexType.class, names = {"GLOBAL_SIMPLE"})
public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath(IndexType indexType) throws Exception {
setUp(indexType);
@Test
public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() throws Exception {
setUp(IndexType.GLOBAL_SIMPLE, true);
config = getConfigBuilder()
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType)
.withGlobalSimpleIndexUpdatePartitionPath(true)

View File

@@ -18,20 +18,26 @@
package org.apache.hudi.io;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieStorageConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.index.HoodieIndexUtils;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.testutils.HoodieClientTestHarness;
@@ -40,7 +46,8 @@ import org.apache.hudi.testutils.MetadataMergeWriteStatus;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.IOException;
import java.util.ArrayList;
@@ -48,6 +55,7 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import scala.Tuple2;
@@ -71,10 +79,6 @@ public class TestHoodieKeyLocationFetchHandle extends HoodieClientTestHarness {
initPath();
initTestDataGenerator();
initFileSystem();
initMetaClient();
config = getConfigBuilder()
.withIndexConfig(HoodieIndexConfig.newBuilder()
.build()).build();
}
@AfterEach
@@ -82,8 +86,15 @@ public class TestHoodieKeyLocationFetchHandle extends HoodieClientTestHarness {
cleanupResources();
}
@Test
public void testFetchHandle() throws Exception {
@ParameterizedTest
@ValueSource(booleans = {true, false})
public void testFetchHandle(boolean populateMetaFields) throws Exception {
metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
config = getConfigBuilder()
.withProperties(getPropertiesForKeyGen())
.withIndexConfig(HoodieIndexConfig.newBuilder()
.build()).build();
List<HoodieRecord> records = dataGen.generateInserts(makeNewCommitTime(), 100);
Map<String, List<HoodieRecord>> partitionRecordsMap = recordsToPartitionRecordsMap(records);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
@@ -93,8 +104,11 @@ public class TestHoodieKeyLocationFetchHandle extends HoodieClientTestHarness {
List<Tuple2<String, HoodieBaseFile>> partitionPathFileIdPairs = loadAllFilesForPartitions(new ArrayList<>(partitionRecordsMap.keySet()), context, hoodieTable);
BaseKeyGenerator keyGenerator = (BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(getPropertiesForKeyGen()));
for (Tuple2<String, HoodieBaseFile> entry : partitionPathFileIdPairs) {
HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2));
HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2),
populateMetaFields ? Option.empty() : Option.of(keyGenerator));
Iterator<Pair<HoodieKey, HoodieRecordLocation>> result = fetcherHandle.locations().iterator();
List<Tuple2<HoodieKey, HoodieRecordLocation>> actualList = new ArrayList<>();
result.forEachRemaining(x -> actualList.add(new Tuple2<>(x.getLeft(), x.getRight())));

View File

@@ -43,6 +43,7 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.table.view.TableFileSystemView;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestTable;
import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieCompactionConfig;
@@ -60,10 +61,10 @@ import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.io.IOException;
import java.nio.file.Files;
@@ -71,7 +72,9 @@ import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -91,16 +94,15 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
private HoodieTableType tableType;
public void init(HoodieTableType tableType) throws IOException {
public void init(HoodieTableType tableType, boolean populateMetaFields) throws IOException {
this.tableType = tableType;
initPath();
initSparkContexts("TestHoodieMetadata");
initFileSystem();
fs.mkdirs(new Path(basePath));
initMetaClient(tableType);
metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
initTestDataGenerator();
metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath);
}
@AfterEach
@@ -108,12 +110,25 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
cleanupResources();
}
private static Stream<Arguments> populateMetaFieldsParams() {
return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of);
}
private static Stream<Arguments> tableTypePopulateMetaFieldsParams() {
return Stream.of(
Arguments.of(HoodieTableType.COPY_ON_WRITE, true),
Arguments.of(HoodieTableType.COPY_ON_WRITE, false),
Arguments.of(HoodieTableType.MERGE_ON_READ, true)
);
}
/**
* Metadata Table bootstrap scenarios.
*/
@Test
public void testMetadataTableBootstrap() throws Exception {
init(HoodieTableType.COPY_ON_WRITE);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testMetadataTableBootstrap(boolean populateMetaFields) throws Exception {
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
// Metadata table should not exist until created for the first time
@@ -122,7 +137,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
// Metadata table is not created if disabled by config
String firstCommitTime = HoodieActiveTimeline.createNewInstantTime();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
client.startCommitWithTime(firstCommitTime);
client.insert(jsc.parallelize(dataGen.generateInserts(firstCommitTime, 5)), firstCommitTime);
assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not be created");
@@ -131,7 +146,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
// Metadata table should not be created if any non-complete instants are present
String secondCommitTime = HoodieActiveTimeline.createNewInstantTime();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(false, true), true)) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(false, true, populateMetaFields), true)) {
client.startCommitWithTime(secondCommitTime);
client.insert(jsc.parallelize(dataGen.generateUpdates(secondCommitTime, 2)), secondCommitTime);
// AutoCommit is false so no bootstrap
@@ -144,7 +159,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
// Metadata table created when enabled by config & sync is called
secondCommitTime = HoodieActiveTimeline.createNewInstantTime();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
client.startCommitWithTime(secondCommitTime);
client.insert(jsc.parallelize(dataGen.generateUpdates(secondCommitTime, 2)), secondCommitTime);
client.syncTableMetadata();
@@ -167,7 +182,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
});
String thirdCommitTime = HoodieActiveTimeline.createNewInstantTime();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
client.startCommitWithTime(thirdCommitTime);
client.insert(jsc.parallelize(dataGen.generateUpdates(thirdCommitTime, 2)), thirdCommitTime);
client.syncTableMetadata();
@@ -184,10 +199,11 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
/**
* Only valid partition directories are added to the metadata.
*/
@Test
public void testOnlyValidPartitionsAdded() throws Exception {
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testOnlyValidPartitionsAdded(boolean populateMetaFields) throws Exception {
// This test requires local file system
init(HoodieTableType.COPY_ON_WRITE);
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
// Create an empty directory which is not a partition directory (lacks partition metadata)
@@ -207,7 +223,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
.addCommit("002").withBaseFilesInPartition("p1", 10).withBaseFilesInPartition("p2", 10, 10, 10);
final HoodieWriteConfig writeConfig =
getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.NEVER, true, true, false)
getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.NEVER, true, true, false, populateMetaFields)
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withDirectoryFilterRegex(filterDirRegex).build()).build();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) {
client.startCommitWithTime("005");
@@ -237,12 +253,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
* Test various table operations sync to Metadata Table correctly.
*/
@ParameterizedTest
@EnumSource(HoodieTableType.class)
public void testTableOperations(HoodieTableType tableType) throws Exception {
init(tableType);
@MethodSource("tableTypePopulateMetaFieldsParams")
public void testTableOperations(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
init(tableType, populateMetaFields);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
// Write 1 (Bulk insert)
String newCommitTime = "001";
@@ -325,12 +341,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
* Test rollback of various table operations sync to Metadata Table correctly.
*/
@ParameterizedTest
@EnumSource(HoodieTableType.class)
public void testRollbackOperations(HoodieTableType tableType) throws Exception {
init(tableType);
@MethodSource("tableTypePopulateMetaFieldsParams")
public void testRollbackOperations(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
init(tableType, populateMetaFields);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
// Write 1 (Bulk insert)
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
@@ -403,7 +419,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
// Rollback of partial commits
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext,
getWriteConfigBuilder(false, true, false).withRollbackUsingMarkers(false).build())) {
getWriteConfigBuilder(false, true, false, populateMetaFields).withRollbackUsingMarkers(false).build())) {
// Write updates and inserts
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
client.startCommitWithTime(newCommitTime);
@@ -417,7 +433,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
// Marker based rollback of partial commits
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext,
getWriteConfigBuilder(false, true, false).withRollbackUsingMarkers(true).build())) {
getWriteConfigBuilder(false, true, false, populateMetaFields).withRollbackUsingMarkers(true).build())) {
// Write updates and inserts
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
client.startCommitWithTime(newCommitTime);
@@ -435,12 +451,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
* Once explicit sync is called, metadata should match.
*/
@ParameterizedTest
@EnumSource(HoodieTableType.class)
public void testRollbackUnsyncedCommit(HoodieTableType tableType) throws Exception {
init(tableType);
@MethodSource("tableTypePopulateMetaFieldsParams")
public void testRollbackUnsyncedCommit(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
init(tableType, populateMetaFields);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
// Initialize table with metadata
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
@@ -450,7 +466,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
validateMetadata(client);
}
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
// Commit with metadata disabled
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateUpdates(newCommitTime, 10);
@@ -459,7 +475,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
client.rollback(newCommitTime);
}
try (SparkRDDWriteClient client = new SparkRDDWriteClient<>(engineContext, getWriteConfig(true, true))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient<>(engineContext, getWriteConfig(true, true, populateMetaFields))) {
assertFalse(metadata(client).isInSync());
client.syncTableMetadata();
validateMetadata(client);
@@ -470,10 +486,10 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
* Test sync of table operations.
*/
@ParameterizedTest
@EnumSource(HoodieTableType.class)
@MethodSource("tableTypePopulateMetaFieldsParams")
@Disabled
public void testSync(HoodieTableType tableType) throws Exception {
init(tableType);
public void testSync(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
init(tableType, populateMetaFields);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
String newCommitTime;
@@ -481,7 +497,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
List<WriteStatus> writeStatuses;
// Initial commits without metadata table enabled
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
records = dataGen.generateInserts(newCommitTime, 5);
client.startCommitWithTime(newCommitTime);
@@ -496,7 +512,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
}
// Enable metadata table so it initialized by listing from file system
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
// inserts
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
client.startCommitWithTime(newCommitTime);
@@ -512,7 +528,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
String restoreToInstant;
String inflightActionTimestamp;
String beforeInflightActionTimestamp;
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
// updates
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
client.startCommitWithTime(newCommitTime);
@@ -584,7 +600,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
Path inflightCleanPath = new Path(metaClient.getMetaPath(), HoodieTimeline.makeInflightCleanerFileName(inflightActionTimestamp));
fs.create(inflightCleanPath).close();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
// Restore cannot be done until the metadata table is in sync. See HUDI-1502 for details
client.syncTableMetadata();
@@ -613,7 +629,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
}
// Enable metadata table and ensure it is synced
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
client.restoreToInstant(restoreToInstant);
assertFalse(metadata(client).isInSync());
@@ -629,13 +645,14 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
/**
* Instants on Metadata Table should be archived as per config. Metadata Table should be automatically compacted as per config.
*/
@Test
public void testCleaningArchivingAndCompaction() throws Exception {
init(HoodieTableType.COPY_ON_WRITE);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testCleaningArchivingAndCompaction(boolean populateMetaFields) throws Exception {
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
final int maxDeltaCommitsBeforeCompaction = 4;
HoodieWriteConfig config = getWriteConfigBuilder(true, true, false)
HoodieWriteConfig config = getWriteConfigBuilder(true, true, false, populateMetaFields)
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true)
.archiveCommitsWith(6, 8).retainCommits(1)
.withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction).build())
@@ -676,14 +693,15 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
/**
* Test various error scenarios.
*/
@Test
public void testErrorCases() throws Exception {
init(HoodieTableType.COPY_ON_WRITE);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testErrorCases(boolean populateMetaFields) throws Exception {
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
// TESTCASE: If commit on the metadata table succeeds but fails on the dataset, then on next init the metadata table
// should be rolled back to last valid commit.
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 10);
@@ -704,7 +722,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
commitInstantFileName), false));
}
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
String newCommitTime = client.startCommit();
// Next insert
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 5);
@@ -721,11 +739,11 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
*/
//@Test
public void testNonPartitioned() throws Exception {
init(HoodieTableType.COPY_ON_WRITE);
init(HoodieTableType.COPY_ON_WRITE, true);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
HoodieTestDataGenerator nonPartitionedGenerator = new HoodieTestDataGenerator(new String[] {""});
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, true))) {
// Write 1 (Bulk insert)
String newCommitTime = "001";
List<HoodieRecord> records = nonPartitionedGenerator.generateInserts(newCommitTime, 10);
@@ -741,12 +759,13 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
/**
* Test various metrics published by metadata table.
*/
@Test
public void testMetadataMetrics() throws Exception {
init(HoodieTableType.COPY_ON_WRITE);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testMetadataMetrics(boolean populateMetaFields) throws Exception {
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(true, true, true).build())) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(true, true, true, populateMetaFields).build())) {
// Write
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
@@ -769,15 +788,16 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
/**
* Test when reading from metadata table which is out of sync with dataset that results are still consistent.
*/
@Test
public void testMetadataOutOfSync() throws Exception {
init(HoodieTableType.COPY_ON_WRITE);
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testMetadataOutOfSync(boolean populateMetaFields) throws Exception {
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
SparkRDDWriteClient unsyncedClient = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true));
SparkRDDWriteClient unsyncedClient = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields));
// Enable metadata so table is initialized
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
// Perform Bulk Insert
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
@@ -786,7 +806,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
}
// Perform commit operations with metadata disabled
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
// Perform Insert
String newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
@@ -811,7 +831,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
validateMetadata(unsyncedClient);
// Perform clean operation with metadata disabled
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
// One more commit needed to trigger clean so upsert and compact
String newCommitTime = "005";
client.startCommitWithTime(newCommitTime);
@@ -833,7 +853,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
validateMetadata(unsyncedClient);
// Perform restore with metadata disabled
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
client.restoreToInstant("004");
}
@@ -1008,18 +1028,20 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
}
}
private HoodieWriteConfig getWriteConfig(boolean autoCommit, boolean useFileListingMetadata) {
return getWriteConfigBuilder(autoCommit, useFileListingMetadata, false).build();
private HoodieWriteConfig getWriteConfig(boolean autoCommit, boolean useFileListingMetadata, boolean populateMetaFields) {
return getWriteConfigBuilder(autoCommit, useFileListingMetadata, false, populateMetaFields).build();
}
private HoodieWriteConfig.Builder getWriteConfigBuilder(boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) {
return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics);
private HoodieWriteConfig.Builder getWriteConfigBuilder(boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics, boolean populateMetaFields) {
return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics, populateMetaFields);
}
private HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) {
private HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata,
boolean enableMetrics, boolean populateMetaFields) {
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA)
.withParallelism(2, 2).withDeleteParallelism(2).withRollbackParallelism(2).withFinalizeWriteParallelism(2)
.withAutoCommit(autoCommit)
.withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen())
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024)
.withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1)
.withFailedWritesCleaningPolicy(policy)
@@ -1028,7 +1050,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
.withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table")
.withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder()
.withEnableBackupForRemoteFileSystemView(false).build())
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(populateMetaFields ? HoodieIndex.IndexType.BLOOM : HoodieIndex.IndexType.SIMPLE).build())
.withMetadataConfig(HoodieMetadataConfig.newBuilder()
.enable(useFileListingMetadata)
.enableMetrics(enableMetrics).build())

View File

@@ -301,6 +301,13 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
}
}
public JavaRDD<WriteStatus> insertFirstBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
String initCommitTime, int numRecordsInThisCommit,
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
boolean assertForCommit, int expRecordsInThisCommit) throws Exception {
return insertFirstBatch(writeConfig, client, newCommitTime, initCommitTime, numRecordsInThisCommit, writeFn, isPreppedAPI, assertForCommit, expRecordsInThisCommit, true);
}
/**
* Helper to insert first batch of records and do regular assertions on the state after successful completion.
*
@@ -319,12 +326,12 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
public JavaRDD<WriteStatus> insertFirstBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
String initCommitTime, int numRecordsInThisCommit,
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
boolean assertForCommit, int expRecordsInThisCommit) throws Exception {
boolean assertForCommit, int expRecordsInThisCommit, boolean filterForCommitTimeWithAssert) throws Exception {
final Function2<List<HoodieRecord>, String, Integer> recordGenFunction =
generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateInserts);
return writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, numRecordsInThisCommit,
recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expRecordsInThisCommit, 1, false);
recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expRecordsInThisCommit, 1, false, filterForCommitTimeWithAssert);
}
/**
@@ -355,6 +362,15 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, false);
}
public JavaRDD<WriteStatus> updateBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
String prevCommitTime, Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime,
int numRecordsInThisCommit,
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception {
return updateBatch(writeConfig, client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, writeFn,
isPreppedAPI, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, true);
}
/**
* Helper to upsert batch of records and do regular assertions on the state after successful completion.
*
@@ -378,13 +394,23 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
String prevCommitTime, Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime,
int numRecordsInThisCommit,
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception {
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits,
boolean filterForCommitTimeWithAssert) throws Exception {
final Function2<List<HoodieRecord>, String, Integer> recordGenFunction =
generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateUniqueUpdates);
return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime,
numRecordsInThisCommit, recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords,
expTotalCommits, false);
expTotalCommits, false, filterForCommitTimeWithAssert);
}
public JavaRDD<WriteStatus> deleteBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
String prevCommitTime, String initCommitTime,
int numRecordsInThisCommit,
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn, boolean isPreppedAPI,
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception {
return deleteBatch(writeConfig, client, newCommitTime, prevCommitTime, initCommitTime, numRecordsInThisCommit, deleteFn, isPreppedAPI,
assertForCommit, expRecordsInThisCommit, expTotalRecords, true);
}
/**
@@ -408,13 +434,22 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
String prevCommitTime, String initCommitTime,
int numRecordsInThisCommit,
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn, boolean isPreppedAPI,
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception {
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, boolean filterForCommitTimeWithAssert) throws Exception {
final Function<Integer, List<HoodieKey>> keyGenFunction =
generateWrapDeleteKeysFn(isPreppedAPI, writeConfig, dataGen::generateUniqueDeletes);
return deleteBatch(client, newCommitTime, prevCommitTime, initCommitTime, numRecordsInThisCommit,
keyGenFunction,
deleteFn, assertForCommit, expRecordsInThisCommit, expTotalRecords);
deleteFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, filterForCommitTimeWithAssert);
}
public JavaRDD<WriteStatus> writeBatch(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime,
Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit,
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit) throws Exception {
return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, recordGenFunction,
writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, doCommit, true);
}
/**
@@ -439,7 +474,8 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit,
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit) throws Exception {
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit,
boolean filterForCommitTimeWithAssert) throws Exception {
// Write 1 (only inserts)
client.startCommitWithTime(newCommitTime);
@@ -466,8 +502,10 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
"Expecting " + expTotalCommits + " commits.");
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(),
"Latest commit should be " + newCommitTime);
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
"Must contain " + expRecordsInThisCommit + " records");
if (filterForCommitTimeWithAssert) { // when meta cols are disabled, we can't really do per commit assertion.
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
"Must contain " + expRecordsInThisCommit + " records");
}
// Check the entire dataset has all records still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
@@ -477,16 +515,18 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(),
"Must contain " + expTotalRecords + " records");
// Check that the incremental consumption from prevCommitTime
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
"Incremental consumption from " + prevCommitTime + " should give all records in latest commit");
if (commitTimesBetweenPrevAndNew.isPresent()) {
commitTimesBetweenPrevAndNew.get().forEach(ct -> {
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, ct),
"Incremental consumption from " + ct + " should give all records in latest commit");
});
if (filterForCommitTimeWithAssert) {
// Check that the incremental consumption from prevCommitTime
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
"Incremental consumption from " + prevCommitTime + " should give all records in latest commit");
if (commitTimesBetweenPrevAndNew.isPresent()) {
commitTimesBetweenPrevAndNew.get().forEach(ct -> {
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, ct),
"Incremental consumption from " + ct + " should give all records in latest commit");
});
}
}
}
return result;
@@ -510,7 +550,7 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
String initCommitTime, int numRecordsInThisCommit,
Function<Integer, List<HoodieKey>> keyGenFunction,
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn,
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception {
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, boolean filerForCommitTimeWithAssert) throws Exception {
// Delete 1 (only deletes)
client.startCommitWithTime(newCommitTime);
@@ -534,8 +574,10 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
"Expecting 3 commits.");
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(),
"Latest commit should be " + newCommitTime);
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
"Must contain " + expRecordsInThisCommit + " records");
if (filerForCommitTimeWithAssert) { // if meta cols are disabled, we can't do assertion based on assertion time
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
"Must contain " + expRecordsInThisCommit + " records");
}
// Check the entire dataset has all records still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
@@ -545,11 +587,13 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(),
"Must contain " + expTotalRecords + " records");
// Check that the incremental consumption from prevCommitTime
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
"Incremental consumption from " + prevCommitTime + " should give no records in latest commit,"
+ " since it is a delete operation");
if (filerForCommitTimeWithAssert) {
// Check that the incremental consumption from prevCommitTime
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
"Incremental consumption from " + prevCommitTime + " should give no records in latest commit,"
+ " since it is a delete operation");
}
}
return result;
}

View File

@@ -32,6 +32,7 @@ import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
@@ -40,7 +41,9 @@ import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.common.testutils.minicluster.HdfsTestService;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.table.WorkloadStat;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -56,6 +59,7 @@ import java.io.IOException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@@ -225,6 +229,21 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType);
}
protected Properties getPropertiesForKeyGen() {
Properties properties = new Properties();
properties.put(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), "false");
properties.put("hoodie.datasource.write.recordkey.field","_row_key");
properties.put("hoodie.datasource.write.partitionpath.field","partition_path");
return properties;
}
protected void addAppropriatePropsForPopulateMetaFields(HoodieWriteConfig.Builder configBuilder, boolean populateMetaFields) {
if (!populateMetaFields) {
configBuilder.withProperties(getPropertiesForKeyGen())
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.SIMPLE).build());
}
}
/**
* Cleanups hoodie clients.
*/

View File

@@ -25,6 +25,9 @@
}, {
"name" : "_row_key",
"type" : "string"
}, {
"name" : "partition_path",
"type" : "string"
}, {
"name" : "rider",
"type" : "string"