[HUDI-2176, 2178, 2179] Adding virtual key support to COW table (#3306)
This commit is contained in:
committed by
GitHub
parent
5353243449
commit
61148c1c43
@@ -45,13 +45,14 @@ object SparkHelpers {
|
|||||||
val schema: Schema = sourceRecords.get(0).getSchema
|
val schema: Schema = sourceRecords.get(0).getSchema
|
||||||
val filter: BloomFilter = BloomFilterFactory.createBloomFilter(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES.defaultValue.toInt, HoodieIndexConfig.BLOOM_FILTER_FPP.defaultValue.toDouble,
|
val filter: BloomFilter = BloomFilterFactory.createBloomFilter(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES.defaultValue.toInt, HoodieIndexConfig.BLOOM_FILTER_FPP.defaultValue.toDouble,
|
||||||
HoodieIndexConfig.HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES.defaultValue.toInt, HoodieIndexConfig.BLOOM_INDEX_FILTER_TYPE.defaultValue);
|
HoodieIndexConfig.HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES.defaultValue.toInt, HoodieIndexConfig.BLOOM_INDEX_FILTER_TYPE.defaultValue);
|
||||||
val writeSupport: HoodieAvroWriteSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(fs.getConf).convert(schema), schema, filter)
|
val writeSupport: HoodieAvroWriteSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(fs.getConf).convert(schema), schema, org.apache.hudi.common.util.Option.of(filter))
|
||||||
val parquetConfig: HoodieAvroParquetConfig = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP, HoodieStorageConfig.PARQUET_BLOCK_SIZE_BYTES.defaultValue.toInt, HoodieStorageConfig.PARQUET_PAGE_SIZE_BYTES.defaultValue.toInt, HoodieStorageConfig.PARQUET_FILE_MAX_BYTES.defaultValue.toInt, fs.getConf, HoodieStorageConfig.PARQUET_COMPRESSION_RATIO.defaultValue.toDouble)
|
val parquetConfig: HoodieAvroParquetConfig = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP, HoodieStorageConfig.PARQUET_BLOCK_SIZE_BYTES.defaultValue.toInt, HoodieStorageConfig.PARQUET_PAGE_SIZE_BYTES.defaultValue.toInt, HoodieStorageConfig.PARQUET_FILE_MAX_BYTES.defaultValue.toInt, fs.getConf, HoodieStorageConfig.PARQUET_COMPRESSION_RATIO.defaultValue.toDouble)
|
||||||
|
|
||||||
// Add current classLoad for config, if not will throw classNotFound of 'HoodieWrapperFileSystem'.
|
// Add current classLoad for config, if not will throw classNotFound of 'HoodieWrapperFileSystem'.
|
||||||
parquetConfig.getHadoopConf().setClassLoader(Thread.currentThread.getContextClassLoader)
|
parquetConfig.getHadoopConf().setClassLoader(Thread.currentThread.getContextClassLoader)
|
||||||
|
|
||||||
val writer = new HoodieParquetWriter[HoodieJsonPayload, IndexedRecord](instantTime, destinationFile, parquetConfig, schema, new SparkTaskContextSupplier())
|
val writer = new HoodieParquetWriter[HoodieJsonPayload, IndexedRecord](instantTime, destinationFile, parquetConfig, schema, new SparkTaskContextSupplier(),
|
||||||
|
true)
|
||||||
for (rec <- sourceRecords) {
|
for (rec <- sourceRecords) {
|
||||||
val key: String = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString
|
val key: String = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString
|
||||||
if (!keysToSkip.contains(key)) {
|
if (!keysToSkip.contains(key)) {
|
||||||
|
|||||||
@@ -1594,6 +1594,11 @@ public class HoodieWriteConfig extends HoodieConfig {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Builder withPopulateMetaFields(boolean populateMetaFields) {
|
||||||
|
writeConfig.setValue(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS, Boolean.toString(populateMetaFields));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public Builder withProperties(Properties properties) {
|
public Builder withProperties(Properties properties) {
|
||||||
this.writeConfig.getProps().putAll(properties);
|
this.writeConfig.getProps().putAll(properties);
|
||||||
return this;
|
return this;
|
||||||
|
|||||||
@@ -207,9 +207,11 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload, I, K, O> extends
|
|||||||
avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get()));
|
avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get()));
|
||||||
String seqId =
|
String seqId =
|
||||||
HoodieRecord.generateSequenceId(instantTime, getPartitionId(), RECORD_COUNTER.getAndIncrement());
|
HoodieRecord.generateSequenceId(instantTime, getPartitionId(), RECORD_COUNTER.getAndIncrement());
|
||||||
|
if (config.populateMetaFields()) {
|
||||||
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
|
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
|
||||||
hoodieRecord.getPartitionPath(), fileId);
|
hoodieRecord.getPartitionPath(), fileId);
|
||||||
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
|
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
|
||||||
|
}
|
||||||
if (isUpdateRecord(hoodieRecord)) {
|
if (isUpdateRecord(hoodieRecord)) {
|
||||||
updatedRecordsWritten++;
|
updatedRecordsWritten++;
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -23,12 +23,16 @@ import org.apache.hudi.common.model.HoodieKey;
|
|||||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.util.BaseFileUtils;
|
import org.apache.hudi.common.util.BaseFileUtils;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -39,17 +43,25 @@ import java.util.stream.Stream;
|
|||||||
public class HoodieKeyLocationFetchHandle<T extends HoodieRecordPayload, I, K, O> extends HoodieReadHandle<T, I, K, O> {
|
public class HoodieKeyLocationFetchHandle<T extends HoodieRecordPayload, I, K, O> extends HoodieReadHandle<T, I, K, O> {
|
||||||
|
|
||||||
private final Pair<String, HoodieBaseFile> partitionPathBaseFilePair;
|
private final Pair<String, HoodieBaseFile> partitionPathBaseFilePair;
|
||||||
|
private final Option<BaseKeyGenerator> keyGeneratorOpt;
|
||||||
|
|
||||||
public HoodieKeyLocationFetchHandle(HoodieWriteConfig config, HoodieTable<T, I, K, O> hoodieTable,
|
public HoodieKeyLocationFetchHandle(HoodieWriteConfig config, HoodieTable<T, I, K, O> hoodieTable,
|
||||||
Pair<String, HoodieBaseFile> partitionPathBaseFilePair) {
|
Pair<String, HoodieBaseFile> partitionPathBaseFilePair, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
super(config, null, hoodieTable, Pair.of(partitionPathBaseFilePair.getLeft(), partitionPathBaseFilePair.getRight().getFileId()));
|
super(config, null, hoodieTable, Pair.of(partitionPathBaseFilePair.getLeft(), partitionPathBaseFilePair.getRight().getFileId()));
|
||||||
this.partitionPathBaseFilePair = partitionPathBaseFilePair;
|
this.partitionPathBaseFilePair = partitionPathBaseFilePair;
|
||||||
|
this.keyGeneratorOpt = keyGeneratorOpt;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Stream<Pair<HoodieKey, HoodieRecordLocation>> locations() {
|
public Stream<Pair<HoodieKey, HoodieRecordLocation>> locations() {
|
||||||
HoodieBaseFile baseFile = partitionPathBaseFilePair.getRight();
|
HoodieBaseFile baseFile = partitionPathBaseFilePair.getRight();
|
||||||
return BaseFileUtils.getInstance(baseFile.getPath()).fetchRecordKeyPartitionPath(
|
BaseFileUtils baseFileUtils = BaseFileUtils.getInstance(baseFile.getPath());
|
||||||
hoodieTable.getHadoopConf(), new Path(baseFile.getPath())).stream()
|
List<HoodieKey> hoodieKeyList = new ArrayList<>();
|
||||||
|
if (keyGeneratorOpt.isPresent()) {
|
||||||
|
hoodieKeyList = baseFileUtils.fetchRecordKeyPartitionPath(hoodieTable.getHadoopConf(), new Path(baseFile.getPath()), keyGeneratorOpt);
|
||||||
|
} else {
|
||||||
|
hoodieKeyList = baseFileUtils.fetchRecordKeyPartitionPath(hoodieTable.getHadoopConf(), new Path(baseFile.getPath()));
|
||||||
|
}
|
||||||
|
return hoodieKeyList.stream()
|
||||||
.map(entry -> Pair.of(entry,
|
.map(entry -> Pair.of(entry,
|
||||||
new HoodieRecordLocation(baseFile.getCommitTime(), baseFile.getFileId())));
|
new HoodieRecordLocation(baseFile.getCommitTime(), baseFile.getFileId())));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ import org.apache.hudi.common.model.IOType;
|
|||||||
import org.apache.hudi.common.util.DefaultSizeEstimator;
|
import org.apache.hudi.common.util.DefaultSizeEstimator;
|
||||||
import org.apache.hudi.common.util.HoodieRecordSizeEstimator;
|
import org.apache.hudi.common.util.HoodieRecordSizeEstimator;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.apache.hudi.common.util.ValidationUtils;
|
||||||
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
|
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieCorruptedDataException;
|
import org.apache.hudi.exception.HoodieCorruptedDataException;
|
||||||
@@ -40,6 +41,8 @@ import org.apache.hudi.exception.HoodieUpsertException;
|
|||||||
import org.apache.hudi.io.storage.HoodieFileReader;
|
import org.apache.hudi.io.storage.HoodieFileReader;
|
||||||
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
||||||
import org.apache.hudi.io.storage.HoodieFileWriter;
|
import org.apache.hudi.io.storage.HoodieFileWriter;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
import org.apache.hudi.keygen.KeyGenUtils;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
@@ -101,21 +104,23 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
|
|||||||
protected long updatedRecordsWritten = 0;
|
protected long updatedRecordsWritten = 0;
|
||||||
protected long insertRecordsWritten = 0;
|
protected long insertRecordsWritten = 0;
|
||||||
protected boolean useWriterSchema;
|
protected boolean useWriterSchema;
|
||||||
|
protected Option<BaseKeyGenerator> keyGeneratorOpt;
|
||||||
private HoodieBaseFile baseFileToMerge;
|
private HoodieBaseFile baseFileToMerge;
|
||||||
|
|
||||||
public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||||
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
|
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
|
||||||
TaskContextSupplier taskContextSupplier) {
|
TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
this(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier,
|
this(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier,
|
||||||
hoodieTable.getBaseFileOnlyView().getLatestBaseFile(partitionPath, fileId).get());
|
hoodieTable.getBaseFileOnlyView().getLatestBaseFile(partitionPath, fileId).get(), keyGeneratorOpt);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||||
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
|
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
|
||||||
TaskContextSupplier taskContextSupplier, HoodieBaseFile baseFile) {
|
TaskContextSupplier taskContextSupplier, HoodieBaseFile baseFile, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier);
|
super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier);
|
||||||
init(fileId, recordItr);
|
init(fileId, recordItr);
|
||||||
init(fileId, partitionPath, baseFile);
|
init(fileId, partitionPath, baseFile);
|
||||||
|
validateAndSetAndKeyGenProps(keyGeneratorOpt, config.populateMetaFields());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -123,11 +128,17 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
|
|||||||
*/
|
*/
|
||||||
public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||||
Map<String, HoodieRecord<T>> keyToNewRecords, String partitionPath, String fileId,
|
Map<String, HoodieRecord<T>> keyToNewRecords, String partitionPath, String fileId,
|
||||||
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
|
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier);
|
super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier);
|
||||||
this.keyToNewRecords = keyToNewRecords;
|
this.keyToNewRecords = keyToNewRecords;
|
||||||
this.useWriterSchema = true;
|
this.useWriterSchema = true;
|
||||||
init(fileId, this.partitionPath, dataFileToBeMerged);
|
init(fileId, this.partitionPath, dataFileToBeMerged);
|
||||||
|
validateAndSetAndKeyGenProps(keyGeneratorOpt, config.populateMetaFields());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void validateAndSetAndKeyGenProps(Option<BaseKeyGenerator> keyGeneratorOpt, boolean populateMetaFields) {
|
||||||
|
ValidationUtils.checkArgument(populateMetaFields == !keyGeneratorOpt.isPresent());
|
||||||
|
this.keyGeneratorOpt = keyGeneratorOpt;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -278,7 +289,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
|
|||||||
* Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
|
* Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
|
||||||
*/
|
*/
|
||||||
public void write(GenericRecord oldRecord) {
|
public void write(GenericRecord oldRecord) {
|
||||||
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt);
|
||||||
boolean copyOldRecord = true;
|
boolean copyOldRecord = true;
|
||||||
if (keyToNewRecords.containsKey(key)) {
|
if (keyToNewRecords.containsKey(key)) {
|
||||||
// If we have duplicate records that we are updating, then the hoodie record will be deflated after
|
// If we have duplicate records that we are updating, then the hoodie record will be deflated after
|
||||||
|
|||||||
@@ -23,8 +23,10 @@ import org.apache.hudi.common.engine.TaskContextSupplier;
|
|||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieUpsertException;
|
import org.apache.hudi.exception.HoodieUpsertException;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
@@ -47,8 +49,9 @@ public class HoodieSortedMergeHandle<T extends HoodieRecordPayload, I, K, O> ext
|
|||||||
private Queue<String> newRecordKeysSorted = new PriorityQueue<>();
|
private Queue<String> newRecordKeysSorted = new PriorityQueue<>();
|
||||||
|
|
||||||
public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||||
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId, TaskContextSupplier taskContextSupplier) {
|
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId, TaskContextSupplier taskContextSupplier,
|
||||||
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier);
|
Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
|
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
|
||||||
newRecordKeysSorted.addAll(keyToNewRecords.keySet());
|
newRecordKeysSorted.addAll(keyToNewRecords.keySet());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -57,9 +60,9 @@ public class HoodieSortedMergeHandle<T extends HoodieRecordPayload, I, K, O> ext
|
|||||||
*/
|
*/
|
||||||
public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||||
Map<String, HoodieRecord<T>> keyToNewRecordsOrig, String partitionPath, String fileId,
|
Map<String, HoodieRecord<T>> keyToNewRecordsOrig, String partitionPath, String fileId,
|
||||||
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
|
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
super(config, instantTime, hoodieTable, keyToNewRecordsOrig, partitionPath, fileId, dataFileToBeMerged,
|
super(config, instantTime, hoodieTable, keyToNewRecordsOrig, partitionPath, fileId, dataFileToBeMerged,
|
||||||
taskContextSupplier);
|
taskContextSupplier, keyGeneratorOpt);
|
||||||
|
|
||||||
newRecordKeysSorted.addAll(keyToNewRecords.keySet());
|
newRecordKeysSorted.addAll(keyToNewRecords.keySet());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,11 +20,13 @@ package org.apache.hudi.io.storage;
|
|||||||
|
|
||||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieUpsertException;
|
import org.apache.hudi.exception.HoodieUpsertException;
|
||||||
import org.apache.hudi.io.HoodieMergeHandle;
|
import org.apache.hudi.io.HoodieMergeHandle;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
import org.apache.hudi.keygen.KeyGenUtils;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
@@ -66,13 +68,14 @@ public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends
|
|||||||
private static final Logger LOG = LogManager.getLogger(HoodieConcatHandle.class);
|
private static final Logger LOG = LogManager.getLogger(HoodieConcatHandle.class);
|
||||||
|
|
||||||
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Iterator recordItr,
|
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Iterator recordItr,
|
||||||
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier) {
|
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier);
|
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Map keyToNewRecords, String partitionPath, String fileId,
|
public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Map keyToNewRecords, String partitionPath, String fileId,
|
||||||
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
|
HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) {
|
||||||
super(config, instantTime, hoodieTable, keyToNewRecords, partitionPath, fileId, dataFileToBeMerged, taskContextSupplier);
|
super(config, instantTime, hoodieTable, keyToNewRecords, partitionPath, fileId, dataFileToBeMerged, taskContextSupplier,
|
||||||
|
Option.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -80,7 +83,7 @@ public class HoodieConcatHandle<T extends HoodieRecordPayload, I, K, O> extends
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void write(GenericRecord oldRecord) {
|
public void write(GenericRecord oldRecord) {
|
||||||
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt);
|
||||||
try {
|
try {
|
||||||
fileWriter.writeAvro(key, oldRecord);
|
fileWriter.writeAvro(key, oldRecord);
|
||||||
} catch (IOException | RuntimeException e) {
|
} catch (IOException | RuntimeException e) {
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import org.apache.hudi.common.bloom.BloomFilterFactory;
|
|||||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
@@ -34,9 +35,9 @@ import org.apache.parquet.avro.AvroSchemaConverter;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.apache.hudi.common.model.HoodieFileFormat.HFILE;
|
||||||
import static org.apache.hudi.common.model.HoodieFileFormat.ORC;
|
import static org.apache.hudi.common.model.HoodieFileFormat.ORC;
|
||||||
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
|
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
|
||||||
import static org.apache.hudi.common.model.HoodieFileFormat.HFILE;
|
|
||||||
|
|
||||||
public class HoodieFileWriterFactory {
|
public class HoodieFileWriterFactory {
|
||||||
|
|
||||||
@@ -45,7 +46,7 @@ public class HoodieFileWriterFactory {
|
|||||||
TaskContextSupplier taskContextSupplier) throws IOException {
|
TaskContextSupplier taskContextSupplier) throws IOException {
|
||||||
final String extension = FSUtils.getFileExtension(path.getName());
|
final String extension = FSUtils.getFileExtension(path.getName());
|
||||||
if (PARQUET.getFileExtension().equals(extension)) {
|
if (PARQUET.getFileExtension().equals(extension)) {
|
||||||
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier);
|
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier, config.populateMetaFields());
|
||||||
}
|
}
|
||||||
if (HFILE.getFileExtension().equals(extension)) {
|
if (HFILE.getFileExtension().equals(extension)) {
|
||||||
return newHFileFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier);
|
return newHFileFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier);
|
||||||
@@ -58,16 +59,21 @@ public class HoodieFileWriterFactory {
|
|||||||
|
|
||||||
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
|
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
|
||||||
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
|
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
|
||||||
TaskContextSupplier taskContextSupplier) throws IOException {
|
TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException {
|
||||||
BloomFilter filter = createBloomFilter(config);
|
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier, populateMetaFields, populateMetaFields);
|
||||||
HoodieAvroWriteSupport writeSupport =
|
}
|
||||||
new HoodieAvroWriteSupport(new AvroSchemaConverter(hoodieTable.getHadoopConf()).convert(schema), schema, filter);
|
|
||||||
|
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
|
||||||
|
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
|
||||||
|
TaskContextSupplier taskContextSupplier, boolean populateMetaFields, boolean enableBloomFilter) throws IOException {
|
||||||
|
Option<BloomFilter> filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty();
|
||||||
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(hoodieTable.getHadoopConf()).convert(schema), schema, filter);
|
||||||
|
|
||||||
HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(),
|
HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(),
|
||||||
config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
|
config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
|
||||||
hoodieTable.getHadoopConf(), config.getParquetCompressionRatio());
|
hoodieTable.getHadoopConf(), config.getParquetCompressionRatio());
|
||||||
|
|
||||||
return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier);
|
return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newHFileFileWriter(
|
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newHFileFileWriter(
|
||||||
|
|||||||
@@ -49,9 +49,10 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
private final HoodieAvroWriteSupport writeSupport;
|
private final HoodieAvroWriteSupport writeSupport;
|
||||||
private final String instantTime;
|
private final String instantTime;
|
||||||
private final TaskContextSupplier taskContextSupplier;
|
private final TaskContextSupplier taskContextSupplier;
|
||||||
|
private final boolean populateMetaFields;
|
||||||
|
|
||||||
public HoodieParquetWriter(String instantTime, Path file, HoodieAvroParquetConfig parquetConfig,
|
public HoodieParquetWriter(String instantTime, Path file, HoodieAvroParquetConfig parquetConfig,
|
||||||
Schema schema, TaskContextSupplier taskContextSupplier) throws IOException {
|
Schema schema, TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException {
|
||||||
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
||||||
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
||||||
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
||||||
@@ -69,14 +70,19 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
this.writeSupport = parquetConfig.getWriteSupport();
|
this.writeSupport = parquetConfig.getWriteSupport();
|
||||||
this.instantTime = instantTime;
|
this.instantTime = instantTime;
|
||||||
this.taskContextSupplier = taskContextSupplier;
|
this.taskContextSupplier = taskContextSupplier;
|
||||||
|
this.populateMetaFields = populateMetaFields;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
|
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
|
||||||
|
if (populateMetaFields) {
|
||||||
prepRecordWithMetadata(avroRecord, record, instantTime,
|
prepRecordWithMetadata(avroRecord, record, instantTime,
|
||||||
taskContextSupplier.getPartitionIdSupplier().get(), recordIndex, file.getName());
|
taskContextSupplier.getPartitionIdSupplier().get(), recordIndex, file.getName());
|
||||||
super.write(avroRecord);
|
super.write(avroRecord);
|
||||||
writeSupport.add(record.getRecordKey());
|
writeSupport.add(record.getRecordKey());
|
||||||
|
} else {
|
||||||
|
super.write(avroRecord);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -87,8 +93,10 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
@Override
|
@Override
|
||||||
public void writeAvro(String key, IndexedRecord object) throws IOException {
|
public void writeAvro(String key, IndexedRecord object) throws IOException {
|
||||||
super.write(object);
|
super.write(object);
|
||||||
|
if (populateMetaFields) {
|
||||||
writeSupport.add(key);
|
writeSupport.add(key);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long getBytesWritten() {
|
public long getBytesWritten() {
|
||||||
|
|||||||
@@ -21,6 +21,8 @@ package org.apache.hudi.keygen;
|
|||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
import org.apache.hudi.common.config.TypedProperties;
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
|
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
|
||||||
import org.apache.hudi.common.util.ReflectionUtils;
|
import org.apache.hudi.common.util.ReflectionUtils;
|
||||||
import org.apache.hudi.common.util.StringUtils;
|
import org.apache.hudi.common.util.StringUtils;
|
||||||
@@ -41,6 +43,26 @@ public class KeyGenUtils {
|
|||||||
protected static final String DEFAULT_PARTITION_PATH = "default";
|
protected static final String DEFAULT_PARTITION_PATH = "default";
|
||||||
public static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
|
public static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches record key from the GenericRecord.
|
||||||
|
* @param genericRecord generic record of interest.
|
||||||
|
* @param keyGeneratorOpt Optional BaseKeyGenerator. If not, meta field will be used.
|
||||||
|
* @return the record key for the passed in generic record.
|
||||||
|
*/
|
||||||
|
public static String getRecordKeyFromGenericRecord(GenericRecord genericRecord, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
|
return keyGeneratorOpt.isPresent() ? keyGeneratorOpt.get().getRecordKey(genericRecord) : genericRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches partition path from the GenericRecord.
|
||||||
|
* @param genericRecord generic record of interest.
|
||||||
|
* @param keyGeneratorOpt Optional BaseKeyGenerator. If not, meta field will be used.
|
||||||
|
* @return the partition path for the passed in generic record.
|
||||||
|
*/
|
||||||
|
public static String getPartitionPathFromGenericRecord(GenericRecord genericRecord, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
|
return keyGeneratorOpt.isPresent() ? keyGeneratorOpt.get().getRecordKey(genericRecord) : genericRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts the record key fields in strings out of the given record key,
|
* Extracts the record key fields in strings out of the given record key,
|
||||||
* this is the reverse operation of {@link #getRecordKey(GenericRecord, String)}.
|
* this is the reverse operation of {@link #getRecordKey(GenericRecord, String)}.
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
|||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||||
import org.apache.hudi.common.testutils.FileCreateUtils;
|
import org.apache.hudi.common.testutils.FileCreateUtils;
|
||||||
import org.apache.hudi.common.testutils.HoodieTestTable;
|
import org.apache.hudi.common.testutils.HoodieTestTable;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieStorageConfig;
|
import org.apache.hudi.config.HoodieStorageConfig;
|
||||||
import org.apache.hudi.io.storage.HoodieAvroParquetConfig;
|
import org.apache.hudi.io.storage.HoodieAvroParquetConfig;
|
||||||
import org.apache.hudi.io.storage.HoodieOrcConfig;
|
import org.apache.hudi.io.storage.HoodieOrcConfig;
|
||||||
@@ -68,11 +69,13 @@ public class HoodieWriteableTestTable extends HoodieTestTable {
|
|||||||
|
|
||||||
protected final Schema schema;
|
protected final Schema schema;
|
||||||
protected final BloomFilter filter;
|
protected final BloomFilter filter;
|
||||||
|
protected final boolean populateMetaFields;
|
||||||
|
|
||||||
protected HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) {
|
protected HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) {
|
||||||
super(basePath, fs, metaClient);
|
super(basePath, fs, metaClient);
|
||||||
this.schema = schema;
|
this.schema = schema;
|
||||||
this.filter = filter;
|
this.filter = filter;
|
||||||
|
this.populateMetaFields = metaClient.getTableConfig().populateMetaFields();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -91,21 +94,25 @@ public class HoodieWriteableTestTable extends HoodieTestTable {
|
|||||||
|
|
||||||
if (HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP.defaultValue().equals(HoodieFileFormat.PARQUET)) {
|
if (HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP.defaultValue().equals(HoodieFileFormat.PARQUET)) {
|
||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
||||||
new AvroSchemaConverter().convert(schema), schema, filter);
|
new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
|
||||||
HoodieAvroParquetConfig config = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
HoodieAvroParquetConfig config = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
||||||
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
|
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
|
||||||
new Configuration(), Double.parseDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO.defaultValue()));
|
new Configuration(), Double.parseDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO.defaultValue()));
|
||||||
try (HoodieParquetWriter writer = new HoodieParquetWriter(
|
try (HoodieParquetWriter writer = new HoodieParquetWriter(
|
||||||
currentInstantTime,
|
currentInstantTime,
|
||||||
new Path(Paths.get(basePath, partition, fileName).toString()),
|
new Path(Paths.get(basePath, partition, fileName).toString()),
|
||||||
config, schema, contextSupplier)) {
|
config, schema, contextSupplier, populateMetaFields)) {
|
||||||
int seqId = 1;
|
int seqId = 1;
|
||||||
for (HoodieRecord record : records) {
|
for (HoodieRecord record : records) {
|
||||||
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
|
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
|
||||||
|
if (populateMetaFields) {
|
||||||
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++));
|
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++));
|
||||||
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName);
|
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName);
|
||||||
writer.writeAvro(record.getRecordKey(), avroRecord);
|
writer.writeAvro(record.getRecordKey(), avroRecord);
|
||||||
filter.add(record.getRecordKey());
|
filter.add(record.getRecordKey());
|
||||||
|
} else {
|
||||||
|
writer.writeAvro(record.getRecordKey(), avroRecord);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP.defaultValue().equals(HoodieFileFormat.ORC)) {
|
} else if (HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP.defaultValue().equals(HoodieFileFormat.ORC)) {
|
||||||
|
|||||||
@@ -130,7 +130,7 @@ public class FlinkHoodieSimpleIndex<T extends HoodieRecordPayload> extends Flink
|
|||||||
List<Pair<String, HoodieBaseFile>> latestBaseFiles) {
|
List<Pair<String, HoodieBaseFile>> latestBaseFiles) {
|
||||||
|
|
||||||
List<HoodieKeyLocationFetchHandle<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>>> hoodieKeyLocationFetchHandles =
|
List<HoodieKeyLocationFetchHandle<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>>> hoodieKeyLocationFetchHandles =
|
||||||
context.map(latestBaseFiles, partitionPathBaseFile -> new HoodieKeyLocationFetchHandle<>(config, hoodieTable, partitionPathBaseFile), parallelism);
|
context.map(latestBaseFiles, partitionPathBaseFile -> new HoodieKeyLocationFetchHandle<>(config, hoodieTable, partitionPathBaseFile, Option.empty()), parallelism);
|
||||||
Map<HoodieKey, HoodieRecordLocation> recordLocations = new HashMap<>();
|
Map<HoodieKey, HoodieRecordLocation> recordLocations = new HashMap<>();
|
||||||
hoodieKeyLocationFetchHandles.stream()
|
hoodieKeyLocationFetchHandles.stream()
|
||||||
.flatMap(handle -> handle.locations())
|
.flatMap(handle -> handle.locations())
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import org.apache.hudi.common.fs.FSUtils;
|
|||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
@@ -65,7 +66,7 @@ public class FlinkMergeAndReplaceHandle<T extends HoodieRecordPayload, I, K, O>
|
|||||||
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
|
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
|
||||||
TaskContextSupplier taskContextSupplier, Path basePath) {
|
TaskContextSupplier taskContextSupplier, Path basePath) {
|
||||||
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier,
|
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier,
|
||||||
new HoodieBaseFile(basePath.toString()));
|
new HoodieBaseFile(basePath.toString()), Option.empty());
|
||||||
// delete invalid data files generated by task retry.
|
// delete invalid data files generated by task retry.
|
||||||
if (getAttemptId() > 0) {
|
if (getAttemptId() > 0) {
|
||||||
deleteInvalidDataFile(getAttemptId() - 1);
|
deleteInvalidDataFile(getAttemptId() - 1);
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import org.apache.hudi.common.engine.TaskContextSupplier;
|
|||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
@@ -65,7 +66,7 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
|
|||||||
public FlinkMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
public FlinkMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||||
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
|
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId,
|
||||||
TaskContextSupplier taskContextSupplier) {
|
TaskContextSupplier taskContextSupplier) {
|
||||||
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier);
|
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty());
|
||||||
if (rolloverPaths == null) {
|
if (rolloverPaths == null) {
|
||||||
// #makeOldAndNewFilePaths may already initialize it already
|
// #makeOldAndNewFilePaths may already initialize it already
|
||||||
rolloverPaths = new ArrayList<>();
|
rolloverPaths = new ArrayList<>();
|
||||||
|
|||||||
@@ -351,10 +351,10 @@ public class HoodieFlinkCopyOnWriteTable<T extends HoodieRecordPayload> extends
|
|||||||
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) {
|
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) {
|
||||||
if (requireSortedRecords()) {
|
if (requireSortedRecords()) {
|
||||||
return new HoodieSortedMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
return new HoodieSortedMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
||||||
dataFileToBeMerged, taskContextSupplier);
|
dataFileToBeMerged, taskContextSupplier, Option.empty());
|
||||||
} else {
|
} else {
|
||||||
return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
||||||
dataFileToBeMerged,taskContextSupplier);
|
dataFileToBeMerged,taskContextSupplier, Option.empty());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -283,9 +283,9 @@ public abstract class BaseJavaCommitActionExecutor<T extends HoodieRecordPayload
|
|||||||
|
|
||||||
protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
|
protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
|
||||||
if (table.requireSortedRecords()) {
|
if (table.requireSortedRecords()) {
|
||||||
return new HoodieSortedMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier);
|
return new HoodieSortedMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty());
|
||||||
} else {
|
} else {
|
||||||
return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier);
|
return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -293,7 +293,7 @@ public abstract class BaseJavaCommitActionExecutor<T extends HoodieRecordPayload
|
|||||||
Map<String, HoodieRecord<T>> keyToNewRecords,
|
Map<String, HoodieRecord<T>> keyToNewRecords,
|
||||||
HoodieBaseFile dataFileToBeMerged) {
|
HoodieBaseFile dataFileToBeMerged) {
|
||||||
return new HoodieMergeHandle<>(config, instantTime, table, keyToNewRecords,
|
return new HoodieMergeHandle<>(config, instantTime, table, keyToNewRecords,
|
||||||
partitionPath, fileId, dataFileToBeMerged, taskContextSupplier);
|
partitionPath, fileId, dataFileToBeMerged, taskContextSupplier, Option.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -25,6 +25,9 @@
|
|||||||
}, {
|
}, {
|
||||||
"name" : "_row_key",
|
"name" : "_row_key",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
}, {
|
||||||
|
"name" : "partition_path",
|
||||||
|
"type" : "string"
|
||||||
}, {
|
}, {
|
||||||
"name" : "rider",
|
"name" : "rider",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ package org.apache.hudi.index.simple;
|
|||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
@@ -30,15 +31,19 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
|
|||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.index.HoodieIndexUtils;
|
import org.apache.hudi.index.HoodieIndexUtils;
|
||||||
import org.apache.hudi.index.SparkHoodieIndex;
|
import org.apache.hudi.index.SparkHoodieIndex;
|
||||||
import org.apache.hudi.io.HoodieKeyLocationFetchHandle;
|
import org.apache.hudi.io.HoodieKeyLocationFetchHandle;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
@@ -146,8 +151,15 @@ public class SparkHoodieSimpleIndex<T extends HoodieRecordPayload> extends Spark
|
|||||||
List<Pair<String, HoodieBaseFile>> baseFiles) {
|
List<Pair<String, HoodieBaseFile>> baseFiles) {
|
||||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||||
int fetchParallelism = Math.max(1, Math.max(baseFiles.size(), parallelism));
|
int fetchParallelism = Math.max(1, Math.max(baseFiles.size(), parallelism));
|
||||||
|
|
||||||
|
try {
|
||||||
|
Option<BaseKeyGenerator> keyGeneratorOpt = config.populateMetaFields() ? Option.empty()
|
||||||
|
: Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps())));
|
||||||
return jsc.parallelize(baseFiles, fetchParallelism)
|
return jsc.parallelize(baseFiles, fetchParallelism)
|
||||||
.flatMapToPair(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile)
|
.flatMapToPair(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile, keyGeneratorOpt)
|
||||||
.locations().map(x -> Tuple2.apply(((Pair)x).getLeft(), ((Pair)x).getRight())).iterator());
|
.locations().map(x -> Tuple2.apply(((Pair)x).getLeft(), ((Pair)x).getRight())).iterator());
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIOException("KeyGenerator instantiation failed ", e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -97,6 +97,8 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
|
|||||||
* @param structType schema of the internalRow.
|
* @param structType schema of the internalRow.
|
||||||
* @return the partition path.
|
* @return the partition path.
|
||||||
*/
|
*/
|
||||||
|
@Override
|
||||||
|
@PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
|
||||||
public String getPartitionPath(InternalRow internalRow, StructType structType) {
|
public String getPartitionPath(InternalRow internalRow, StructType structType) {
|
||||||
try {
|
try {
|
||||||
initDeserializer(structType);
|
initDeserializer(structType);
|
||||||
|
|||||||
@@ -19,6 +19,8 @@
|
|||||||
package org.apache.hudi.keygen;
|
package org.apache.hudi.keygen;
|
||||||
|
|
||||||
import org.apache.spark.sql.Row;
|
import org.apache.spark.sql.Row;
|
||||||
|
import org.apache.spark.sql.catalyst.InternalRow;
|
||||||
|
import org.apache.spark.sql.types.StructType;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Spark key generator interface.
|
* Spark key generator interface.
|
||||||
@@ -28,4 +30,6 @@ public interface SparkKeyGeneratorInterface extends KeyGeneratorInterface {
|
|||||||
String getRecordKey(Row row);
|
String getRecordKey(Row row);
|
||||||
|
|
||||||
String getPartitionPath(Row row);
|
String getPartitionPath(Row row);
|
||||||
|
|
||||||
|
String getPartitionPath(InternalRow internalRow, StructType structType);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ import org.apache.hudi.avro.model.HoodieRollbackMetadata;
|
|||||||
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
|
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
@@ -37,11 +38,14 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
|
|||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||||
import org.apache.hudi.exception.HoodieUpsertException;
|
import org.apache.hudi.exception.HoodieUpsertException;
|
||||||
import org.apache.hudi.io.HoodieCreateHandle;
|
import org.apache.hudi.io.HoodieCreateHandle;
|
||||||
import org.apache.hudi.io.HoodieMergeHandle;
|
import org.apache.hudi.io.HoodieMergeHandle;
|
||||||
import org.apache.hudi.io.HoodieSortedMergeHandle;
|
import org.apache.hudi.io.HoodieSortedMergeHandle;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
||||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||||
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
|
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
|
||||||
import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor;
|
import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor;
|
||||||
@@ -210,12 +214,21 @@ public class HoodieSparkCopyOnWriteTable<T extends HoodieRecordPayload> extends
|
|||||||
|
|
||||||
protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId,
|
protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId,
|
||||||
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) {
|
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) {
|
||||||
|
Option<BaseKeyGenerator> keyGeneratorOpt = Option.empty();
|
||||||
|
if (!config.populateMetaFields()) {
|
||||||
|
try {
|
||||||
|
keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps())));
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIOException("Only BaseKeyGenerator (or any key generator that extends from BaseKeyGenerator) are supported when meta "
|
||||||
|
+ "columns are disabled. Please choose the right key generator if you wish to disable meta fields.", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
if (requireSortedRecords()) {
|
if (requireSortedRecords()) {
|
||||||
return new HoodieSortedMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
return new HoodieSortedMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
||||||
dataFileToBeMerged, taskContextSupplier);
|
dataFileToBeMerged, taskContextSupplier, keyGeneratorOpt);
|
||||||
} else {
|
} else {
|
||||||
return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
return new HoodieMergeHandle(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
||||||
dataFileToBeMerged,taskContextSupplier);
|
dataFileToBeMerged, taskContextSupplier, keyGeneratorOpt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ import org.apache.hudi.exception.HoodieClusteringException;
|
|||||||
import org.apache.hudi.io.IOUtils;
|
import org.apache.hudi.io.IOUtils;
|
||||||
import org.apache.hudi.io.storage.HoodieFileReader;
|
import org.apache.hudi.io.storage.HoodieFileReader;
|
||||||
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
||||||
|
import org.apache.hudi.keygen.KeyGenUtils;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||||
import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy;
|
import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy;
|
||||||
@@ -247,8 +248,8 @@ public class SparkExecuteClusteringCommitActionExecutor<T extends HoodieRecordPa
|
|||||||
*/
|
*/
|
||||||
private HoodieRecord<? extends HoodieRecordPayload> transform(IndexedRecord indexedRecord) {
|
private HoodieRecord<? extends HoodieRecordPayload> transform(IndexedRecord indexedRecord) {
|
||||||
GenericRecord record = (GenericRecord) indexedRecord;
|
GenericRecord record = (GenericRecord) indexedRecord;
|
||||||
String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String key = KeyGenUtils.getRecordKeyFromGenericRecord(record, keyGeneratorOpt);
|
||||||
String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
|
String partition = KeyGenUtils.getPartitionPathFromGenericRecord(record, keyGeneratorOpt);
|
||||||
HoodieKey hoodieKey = new HoodieKey(key, partition);
|
HoodieKey hoodieKey = new HoodieKey(key, partition);
|
||||||
|
|
||||||
HoodieRecordPayload avroPayload = ReflectionUtils.loadPayload(table.getMetaClient().getTableConfig().getPayloadClass(),
|
HoodieRecordPayload avroPayload = ReflectionUtils.loadPayload(table.getMetaClient().getTableConfig().getPayloadClass(),
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ package org.apache.hudi.table.action.commit;
|
|||||||
|
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
@@ -37,6 +38,7 @@ import org.apache.hudi.common.util.ReflectionUtils;
|
|||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieCommitException;
|
import org.apache.hudi.exception.HoodieCommitException;
|
||||||
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.exception.HoodieMetadataException;
|
import org.apache.hudi.exception.HoodieMetadataException;
|
||||||
import org.apache.hudi.exception.HoodieUpsertException;
|
import org.apache.hudi.exception.HoodieUpsertException;
|
||||||
import org.apache.hudi.execution.SparkLazyInsertIterable;
|
import org.apache.hudi.execution.SparkLazyInsertIterable;
|
||||||
@@ -44,6 +46,8 @@ import org.apache.hudi.io.CreateHandleFactory;
|
|||||||
import org.apache.hudi.io.HoodieMergeHandle;
|
import org.apache.hudi.io.HoodieMergeHandle;
|
||||||
import org.apache.hudi.io.HoodieSortedMergeHandle;
|
import org.apache.hudi.io.HoodieSortedMergeHandle;
|
||||||
import org.apache.hudi.io.storage.HoodieConcatHandle;
|
import org.apache.hudi.io.storage.HoodieConcatHandle;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
||||||
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
|
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
|
||||||
import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter;
|
import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter;
|
||||||
import org.apache.hudi.table.HoodieSparkTable;
|
import org.apache.hudi.table.HoodieSparkTable;
|
||||||
@@ -78,6 +82,7 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
|
|||||||
BaseCommitActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, HoodieWriteMetadata> {
|
BaseCommitActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, HoodieWriteMetadata> {
|
||||||
|
|
||||||
private static final Logger LOG = LogManager.getLogger(BaseSparkCommitActionExecutor.class);
|
private static final Logger LOG = LogManager.getLogger(BaseSparkCommitActionExecutor.class);
|
||||||
|
protected Option<BaseKeyGenerator> keyGeneratorOpt = Option.empty();
|
||||||
|
|
||||||
public BaseSparkCommitActionExecutor(HoodieEngineContext context,
|
public BaseSparkCommitActionExecutor(HoodieEngineContext context,
|
||||||
HoodieWriteConfig config,
|
HoodieWriteConfig config,
|
||||||
@@ -85,6 +90,7 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
|
|||||||
String instantTime,
|
String instantTime,
|
||||||
WriteOperationType operationType) {
|
WriteOperationType operationType) {
|
||||||
super(context, config, table, instantTime, operationType, Option.empty());
|
super(context, config, table, instantTime, operationType, Option.empty());
|
||||||
|
initKeyGenIfNeeded(config.populateMetaFields());
|
||||||
}
|
}
|
||||||
|
|
||||||
public BaseSparkCommitActionExecutor(HoodieEngineContext context,
|
public BaseSparkCommitActionExecutor(HoodieEngineContext context,
|
||||||
@@ -94,6 +100,17 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
|
|||||||
WriteOperationType operationType,
|
WriteOperationType operationType,
|
||||||
Option extraMetadata) {
|
Option extraMetadata) {
|
||||||
super(context, config, table, instantTime, operationType, extraMetadata);
|
super(context, config, table, instantTime, operationType, extraMetadata);
|
||||||
|
initKeyGenIfNeeded(config.populateMetaFields());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initKeyGenIfNeeded(boolean populateMetaFields) {
|
||||||
|
if (!populateMetaFields) {
|
||||||
|
try {
|
||||||
|
keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps())));
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIOException("Only BaseKeyGenerators are supported when meta columns are disabled ", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<HoodieRecord<T>> clusteringHandleUpdate(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
|
private JavaRDD<HoodieRecord<T>> clusteringHandleUpdate(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
|
||||||
@@ -327,11 +344,12 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
|
|||||||
|
|
||||||
protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
|
protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
|
||||||
if (table.requireSortedRecords()) {
|
if (table.requireSortedRecords()) {
|
||||||
return new HoodieSortedMergeHandle<>(config, instantTime, (HoodieSparkTable) table, recordItr, partitionPath, fileId, taskContextSupplier);
|
return new HoodieSortedMergeHandle<>(config, instantTime, (HoodieSparkTable) table, recordItr, partitionPath, fileId, taskContextSupplier,
|
||||||
|
keyGeneratorOpt);
|
||||||
} else if (!WriteOperationType.isChangingRecords(operationType) && config.allowDuplicateInserts()) {
|
} else if (!WriteOperationType.isChangingRecords(operationType) && config.allowDuplicateInserts()) {
|
||||||
return new HoodieConcatHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier);
|
return new HoodieConcatHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
|
||||||
} else {
|
} else {
|
||||||
return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier);
|
return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import org.apache.hadoop.fs.Path;
|
|||||||
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
||||||
import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata;
|
import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata;
|
||||||
import org.apache.hudi.avro.model.HoodieCleanMetadata;
|
import org.apache.hudi.avro.model.HoodieCleanMetadata;
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
|
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.FileSlice;
|
import org.apache.hudi.common.model.FileSlice;
|
||||||
@@ -66,6 +67,9 @@ import org.apache.hudi.exception.HoodieUpsertException;
|
|||||||
import org.apache.hudi.index.HoodieIndex;
|
import org.apache.hudi.index.HoodieIndex;
|
||||||
import org.apache.hudi.index.HoodieIndex.IndexType;
|
import org.apache.hudi.index.HoodieIndex.IndexType;
|
||||||
import org.apache.hudi.io.HoodieMergeHandle;
|
import org.apache.hudi.io.HoodieMergeHandle;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
import org.apache.hudi.keygen.KeyGenerator;
|
||||||
|
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
||||||
import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
|
import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
|
||||||
import org.apache.hudi.table.HoodieSparkTable;
|
import org.apache.hudi.table.HoodieSparkTable;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
@@ -120,6 +124,7 @@ import static org.apache.hudi.config.HoodieClusteringConfig.CLUSTERING_EXECUTION
|
|||||||
import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors;
|
import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
import static org.junit.jupiter.api.Assertions.fail;
|
import static org.junit.jupiter.api.Assertions.fail;
|
||||||
@@ -136,10 +141,32 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
private static Stream<Arguments> configParams() {
|
private static Stream<Arguments> smallInsertHandlingParams() {
|
||||||
return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of);
|
return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Stream<Arguments> populateMetaFieldsParams() {
|
||||||
|
return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Stream<Arguments> rollbackFailedCommitsParams() {
|
||||||
|
return Stream.of(
|
||||||
|
Arguments.of(HoodieFailedWritesCleaningPolicy.LAZY, true),
|
||||||
|
Arguments.of(HoodieFailedWritesCleaningPolicy.LAZY, false),
|
||||||
|
Arguments.of(HoodieFailedWritesCleaningPolicy.NEVER, true),
|
||||||
|
Arguments.of(HoodieFailedWritesCleaningPolicy.NEVER, false)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Stream<Arguments> rollbackAfterConsistencyCheckFailureParams() {
|
||||||
|
return Stream.of(
|
||||||
|
Arguments.of(true, true),
|
||||||
|
Arguments.of(true, false),
|
||||||
|
Arguments.of(false, true),
|
||||||
|
Arguments.of(false, false)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
private HoodieTestTable testTable;
|
private HoodieTestTable testTable;
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
@@ -150,50 +177,56 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
/**
|
/**
|
||||||
* Test Auto Commit behavior for HoodieWriteClient insert API.
|
* Test Auto Commit behavior for HoodieWriteClient insert API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testAutoCommitOnInsert() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testAutoCommit(SparkRDDWriteClient::insert, false);
|
public void testAutoCommitOnInsert(boolean populateMetaFields) throws Exception {
|
||||||
|
testAutoCommit(SparkRDDWriteClient::insert, false, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test Auto Commit behavior for HoodieWriteClient insertPrepped API.
|
* Test Auto Commit behavior for HoodieWriteClient insertPrepped API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testAutoCommitOnInsertPrepped() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testAutoCommit(SparkRDDWriteClient::insertPreppedRecords, true);
|
public void testAutoCommitOnInsertPrepped(boolean populateMetaFields) throws Exception {
|
||||||
|
testAutoCommit(SparkRDDWriteClient::insertPreppedRecords, true, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test Auto Commit behavior for HoodieWriteClient upsert API.
|
* Test Auto Commit behavior for HoodieWriteClient upsert API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testAutoCommitOnUpsert() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testAutoCommit(SparkRDDWriteClient::upsert, false);
|
public void testAutoCommitOnUpsert(boolean populateMetaFields) throws Exception {
|
||||||
|
testAutoCommit(SparkRDDWriteClient::upsert, false, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test Auto Commit behavior for HoodieWriteClient upsert Prepped API.
|
* Test Auto Commit behavior for HoodieWriteClient upsert Prepped API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testAutoCommitOnUpsertPrepped() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testAutoCommit(SparkRDDWriteClient::upsertPreppedRecords, true);
|
public void testAutoCommitOnUpsertPrepped(boolean populateMetaFields) throws Exception {
|
||||||
|
testAutoCommit(SparkRDDWriteClient::upsertPreppedRecords, true, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test Auto Commit behavior for HoodieWriteClient bulk-insert API.
|
* Test Auto Commit behavior for HoodieWriteClient bulk-insert API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testAutoCommitOnBulkInsert() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testAutoCommit(SparkRDDWriteClient::bulkInsert, false);
|
public void testAutoCommitOnBulkInsert(boolean populateMetaFields) throws Exception {
|
||||||
|
testAutoCommit(SparkRDDWriteClient::bulkInsert, false, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test Auto Commit behavior for HoodieWriteClient bulk-insert prepped API.
|
* Test Auto Commit behavior for HoodieWriteClient bulk-insert prepped API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testAutoCommitOnBulkInsertPrepped() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
|
public void testAutoCommitOnBulkInsertPrepped(boolean populateMetaFields) throws Exception {
|
||||||
testAutoCommit((writeClient, recordRDD, instantTime) -> writeClient.bulkInsertPreppedRecords(recordRDD, instantTime,
|
testAutoCommit((writeClient, recordRDD, instantTime) -> writeClient.bulkInsertPreppedRecords(recordRDD, instantTime,
|
||||||
Option.empty()), true);
|
Option.empty()), true, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -203,15 +236,16 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
* @throws Exception in case of failure
|
* @throws Exception in case of failure
|
||||||
*/
|
*/
|
||||||
private void testAutoCommit(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
|
private void testAutoCommit(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
|
||||||
boolean isPrepped) throws Exception {
|
boolean isPrepped, boolean populateMetaFields) throws Exception {
|
||||||
// Set autoCommit false
|
// Set autoCommit false
|
||||||
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
|
||||||
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());) {
|
||||||
|
|
||||||
String prevCommitTime = "000";
|
String prevCommitTime = "000";
|
||||||
String newCommitTime = "001";
|
String newCommitTime = "001";
|
||||||
int numRecords = 200;
|
int numRecords = 200;
|
||||||
JavaRDD<WriteStatus> result = insertFirstBatch(cfg, client, newCommitTime, prevCommitTime, numRecords, writeFn,
|
JavaRDD<WriteStatus> result = insertFirstBatch(cfgBuilder.build(), client, newCommitTime, prevCommitTime, numRecords, writeFn,
|
||||||
isPrepped, false, numRecords);
|
isPrepped, false, numRecords);
|
||||||
|
|
||||||
assertFalse(testTable.commitExists(newCommitTime),
|
assertFalse(testTable.commitExists(newCommitTime),
|
||||||
@@ -225,25 +259,28 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
/**
|
/**
|
||||||
* Test De-duplication behavior for HoodieWriteClient insert API.
|
* Test De-duplication behavior for HoodieWriteClient insert API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testDeduplicationOnInsert() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testDeduplication(SparkRDDWriteClient::insert);
|
public void testDeduplicationOnInsert(boolean populateMetaFields) throws Exception {
|
||||||
|
testDeduplication(SparkRDDWriteClient::insert, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test De-duplication behavior for HoodieWriteClient bulk-insert API.
|
* Test De-duplication behavior for HoodieWriteClient bulk-insert API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testDeduplicationOnBulkInsert() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testDeduplication(SparkRDDWriteClient::bulkInsert);
|
public void testDeduplicationOnBulkInsert(boolean populateMetaFields) throws Exception {
|
||||||
|
testDeduplication(SparkRDDWriteClient::bulkInsert, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test De-duplication behavior for HoodieWriteClient upsert API.
|
* Test De-duplication behavior for HoodieWriteClient upsert API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testDeduplicationOnUpsert() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testDeduplication(SparkRDDWriteClient::upsert);
|
public void testDeduplicationOnUpsert(boolean populateMetaFields) throws Exception {
|
||||||
|
testDeduplication(SparkRDDWriteClient::upsert, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -253,7 +290,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
* @throws Exception in case of failure
|
* @throws Exception in case of failure
|
||||||
*/
|
*/
|
||||||
private void testDeduplication(
|
private void testDeduplication(
|
||||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn) throws Exception {
|
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean populateMetaFields) throws Exception {
|
||||||
String newCommitTime = "001";
|
String newCommitTime = "001";
|
||||||
|
|
||||||
String recordKey = UUID.randomUUID().toString();
|
String recordKey = UUID.randomUUID().toString();
|
||||||
@@ -289,8 +326,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
|
|
||||||
// Perform write-action and check
|
// Perform write-action and check
|
||||||
JavaRDD<HoodieRecord> recordList = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1);
|
JavaRDD<HoodieRecord> recordList = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1);
|
||||||
try (SparkRDDWriteClient client = getHoodieWriteClient(getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY)
|
HoodieWriteConfig.Builder configBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY)
|
||||||
.combineInput(true, true).build());) {
|
.combineInput(true, true);
|
||||||
|
addAppropriatePropsForPopulateMetaFields(configBuilder, populateMetaFields);
|
||||||
|
|
||||||
|
try (SparkRDDWriteClient client = getHoodieWriteClient(configBuilder.build());) {
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
List<WriteStatus> statuses = writeFn.apply(client, recordList, newCommitTime).collect();
|
List<WriteStatus> statuses = writeFn.apply(client, recordList, newCommitTime).collect();
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
@@ -321,17 +361,23 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
/**
|
/**
|
||||||
* Test Upsert API.
|
* Test Upsert API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testUpserts() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testUpsertsInternal(getConfig(), SparkRDDWriteClient::upsert, false);
|
public void testUpserts(boolean populateMetaFields) throws Exception {
|
||||||
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
|
||||||
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
testUpsertsInternal(cfgBuilder.build(), SparkRDDWriteClient::upsert, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test UpsertPrepped API.
|
* Test UpsertPrepped API.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testUpsertsPrepped() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testUpsertsInternal(getConfig(), SparkRDDWriteClient::upsertPreppedRecords, true);
|
public void testUpsertsPrepped(boolean populateMetaFields) throws Exception {
|
||||||
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
|
||||||
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
testUpsertsInternal(cfgBuilder.build(), SparkRDDWriteClient::upsertPreppedRecords, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -348,9 +394,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
HoodieWriteConfig hoodieWriteConfig = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY)
|
HoodieWriteConfig hoodieWriteConfig = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY)
|
||||||
.withProps(config.getProps()).withTimelineLayoutVersion(
|
.withProps(config.getProps()).withTimelineLayoutVersion(
|
||||||
VERSION_0).build();
|
VERSION_0).build();
|
||||||
|
|
||||||
HoodieTableMetaClient.withPropertyBuilder()
|
HoodieTableMetaClient.withPropertyBuilder()
|
||||||
.fromMetaClient(metaClient)
|
.fromMetaClient(metaClient)
|
||||||
.setTimelineLayoutVersion(VERSION_0)
|
.setTimelineLayoutVersion(VERSION_0)
|
||||||
|
.setPopulateMetaFields(config.populateMetaFields())
|
||||||
.initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
|
.initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
|
||||||
|
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
|
SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
|
||||||
@@ -360,7 +408,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
String initCommitTime = "000";
|
String initCommitTime = "000";
|
||||||
int numRecords = 200;
|
int numRecords = 200;
|
||||||
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert,
|
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert,
|
||||||
isPrepped, true, numRecords);
|
isPrepped, true, numRecords, config.populateMetaFields());
|
||||||
|
|
||||||
// Write 2 (updates)
|
// Write 2 (updates)
|
||||||
String prevCommitTime = newCommitTime;
|
String prevCommitTime = newCommitTime;
|
||||||
@@ -369,7 +417,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
String commitTimeBetweenPrevAndNew = "002";
|
String commitTimeBetweenPrevAndNew = "002";
|
||||||
updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime,
|
updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime,
|
||||||
Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, writeFn, isPrepped, true,
|
Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, writeFn, isPrepped, true,
|
||||||
numRecords, 200, 2);
|
numRecords, 200, 2, config.populateMetaFields());
|
||||||
|
|
||||||
// Delete 1
|
// Delete 1
|
||||||
prevCommitTime = newCommitTime;
|
prevCommitTime = newCommitTime;
|
||||||
@@ -378,7 +426,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
|
|
||||||
deleteBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime,
|
deleteBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime,
|
||||||
initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true,
|
initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true,
|
||||||
0, 150);
|
0, 150, config.populateMetaFields());
|
||||||
|
|
||||||
// Now simulate an upgrade and perform a restore operation
|
// Now simulate an upgrade and perform a restore operation
|
||||||
HoodieWriteConfig newConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion(
|
HoodieWriteConfig newConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion(
|
||||||
@@ -440,7 +488,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
HoodieMergeHandle handle = new HoodieMergeHandle(cfg, instantTime, table, new HashMap<>(),
|
HoodieMergeHandle handle = new HoodieMergeHandle(cfg, instantTime, table, new HashMap<>(),
|
||||||
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier());
|
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(),
|
||||||
|
config.populateMetaFields() ? Option.empty() :
|
||||||
|
Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))));
|
||||||
WriteStatus writeStatus = new WriteStatus(false, 0.0);
|
WriteStatus writeStatus = new WriteStatus(false, 0.0);
|
||||||
writeStatus.setStat(new HoodieWriteStat());
|
writeStatus.setStat(new HoodieWriteStat());
|
||||||
writeStatus.getStat().setNumWrites(0);
|
writeStatus.getStat().setNumWrites(0);
|
||||||
@@ -454,7 +504,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
cfg.getProps().setProperty("hoodie.merge.data.validation.enabled", "true");
|
cfg.getProps().setProperty("hoodie.merge.data.validation.enabled", "true");
|
||||||
HoodieWriteConfig cfg2 = HoodieWriteConfig.newBuilder().withProps(cfg.getProps()).build();
|
HoodieWriteConfig cfg2 = HoodieWriteConfig.newBuilder().withProps(cfg.getProps()).build();
|
||||||
HoodieMergeHandle handle = new HoodieMergeHandle(cfg2, newInstantTime, table, new HashMap<>(),
|
HoodieMergeHandle handle = new HoodieMergeHandle(cfg2, newInstantTime, table, new HashMap<>(),
|
||||||
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier());
|
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(),
|
||||||
|
config.populateMetaFields() ? Option.empty() :
|
||||||
|
Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))));
|
||||||
WriteStatus writeStatus = new WriteStatus(false, 0.0);
|
WriteStatus writeStatus = new WriteStatus(false, 0.0);
|
||||||
writeStatus.setStat(new HoodieWriteStat());
|
writeStatus.setStat(new HoodieWriteStat());
|
||||||
writeStatus.getStat().setNumWrites(0);
|
writeStatus.getStat().setNumWrites(0);
|
||||||
@@ -470,17 +522,23 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
/**
|
/**
|
||||||
* Test Insert API for HoodieConcatHandle.
|
* Test Insert API for HoodieConcatHandle.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testInsertsWithHoodieConcatHandle() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testHoodieConcatHandle(getConfig(), false);
|
public void testInsertsWithHoodieConcatHandle(boolean populateMetaFields) throws Exception {
|
||||||
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
|
||||||
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
testHoodieConcatHandle(cfgBuilder.build(), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test InsertPrepped API for HoodieConcatHandle.
|
* Test InsertPrepped API for HoodieConcatHandle.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testInsertsPreppedWithHoodieConcatHandle() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
testHoodieConcatHandle(getConfig(), true);
|
public void testInsertsPreppedWithHoodieConcatHandle(boolean populateMetaFields) throws Exception {
|
||||||
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
|
||||||
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
testHoodieConcatHandle(cfgBuilder.build(), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -507,7 +565,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
String initCommitTime = "000";
|
String initCommitTime = "000";
|
||||||
int numRecords = 200;
|
int numRecords = 200;
|
||||||
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert,
|
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert,
|
||||||
isPrepped, true, numRecords);
|
isPrepped, true, numRecords, config.populateMetaFields());
|
||||||
|
|
||||||
// Write 2 (updates)
|
// Write 2 (updates)
|
||||||
String prevCommitTime = newCommitTime;
|
String prevCommitTime = newCommitTime;
|
||||||
@@ -520,15 +578,18 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
|
|
||||||
writeBatch(client, newCommitTime, prevCommitTime, Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime,
|
writeBatch(client, newCommitTime, prevCommitTime, Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime,
|
||||||
numRecords, recordGenFunction, SparkRDDWriteClient::insert, true, numRecords, 300,
|
numRecords, recordGenFunction, SparkRDDWriteClient::insert, true, numRecords, 300,
|
||||||
2, false);
|
2, false, config.populateMetaFields());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests deletion of records.
|
* Tests deletion of records.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testDeletes() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).build());
|
public void testDeletes(boolean populateMetaFields) throws Exception {
|
||||||
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY);
|
||||||
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());
|
||||||
/**
|
/**
|
||||||
* Write 1 (inserts and deletes) Write actual 200 insert records and ignore 100 delete records
|
* Write 1 (inserts and deletes) Write actual 200 insert records and ignore 100 delete records
|
||||||
*/
|
*/
|
||||||
@@ -547,7 +608,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
};
|
};
|
||||||
writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime,
|
writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime,
|
||||||
// unused as genFn uses hard-coded number of inserts/updates/deletes
|
// unused as genFn uses hard-coded number of inserts/updates/deletes
|
||||||
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 200, 200, 1, false);
|
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 200, 200, 1, false,
|
||||||
|
populateMetaFields);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write 2 (deletes+writes).
|
* Write 2 (deletes+writes).
|
||||||
@@ -564,7 +626,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
return recordsInSecondBatch;
|
return recordsInSecondBatch;
|
||||||
};
|
};
|
||||||
writeBatch(client, newCommitTime, prevCommitTime, Option.empty(), initCommitTime, 100, recordGenFunction,
|
writeBatch(client, newCommitTime, prevCommitTime, Option.empty(), initCommitTime, 100, recordGenFunction,
|
||||||
SparkRDDWriteClient::upsert, true, 50, 150, 2, false);
|
SparkRDDWriteClient::upsert, true, 50, 150, 2, false,
|
||||||
|
populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -572,9 +635,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
* not be available in read path.
|
* not be available in read path.
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testDeletesForInsertsInSameBatch() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).build());
|
public void testDeletesForInsertsInSameBatch(boolean populateMetaFields) throws Exception {
|
||||||
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY);
|
||||||
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());
|
||||||
/**
|
/**
|
||||||
* Write 200 inserts and issue deletes to a subset(50) of inserts.
|
* Write 200 inserts and issue deletes to a subset(50) of inserts.
|
||||||
*/
|
*/
|
||||||
@@ -593,7 +659,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
};
|
};
|
||||||
|
|
||||||
writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime,
|
writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime,
|
||||||
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 150, 150, 1, false);
|
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 150, 150, 1, false,
|
||||||
|
populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -793,7 +860,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
Properties props = new Properties();
|
Properties props = new Properties();
|
||||||
props.setProperty(ASYNC_CLUSTERING_ENABLE_OPT_KEY.key(), "true");
|
props.setProperty(ASYNC_CLUSTERING_ENABLE_OPT_KEY.key(), "true");
|
||||||
HoodieWriteConfig config = getSmallInsertWriteConfig(100,
|
HoodieWriteConfig config = getSmallInsertWriteConfig(100,
|
||||||
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), props);
|
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), true, props);
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||||
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
|
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
|
||||||
|
|
||||||
@@ -847,7 +914,10 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
final String testPartitionPath = "2016/09/26";
|
final String testPartitionPath = "2016/09/26";
|
||||||
final int insertSplitLimit = 100;
|
final int insertSplitLimit = 100;
|
||||||
// setup the small file handling params
|
// setup the small file handling params
|
||||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max
|
// hold upto 200 records max
|
||||||
|
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit,
|
||||||
|
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150));
|
||||||
|
|
||||||
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||||
BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
|
BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
|
||||||
@@ -954,11 +1024,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
* Test scenario of new file-group getting added during insert().
|
* Test scenario of new file-group getting added during insert().
|
||||||
*/
|
*/
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@MethodSource("configParams")
|
@MethodSource("smallInsertHandlingParams")
|
||||||
public void testSmallInsertHandlingForInserts(boolean mergeAllowDuplicateInserts) throws Exception {
|
public void testSmallInsertHandlingForInserts(boolean mergeAllowDuplicateInserts) throws Exception {
|
||||||
final String testPartitionPath = "2016/09/26";
|
final String testPartitionPath = "2016/09/26";
|
||||||
final int insertSplitLimit = 100;
|
final int insertSplitLimit = 100;
|
||||||
// setup the small file handling params
|
// setup the small file handling params
|
||||||
|
|
||||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, false, mergeAllowDuplicateInserts); // hold upto 200 records max
|
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, false, mergeAllowDuplicateInserts); // hold upto 200 records max
|
||||||
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||||
@@ -1039,7 +1110,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
final String testPartitionPath = "2016/09/26";
|
final String testPartitionPath = "2016/09/26";
|
||||||
final int insertSplitLimit = 100;
|
final int insertSplitLimit = 100;
|
||||||
// setup the small file handling params
|
// setup the small file handling params
|
||||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max
|
// hold upto 200 records max
|
||||||
|
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit,
|
||||||
|
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150));
|
||||||
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||||
|
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||||
@@ -1100,31 +1173,34 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
testDeletes(client, updateBatch3.getRight(), 10, file1, "007", 140, keysSoFar);
|
testDeletes(client, updateBatch3.getRight(), 10, file1, "007", 140, keysSoFar);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testSimpleClustering() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
|
public void testSimpleClustering(boolean populateMetaFields) throws Exception {
|
||||||
// setup clustering config
|
// setup clustering config
|
||||||
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
|
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
|
||||||
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
|
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
|
||||||
testClustering(clusteringConfig);
|
testClustering(clusteringConfig, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testClusteringWithSortColumns() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
|
public void testClusteringWithSortColumns(boolean populateMetaFields) throws Exception {
|
||||||
// setup clustering config
|
// setup clustering config
|
||||||
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
|
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
|
||||||
.withClusteringSortColumns("_hoodie_record_key")
|
.withClusteringSortColumns(populateMetaFields ? "_hoodie_record_key" : "_row_key")
|
||||||
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
|
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
|
||||||
testClustering(clusteringConfig);
|
testClustering(clusteringConfig, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testPendingClusteringRollback() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
|
public void testPendingClusteringRollback(boolean populateMetaFields) throws Exception {
|
||||||
// setup clustering config
|
// setup clustering config
|
||||||
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
|
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
|
||||||
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
|
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
|
||||||
|
|
||||||
// start clustering, but dont commit
|
// start clustering, but don't commit
|
||||||
List<HoodieRecord> allRecords = testClustering(clusteringConfig, false);
|
List<HoodieRecord> allRecords = testClustering(clusteringConfig, populateMetaFields);
|
||||||
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
|
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
|
||||||
List<Pair<HoodieInstant, HoodieClusteringPlan>> pendingClusteringPlans =
|
List<Pair<HoodieInstant, HoodieClusteringPlan>> pendingClusteringPlans =
|
||||||
ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList());
|
ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList());
|
||||||
@@ -1132,7 +1208,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
HoodieInstant pendingClusteringInstant = pendingClusteringPlans.get(0).getLeft();
|
HoodieInstant pendingClusteringInstant = pendingClusteringPlans.get(0).getLeft();
|
||||||
|
|
||||||
// complete another commit after pending clustering
|
// complete another commit after pending clustering
|
||||||
HoodieWriteConfig config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER).build();
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER);
|
||||||
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
HoodieWriteConfig config = cfgBuilder.build();
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||||
dataGen = new HoodieTestDataGenerator();
|
dataGen = new HoodieTestDataGenerator();
|
||||||
String commitTime = HoodieActiveTimeline.createNewInstantTime();
|
String commitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
@@ -1146,13 +1224,14 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
assertEquals(0, ClusteringUtils.getAllPendingClusteringPlans(metaClient).count());
|
assertEquals(0, ClusteringUtils.getAllPendingClusteringPlans(metaClient).count());
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig) throws Exception {
|
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig, boolean populateMetaFields) throws Exception {
|
||||||
return testClustering(clusteringConfig, false);
|
return testClustering(clusteringConfig, false, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig, boolean completeClustering) throws Exception {
|
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig, boolean completeClustering, boolean populateMetaFields) throws Exception {
|
||||||
// create config to not update small files.
|
// create config to not update small files.
|
||||||
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, false, 10);
|
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, 10, false, populateMetaFields,
|
||||||
|
populateMetaFields ? new Properties() : getPropertiesForKeyGen());
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||||
dataGen = new HoodieTestDataGenerator();
|
dataGen = new HoodieTestDataGenerator();
|
||||||
String commitTime = HoodieActiveTimeline.createNewInstantTime();
|
String commitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
@@ -1170,14 +1249,15 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
assertEquals(0, fileIdIntersection.size());
|
assertEquals(0, fileIdIntersection.size());
|
||||||
|
|
||||||
config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withAutoCommit(completeClustering)
|
config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withAutoCommit(completeClustering)
|
||||||
.withClusteringConfig(clusteringConfig).build();
|
.withClusteringConfig(clusteringConfig)
|
||||||
|
.withProps(populateMetaFields ? new Properties() : getPropertiesForKeyGen()).build();
|
||||||
|
|
||||||
// create client with new config.
|
// create client with new config.
|
||||||
client = getHoodieWriteClient(config);
|
client = getHoodieWriteClient(config);
|
||||||
String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
|
String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
|
||||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = client.cluster(clusteringCommitTime, completeClustering);
|
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = client.cluster(clusteringCommitTime, completeClustering);
|
||||||
List<HoodieRecord> allRecords = Stream.concat(records1.stream(), records2.stream()).collect(Collectors.toList());
|
List<HoodieRecord> allRecords = Stream.concat(records1.stream(), records2.stream()).collect(Collectors.toList());
|
||||||
verifyRecordsWritten(clusteringCommitTime, allRecords, clusterMetadata.getWriteStatuses().collect());
|
verifyRecordsWritten(clusteringCommitTime, allRecords, clusterMetadata.getWriteStatuses().collect(), config);
|
||||||
Set<HoodieFileGroupId> insertedFileIds = new HashSet<>();
|
Set<HoodieFileGroupId> insertedFileIds = new HashSet<>();
|
||||||
insertedFileIds.addAll(fileIds1);
|
insertedFileIds.addAll(fileIds1);
|
||||||
insertedFileIds.addAll(fileIds2);
|
insertedFileIds.addAll(fileIds2);
|
||||||
@@ -1197,25 +1277,28 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
/**
|
/**
|
||||||
* Test scenario of writing more file groups than existing number of file groups in partition.
|
* Test scenario of writing more file groups than existing number of file groups in partition.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testInsertOverwritePartitionHandlingWithMoreRecords() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
verifyInsertOverwritePartitionHandling(1000, 3000);
|
public void testInsertOverwritePartitionHandlingWithMoreRecords(boolean populateMetaFields) throws Exception {
|
||||||
|
verifyInsertOverwritePartitionHandling(1000, 3000, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test scenario of writing fewer file groups than existing number of file groups in partition.
|
* Test scenario of writing fewer file groups than existing number of file groups in partition.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testInsertOverwritePartitionHandlingWithFewerRecords() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
verifyInsertOverwritePartitionHandling(3000, 1000);
|
public void testInsertOverwritePartitionHandlingWithFewerRecords(boolean populateMetaFields) throws Exception {
|
||||||
|
verifyInsertOverwritePartitionHandling(3000, 1000, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test scenario of writing similar number file groups in partition.
|
* Test scenario of writing similar number file groups in partition.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testInsertOverwritePartitionHandlingWithSimilarNumberOfRecords() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
verifyInsertOverwritePartitionHandling(3000, 3000);
|
public void testInsertOverwritePartitionHandlingWithSimilarNumberOfRecords(boolean populateMetaFields) throws Exception {
|
||||||
|
verifyInsertOverwritePartitionHandling(3000, 3000, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -1224,9 +1307,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
*
|
*
|
||||||
* Verify that all records in step1 are overwritten
|
* Verify that all records in step1 are overwritten
|
||||||
*/
|
*/
|
||||||
private void verifyInsertOverwritePartitionHandling(int batch1RecordsCount, int batch2RecordsCount) throws Exception {
|
private void verifyInsertOverwritePartitionHandling(int batch1RecordsCount, int batch2RecordsCount, boolean populateMetaFields) throws Exception {
|
||||||
final String testPartitionPath = "americas";
|
final String testPartitionPath = "americas";
|
||||||
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, false);
|
HoodieWriteConfig config = getSmallInsertWriteConfig(2000,
|
||||||
|
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields
|
||||||
|
? new Properties() : getPropertiesForKeyGen());
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||||
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||||
|
|
||||||
@@ -1247,7 +1332,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
assertEquals(batch1Buckets, new HashSet<>(writeResult.getPartitionToReplaceFileIds().get(testPartitionPath)));
|
assertEquals(batch1Buckets, new HashSet<>(writeResult.getPartitionToReplaceFileIds().get(testPartitionPath)));
|
||||||
verifyRecordsWritten(commitTime2, inserts2, statuses);
|
verifyRecordsWritten(commitTime2, inserts2, statuses, config);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<String> getFileIdsFromWriteStatus(List<WriteStatus> statuses) {
|
private Set<String> getFileIdsFromWriteStatus(List<WriteStatus> statuses) {
|
||||||
@@ -1257,35 +1342,38 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
/**
|
/**
|
||||||
* Test scenario of writing fewer file groups for first partition than second an third partition.
|
* Test scenario of writing fewer file groups for first partition than second an third partition.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void verifyDeletePartitionsHandlingWithFewerRecordsFirstPartition() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
verifyDeletePartitionsHandling(1000, 3000, 3000);
|
public void verifyDeletePartitionsHandlingWithFewerRecordsFirstPartition(boolean populateMetaFields) throws Exception {
|
||||||
|
verifyDeletePartitionsHandling(1000, 3000, 3000, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test scenario of writing similar number file groups in partition.
|
* Test scenario of writing similar number file groups in partition.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void verifyDeletePartitionsHandlingWithSimilarNumberOfRecords() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
verifyDeletePartitionsHandling(3000, 3000, 3000);
|
public void verifyDeletePartitionsHandlingWithSimilarNumberOfRecords(boolean populateMetaFields) throws Exception {
|
||||||
|
verifyDeletePartitionsHandling(3000, 3000, 3000, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test scenario of writing more file groups for first partition than second an third partition.
|
* Test scenario of writing more file groups for first partition than second an third partition.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void verifyDeletePartitionsHandlingHandlingWithFewerRecordsSecondThirdPartition() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
verifyDeletePartitionsHandling(3000, 1000, 1000);
|
public void verifyDeletePartitionsHandlingHandlingWithFewerRecordsSecondThirdPartition(boolean populateMetaFields) throws Exception {
|
||||||
|
verifyDeletePartitionsHandling(3000, 1000, 1000, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<String> insertPartitionRecordsWithCommit(SparkRDDWriteClient client, int recordsCount, String commitTime1, String partitionPath) {
|
private Set<String> insertPartitionRecordsWithCommit(SparkRDDWriteClient client, int recordsCount, String commitTime1, String partitionPath) throws IOException {
|
||||||
client.startCommitWithTime(commitTime1);
|
client.startCommitWithTime(commitTime1);
|
||||||
List<HoodieRecord> inserts1 = dataGen.generateInsertsForPartition(commitTime1, recordsCount, partitionPath);
|
List<HoodieRecord> inserts1 = dataGen.generateInsertsForPartition(commitTime1, recordsCount, partitionPath);
|
||||||
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 2);
|
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 2);
|
||||||
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
|
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
Set<String> batchBuckets = statuses.stream().map(s -> s.getFileId()).collect(Collectors.toSet());
|
Set<String> batchBuckets = statuses.stream().map(s -> s.getFileId()).collect(Collectors.toSet());
|
||||||
verifyRecordsWritten(commitTime1, inserts1, statuses);
|
verifyRecordsWritten(commitTime1, inserts1, statuses, client.config);
|
||||||
return batchBuckets;
|
return batchBuckets;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1306,8 +1394,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
* 5) delete second and third partition and check result.
|
* 5) delete second and third partition and check result.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
private void verifyDeletePartitionsHandling(int batch1RecordsCount, int batch2RecordsCount, int batch3RecordsCount) throws Exception {
|
private void verifyDeletePartitionsHandling(int batch1RecordsCount, int batch2RecordsCount, int batch3RecordsCount,
|
||||||
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, false);
|
boolean populateMetaFields) throws Exception {
|
||||||
|
HoodieWriteConfig config = getSmallInsertWriteConfig(2000,
|
||||||
|
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields
|
||||||
|
? new Properties() : getPropertiesForKeyGen());
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||||
dataGen = new HoodieTestDataGenerator();
|
dataGen = new HoodieTestDataGenerator();
|
||||||
|
|
||||||
@@ -1360,7 +1451,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
/**
|
/**
|
||||||
* Verify data in base files matches expected records and commit time.
|
* Verify data in base files matches expected records and commit time.
|
||||||
*/
|
*/
|
||||||
private void verifyRecordsWritten(String commitTime, List<HoodieRecord> expectedRecords, List<WriteStatus> allStatus) {
|
private void verifyRecordsWritten(String commitTime, List<HoodieRecord> expectedRecords, List<WriteStatus> allStatus,
|
||||||
|
HoodieWriteConfig config) throws IOException {
|
||||||
List<GenericRecord> records = new ArrayList<>();
|
List<GenericRecord> records = new ArrayList<>();
|
||||||
for (WriteStatus status : allStatus) {
|
for (WriteStatus status : allStatus) {
|
||||||
Path filePath = new Path(basePath, status.getStat().getPath());
|
Path filePath = new Path(basePath, status.getStat().getPath());
|
||||||
@@ -1369,20 +1461,29 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
|
|
||||||
Set<String> expectedKeys = recordsToRecordKeySet(expectedRecords);
|
Set<String> expectedKeys = recordsToRecordKeySet(expectedRecords);
|
||||||
assertEquals(records.size(), expectedKeys.size());
|
assertEquals(records.size(), expectedKeys.size());
|
||||||
|
if (config.populateMetaFields()) {
|
||||||
for (GenericRecord record : records) {
|
for (GenericRecord record : records) {
|
||||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
assertEquals(commitTime,
|
assertEquals(commitTime,
|
||||||
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
||||||
assertTrue(expectedKeys.contains(recordKey));
|
assertTrue(expectedKeys.contains(recordKey));
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()));
|
||||||
|
for (GenericRecord record : records) {
|
||||||
|
String recordKey = keyGenerator.getKey(record).getRecordKey();
|
||||||
|
assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD));
|
||||||
|
assertTrue(expectedKeys.contains(recordKey));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<WriteStatus> writeAndVerifyBatch(SparkRDDWriteClient client, List<HoodieRecord> inserts, String commitTime) {
|
private List<WriteStatus> writeAndVerifyBatch(SparkRDDWriteClient client, List<HoodieRecord> inserts, String commitTime) throws IOException {
|
||||||
client.startCommitWithTime(commitTime);
|
client.startCommitWithTime(commitTime);
|
||||||
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts, 2);
|
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts, 2);
|
||||||
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime).collect();
|
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime).collect();
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
verifyRecordsWritten(commitTime, inserts, statuses);
|
verifyRecordsWritten(commitTime, inserts, statuses, client.config);
|
||||||
return statuses;
|
return statuses;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1449,12 +1550,15 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
/**
|
/**
|
||||||
* Test delete with delete api.
|
* Test delete with delete api.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testDeletesWithoutInserts() {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
|
public void testDeletesWithoutInserts(boolean populateMetaFields) {
|
||||||
final String testPartitionPath = "2016/09/26";
|
final String testPartitionPath = "2016/09/26";
|
||||||
final int insertSplitLimit = 100;
|
final int insertSplitLimit = 100;
|
||||||
// setup the small file handling params
|
// setup the small file handling params
|
||||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, true); // hold upto 200 records max
|
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit,
|
||||||
|
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields
|
||||||
|
? new Properties() : getPropertiesForKeyGen());
|
||||||
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||||
|
|
||||||
@@ -1473,13 +1577,15 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
/**
|
/**
|
||||||
* Test to ensure commit metadata points to valid files.
|
* Test to ensure commit metadata points to valid files.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testCommitWritesRelativePaths() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
|
public void testCommitWritesRelativePaths(boolean populateMetaFields) throws Exception {
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
|
||||||
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());) {
|
||||||
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
|
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
|
||||||
HoodieSparkTable table = HoodieSparkTable.create(cfg, context, metaClient);
|
HoodieSparkTable table = HoodieSparkTable.create(cfgBuilder.build(), context, metaClient);
|
||||||
|
|
||||||
String instantTime = "000";
|
String instantTime = "000";
|
||||||
client.startCommitWithTime(instantTime);
|
client.startCommitWithTime(instantTime);
|
||||||
@@ -1518,9 +1624,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
/**
|
/**
|
||||||
* Test to ensure commit metadata points to valid files.10.
|
* Test to ensure commit metadata points to valid files.10.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testMetadataStatsOnCommit() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
|
public void testMetadataStatsOnCommit(boolean populateMetaFields) throws Exception {
|
||||||
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
|
||||||
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
HoodieWriteConfig cfg = cfgBuilder.build();
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
|
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
|
||||||
|
|
||||||
String instantTime0 = "000";
|
String instantTime0 = "000";
|
||||||
@@ -1607,16 +1716,24 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollbackUsingMarkers, boolean enableOptimisticConsistencyGuard) throws Exception {
|
private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollbackUsingMarkers, boolean enableOptimisticConsistencyGuard,
|
||||||
|
boolean populateMetaFields) throws Exception {
|
||||||
String instantTime = "000";
|
String instantTime = "000";
|
||||||
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
|
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
|
||||||
|
|
||||||
|
Properties properties = new Properties();
|
||||||
|
if (!populateMetaFields) {
|
||||||
|
properties = getPropertiesForKeyGen();
|
||||||
|
}
|
||||||
|
|
||||||
HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false)
|
HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false)
|
||||||
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true)
|
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true)
|
||||||
.withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build() :
|
.withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build() :
|
||||||
getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false)
|
getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false)
|
||||||
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder()
|
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder()
|
||||||
.withConsistencyCheckEnabled(true)
|
.withConsistencyCheckEnabled(true)
|
||||||
.withOptimisticConsistencyGuardSleepTimeMs(1).build()).build();
|
.withOptimisticConsistencyGuardSleepTimeMs(1).build())
|
||||||
|
.withProperties(properties).build();
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
|
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
|
||||||
testConsistencyCheck(metaClient, instantTime, enableOptimisticConsistencyGuard);
|
testConsistencyCheck(metaClient, instantTime, enableOptimisticConsistencyGuard);
|
||||||
|
|
||||||
@@ -1651,28 +1768,28 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@ValueSource(booleans = {true, false})
|
@MethodSource("rollbackAfterConsistencyCheckFailureParams")
|
||||||
public void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean enableOptimisticConsistencyGuard) throws Exception {
|
public void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean enableOptimisticConsistencyGuard, boolean populateMetCols) throws Exception {
|
||||||
testRollbackAfterConsistencyCheckFailureUsingFileList(false, enableOptimisticConsistencyGuard);
|
testRollbackAfterConsistencyCheckFailureUsingFileList(false, enableOptimisticConsistencyGuard, populateMetCols);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@ValueSource(booleans = {true, false})
|
@MethodSource("rollbackAfterConsistencyCheckFailureParams")
|
||||||
public void testRollbackAfterConsistencyCheckFailureUsingMarkers(boolean enableOptimisticConsistencyGuard) throws Exception {
|
public void testRollbackAfterConsistencyCheckFailureUsingMarkers(boolean enableOptimisticConsistencyGuard, boolean populateMetCols) throws Exception {
|
||||||
testRollbackAfterConsistencyCheckFailureUsingFileList(true, enableOptimisticConsistencyGuard);
|
testRollbackAfterConsistencyCheckFailureUsingFileList(true, enableOptimisticConsistencyGuard, populateMetCols);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@EnumSource(value = HoodieFailedWritesCleaningPolicy.class, names = {"LAZY", "NEVER"})
|
@MethodSource("rollbackFailedCommitsParams")
|
||||||
public void testRollbackFailedCommits(HoodieFailedWritesCleaningPolicy cleaningPolicy) throws Exception {
|
public void testRollbackFailedCommits(HoodieFailedWritesCleaningPolicy cleaningPolicy, boolean populateMetaFields) throws Exception {
|
||||||
HoodieTestUtils.init(hadoopConf, basePath);
|
HoodieTestUtils.init(hadoopConf, basePath);
|
||||||
// Perform 2 failed writes to table
|
// Perform 2 failed writes to table
|
||||||
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
|
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||||
0, false);
|
0, false);
|
||||||
client.close();
|
client.close();
|
||||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200",
|
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||||
0, false);
|
0, false);
|
||||||
@@ -1680,7 +1797,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
// refresh data generator to delete records generated from failed commits
|
// refresh data generator to delete records generated from failed commits
|
||||||
dataGen = new HoodieTestDataGenerator();
|
dataGen = new HoodieTestDataGenerator();
|
||||||
// Perform 1 successful write
|
// Perform 1 successful write
|
||||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300",
|
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||||
0, true);
|
0, true);
|
||||||
@@ -1696,7 +1813,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("200");
|
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("200");
|
||||||
Thread.sleep(2000);
|
Thread.sleep(2000);
|
||||||
}
|
}
|
||||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
// Perform 1 successful write
|
// Perform 1 successful write
|
||||||
writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400",
|
writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||||
@@ -1732,11 +1849,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testRollbackFailedCommitsToggleCleaningPolicy() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
|
public void testRollbackFailedCommitsToggleCleaningPolicy(boolean populateMetaFields) throws Exception {
|
||||||
HoodieTestUtils.init(hadoopConf, basePath);
|
HoodieTestUtils.init(hadoopConf, basePath);
|
||||||
HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.EAGER;
|
HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.EAGER;
|
||||||
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
// Perform 1 failed writes to table
|
// Perform 1 failed writes to table
|
||||||
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
|
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||||
@@ -1745,12 +1863,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
// Toggle cleaning policy to LAZY
|
// Toggle cleaning policy to LAZY
|
||||||
cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
|
cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
|
||||||
// Perform 2 failed writes to table
|
// Perform 2 failed writes to table
|
||||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200",
|
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||||
0, false);
|
0, false);
|
||||||
client.close();
|
client.close();
|
||||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300",
|
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||||
0, false);
|
0, false);
|
||||||
@@ -1766,19 +1884,19 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
assertTrue(timeline.getTimelineOfActions(
|
assertTrue(timeline.getTimelineOfActions(
|
||||||
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).countInstants() == 3);
|
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).countInstants() == 3);
|
||||||
// Perform 2 failed commits
|
// Perform 2 failed commits
|
||||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400",
|
writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||||
0, false);
|
0, false);
|
||||||
client.close();
|
client.close();
|
||||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
writeBatch(client, "500", "400", Option.of(Arrays.asList("500")), "500",
|
writeBatch(client, "500", "400", Option.of(Arrays.asList("500")), "500",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||||
0, false);
|
0, false);
|
||||||
client.close();
|
client.close();
|
||||||
// Toggle cleaning policy to EAGER
|
// Toggle cleaning policy to EAGER
|
||||||
cleaningPolicy = HoodieFailedWritesCleaningPolicy.EAGER;
|
cleaningPolicy = HoodieFailedWritesCleaningPolicy.EAGER;
|
||||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
client.startCommit();
|
client.startCommit();
|
||||||
timeline = metaClient.getActiveTimeline().reload();
|
timeline = metaClient.getActiveTimeline().reload();
|
||||||
assertTrue(timeline.getTimelineOfActions(
|
assertTrue(timeline.getTimelineOfActions(
|
||||||
@@ -1786,18 +1904,19 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 0);
|
assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testParallelInsertAndCleanPreviousFailedCommits() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
|
public void testParallelInsertAndCleanPreviousFailedCommits(boolean populateMetaFields) throws Exception {
|
||||||
HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
|
HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
|
||||||
ExecutorService service = Executors.newFixedThreadPool(2);
|
ExecutorService service = Executors.newFixedThreadPool(2);
|
||||||
HoodieTestUtils.init(hadoopConf, basePath);
|
HoodieTestUtils.init(hadoopConf, basePath);
|
||||||
// Perform 2 failed writes to table
|
// Perform 2 failed writes to table
|
||||||
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
|
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100,
|
||||||
0, false);
|
0, false);
|
||||||
client.close();
|
client.close();
|
||||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
writeBatch(client, "200", "200", Option.of(Arrays.asList("200")), "200",
|
writeBatch(client, "200", "200", Option.of(Arrays.asList("200")), "200",
|
||||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100,
|
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100,
|
||||||
0, false);
|
0, false);
|
||||||
@@ -1805,7 +1924,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
// refresh data generator to delete records generated from failed commits
|
// refresh data generator to delete records generated from failed commits
|
||||||
dataGen = new HoodieTestDataGenerator();
|
dataGen = new HoodieTestDataGenerator();
|
||||||
// Create a succesful commit
|
// Create a succesful commit
|
||||||
Future<JavaRDD<WriteStatus>> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy)),
|
Future<JavaRDD<WriteStatus>> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)),
|
||||||
"300", "200", Option.of(Arrays.asList("300")), "200", 100, dataGen::generateInserts,
|
"300", "200", Option.of(Arrays.asList("300")), "200", 100, dataGen::generateInserts,
|
||||||
SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
|
SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
|
||||||
commit3.get();
|
commit3.get();
|
||||||
@@ -1815,17 +1934,17 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).countInstants() == 0);
|
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).countInstants() == 0);
|
||||||
assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2);
|
assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2);
|
||||||
assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 1);
|
assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 1);
|
||||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||||
// Await till enough time passes such that the first 2 failed commits heartbeats are expired
|
// Await till enough time passes such that the first 2 failed commits heartbeats are expired
|
||||||
boolean conditionMet = false;
|
boolean conditionMet = false;
|
||||||
while (!conditionMet) {
|
while (!conditionMet) {
|
||||||
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("200");
|
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("200");
|
||||||
Thread.sleep(2000);
|
Thread.sleep(2000);
|
||||||
}
|
}
|
||||||
Future<JavaRDD<WriteStatus>> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy)),
|
Future<JavaRDD<WriteStatus>> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)),
|
||||||
"400", "300", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts,
|
"400", "300", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts,
|
||||||
SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
|
SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
|
||||||
Future<HoodieCleanMetadata> clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy)).clean());
|
Future<HoodieCleanMetadata> clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)).clean());
|
||||||
commit4.get();
|
commit4.get();
|
||||||
clean1.get();
|
clean1.get();
|
||||||
HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
|
HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
|
||||||
@@ -1878,11 +1997,13 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
return Pair.of(markerFilePath, result);
|
return Pair.of(markerFilePath, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testMultiOperationsPerCommit() throws IOException {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false)
|
public void testMultiOperationsPerCommit(boolean populateMetaFields) throws IOException {
|
||||||
.withAllowMultiWriteOnSameInstant(true)
|
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false)
|
||||||
.build();
|
.withAllowMultiWriteOnSameInstant(true);
|
||||||
|
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||||
|
HoodieWriteConfig cfg = cfgBuilder.build();
|
||||||
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
|
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
|
||||||
String firstInstantTime = "0000";
|
String firstInstantTime = "0000";
|
||||||
client.startCommitWithTime(firstInstantTime);
|
client.startCommitWithTime(firstInstantTime);
|
||||||
@@ -1957,16 +2078,19 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts) {
|
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts) {
|
||||||
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, mergeAllowDuplicateInserts, new Properties());
|
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, mergeAllowDuplicateInserts, true, new Properties());
|
||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, Properties props) {
|
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean populateMetaFields, Properties props) {
|
||||||
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, false, props);
|
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, false, populateMetaFields, props);
|
||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts,
|
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts,
|
||||||
Properties props) {
|
boolean populateMetaFields, Properties props) {
|
||||||
HoodieWriteConfig.Builder builder = getConfigBuilder(schemaStr);
|
HoodieWriteConfig.Builder builder = getConfigBuilder(schemaStr);
|
||||||
|
if (!populateMetaFields) {
|
||||||
|
builder.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.SIMPLE).build());
|
||||||
|
}
|
||||||
return builder
|
return builder
|
||||||
.withCompactionConfig(
|
.withCompactionConfig(
|
||||||
HoodieCompactionConfig.newBuilder()
|
HoodieCompactionConfig.newBuilder()
|
||||||
@@ -1994,7 +2118,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
return clusteringInstant;
|
return clusteringInstant;
|
||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig getParallelWritingWriteConfig(HoodieFailedWritesCleaningPolicy cleaningPolicy) {
|
private HoodieWriteConfig getParallelWritingWriteConfig(HoodieFailedWritesCleaningPolicy cleaningPolicy, boolean populateMetaFields) {
|
||||||
return getConfigBuilder()
|
return getConfigBuilder()
|
||||||
.withEmbeddedTimelineServerEnabled(false)
|
.withEmbeddedTimelineServerEnabled(false)
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
@@ -2002,7 +2126,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
|||||||
.withAutoClean(false).build())
|
.withAutoClean(false).build())
|
||||||
.withTimelineLayoutVersion(1)
|
.withTimelineLayoutVersion(1)
|
||||||
.withHeartbeatIntervalInMs(3 * 1000)
|
.withHeartbeatIntervalInMs(3 * 1000)
|
||||||
.withAutoCommit(false).build();
|
.withAutoCommit(false)
|
||||||
|
.withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen()).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
|||||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||||
import org.apache.hudi.common.testutils.RawTripTestPayload;
|
import org.apache.hudi.common.testutils.RawTripTestPayload;
|
||||||
import org.apache.hudi.common.util.BaseFileUtils;
|
import org.apache.hudi.common.util.BaseFileUtils;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieUpsertException;
|
import org.apache.hudi.exception.HoodieUpsertException;
|
||||||
import org.apache.hudi.io.HoodieCreateHandle;
|
import org.apache.hudi.io.HoodieCreateHandle;
|
||||||
@@ -121,7 +122,7 @@ public class TestUpdateSchemaEvolution extends HoodieClientTestHarness {
|
|||||||
jsc.parallelize(Arrays.asList(1)).map(x -> {
|
jsc.parallelize(Arrays.asList(1)).map(x -> {
|
||||||
Executable executable = () -> {
|
Executable executable = () -> {
|
||||||
HoodieMergeHandle mergeHandle = new HoodieMergeHandle(updateTable.getConfig(), "101", updateTable,
|
HoodieMergeHandle mergeHandle = new HoodieMergeHandle(updateTable.getConfig(), "101", updateTable,
|
||||||
updateRecords.iterator(), updateRecords.get(0).getPartitionPath(), insertResult.getFileId(), supplier);
|
updateRecords.iterator(), updateRecords.get(0).getPartitionPath(), insertResult.getFileId(), supplier, Option.empty());
|
||||||
List<GenericRecord> oldRecords = BaseFileUtils.getInstance(updateTable.getBaseFileFormat())
|
List<GenericRecord> oldRecords = BaseFileUtils.getInstance(updateTable.getBaseFileFormat())
|
||||||
.readAvroRecords(updateTable.getHadoopConf(),
|
.readAvroRecords(updateTable.getHadoopConf(),
|
||||||
new Path(updateTable.getConfig().getBasePath() + "/" + insertResult.getStat().getPath()),
|
new Path(updateTable.getConfig().getBasePath() + "/" + insertResult.getStat().getPath()),
|
||||||
|
|||||||
@@ -23,10 +23,12 @@ import org.apache.hudi.common.fs.ConsistencyGuardConfig;
|
|||||||
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
|
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||||
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
|
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
|
||||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||||
|
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||||
import org.apache.hudi.common.testutils.RawTripTestPayload;
|
import org.apache.hudi.common.testutils.RawTripTestPayload;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
@@ -47,8 +49,10 @@ import org.apache.hadoop.fs.Path;
|
|||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.params.ParameterizedTest;
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
import org.junit.jupiter.params.provider.EnumSource;
|
import org.junit.jupiter.params.provider.Arguments;
|
||||||
|
import org.junit.jupiter.params.provider.MethodSource;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@@ -56,8 +60,10 @@ import java.util.Collections;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Properties;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
@@ -69,16 +75,34 @@ import static org.junit.jupiter.api.Assertions.fail;
|
|||||||
|
|
||||||
public class TestHoodieIndex extends HoodieClientTestHarness {
|
public class TestHoodieIndex extends HoodieClientTestHarness {
|
||||||
|
|
||||||
|
private static Stream<Arguments> indexTypeParams() {
|
||||||
|
Object[][] data = new Object[][] {
|
||||||
|
{IndexType.BLOOM, true},
|
||||||
|
{IndexType.GLOBAL_BLOOM, true},
|
||||||
|
{IndexType.SIMPLE, true},
|
||||||
|
{IndexType.GLOBAL_SIMPLE, true},
|
||||||
|
{IndexType.SIMPLE, false},
|
||||||
|
{IndexType.GLOBAL_SIMPLE, false}
|
||||||
|
};
|
||||||
|
return Stream.of(data).map(Arguments::of);
|
||||||
|
}
|
||||||
|
|
||||||
private static final Schema SCHEMA = getSchemaFromResource(TestHoodieIndex.class, "/exampleSchema.avsc", true);
|
private static final Schema SCHEMA = getSchemaFromResource(TestHoodieIndex.class, "/exampleSchema.avsc", true);
|
||||||
private final Random random = new Random();
|
private final Random random = new Random();
|
||||||
private IndexType indexType;
|
private IndexType indexType;
|
||||||
private HoodieIndex index;
|
private HoodieIndex index;
|
||||||
private HoodieWriteConfig config;
|
private HoodieWriteConfig config;
|
||||||
|
|
||||||
private void setUp(IndexType indexType) throws Exception {
|
private void setUp(IndexType indexType, boolean populateMetaFields) throws Exception {
|
||||||
this.indexType = indexType;
|
this.indexType = indexType;
|
||||||
initResources();
|
initPath();
|
||||||
|
initSparkContexts();
|
||||||
|
initTestDataGenerator();
|
||||||
|
initFileSystem();
|
||||||
|
metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties()
|
||||||
|
: getPropertiesForKeyGen());
|
||||||
config = getConfigBuilder()
|
config = getConfigBuilder()
|
||||||
|
.withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen())
|
||||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType)
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType)
|
||||||
.build()).withAutoCommit(false).build();
|
.build()).withAutoCommit(false).build();
|
||||||
writeClient = getHoodieWriteClient(config);
|
writeClient = getHoodieWriteClient(config);
|
||||||
@@ -91,9 +115,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
|
@MethodSource("indexTypeParams")
|
||||||
public void testSimpleTagLocationAndUpdate(IndexType indexType) throws Exception {
|
public void testSimpleTagLocationAndUpdate(IndexType indexType, boolean populateMetaFields) throws Exception {
|
||||||
setUp(indexType);
|
setUp(indexType, populateMetaFields);
|
||||||
String newCommitTime = "001";
|
String newCommitTime = "001";
|
||||||
int totalRecords = 10 + random.nextInt(20);
|
int totalRecords = 10 + random.nextInt(20);
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
|
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
|
||||||
@@ -141,9 +165,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
|
@MethodSource("indexTypeParams")
|
||||||
public void testTagLocationAndDuplicateUpdate(IndexType indexType) throws Exception {
|
public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean populateMetaFields) throws Exception {
|
||||||
setUp(indexType);
|
setUp(indexType, populateMetaFields);
|
||||||
String newCommitTime = "001";
|
String newCommitTime = "001";
|
||||||
int totalRecords = 10 + random.nextInt(20);
|
int totalRecords = 10 + random.nextInt(20);
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
|
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
|
||||||
@@ -191,9 +215,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
|
@MethodSource("indexTypeParams")
|
||||||
public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType) throws Exception {
|
public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType, boolean populateMetaFields) throws Exception {
|
||||||
setUp(indexType);
|
setUp(indexType, populateMetaFields);
|
||||||
String newCommitTime = writeClient.startCommit();
|
String newCommitTime = writeClient.startCommit();
|
||||||
int totalRecords = 20 + random.nextInt(20);
|
int totalRecords = 20 + random.nextInt(20);
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
|
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
|
||||||
@@ -242,10 +266,18 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
|
|||||||
assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
|
assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Stream<Arguments> regularIndexTypeParams() {
|
||||||
|
Object[][] data = new Object[][] {
|
||||||
|
{IndexType.BLOOM, true},
|
||||||
|
{IndexType.SIMPLE, true}
|
||||||
|
};
|
||||||
|
return Stream.of(data).map(Arguments::of);
|
||||||
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@EnumSource(value = IndexType.class, names = {"BLOOM", "SIMPLE",})
|
@MethodSource("regularIndexTypeParams")
|
||||||
public void testTagLocationAndFetchRecordLocations(IndexType indexType) throws Exception {
|
public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean populateMetaFields) throws Exception {
|
||||||
setUp(indexType);
|
setUp(indexType, populateMetaFields);
|
||||||
String p1 = "2016/01/31";
|
String p1 = "2016/01/31";
|
||||||
String p2 = "2015/01/31";
|
String p2 = "2015/01/31";
|
||||||
String rowKey1 = UUID.randomUUID().toString();
|
String rowKey1 = UUID.randomUUID().toString();
|
||||||
@@ -325,10 +357,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ParameterizedTest
|
@Test
|
||||||
@EnumSource(value = IndexType.class, names = {"GLOBAL_SIMPLE"})
|
public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() throws Exception {
|
||||||
public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath(IndexType indexType) throws Exception {
|
setUp(IndexType.GLOBAL_SIMPLE, true);
|
||||||
setUp(indexType);
|
|
||||||
config = getConfigBuilder()
|
config = getConfigBuilder()
|
||||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType)
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType)
|
||||||
.withGlobalSimpleIndexUpdatePartitionPath(true)
|
.withGlobalSimpleIndexUpdatePartitionPath(true)
|
||||||
|
|||||||
@@ -18,20 +18,26 @@
|
|||||||
|
|
||||||
package org.apache.hudi.io;
|
package org.apache.hudi.io;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
|
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||||
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||||
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
|
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
|
||||||
|
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||||
import org.apache.hudi.config.HoodieIndexConfig;
|
import org.apache.hudi.config.HoodieIndexConfig;
|
||||||
import org.apache.hudi.config.HoodieStorageConfig;
|
import org.apache.hudi.config.HoodieStorageConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.index.HoodieIndexUtils;
|
import org.apache.hudi.index.HoodieIndexUtils;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
||||||
import org.apache.hudi.table.HoodieSparkTable;
|
import org.apache.hudi.table.HoodieSparkTable;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
import org.apache.hudi.testutils.HoodieClientTestHarness;
|
import org.apache.hudi.testutils.HoodieClientTestHarness;
|
||||||
@@ -40,7 +46,8 @@ import org.apache.hudi.testutils.MetadataMergeWriteStatus;
|
|||||||
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
|
import org.junit.jupiter.params.provider.ValueSource;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@@ -48,6 +55,7 @@ import java.util.HashMap;
|
|||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
@@ -71,10 +79,6 @@ public class TestHoodieKeyLocationFetchHandle extends HoodieClientTestHarness {
|
|||||||
initPath();
|
initPath();
|
||||||
initTestDataGenerator();
|
initTestDataGenerator();
|
||||||
initFileSystem();
|
initFileSystem();
|
||||||
initMetaClient();
|
|
||||||
config = getConfigBuilder()
|
|
||||||
.withIndexConfig(HoodieIndexConfig.newBuilder()
|
|
||||||
.build()).build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
@@ -82,8 +86,15 @@ public class TestHoodieKeyLocationFetchHandle extends HoodieClientTestHarness {
|
|||||||
cleanupResources();
|
cleanupResources();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testFetchHandle() throws Exception {
|
@ValueSource(booleans = {true, false})
|
||||||
|
public void testFetchHandle(boolean populateMetaFields) throws Exception {
|
||||||
|
metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
|
||||||
|
config = getConfigBuilder()
|
||||||
|
.withProperties(getPropertiesForKeyGen())
|
||||||
|
.withIndexConfig(HoodieIndexConfig.newBuilder()
|
||||||
|
.build()).build();
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(makeNewCommitTime(), 100);
|
List<HoodieRecord> records = dataGen.generateInserts(makeNewCommitTime(), 100);
|
||||||
Map<String, List<HoodieRecord>> partitionRecordsMap = recordsToPartitionRecordsMap(records);
|
Map<String, List<HoodieRecord>> partitionRecordsMap = recordsToPartitionRecordsMap(records);
|
||||||
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
|
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
|
||||||
@@ -93,8 +104,11 @@ public class TestHoodieKeyLocationFetchHandle extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
List<Tuple2<String, HoodieBaseFile>> partitionPathFileIdPairs = loadAllFilesForPartitions(new ArrayList<>(partitionRecordsMap.keySet()), context, hoodieTable);
|
List<Tuple2<String, HoodieBaseFile>> partitionPathFileIdPairs = loadAllFilesForPartitions(new ArrayList<>(partitionRecordsMap.keySet()), context, hoodieTable);
|
||||||
|
|
||||||
|
BaseKeyGenerator keyGenerator = (BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(getPropertiesForKeyGen()));
|
||||||
|
|
||||||
for (Tuple2<String, HoodieBaseFile> entry : partitionPathFileIdPairs) {
|
for (Tuple2<String, HoodieBaseFile> entry : partitionPathFileIdPairs) {
|
||||||
HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2));
|
HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2),
|
||||||
|
populateMetaFields ? Option.empty() : Option.of(keyGenerator));
|
||||||
Iterator<Pair<HoodieKey, HoodieRecordLocation>> result = fetcherHandle.locations().iterator();
|
Iterator<Pair<HoodieKey, HoodieRecordLocation>> result = fetcherHandle.locations().iterator();
|
||||||
List<Tuple2<HoodieKey, HoodieRecordLocation>> actualList = new ArrayList<>();
|
List<Tuple2<HoodieKey, HoodieRecordLocation>> actualList = new ArrayList<>();
|
||||||
result.forEachRemaining(x -> actualList.add(new Tuple2<>(x.getLeft(), x.getRight())));
|
result.forEachRemaining(x -> actualList.add(new Tuple2<>(x.getLeft(), x.getRight())));
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
|||||||
import org.apache.hudi.common.table.view.TableFileSystemView;
|
import org.apache.hudi.common.table.view.TableFileSystemView;
|
||||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||||
import org.apache.hudi.common.testutils.HoodieTestTable;
|
import org.apache.hudi.common.testutils.HoodieTestTable;
|
||||||
|
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||||
import org.apache.hudi.common.util.HoodieTimer;
|
import org.apache.hudi.common.util.HoodieTimer;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||||
@@ -60,10 +61,10 @@ import org.apache.log4j.Logger;
|
|||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.junit.jupiter.api.io.TempDir;
|
import org.junit.jupiter.api.io.TempDir;
|
||||||
import org.junit.jupiter.params.ParameterizedTest;
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
import org.junit.jupiter.params.provider.EnumSource;
|
import org.junit.jupiter.params.provider.Arguments;
|
||||||
|
import org.junit.jupiter.params.provider.MethodSource;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
@@ -71,7 +72,9 @@ import java.nio.file.Paths;
|
|||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Properties;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
@@ -91,16 +94,15 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
private HoodieTableType tableType;
|
private HoodieTableType tableType;
|
||||||
|
|
||||||
public void init(HoodieTableType tableType) throws IOException {
|
public void init(HoodieTableType tableType, boolean populateMetaFields) throws IOException {
|
||||||
this.tableType = tableType;
|
this.tableType = tableType;
|
||||||
initPath();
|
initPath();
|
||||||
initSparkContexts("TestHoodieMetadata");
|
initSparkContexts("TestHoodieMetadata");
|
||||||
initFileSystem();
|
initFileSystem();
|
||||||
fs.mkdirs(new Path(basePath));
|
fs.mkdirs(new Path(basePath));
|
||||||
initMetaClient(tableType);
|
metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
|
||||||
initTestDataGenerator();
|
initTestDataGenerator();
|
||||||
metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath);
|
metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
@@ -108,12 +110,25 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
cleanupResources();
|
cleanupResources();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Stream<Arguments> populateMetaFieldsParams() {
|
||||||
|
return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Stream<Arguments> tableTypePopulateMetaFieldsParams() {
|
||||||
|
return Stream.of(
|
||||||
|
Arguments.of(HoodieTableType.COPY_ON_WRITE, true),
|
||||||
|
Arguments.of(HoodieTableType.COPY_ON_WRITE, false),
|
||||||
|
Arguments.of(HoodieTableType.MERGE_ON_READ, true)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Metadata Table bootstrap scenarios.
|
* Metadata Table bootstrap scenarios.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testMetadataTableBootstrap() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
init(HoodieTableType.COPY_ON_WRITE);
|
public void testMetadataTableBootstrap(boolean populateMetaFields) throws Exception {
|
||||||
|
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
// Metadata table should not exist until created for the first time
|
// Metadata table should not exist until created for the first time
|
||||||
@@ -122,7 +137,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// Metadata table is not created if disabled by config
|
// Metadata table is not created if disabled by config
|
||||||
String firstCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
String firstCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||||
client.startCommitWithTime(firstCommitTime);
|
client.startCommitWithTime(firstCommitTime);
|
||||||
client.insert(jsc.parallelize(dataGen.generateInserts(firstCommitTime, 5)), firstCommitTime);
|
client.insert(jsc.parallelize(dataGen.generateInserts(firstCommitTime, 5)), firstCommitTime);
|
||||||
assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not be created");
|
assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not be created");
|
||||||
@@ -131,7 +146,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// Metadata table should not be created if any non-complete instants are present
|
// Metadata table should not be created if any non-complete instants are present
|
||||||
String secondCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
String secondCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(false, true), true)) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(false, true, populateMetaFields), true)) {
|
||||||
client.startCommitWithTime(secondCommitTime);
|
client.startCommitWithTime(secondCommitTime);
|
||||||
client.insert(jsc.parallelize(dataGen.generateUpdates(secondCommitTime, 2)), secondCommitTime);
|
client.insert(jsc.parallelize(dataGen.generateUpdates(secondCommitTime, 2)), secondCommitTime);
|
||||||
// AutoCommit is false so no bootstrap
|
// AutoCommit is false so no bootstrap
|
||||||
@@ -144,7 +159,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// Metadata table created when enabled by config & sync is called
|
// Metadata table created when enabled by config & sync is called
|
||||||
secondCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
secondCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
|
||||||
client.startCommitWithTime(secondCommitTime);
|
client.startCommitWithTime(secondCommitTime);
|
||||||
client.insert(jsc.parallelize(dataGen.generateUpdates(secondCommitTime, 2)), secondCommitTime);
|
client.insert(jsc.parallelize(dataGen.generateUpdates(secondCommitTime, 2)), secondCommitTime);
|
||||||
client.syncTableMetadata();
|
client.syncTableMetadata();
|
||||||
@@ -167,7 +182,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
});
|
});
|
||||||
|
|
||||||
String thirdCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
String thirdCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
|
||||||
client.startCommitWithTime(thirdCommitTime);
|
client.startCommitWithTime(thirdCommitTime);
|
||||||
client.insert(jsc.parallelize(dataGen.generateUpdates(thirdCommitTime, 2)), thirdCommitTime);
|
client.insert(jsc.parallelize(dataGen.generateUpdates(thirdCommitTime, 2)), thirdCommitTime);
|
||||||
client.syncTableMetadata();
|
client.syncTableMetadata();
|
||||||
@@ -184,10 +199,11 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
/**
|
/**
|
||||||
* Only valid partition directories are added to the metadata.
|
* Only valid partition directories are added to the metadata.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testOnlyValidPartitionsAdded() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
|
public void testOnlyValidPartitionsAdded(boolean populateMetaFields) throws Exception {
|
||||||
// This test requires local file system
|
// This test requires local file system
|
||||||
init(HoodieTableType.COPY_ON_WRITE);
|
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
// Create an empty directory which is not a partition directory (lacks partition metadata)
|
// Create an empty directory which is not a partition directory (lacks partition metadata)
|
||||||
@@ -207,7 +223,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
.addCommit("002").withBaseFilesInPartition("p1", 10).withBaseFilesInPartition("p2", 10, 10, 10);
|
.addCommit("002").withBaseFilesInPartition("p1", 10).withBaseFilesInPartition("p2", 10, 10, 10);
|
||||||
|
|
||||||
final HoodieWriteConfig writeConfig =
|
final HoodieWriteConfig writeConfig =
|
||||||
getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.NEVER, true, true, false)
|
getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.NEVER, true, true, false, populateMetaFields)
|
||||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withDirectoryFilterRegex(filterDirRegex).build()).build();
|
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withDirectoryFilterRegex(filterDirRegex).build()).build();
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) {
|
||||||
client.startCommitWithTime("005");
|
client.startCommitWithTime("005");
|
||||||
@@ -237,12 +253,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
* Test various table operations sync to Metadata Table correctly.
|
* Test various table operations sync to Metadata Table correctly.
|
||||||
*/
|
*/
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@EnumSource(HoodieTableType.class)
|
@MethodSource("tableTypePopulateMetaFieldsParams")
|
||||||
public void testTableOperations(HoodieTableType tableType) throws Exception {
|
public void testTableOperations(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
|
||||||
init(tableType);
|
init(tableType, populateMetaFields);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||||
|
|
||||||
// Write 1 (Bulk insert)
|
// Write 1 (Bulk insert)
|
||||||
String newCommitTime = "001";
|
String newCommitTime = "001";
|
||||||
@@ -325,12 +341,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
* Test rollback of various table operations sync to Metadata Table correctly.
|
* Test rollback of various table operations sync to Metadata Table correctly.
|
||||||
*/
|
*/
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@EnumSource(HoodieTableType.class)
|
@MethodSource("tableTypePopulateMetaFieldsParams")
|
||||||
public void testRollbackOperations(HoodieTableType tableType) throws Exception {
|
public void testRollbackOperations(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
|
||||||
init(tableType);
|
init(tableType, populateMetaFields);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||||
// Write 1 (Bulk insert)
|
// Write 1 (Bulk insert)
|
||||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
|
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
|
||||||
@@ -403,7 +419,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// Rollback of partial commits
|
// Rollback of partial commits
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext,
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext,
|
||||||
getWriteConfigBuilder(false, true, false).withRollbackUsingMarkers(false).build())) {
|
getWriteConfigBuilder(false, true, false, populateMetaFields).withRollbackUsingMarkers(false).build())) {
|
||||||
// Write updates and inserts
|
// Write updates and inserts
|
||||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -417,7 +433,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// Marker based rollback of partial commits
|
// Marker based rollback of partial commits
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext,
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext,
|
||||||
getWriteConfigBuilder(false, true, false).withRollbackUsingMarkers(true).build())) {
|
getWriteConfigBuilder(false, true, false, populateMetaFields).withRollbackUsingMarkers(true).build())) {
|
||||||
// Write updates and inserts
|
// Write updates and inserts
|
||||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -435,12 +451,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
* Once explicit sync is called, metadata should match.
|
* Once explicit sync is called, metadata should match.
|
||||||
*/
|
*/
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@EnumSource(HoodieTableType.class)
|
@MethodSource("tableTypePopulateMetaFieldsParams")
|
||||||
public void testRollbackUnsyncedCommit(HoodieTableType tableType) throws Exception {
|
public void testRollbackUnsyncedCommit(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
|
||||||
init(tableType);
|
init(tableType, populateMetaFields);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||||
// Initialize table with metadata
|
// Initialize table with metadata
|
||||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
|
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
|
||||||
@@ -450,7 +466,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
validateMetadata(client);
|
validateMetadata(client);
|
||||||
}
|
}
|
||||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||||
// Commit with metadata disabled
|
// Commit with metadata disabled
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
List<HoodieRecord> records = dataGen.generateUpdates(newCommitTime, 10);
|
List<HoodieRecord> records = dataGen.generateUpdates(newCommitTime, 10);
|
||||||
@@ -459,7 +475,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
client.rollback(newCommitTime);
|
client.rollback(newCommitTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient<>(engineContext, getWriteConfig(true, true))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient<>(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||||
assertFalse(metadata(client).isInSync());
|
assertFalse(metadata(client).isInSync());
|
||||||
client.syncTableMetadata();
|
client.syncTableMetadata();
|
||||||
validateMetadata(client);
|
validateMetadata(client);
|
||||||
@@ -470,10 +486,10 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
* Test sync of table operations.
|
* Test sync of table operations.
|
||||||
*/
|
*/
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@EnumSource(HoodieTableType.class)
|
@MethodSource("tableTypePopulateMetaFieldsParams")
|
||||||
@Disabled
|
@Disabled
|
||||||
public void testSync(HoodieTableType tableType) throws Exception {
|
public void testSync(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
|
||||||
init(tableType);
|
init(tableType, populateMetaFields);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
String newCommitTime;
|
String newCommitTime;
|
||||||
@@ -481,7 +497,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
List<WriteStatus> writeStatuses;
|
List<WriteStatus> writeStatuses;
|
||||||
|
|
||||||
// Initial commits without metadata table enabled
|
// Initial commits without metadata table enabled
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
records = dataGen.generateInserts(newCommitTime, 5);
|
records = dataGen.generateInserts(newCommitTime, 5);
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -496,7 +512,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Enable metadata table so it initialized by listing from file system
|
// Enable metadata table so it initialized by listing from file system
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||||
// inserts
|
// inserts
|
||||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -512,7 +528,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
String restoreToInstant;
|
String restoreToInstant;
|
||||||
String inflightActionTimestamp;
|
String inflightActionTimestamp;
|
||||||
String beforeInflightActionTimestamp;
|
String beforeInflightActionTimestamp;
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||||
// updates
|
// updates
|
||||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -584,7 +600,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
Path inflightCleanPath = new Path(metaClient.getMetaPath(), HoodieTimeline.makeInflightCleanerFileName(inflightActionTimestamp));
|
Path inflightCleanPath = new Path(metaClient.getMetaPath(), HoodieTimeline.makeInflightCleanerFileName(inflightActionTimestamp));
|
||||||
fs.create(inflightCleanPath).close();
|
fs.create(inflightCleanPath).close();
|
||||||
|
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||||
// Restore cannot be done until the metadata table is in sync. See HUDI-1502 for details
|
// Restore cannot be done until the metadata table is in sync. See HUDI-1502 for details
|
||||||
client.syncTableMetadata();
|
client.syncTableMetadata();
|
||||||
|
|
||||||
@@ -613,7 +629,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Enable metadata table and ensure it is synced
|
// Enable metadata table and ensure it is synced
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||||
client.restoreToInstant(restoreToInstant);
|
client.restoreToInstant(restoreToInstant);
|
||||||
assertFalse(metadata(client).isInSync());
|
assertFalse(metadata(client).isInSync());
|
||||||
|
|
||||||
@@ -629,13 +645,14 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
/**
|
/**
|
||||||
* Instants on Metadata Table should be archived as per config. Metadata Table should be automatically compacted as per config.
|
* Instants on Metadata Table should be archived as per config. Metadata Table should be automatically compacted as per config.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testCleaningArchivingAndCompaction() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
init(HoodieTableType.COPY_ON_WRITE);
|
public void testCleaningArchivingAndCompaction(boolean populateMetaFields) throws Exception {
|
||||||
|
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
final int maxDeltaCommitsBeforeCompaction = 4;
|
final int maxDeltaCommitsBeforeCompaction = 4;
|
||||||
HoodieWriteConfig config = getWriteConfigBuilder(true, true, false)
|
HoodieWriteConfig config = getWriteConfigBuilder(true, true, false, populateMetaFields)
|
||||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true)
|
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true)
|
||||||
.archiveCommitsWith(6, 8).retainCommits(1)
|
.archiveCommitsWith(6, 8).retainCommits(1)
|
||||||
.withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction).build())
|
.withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction).build())
|
||||||
@@ -676,14 +693,15 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
/**
|
/**
|
||||||
* Test various error scenarios.
|
* Test various error scenarios.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testErrorCases() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
init(HoodieTableType.COPY_ON_WRITE);
|
public void testErrorCases(boolean populateMetaFields) throws Exception {
|
||||||
|
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
// TESTCASE: If commit on the metadata table succeeds but fails on the dataset, then on next init the metadata table
|
// TESTCASE: If commit on the metadata table succeeds but fails on the dataset, then on next init the metadata table
|
||||||
// should be rolled back to last valid commit.
|
// should be rolled back to last valid commit.
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
|
||||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 10);
|
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 10);
|
||||||
@@ -704,7 +722,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
commitInstantFileName), false));
|
commitInstantFileName), false));
|
||||||
}
|
}
|
||||||
|
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
|
||||||
String newCommitTime = client.startCommit();
|
String newCommitTime = client.startCommit();
|
||||||
// Next insert
|
// Next insert
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 5);
|
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 5);
|
||||||
@@ -721,11 +739,11 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
*/
|
*/
|
||||||
//@Test
|
//@Test
|
||||||
public void testNonPartitioned() throws Exception {
|
public void testNonPartitioned() throws Exception {
|
||||||
init(HoodieTableType.COPY_ON_WRITE);
|
init(HoodieTableType.COPY_ON_WRITE, true);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
HoodieTestDataGenerator nonPartitionedGenerator = new HoodieTestDataGenerator(new String[] {""});
|
HoodieTestDataGenerator nonPartitionedGenerator = new HoodieTestDataGenerator(new String[] {""});
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, true))) {
|
||||||
// Write 1 (Bulk insert)
|
// Write 1 (Bulk insert)
|
||||||
String newCommitTime = "001";
|
String newCommitTime = "001";
|
||||||
List<HoodieRecord> records = nonPartitionedGenerator.generateInserts(newCommitTime, 10);
|
List<HoodieRecord> records = nonPartitionedGenerator.generateInserts(newCommitTime, 10);
|
||||||
@@ -741,12 +759,13 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
/**
|
/**
|
||||||
* Test various metrics published by metadata table.
|
* Test various metrics published by metadata table.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testMetadataMetrics() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
init(HoodieTableType.COPY_ON_WRITE);
|
public void testMetadataMetrics(boolean populateMetaFields) throws Exception {
|
||||||
|
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(true, true, true).build())) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(true, true, true, populateMetaFields).build())) {
|
||||||
// Write
|
// Write
|
||||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
|
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
|
||||||
@@ -769,15 +788,16 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
/**
|
/**
|
||||||
* Test when reading from metadata table which is out of sync with dataset that results are still consistent.
|
* Test when reading from metadata table which is out of sync with dataset that results are still consistent.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testMetadataOutOfSync() throws Exception {
|
@MethodSource("populateMetaFieldsParams")
|
||||||
init(HoodieTableType.COPY_ON_WRITE);
|
public void testMetadataOutOfSync(boolean populateMetaFields) throws Exception {
|
||||||
|
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
SparkRDDWriteClient unsyncedClient = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true));
|
SparkRDDWriteClient unsyncedClient = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields));
|
||||||
|
|
||||||
// Enable metadata so table is initialized
|
// Enable metadata so table is initialized
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||||
// Perform Bulk Insert
|
// Perform Bulk Insert
|
||||||
String newCommitTime = "001";
|
String newCommitTime = "001";
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -786,7 +806,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Perform commit operations with metadata disabled
|
// Perform commit operations with metadata disabled
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||||
// Perform Insert
|
// Perform Insert
|
||||||
String newCommitTime = "002";
|
String newCommitTime = "002";
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -811,7 +831,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
validateMetadata(unsyncedClient);
|
validateMetadata(unsyncedClient);
|
||||||
|
|
||||||
// Perform clean operation with metadata disabled
|
// Perform clean operation with metadata disabled
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||||
// One more commit needed to trigger clean so upsert and compact
|
// One more commit needed to trigger clean so upsert and compact
|
||||||
String newCommitTime = "005";
|
String newCommitTime = "005";
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -833,7 +853,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
validateMetadata(unsyncedClient);
|
validateMetadata(unsyncedClient);
|
||||||
|
|
||||||
// Perform restore with metadata disabled
|
// Perform restore with metadata disabled
|
||||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||||
client.restoreToInstant("004");
|
client.restoreToInstant("004");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1008,18 +1028,20 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig getWriteConfig(boolean autoCommit, boolean useFileListingMetadata) {
|
private HoodieWriteConfig getWriteConfig(boolean autoCommit, boolean useFileListingMetadata, boolean populateMetaFields) {
|
||||||
return getWriteConfigBuilder(autoCommit, useFileListingMetadata, false).build();
|
return getWriteConfigBuilder(autoCommit, useFileListingMetadata, false, populateMetaFields).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig.Builder getWriteConfigBuilder(boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) {
|
private HoodieWriteConfig.Builder getWriteConfigBuilder(boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics, boolean populateMetaFields) {
|
||||||
return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics);
|
return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics, populateMetaFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) {
|
private HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata,
|
||||||
|
boolean enableMetrics, boolean populateMetaFields) {
|
||||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA)
|
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA)
|
||||||
.withParallelism(2, 2).withDeleteParallelism(2).withRollbackParallelism(2).withFinalizeWriteParallelism(2)
|
.withParallelism(2, 2).withDeleteParallelism(2).withRollbackParallelism(2).withFinalizeWriteParallelism(2)
|
||||||
.withAutoCommit(autoCommit)
|
.withAutoCommit(autoCommit)
|
||||||
|
.withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen())
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024)
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024)
|
||||||
.withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1)
|
.withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1)
|
||||||
.withFailedWritesCleaningPolicy(policy)
|
.withFailedWritesCleaningPolicy(policy)
|
||||||
@@ -1028,7 +1050,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
.withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table")
|
.withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table")
|
||||||
.withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder()
|
.withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder()
|
||||||
.withEnableBackupForRemoteFileSystemView(false).build())
|
.withEnableBackupForRemoteFileSystemView(false).build())
|
||||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(populateMetaFields ? HoodieIndex.IndexType.BLOOM : HoodieIndex.IndexType.SIMPLE).build())
|
||||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder()
|
.withMetadataConfig(HoodieMetadataConfig.newBuilder()
|
||||||
.enable(useFileListingMetadata)
|
.enable(useFileListingMetadata)
|
||||||
.enableMetrics(enableMetrics).build())
|
.enableMetrics(enableMetrics).build())
|
||||||
|
|||||||
@@ -301,6 +301,13 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public JavaRDD<WriteStatus> insertFirstBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
|
||||||
|
String initCommitTime, int numRecordsInThisCommit,
|
||||||
|
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
|
||||||
|
boolean assertForCommit, int expRecordsInThisCommit) throws Exception {
|
||||||
|
return insertFirstBatch(writeConfig, client, newCommitTime, initCommitTime, numRecordsInThisCommit, writeFn, isPreppedAPI, assertForCommit, expRecordsInThisCommit, true);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper to insert first batch of records and do regular assertions on the state after successful completion.
|
* Helper to insert first batch of records and do regular assertions on the state after successful completion.
|
||||||
*
|
*
|
||||||
@@ -319,12 +326,12 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
public JavaRDD<WriteStatus> insertFirstBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
|
public JavaRDD<WriteStatus> insertFirstBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
|
||||||
String initCommitTime, int numRecordsInThisCommit,
|
String initCommitTime, int numRecordsInThisCommit,
|
||||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
|
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
|
||||||
boolean assertForCommit, int expRecordsInThisCommit) throws Exception {
|
boolean assertForCommit, int expRecordsInThisCommit, boolean filterForCommitTimeWithAssert) throws Exception {
|
||||||
final Function2<List<HoodieRecord>, String, Integer> recordGenFunction =
|
final Function2<List<HoodieRecord>, String, Integer> recordGenFunction =
|
||||||
generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateInserts);
|
generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateInserts);
|
||||||
|
|
||||||
return writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, numRecordsInThisCommit,
|
return writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, numRecordsInThisCommit,
|
||||||
recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expRecordsInThisCommit, 1, false);
|
recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expRecordsInThisCommit, 1, false, filterForCommitTimeWithAssert);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -355,6 +362,15 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, false);
|
recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public JavaRDD<WriteStatus> updateBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
|
||||||
|
String prevCommitTime, Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime,
|
||||||
|
int numRecordsInThisCommit,
|
||||||
|
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
|
||||||
|
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception {
|
||||||
|
return updateBatch(writeConfig, client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, writeFn,
|
||||||
|
isPreppedAPI, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, true);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper to upsert batch of records and do regular assertions on the state after successful completion.
|
* Helper to upsert batch of records and do regular assertions on the state after successful completion.
|
||||||
*
|
*
|
||||||
@@ -378,13 +394,23 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
String prevCommitTime, Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime,
|
String prevCommitTime, Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime,
|
||||||
int numRecordsInThisCommit,
|
int numRecordsInThisCommit,
|
||||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
|
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
|
||||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception {
|
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits,
|
||||||
|
boolean filterForCommitTimeWithAssert) throws Exception {
|
||||||
final Function2<List<HoodieRecord>, String, Integer> recordGenFunction =
|
final Function2<List<HoodieRecord>, String, Integer> recordGenFunction =
|
||||||
generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateUniqueUpdates);
|
generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateUniqueUpdates);
|
||||||
|
|
||||||
return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime,
|
return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime,
|
||||||
numRecordsInThisCommit, recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords,
|
numRecordsInThisCommit, recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords,
|
||||||
expTotalCommits, false);
|
expTotalCommits, false, filterForCommitTimeWithAssert);
|
||||||
|
}
|
||||||
|
|
||||||
|
public JavaRDD<WriteStatus> deleteBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
|
||||||
|
String prevCommitTime, String initCommitTime,
|
||||||
|
int numRecordsInThisCommit,
|
||||||
|
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn, boolean isPreppedAPI,
|
||||||
|
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception {
|
||||||
|
return deleteBatch(writeConfig, client, newCommitTime, prevCommitTime, initCommitTime, numRecordsInThisCommit, deleteFn, isPreppedAPI,
|
||||||
|
assertForCommit, expRecordsInThisCommit, expTotalRecords, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -408,13 +434,22 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
String prevCommitTime, String initCommitTime,
|
String prevCommitTime, String initCommitTime,
|
||||||
int numRecordsInThisCommit,
|
int numRecordsInThisCommit,
|
||||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn, boolean isPreppedAPI,
|
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn, boolean isPreppedAPI,
|
||||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception {
|
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, boolean filterForCommitTimeWithAssert) throws Exception {
|
||||||
final Function<Integer, List<HoodieKey>> keyGenFunction =
|
final Function<Integer, List<HoodieKey>> keyGenFunction =
|
||||||
generateWrapDeleteKeysFn(isPreppedAPI, writeConfig, dataGen::generateUniqueDeletes);
|
generateWrapDeleteKeysFn(isPreppedAPI, writeConfig, dataGen::generateUniqueDeletes);
|
||||||
|
|
||||||
return deleteBatch(client, newCommitTime, prevCommitTime, initCommitTime, numRecordsInThisCommit,
|
return deleteBatch(client, newCommitTime, prevCommitTime, initCommitTime, numRecordsInThisCommit,
|
||||||
keyGenFunction,
|
keyGenFunction,
|
||||||
deleteFn, assertForCommit, expRecordsInThisCommit, expTotalRecords);
|
deleteFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, filterForCommitTimeWithAssert);
|
||||||
|
}
|
||||||
|
|
||||||
|
public JavaRDD<WriteStatus> writeBatch(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime,
|
||||||
|
Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit,
|
||||||
|
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
|
||||||
|
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
|
||||||
|
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit) throws Exception {
|
||||||
|
return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, recordGenFunction,
|
||||||
|
writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, doCommit, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -439,7 +474,8 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit,
|
Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit,
|
||||||
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
|
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
|
||||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
|
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
|
||||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit) throws Exception {
|
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit,
|
||||||
|
boolean filterForCommitTimeWithAssert) throws Exception {
|
||||||
|
|
||||||
// Write 1 (only inserts)
|
// Write 1 (only inserts)
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -466,8 +502,10 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
"Expecting " + expTotalCommits + " commits.");
|
"Expecting " + expTotalCommits + " commits.");
|
||||||
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(),
|
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(),
|
||||||
"Latest commit should be " + newCommitTime);
|
"Latest commit should be " + newCommitTime);
|
||||||
|
if (filterForCommitTimeWithAssert) { // when meta cols are disabled, we can't really do per commit assertion.
|
||||||
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||||
"Must contain " + expRecordsInThisCommit + " records");
|
"Must contain " + expRecordsInThisCommit + " records");
|
||||||
|
}
|
||||||
|
|
||||||
// Check the entire dataset has all records still
|
// Check the entire dataset has all records still
|
||||||
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
||||||
@@ -477,6 +515,7 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(),
|
assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(),
|
||||||
"Must contain " + expTotalRecords + " records");
|
"Must contain " + expTotalRecords + " records");
|
||||||
|
|
||||||
|
if (filterForCommitTimeWithAssert) {
|
||||||
// Check that the incremental consumption from prevCommitTime
|
// Check that the incremental consumption from prevCommitTime
|
||||||
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||||
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
|
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
|
||||||
@@ -489,6 +528,7 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -510,7 +550,7 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
String initCommitTime, int numRecordsInThisCommit,
|
String initCommitTime, int numRecordsInThisCommit,
|
||||||
Function<Integer, List<HoodieKey>> keyGenFunction,
|
Function<Integer, List<HoodieKey>> keyGenFunction,
|
||||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn,
|
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn,
|
||||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception {
|
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, boolean filerForCommitTimeWithAssert) throws Exception {
|
||||||
|
|
||||||
// Delete 1 (only deletes)
|
// Delete 1 (only deletes)
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -534,8 +574,10 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
"Expecting 3 commits.");
|
"Expecting 3 commits.");
|
||||||
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(),
|
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(),
|
||||||
"Latest commit should be " + newCommitTime);
|
"Latest commit should be " + newCommitTime);
|
||||||
|
if (filerForCommitTimeWithAssert) { // if meta cols are disabled, we can't do assertion based on assertion time
|
||||||
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||||
"Must contain " + expRecordsInThisCommit + " records");
|
"Must contain " + expRecordsInThisCommit + " records");
|
||||||
|
}
|
||||||
|
|
||||||
// Check the entire dataset has all records still
|
// Check the entire dataset has all records still
|
||||||
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
||||||
@@ -545,12 +587,14 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
|||||||
assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(),
|
assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(),
|
||||||
"Must contain " + expTotalRecords + " records");
|
"Must contain " + expTotalRecords + " records");
|
||||||
|
|
||||||
|
if (filerForCommitTimeWithAssert) {
|
||||||
// Check that the incremental consumption from prevCommitTime
|
// Check that the incremental consumption from prevCommitTime
|
||||||
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||||
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
|
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
|
||||||
"Incremental consumption from " + prevCommitTime + " should give no records in latest commit,"
|
"Incremental consumption from " + prevCommitTime + " should give no records in latest commit,"
|
||||||
+ " since it is a delete operation");
|
+ " since it is a delete operation");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ import org.apache.hudi.common.fs.FSUtils;
|
|||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||||
import org.apache.hudi.common.model.HoodieTableType;
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
|
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
||||||
@@ -40,7 +41,9 @@ import org.apache.hudi.common.testutils.HoodieTestUtils;
|
|||||||
import org.apache.hudi.common.testutils.minicluster.HdfsTestService;
|
import org.apache.hudi.common.testutils.minicluster.HdfsTestService;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
|
import org.apache.hudi.config.HoodieIndexConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.index.HoodieIndex;
|
||||||
import org.apache.hudi.table.WorkloadStat;
|
import org.apache.hudi.table.WorkloadStat;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
@@ -56,6 +59,7 @@ import java.io.IOException;
|
|||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Properties;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
@@ -225,6 +229,21 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
|||||||
metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType);
|
metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected Properties getPropertiesForKeyGen() {
|
||||||
|
Properties properties = new Properties();
|
||||||
|
properties.put(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), "false");
|
||||||
|
properties.put("hoodie.datasource.write.recordkey.field","_row_key");
|
||||||
|
properties.put("hoodie.datasource.write.partitionpath.field","partition_path");
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void addAppropriatePropsForPopulateMetaFields(HoodieWriteConfig.Builder configBuilder, boolean populateMetaFields) {
|
||||||
|
if (!populateMetaFields) {
|
||||||
|
configBuilder.withProperties(getPropertiesForKeyGen())
|
||||||
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.SIMPLE).build());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleanups hoodie clients.
|
* Cleanups hoodie clients.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -25,6 +25,9 @@
|
|||||||
}, {
|
}, {
|
||||||
"name" : "_row_key",
|
"name" : "_row_key",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
}, {
|
||||||
|
"name" : "partition_path",
|
||||||
|
"type" : "string"
|
||||||
}, {
|
}, {
|
||||||
"name" : "rider",
|
"name" : "rider",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
|||||||
@@ -244,6 +244,22 @@ public class HoodieAvroUtils {
|
|||||||
return recordSchema;
|
return recordSchema;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch schema for record key and partition path.
|
||||||
|
*/
|
||||||
|
public static Schema getSchemaForFields(Schema fileSchema, List<String> fields) {
|
||||||
|
List<Schema.Field> toBeAddedFields = new ArrayList<>();
|
||||||
|
Schema recordSchema = Schema.createRecord("HoodieRecordKey", "", "", false);
|
||||||
|
|
||||||
|
for (Schema.Field schemaField: fileSchema.getFields()) {
|
||||||
|
if (fields.contains(schemaField.name())) {
|
||||||
|
toBeAddedFields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), schemaField.defaultValue()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
recordSchema.setFields(toBeAddedFields);
|
||||||
|
return recordSchema;
|
||||||
|
}
|
||||||
|
|
||||||
public static GenericRecord addHoodieKeyToRecord(GenericRecord record, String recordKey, String partitionPath,
|
public static GenericRecord addHoodieKeyToRecord(GenericRecord record, String recordKey, String partitionPath,
|
||||||
String fileName) {
|
String fileName) {
|
||||||
record.put(HoodieRecord.FILENAME_METADATA_FIELD, fileName);
|
record.put(HoodieRecord.FILENAME_METADATA_FIELD, fileName);
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ package org.apache.hudi.avro;
|
|||||||
|
|
||||||
import org.apache.hudi.common.bloom.BloomFilter;
|
import org.apache.hudi.common.bloom.BloomFilter;
|
||||||
import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
|
import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.parquet.avro.AvroWriteSupport;
|
import org.apache.parquet.avro.AvroWriteSupport;
|
||||||
@@ -33,7 +34,7 @@ import java.util.HashMap;
|
|||||||
*/
|
*/
|
||||||
public class HoodieAvroWriteSupport extends AvroWriteSupport {
|
public class HoodieAvroWriteSupport extends AvroWriteSupport {
|
||||||
|
|
||||||
private BloomFilter bloomFilter;
|
private Option<BloomFilter> bloomFilterOpt;
|
||||||
private String minRecordKey;
|
private String minRecordKey;
|
||||||
private String maxRecordKey;
|
private String maxRecordKey;
|
||||||
|
|
||||||
@@ -44,29 +45,30 @@ public class HoodieAvroWriteSupport extends AvroWriteSupport {
|
|||||||
public static final String HOODIE_BLOOM_FILTER_TYPE_CODE = "hoodie_bloom_filter_type_code";
|
public static final String HOODIE_BLOOM_FILTER_TYPE_CODE = "hoodie_bloom_filter_type_code";
|
||||||
public static final String HOODIE_AVRO_SCHEMA_METADATA_KEY = "orc.avro.schema";
|
public static final String HOODIE_AVRO_SCHEMA_METADATA_KEY = "orc.avro.schema";
|
||||||
|
|
||||||
public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, BloomFilter bloomFilter) {
|
public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, Option<BloomFilter> bloomFilterOpt) {
|
||||||
super(schema, avroSchema);
|
super(schema, avroSchema);
|
||||||
this.bloomFilter = bloomFilter;
|
this.bloomFilterOpt = bloomFilterOpt;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public WriteSupport.FinalizedWriteContext finalizeWrite() {
|
public WriteSupport.FinalizedWriteContext finalizeWrite() {
|
||||||
HashMap<String, String> extraMetaData = new HashMap<>();
|
HashMap<String, String> extraMetaData = new HashMap<>();
|
||||||
if (bloomFilter != null) {
|
if (bloomFilterOpt.isPresent()) {
|
||||||
extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString());
|
extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilterOpt.get().serializeToString());
|
||||||
if (minRecordKey != null && maxRecordKey != null) {
|
if (minRecordKey != null && maxRecordKey != null) {
|
||||||
extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey);
|
extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey);
|
||||||
extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey);
|
extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey);
|
||||||
}
|
}
|
||||||
if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) {
|
if (bloomFilterOpt.get().getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) {
|
||||||
extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilter.getBloomFilterTypeCode().name());
|
extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilterOpt.get().getBloomFilterTypeCode().name());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new WriteSupport.FinalizedWriteContext(extraMetaData);
|
return new WriteSupport.FinalizedWriteContext(extraMetaData);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(String recordKey) {
|
public void add(String recordKey) {
|
||||||
this.bloomFilter.add(recordKey);
|
if (bloomFilterOpt.isPresent()) {
|
||||||
|
this.bloomFilterOpt.get().add(recordKey);
|
||||||
if (minRecordKey != null) {
|
if (minRecordKey != null) {
|
||||||
minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey;
|
minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey;
|
||||||
} else {
|
} else {
|
||||||
@@ -80,3 +82,4 @@ public class HoodieAvroWriteSupport extends AvroWriteSupport {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -319,12 +319,6 @@ public class HoodieTableMetaClient implements Serializable {
|
|||||||
* @param operationType operation type to be executed.
|
* @param operationType operation type to be executed.
|
||||||
*/
|
*/
|
||||||
public void validateTableProperties(Properties properties, WriteOperationType operationType) {
|
public void validateTableProperties(Properties properties, WriteOperationType operationType) {
|
||||||
// disabling meta fields are allowed only for bulk_insert operation
|
|
||||||
if (!Boolean.parseBoolean((String) properties.getOrDefault(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()))
|
|
||||||
&& operationType != WriteOperationType.BULK_INSERT) {
|
|
||||||
throw new HoodieException(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key() + " can only be disabled for " + WriteOperationType.BULK_INSERT
|
|
||||||
+ " operation");
|
|
||||||
}
|
|
||||||
// once meta fields are disabled, it cant be re-enabled for a given table.
|
// once meta fields are disabled, it cant be re-enabled for a given table.
|
||||||
if (!getTableConfig().populateMetaFields()
|
if (!getTableConfig().populateMetaFields()
|
||||||
&& Boolean.parseBoolean((String) properties.getOrDefault(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()))) {
|
&& Boolean.parseBoolean((String) properties.getOrDefault(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()))) {
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ import org.apache.hudi.common.model.HoodieFileFormat;
|
|||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
|
||||||
public abstract class BaseFileUtils {
|
public abstract class BaseFileUtils {
|
||||||
|
|
||||||
@@ -170,6 +171,15 @@ public abstract class BaseFileUtils {
|
|||||||
*/
|
*/
|
||||||
public abstract List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath);
|
public abstract List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch {@link HoodieKey}s from the given data file.
|
||||||
|
* @param configuration configuration to build fs object
|
||||||
|
* @param filePath The data file path
|
||||||
|
* @param keyGeneratorOpt instance of KeyGenerator.
|
||||||
|
* @return {@link List} of {@link HoodieKey}s fetched from the parquet file
|
||||||
|
*/
|
||||||
|
public abstract List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath, Option<BaseKeyGenerator> keyGeneratorOpt);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read the Avro schema of the data file.
|
* Read the Avro schema of the data file.
|
||||||
* @param configuration Configuration
|
* @param configuration Configuration
|
||||||
|
|||||||
@@ -41,6 +41,8 @@ import org.apache.hudi.common.model.HoodieRecord;
|
|||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.exception.MetadataNotFoundException;
|
import org.apache.hudi.exception.MetadataNotFoundException;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
|
||||||
import org.apache.orc.OrcFile;
|
import org.apache.orc.OrcFile;
|
||||||
import org.apache.orc.OrcProto.UserMetadataItem;
|
import org.apache.orc.OrcProto.UserMetadataItem;
|
||||||
import org.apache.orc.Reader;
|
import org.apache.orc.Reader;
|
||||||
@@ -109,6 +111,11 @@ public class OrcUtils extends BaseFileUtils {
|
|||||||
return hoodieKeys;
|
return hoodieKeys;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
|
throw new HoodieIOException("UnsupportedOperation : Disabling meta fields not yet supported for Orc");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* NOTE: This literally reads the entire file contents, thus should be used with caution.
|
* NOTE: This literally reads the entire file contents, thus should be used with caution.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import org.apache.hudi.common.model.HoodieKey;
|
|||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.exception.MetadataNotFoundException;
|
import org.apache.hudi.exception.MetadataNotFoundException;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
@@ -115,23 +116,36 @@ public class ParquetUtils extends BaseFileUtils {
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath) {
|
public List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath) {
|
||||||
List<HoodieKey> hoodieKeys = new ArrayList<>();
|
return fetchRecordKeyPartitionPathInternal(configuration, filePath, Option.empty());
|
||||||
try {
|
|
||||||
if (!filePath.getFileSystem(configuration).exists(filePath)) {
|
|
||||||
return new ArrayList<>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<HoodieKey> fetchRecordKeyPartitionPathInternal(Configuration configuration, Path filePath, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
|
List<HoodieKey> hoodieKeys = new ArrayList<>();
|
||||||
|
try {
|
||||||
Configuration conf = new Configuration(configuration);
|
Configuration conf = new Configuration(configuration);
|
||||||
conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
|
conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
|
||||||
Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
|
Schema readSchema = keyGeneratorOpt.map(keyGenerator -> {
|
||||||
|
List<String> fields = new ArrayList<>();
|
||||||
|
fields.addAll(keyGenerator.getRecordKeyFields());
|
||||||
|
fields.addAll(keyGenerator.getPartitionPathFields());
|
||||||
|
return HoodieAvroUtils.getSchemaForFields(readAvroSchema(conf, filePath), fields);
|
||||||
|
})
|
||||||
|
.orElse(HoodieAvroUtils.getRecordKeyPartitionPathSchema());
|
||||||
AvroReadSupport.setAvroReadSchema(conf, readSchema);
|
AvroReadSupport.setAvroReadSchema(conf, readSchema);
|
||||||
AvroReadSupport.setRequestedProjection(conf, readSchema);
|
AvroReadSupport.setRequestedProjection(conf, readSchema);
|
||||||
ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build();
|
ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build();
|
||||||
Object obj = reader.read();
|
Object obj = reader.read();
|
||||||
while (obj != null) {
|
while (obj != null) {
|
||||||
if (obj instanceof GenericRecord) {
|
if (obj instanceof GenericRecord) {
|
||||||
String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String recordKey = null;
|
||||||
String partitionPath = ((GenericRecord) obj).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
|
String partitionPath = null;
|
||||||
|
if (keyGeneratorOpt.isPresent()) {
|
||||||
|
recordKey = keyGeneratorOpt.get().getRecordKey((GenericRecord) obj);
|
||||||
|
partitionPath = keyGeneratorOpt.get().getPartitionPath((GenericRecord) obj);
|
||||||
|
} else {
|
||||||
|
recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
|
partitionPath = ((GenericRecord) obj).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
|
||||||
|
}
|
||||||
hoodieKeys.add(new HoodieKey(recordKey, partitionPath));
|
hoodieKeys.add(new HoodieKey(recordKey, partitionPath));
|
||||||
obj = reader.read();
|
obj = reader.read();
|
||||||
}
|
}
|
||||||
@@ -142,6 +156,19 @@ public class ParquetUtils extends BaseFileUtils {
|
|||||||
return hoodieKeys;
|
return hoodieKeys;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch {@link HoodieKey}s from the given parquet file.
|
||||||
|
*
|
||||||
|
* @param configuration configuration to build fs object
|
||||||
|
* @param filePath The parquet file path.
|
||||||
|
* @param keyGeneratorOpt
|
||||||
|
* @return {@link List} of {@link HoodieKey}s fetched from the parquet file
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath, Option<BaseKeyGenerator> keyGeneratorOpt) {
|
||||||
|
return fetchRecordKeyPartitionPathInternal(configuration, filePath, keyGeneratorOpt);
|
||||||
|
}
|
||||||
|
|
||||||
public ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) {
|
public ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) {
|
||||||
ParquetMetadata footer;
|
ParquetMetadata footer;
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import org.apache.hudi.common.bloom.BloomFilter;
|
|||||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||||
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericData;
|
import org.apache.avro.generic.GenericData;
|
||||||
@@ -52,7 +53,7 @@ public class TestHoodieAvroWriteSupport {
|
|||||||
1000, 0.0001, 10000,
|
1000, 0.0001, 10000,
|
||||||
BloomFilterTypeCode.SIMPLE.name());
|
BloomFilterTypeCode.SIMPLE.name());
|
||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
||||||
new AvroSchemaConverter().convert(schema), schema, filter);
|
new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
|
||||||
ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
|
ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
|
||||||
120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
|
120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
|
||||||
for (String rowKey : rowKeys) {
|
for (String rowKey : rowKeys) {
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ public class HoodieTestDataGenerator {
|
|||||||
public static final int DEFAULT_PARTITION_DEPTH = 3;
|
public static final int DEFAULT_PARTITION_DEPTH = 3;
|
||||||
public static final String TRIP_SCHEMA_PREFIX = "{\"type\": \"record\"," + "\"name\": \"triprec\"," + "\"fields\": [ "
|
public static final String TRIP_SCHEMA_PREFIX = "{\"type\": \"record\"," + "\"name\": \"triprec\"," + "\"fields\": [ "
|
||||||
+ "{\"name\": \"timestamp\",\"type\": \"long\"}," + "{\"name\": \"_row_key\", \"type\": \"string\"},"
|
+ "{\"name\": \"timestamp\",\"type\": \"long\"}," + "{\"name\": \"_row_key\", \"type\": \"string\"},"
|
||||||
|
+ "{\"name\": \"partition_path\", \"type\": \"string\"},"
|
||||||
+ "{\"name\": \"rider\", \"type\": \"string\"}," + "{\"name\": \"driver\", \"type\": \"string\"},"
|
+ "{\"name\": \"rider\", \"type\": \"string\"}," + "{\"name\": \"driver\", \"type\": \"string\"},"
|
||||||
+ "{\"name\": \"begin_lat\", \"type\": \"double\"}," + "{\"name\": \"begin_lon\", \"type\": \"double\"},"
|
+ "{\"name\": \"begin_lat\", \"type\": \"double\"}," + "{\"name\": \"begin_lon\", \"type\": \"double\"},"
|
||||||
+ "{\"name\": \"end_lat\", \"type\": \"double\"}," + "{\"name\": \"end_lon\", \"type\": \"double\"},";
|
+ "{\"name\": \"end_lat\", \"type\": \"double\"}," + "{\"name\": \"end_lon\", \"type\": \"double\"},";
|
||||||
@@ -123,7 +124,7 @@ public class HoodieTestDataGenerator {
|
|||||||
+ "{\"name\":\"driver\",\"type\":\"string\"},{\"name\":\"fare\",\"type\":\"double\"},{\"name\": \"_hoodie_is_deleted\", \"type\": \"boolean\", \"default\": false}]}";
|
+ "{\"name\":\"driver\",\"type\":\"string\"},{\"name\":\"fare\",\"type\":\"double\"},{\"name\": \"_hoodie_is_deleted\", \"type\": \"boolean\", \"default\": false}]}";
|
||||||
|
|
||||||
public static final String NULL_SCHEMA = Schema.create(Schema.Type.NULL).toString();
|
public static final String NULL_SCHEMA = Schema.create(Schema.Type.NULL).toString();
|
||||||
public static final String TRIP_HIVE_COLUMN_TYPES = "bigint,string,string,string,double,double,double,double,int,bigint,float,binary,int,bigint,decimal(10,6),"
|
public static final String TRIP_HIVE_COLUMN_TYPES = "bigint,string,string,string,string,double,double,double,double,int,bigint,float,binary,int,bigint,decimal(10,6),"
|
||||||
+ "map<string,string>,struct<amount:double,currency:string>,array<struct<amount:double,currency:string>>,boolean";
|
+ "map<string,string>,struct<amount:double,currency:string>,array<struct<amount:double,currency:string>>,boolean";
|
||||||
|
|
||||||
|
|
||||||
@@ -208,7 +209,7 @@ public class HoodieTestDataGenerator {
|
|||||||
public static RawTripTestPayload generateRandomValue(
|
public static RawTripTestPayload generateRandomValue(
|
||||||
HoodieKey key, String instantTime, boolean isFlattened) throws IOException {
|
HoodieKey key, String instantTime, boolean isFlattened) throws IOException {
|
||||||
GenericRecord rec = generateGenericRecord(
|
GenericRecord rec = generateGenericRecord(
|
||||||
key.getRecordKey(), "rider-" + instantTime, "driver-" + instantTime, 0,
|
key.getRecordKey(), key.getPartitionPath(), "rider-" + instantTime, "driver-" + instantTime, 0,
|
||||||
false, isFlattened);
|
false, isFlattened);
|
||||||
return new RawTripTestPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA);
|
return new RawTripTestPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA);
|
||||||
}
|
}
|
||||||
@@ -230,7 +231,7 @@ public class HoodieTestDataGenerator {
|
|||||||
* Generates a new avro record of the above schema format for a delete.
|
* Generates a new avro record of the above schema format for a delete.
|
||||||
*/
|
*/
|
||||||
public static RawTripTestPayload generateRandomDeleteValue(HoodieKey key, String instantTime) throws IOException {
|
public static RawTripTestPayload generateRandomDeleteValue(HoodieKey key, String instantTime) throws IOException {
|
||||||
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + instantTime, "driver-" + instantTime, 0,
|
GenericRecord rec = generateGenericRecord(key.getRecordKey(), key.getPartitionPath(), "rider-" + instantTime, "driver-" + instantTime, 0,
|
||||||
true, false);
|
true, false);
|
||||||
return new RawTripTestPayload(Option.of(rec.toString()), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA, true);
|
return new RawTripTestPayload(Option.of(rec.toString()), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA, true);
|
||||||
}
|
}
|
||||||
@@ -239,21 +240,22 @@ public class HoodieTestDataGenerator {
|
|||||||
* Generates a new avro record of the above schema format, retaining the key if optionally provided.
|
* Generates a new avro record of the above schema format, retaining the key if optionally provided.
|
||||||
*/
|
*/
|
||||||
public static HoodieAvroPayload generateAvroPayload(HoodieKey key, String instantTime) {
|
public static HoodieAvroPayload generateAvroPayload(HoodieKey key, String instantTime) {
|
||||||
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + instantTime, "driver-" + instantTime, 0);
|
GenericRecord rec = generateGenericRecord(key.getRecordKey(), key.getPartitionPath(), "rider-" + instantTime, "driver-" + instantTime, 0);
|
||||||
return new HoodieAvroPayload(Option.of(rec));
|
return new HoodieAvroPayload(Option.of(rec));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName,
|
public static GenericRecord generateGenericRecord(String rowKey, String partitionPath, String riderName, String driverName,
|
||||||
long timestamp) {
|
long timestamp) {
|
||||||
return generateGenericRecord(rowKey, riderName, driverName, timestamp, false, false);
|
return generateGenericRecord(rowKey, partitionPath, riderName, driverName, timestamp, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName,
|
public static GenericRecord generateGenericRecord(String rowKey, String partitionPath, String riderName, String driverName,
|
||||||
long timestamp, boolean isDeleteRecord,
|
long timestamp, boolean isDeleteRecord,
|
||||||
boolean isFlattened) {
|
boolean isFlattened) {
|
||||||
GenericRecord rec = new GenericData.Record(isFlattened ? FLATTENED_AVRO_SCHEMA : AVRO_SCHEMA);
|
GenericRecord rec = new GenericData.Record(isFlattened ? FLATTENED_AVRO_SCHEMA : AVRO_SCHEMA);
|
||||||
rec.put("_row_key", rowKey);
|
rec.put("_row_key", rowKey);
|
||||||
rec.put("timestamp", timestamp);
|
rec.put("timestamp", timestamp);
|
||||||
|
rec.put("partition_path", partitionPath);
|
||||||
rec.put("rider", riderName);
|
rec.put("rider", riderName);
|
||||||
rec.put("driver", driverName);
|
rec.put("driver", driverName);
|
||||||
rec.put("begin_lat", RAND.nextDouble());
|
rec.put("begin_lat", RAND.nextDouble());
|
||||||
@@ -807,7 +809,7 @@ public class HoodieTestDataGenerator {
|
|||||||
public List<GenericRecord> generateGenericRecords(int numRecords) {
|
public List<GenericRecord> generateGenericRecords(int numRecords) {
|
||||||
List<GenericRecord> list = new ArrayList<>();
|
List<GenericRecord> list = new ArrayList<>();
|
||||||
IntStream.range(0, numRecords).forEach(i -> {
|
IntStream.range(0, numRecords).forEach(i -> {
|
||||||
list.add(generateGenericRecord(UUID.randomUUID().toString(), UUID.randomUUID().toString(), UUID.randomUUID()
|
list.add(generateGenericRecord(UUID.randomUUID().toString(), "0", UUID.randomUUID().toString(), UUID.randomUUID()
|
||||||
.toString(), RAND.nextLong()));
|
.toString(), RAND.nextLong()));
|
||||||
});
|
});
|
||||||
return list;
|
return list;
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ public class TestAvroOrcUtils extends HoodieCommonTestHarness {
|
|||||||
// The following types are tested:
|
// The following types are tested:
|
||||||
// DATE, DECIMAL, LONG, INT, BYTES, ARRAY, RECORD, MAP, STRING, FLOAT, DOUBLE
|
// DATE, DECIMAL, LONG, INT, BYTES, ARRAY, RECORD, MAP, STRING, FLOAT, DOUBLE
|
||||||
TypeDescription orcSchema = TypeDescription.fromString("struct<"
|
TypeDescription orcSchema = TypeDescription.fromString("struct<"
|
||||||
+ "timestamp:bigint,_row_key:string,rider:string,driver:string,begin_lat:double,"
|
+ "timestamp:bigint,_row_key:string,partition_path:string,rider:string,driver:string,begin_lat:double,"
|
||||||
+ "begin_lon:double,end_lat:double,end_lon:double,"
|
+ "begin_lon:double,end_lat:double,end_lon:double,"
|
||||||
+ "distance_in_meters:int,seconds_since_epoch:bigint,weight:float,nation:binary,"
|
+ "distance_in_meters:int,seconds_since_epoch:bigint,weight:float,nation:binary,"
|
||||||
+ "current_date:date,current_ts:bigint,height:decimal(10,6),"
|
+ "current_date:date,current_ts:bigint,height:decimal(10,6),"
|
||||||
|
|||||||
@@ -23,11 +23,14 @@ import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
|||||||
import org.apache.hudi.common.bloom.BloomFilter;
|
import org.apache.hudi.common.bloom.BloomFilter;
|
||||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||||
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
|
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
|
||||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||||
|
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||||
|
|
||||||
|
import org.apache.avro.JsonProperties;
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericData;
|
import org.apache.avro.generic.GenericData;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
@@ -50,6 +53,7 @@ import java.util.List;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
|
import static org.apache.hudi.avro.HoodieAvroUtils.METADATA_FIELD_SCHEMA;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
@@ -151,6 +155,33 @@ public class TestParquetUtils extends HoodieCommonTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFetchRecordKeyPartitionPathVirtualKeysFromParquet() throws Exception {
|
||||||
|
List<String> rowKeys = new ArrayList<>();
|
||||||
|
List<HoodieKey> expected = new ArrayList<>();
|
||||||
|
String partitionPath = "path1";
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
String rowKey = UUID.randomUUID().toString();
|
||||||
|
rowKeys.add(rowKey);
|
||||||
|
expected.add(new HoodieKey(rowKey, partitionPath));
|
||||||
|
}
|
||||||
|
|
||||||
|
String filePath = Paths.get(basePath, "test.parquet").toUri().toString();
|
||||||
|
Schema schema = getSchemaWithFields(Arrays.asList(new String[]{"abc", "def"}));
|
||||||
|
writeParquetFile(BloomFilterTypeCode.SIMPLE.name(), filePath, rowKeys, schema, true, partitionPath,
|
||||||
|
false, "abc", "def");
|
||||||
|
|
||||||
|
// Read and verify
|
||||||
|
List<HoodieKey> fetchedRows =
|
||||||
|
parquetUtils.fetchRecordKeyPartitionPath(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath),
|
||||||
|
Option.of(new TestBaseKeyGen("abc","def")));
|
||||||
|
assertEquals(rowKeys.size(), fetchedRows.size(), "Total count does not match");
|
||||||
|
|
||||||
|
for (HoodieKey entry : fetchedRows) {
|
||||||
|
assertTrue(expected.contains(entry), "Record key must be in the given filter");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testReadCounts() throws Exception {
|
public void testReadCounts() throws Exception {
|
||||||
String filePath = Paths.get(basePath, "test.parquet").toUri().toString();
|
String filePath = Paths.get(basePath, "test.parquet").toUri().toString();
|
||||||
@@ -168,22 +199,73 @@ public class TestParquetUtils extends HoodieCommonTestHarness {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPath) throws Exception {
|
private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPath) throws Exception {
|
||||||
|
writeParquetFile(typeCode, filePath, rowKeys, schema, addPartitionPathField, partitionPath,
|
||||||
|
true, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPathValue,
|
||||||
|
boolean useMetaFields, String recordFieldName, String partitionFieldName) throws Exception {
|
||||||
// Write out a parquet file
|
// Write out a parquet file
|
||||||
BloomFilter filter = BloomFilterFactory
|
BloomFilter filter = BloomFilterFactory
|
||||||
.createBloomFilter(1000, 0.0001, 10000, typeCode);
|
.createBloomFilter(1000, 0.0001, 10000, typeCode);
|
||||||
HoodieAvroWriteSupport writeSupport =
|
HoodieAvroWriteSupport writeSupport =
|
||||||
new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
|
new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
|
||||||
ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
|
ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
|
||||||
120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
|
120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
|
||||||
for (String rowKey : rowKeys) {
|
for (String rowKey : rowKeys) {
|
||||||
GenericRecord rec = new GenericData.Record(schema);
|
GenericRecord rec = new GenericData.Record(schema);
|
||||||
rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
|
rec.put(useMetaFields ? HoodieRecord.RECORD_KEY_METADATA_FIELD : recordFieldName, rowKey);
|
||||||
if (addPartitionPathField) {
|
if (addPartitionPathField) {
|
||||||
rec.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath);
|
rec.put(useMetaFields ? HoodieRecord.PARTITION_PATH_METADATA_FIELD : partitionFieldName, partitionPathValue);
|
||||||
}
|
}
|
||||||
writer.write(rec);
|
writer.write(rec);
|
||||||
writeSupport.add(rowKey);
|
writeSupport.add(rowKey);
|
||||||
}
|
}
|
||||||
writer.close();
|
writer.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Schema getSchemaWithFields(List<String> fields) {
|
||||||
|
List<Schema.Field> toBeAddedFields = new ArrayList<>();
|
||||||
|
Schema recordSchema = Schema.createRecord("HoodieRecordKey", "", "", false);
|
||||||
|
|
||||||
|
for (String field: fields) {
|
||||||
|
Schema.Field schemaField =
|
||||||
|
new Schema.Field(field, METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE);
|
||||||
|
toBeAddedFields.add(schemaField);
|
||||||
|
}
|
||||||
|
recordSchema.setFields(toBeAddedFields);
|
||||||
|
return recordSchema;
|
||||||
|
}
|
||||||
|
|
||||||
|
class TestBaseKeyGen extends BaseKeyGenerator {
|
||||||
|
|
||||||
|
private String recordKeyField;
|
||||||
|
private String partitionField;
|
||||||
|
|
||||||
|
public TestBaseKeyGen(String recordKeyField, String partitionField) {
|
||||||
|
super(new TypedProperties());
|
||||||
|
this.recordKeyField = recordKeyField;
|
||||||
|
this.partitionField = partitionField;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getRecordKey(GenericRecord record) {
|
||||||
|
return record.get(recordKeyField).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getPartitionPath(GenericRecord record) {
|
||||||
|
return record.get(partitionField).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> getRecordKeyFields() {
|
||||||
|
return Arrays.asList(new String[]{recordKeyField});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> getPartitionPathFields() {
|
||||||
|
return Arrays.asList(new String[]{partitionField});
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -562,7 +562,7 @@ public class TestBootstrap extends HoodieClientTestBase {
|
|||||||
final List<String> records = new ArrayList<>();
|
final List<String> records = new ArrayList<>();
|
||||||
IntStream.range(from, to).forEach(i -> {
|
IntStream.range(from, to).forEach(i -> {
|
||||||
String id = "" + i;
|
String id = "" + i;
|
||||||
records.add(generateGenericRecord("trip_" + id, "rider_" + id, "driver_" + id,
|
records.add(generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id,
|
||||||
timestamp, false, false).toString());
|
timestamp, false, false).toString());
|
||||||
});
|
});
|
||||||
if (isPartitioned) {
|
if (isPartitioned) {
|
||||||
|
|||||||
@@ -29,6 +29,8 @@ import org.apache.hudi.exception.{HoodieException, HoodieKeyException}
|
|||||||
import org.apache.hudi.keygen._
|
import org.apache.hudi.keygen._
|
||||||
import org.apache.hudi.testutils.KeyGeneratorTestUtilities
|
import org.apache.hudi.testutils.KeyGeneratorTestUtilities
|
||||||
import org.apache.spark.sql.Row
|
import org.apache.spark.sql.Row
|
||||||
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
import org.junit.jupiter.api.Assertions.assertEquals
|
import org.junit.jupiter.api.Assertions.assertEquals
|
||||||
import org.junit.jupiter.api.{BeforeEach, Test}
|
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||||
import org.scalatest.Assertions.fail
|
import org.scalatest.Assertions.fail
|
||||||
@@ -259,6 +261,8 @@ class TestDataSourceDefaults {
|
|||||||
val genericRecord = converterFn.apply(row).asInstanceOf[GenericRecord]
|
val genericRecord = converterFn.apply(row).asInstanceOf[GenericRecord]
|
||||||
getKey(genericRecord).getPartitionPath
|
getKey(genericRecord).getPartitionPath
|
||||||
}
|
}
|
||||||
|
|
||||||
|
override def getPartitionPath(internalRow: InternalRow, structType: StructType): String = null
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test def testComplexKeyGenerator() = {
|
@Test def testComplexKeyGenerator() = {
|
||||||
|
|||||||
@@ -392,18 +392,19 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
List((DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, HoodieFileFormat.PARQUET.name()), (DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, HoodieFileFormat.ORC.name()),
|
List((DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, HoodieFileFormat.PARQUET.name(), true), (DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, HoodieFileFormat.ORC.name(), true),
|
||||||
(DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, HoodieFileFormat.PARQUET.name()), (DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, HoodieFileFormat.ORC.name()))
|
(DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, HoodieFileFormat.PARQUET.name(), true), (DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, HoodieFileFormat.ORC.name(), true),
|
||||||
|
(DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, HoodieFileFormat.PARQUET.name(), false))
|
||||||
.foreach(t => {
|
.foreach(t => {
|
||||||
val tableType = t._1
|
val tableType = t._1
|
||||||
val baseFileFormat = t._2
|
val baseFileFormat = t._2
|
||||||
test("test basic HoodieSparkSqlWriter functionality with datasource insert for " + tableType + " with " + baseFileFormat + "as the base file format") {
|
val populateMetaFields = t._3
|
||||||
initSparkContext("test_insert_datasource")
|
test("test basic HoodieSparkSqlWriter functionality with datasource insert for " + tableType + " with " + baseFileFormat + " as the base file format "
|
||||||
|
+ " with populate meta fields " + populateMetaFields) {
|
||||||
|
initSparkContext("test_insert_base_file_format_datasource")
|
||||||
val path = java.nio.file.Files.createTempDirectory("hoodie_test_path")
|
val path = java.nio.file.Files.createTempDirectory("hoodie_test_path")
|
||||||
try {
|
try {
|
||||||
|
|
||||||
val hoodieFooTableName = "hoodie_foo_tbl"
|
val hoodieFooTableName = "hoodie_foo_tbl"
|
||||||
|
|
||||||
//create a new table
|
//create a new table
|
||||||
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
|
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
|
||||||
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
|
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
|
||||||
@@ -413,6 +414,7 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
|||||||
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
|
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
|
||||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||||
|
HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key() -> String.valueOf(populateMetaFields),
|
||||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> classOf[SimpleKeyGenerator].getCanonicalName)
|
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> classOf[SimpleKeyGenerator].getCanonicalName)
|
||||||
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
||||||
|
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
|||||||
import org.apache.hudi.common.testutils.SchemaTestUtil;
|
import org.apache.hudi.common.testutils.SchemaTestUtil;
|
||||||
import org.apache.hudi.common.testutils.minicluster.ZookeeperTestService;
|
import org.apache.hudi.common.testutils.minicluster.ZookeeperTestService;
|
||||||
import org.apache.hudi.common.util.FileIOUtils;
|
import org.apache.hudi.common.util.FileIOUtils;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.hive.HiveSyncConfig;
|
import org.apache.hudi.hive.HiveSyncConfig;
|
||||||
import org.apache.hudi.hive.HiveSyncTool;
|
import org.apache.hudi.hive.HiveSyncTool;
|
||||||
import org.apache.hudi.hive.ddl.HiveQueryDDLExecutor;
|
import org.apache.hudi.hive.ddl.HiveQueryDDLExecutor;
|
||||||
@@ -351,7 +352,7 @@ public class HiveTestUtil {
|
|||||||
org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
|
org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
|
||||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
|
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
|
||||||
BloomFilterTypeCode.SIMPLE.name());
|
BloomFilterTypeCode.SIMPLE.name());
|
||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, Option.of(filter));
|
||||||
ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
|
ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
|
||||||
ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
|
ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
|
||||||
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());
|
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());
|
||||||
@@ -373,7 +374,7 @@ public class HiveTestUtil {
|
|||||||
org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
|
org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
|
||||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
|
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
|
||||||
BloomFilterTypeCode.SIMPLE.name());
|
BloomFilterTypeCode.SIMPLE.name());
|
||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, Option.of(filter));
|
||||||
ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
|
ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
|
||||||
ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
|
ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
|
||||||
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());
|
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
|||||||
import org.apache.hudi.common.testutils.SchemaTestUtil;
|
import org.apache.hudi.common.testutils.SchemaTestUtil;
|
||||||
import org.apache.hudi.common.testutils.minicluster.HdfsTestService;
|
import org.apache.hudi.common.testutils.minicluster.HdfsTestService;
|
||||||
import org.apache.hudi.common.util.FileIOUtils;
|
import org.apache.hudi.common.util.FileIOUtils;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
@@ -226,7 +227,7 @@ public class TestCluster implements BeforeAllCallback, AfterAllCallback,
|
|||||||
org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
|
org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
|
||||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
|
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
|
||||||
BloomFilterTypeCode.SIMPLE.name());
|
BloomFilterTypeCode.SIMPLE.name());
|
||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, Option.of(filter));
|
||||||
ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
|
ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
|
||||||
ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
|
ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
|
||||||
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, dfsCluster.getFileSystem().getConf());
|
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, dfsCluster.getFileSystem().getConf());
|
||||||
|
|||||||
@@ -231,7 +231,7 @@ public class TestHDFSParquetImporter extends FunctionalTestHarness implements Se
|
|||||||
long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000;
|
long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000;
|
||||||
List<GenericRecord> records = new ArrayList<GenericRecord>();
|
List<GenericRecord> records = new ArrayList<GenericRecord>();
|
||||||
for (long recordNum = 0; recordNum < 96; recordNum++) {
|
for (long recordNum = 0; recordNum < 96; recordNum++) {
|
||||||
records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-" + recordNum,
|
records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "0", "rider-" + recordNum,
|
||||||
"driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
|
"driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
|
||||||
}
|
}
|
||||||
try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(srcFile)
|
try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(srcFile)
|
||||||
@@ -249,12 +249,12 @@ public class TestHDFSParquetImporter extends FunctionalTestHarness implements Se
|
|||||||
List<GenericRecord> records = new ArrayList<GenericRecord>();
|
List<GenericRecord> records = new ArrayList<GenericRecord>();
|
||||||
// 10 for update
|
// 10 for update
|
||||||
for (long recordNum = 0; recordNum < 11; recordNum++) {
|
for (long recordNum = 0; recordNum < 11; recordNum++) {
|
||||||
records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-upsert-" + recordNum,
|
records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "0", "rider-upsert-" + recordNum,
|
||||||
"driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
|
"driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
|
||||||
}
|
}
|
||||||
// 4 for insert
|
// 4 for insert
|
||||||
for (long recordNum = 96; recordNum < 100; recordNum++) {
|
for (long recordNum = 96; recordNum < 100; recordNum++) {
|
||||||
records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-upsert-" + recordNum,
|
records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "0", "rider-upsert-" + recordNum,
|
||||||
"driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
|
"driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
|
||||||
}
|
}
|
||||||
try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(srcFile)
|
try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(srcFile)
|
||||||
|
|||||||
@@ -25,6 +25,9 @@
|
|||||||
}, {
|
}, {
|
||||||
"name" : "_row_key",
|
"name" : "_row_key",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
}, {
|
||||||
|
"name" : "partition_path",
|
||||||
|
"type" : "string"
|
||||||
}, {
|
}, {
|
||||||
"name" : "rider",
|
"name" : "rider",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
|||||||
@@ -25,6 +25,9 @@
|
|||||||
}, {
|
}, {
|
||||||
"name" : "_row_key",
|
"name" : "_row_key",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
}, {
|
||||||
|
"name" : "partition_path",
|
||||||
|
"type" : "string"
|
||||||
}, {
|
}, {
|
||||||
"name" : "rider",
|
"name" : "rider",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
|||||||
@@ -16,4 +16,4 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
###
|
###
|
||||||
include=base.properties
|
include=base.properties
|
||||||
hoodie.deltastreamer.transformer.sql=SELECT a.timestamp, a._row_key, a.rider, a.driver, a.begin_lat, a.begin_lon, a.end_lat, a.end_lon, a.distance_in_meters, a.seconds_since_epoch, a.weight, a.nation, a.current_date, a.current_ts, a.height, a.city_to_state, a.fare, a.tip_history, a.`_hoodie_is_deleted`, CAST(1.0 AS DOUBLE) AS haversine_distance FROM <SRC> a
|
hoodie.deltastreamer.transformer.sql=SELECT a.timestamp, a._row_key, a.partition_path, a.rider, a.driver, a.begin_lat, a.begin_lon, a.end_lat, a.end_lon, a.distance_in_meters, a.seconds_since_epoch, a.weight, a.nation, a.current_date, a.current_ts, a.height, a.city_to_state, a.fare, a.tip_history, a.`_hoodie_is_deleted`, CAST(1.0 AS DOUBLE) AS haversine_distance FROM <SRC> a
|
||||||
|
|||||||
@@ -25,6 +25,9 @@
|
|||||||
}, {
|
}, {
|
||||||
"name" : "_row_key",
|
"name" : "_row_key",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
}, {
|
||||||
|
"name" : "partition_path",
|
||||||
|
"type" : "string"
|
||||||
}, {
|
}, {
|
||||||
"name" : "rider",
|
"name" : "rider",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
|||||||
@@ -25,6 +25,9 @@
|
|||||||
}, {
|
}, {
|
||||||
"name" : "_row_key",
|
"name" : "_row_key",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
}, {
|
||||||
|
"name" : "partition_path",
|
||||||
|
"type" : "string"
|
||||||
}, {
|
}, {
|
||||||
"name" : "rider",
|
"name" : "rider",
|
||||||
"type" : "string"
|
"type" : "string"
|
||||||
|
|||||||
Reference in New Issue
Block a user