1
0

[HUDI-650] Modify handleUpdate path to validate partitionPath (#1368)

This commit is contained in:
satishkotha
2020-03-20 08:37:22 -07:00
committed by GitHub
parent eeab532d79
commit 83fb9651f3
12 changed files with 170 additions and 61 deletions

View File

@@ -52,7 +52,8 @@ public class MergeOnReadLazyInsertIterable<T extends HoodieRecordPayload> extend
List<WriteStatus> statuses = new ArrayList<>(); List<WriteStatus> statuses = new ArrayList<>();
// lazily initialize the handle, for the first time // lazily initialize the handle, for the first time
if (handle == null) { if (handle == null) {
handle = new HoodieAppendHandle(hoodieConfig, commitTime, hoodieTable, getNextFileId(idPrefix)); handle = new HoodieAppendHandle(hoodieConfig, commitTime, hoodieTable,
insertPayload.getPartitionPath(), getNextFileId(idPrefix));
} }
if (handle.canWrite(insertPayload)) { if (handle.canWrite(insertPayload)) {
// write the payload, if the handle has capacity // write the payload, if the handle has capacity
@@ -62,7 +63,8 @@ public class MergeOnReadLazyInsertIterable<T extends HoodieRecordPayload> extend
handle.close(); handle.close();
statuses.add(handle.getWriteStatus()); statuses.add(handle.getWriteStatus());
// Need to handle the rejected payload & open new handle // Need to handle the rejected payload & open new handle
handle = new HoodieAppendHandle(hoodieConfig, commitTime, hoodieTable, getNextFileId(idPrefix)); handle = new HoodieAppendHandle(hoodieConfig, commitTime, hoodieTable,
insertPayload.getPartitionPath(), getNextFileId(idPrefix));
handle.write(insertPayload, payload.insertValue, payload.exception); // we should be able to write 1 payload. handle.write(insertPayload, payload.insertValue, payload.exception); // we should be able to write 1 payload.
} }
} }

View File

@@ -74,7 +74,6 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
// Buffer for holding records (to be deleted) in memory before they are flushed to disk // Buffer for holding records (to be deleted) in memory before they are flushed to disk
private List<HoodieKey> keysToDelete = new ArrayList<>(); private List<HoodieKey> keysToDelete = new ArrayList<>();
private String partitionPath;
private Iterator<HoodieRecord<T>> recordItr; private Iterator<HoodieRecord<T>> recordItr;
// Total number of records written during an append // Total number of records written during an append
private long recordsWritten = 0; private long recordsWritten = 0;
@@ -101,21 +100,21 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
// Total number of new records inserted into the delta file // Total number of new records inserted into the delta file
private long insertRecordsWritten = 0; private long insertRecordsWritten = 0;
public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, String fileId, public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
Iterator<HoodieRecord<T>> recordItr) { String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
super(config, commitTime, fileId, hoodieTable); super(config, commitTime, partitionPath, fileId, hoodieTable);
writeStatus.setStat(new HoodieDeltaWriteStat()); writeStatus.setStat(new HoodieDeltaWriteStat());
this.fileId = fileId; this.fileId = fileId;
this.recordItr = recordItr; this.recordItr = recordItr;
} }
public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, String fileId) { public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
this(config, commitTime, hoodieTable, fileId, null); String partitionPath, String fileId) {
this(config, commitTime, hoodieTable, partitionPath, fileId, null);
} }
private void init(HoodieRecord record) { private void init(HoodieRecord record) {
if (doInit) { if (doInit) {
this.partitionPath = record.getPartitionPath();
// extract some information from the first record // extract some information from the first record
SliceView rtView = hoodieTable.getSliceView(); SliceView rtView = hoodieTable.getSliceView();
Option<FileSlice> fileSlice = rtView.getLatestFileSlice(partitionPath, fileId); Option<FileSlice> fileSlice = rtView.getLatestFileSlice(partitionPath, fileId);
@@ -295,6 +294,13 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
} }
private void writeToBuffer(HoodieRecord<T> record) { private void writeToBuffer(HoodieRecord<T> record) {
if (!partitionPath.equals(record.getPartitionPath())) {
HoodieUpsertException failureEx = new HoodieUpsertException("mismatched partition path, record partition: "
+ record.getPartitionPath() + " but trying to insert into partition: " + partitionPath);
writeStatus.markFailure(record, failureEx, record.getData().getMetadata());
return;
}
// update the new location of the record, so we know where to find it next // update the new location of the record, so we know where to find it next
record.unseal(); record.unseal();
record.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));

View File

@@ -57,7 +57,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
public HoodieCreateHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, public HoodieCreateHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
String partitionPath, String fileId) { String partitionPath, String fileId) {
super(config, commitTime, fileId, hoodieTable); super(config, commitTime, partitionPath, fileId, hoodieTable);
writeStatus.setFileId(fileId); writeStatus.setFileId(fileId);
writeStatus.setPartitionPath(partitionPath); writeStatus.setPartitionPath(partitionPath);

View File

@@ -70,9 +70,9 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
private boolean useWriterSchema; private boolean useWriterSchema;
public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
Iterator<HoodieRecord<T>> recordItr, String fileId) { Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId) {
super(config, commitTime, fileId, hoodieTable); super(config, commitTime, partitionPath, fileId, hoodieTable);
String partitionPath = init(fileId, recordItr); init(fileId, recordItr);
init(fileId, partitionPath, hoodieTable.getBaseFileOnlyView().getLatestBaseFile(partitionPath, fileId).get()); init(fileId, partitionPath, hoodieTable.getBaseFileOnlyView().getLatestBaseFile(partitionPath, fileId).get());
} }
@@ -80,12 +80,12 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
* Called by compactor code path. * Called by compactor code path.
*/ */
public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
Map<String, HoodieRecord<T>> keyToNewRecords, String fileId, HoodieBaseFile dataFileToBeMerged) { Map<String, HoodieRecord<T>> keyToNewRecords, String partitionPath, String fileId,
super(config, commitTime, fileId, hoodieTable); HoodieBaseFile dataFileToBeMerged) {
super(config, commitTime, partitionPath, fileId, hoodieTable);
this.keyToNewRecords = keyToNewRecords; this.keyToNewRecords = keyToNewRecords;
this.useWriterSchema = true; this.useWriterSchema = true;
init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get()).getPartitionPath(), init(fileId, this.partitionPath, dataFileToBeMerged);
dataFileToBeMerged);
} }
public static Schema createHoodieWriteSchema(Schema originalSchema) { public static Schema createHoodieWriteSchema(Schema originalSchema) {
@@ -143,7 +143,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
/** /**
* Load the new incoming records in a map and return partitionPath. * Load the new incoming records in a map and return partitionPath.
*/ */
private String init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) { private void init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) {
try { try {
// Load the new records in a map // Load the new records in a map
long memoryForMerge = config.getMaxMemoryPerPartitionMerge(); long memoryForMerge = config.getMaxMemoryPerPartitionMerge();
@@ -153,10 +153,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
} catch (IOException io) { } catch (IOException io) {
throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io); throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io);
} }
String partitionPath = null;
while (newRecordsItr.hasNext()) { while (newRecordsItr.hasNext()) {
HoodieRecord<T> record = newRecordsItr.next(); HoodieRecord<T> record = newRecordsItr.next();
partitionPath = record.getPartitionPath();
// update the new location of the record, so we know where to find it next // update the new location of the record, so we know where to find it next
record.unseal(); record.unseal();
record.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
@@ -170,7 +168,6 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
+ ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + "Number of entries in DiskBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + "Number of entries in DiskBasedMap => "
+ ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + "Size of file spilled to disk => " + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + "Size of file spilled to disk => "
+ ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes()); + ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
return partitionPath;
} }
private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord) { private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord) {
@@ -182,6 +179,12 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
private boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord) { private boolean writeRecord(HoodieRecord<T> hoodieRecord, Option<IndexedRecord> indexedRecord) {
Option recordMetadata = hoodieRecord.getData().getMetadata(); Option recordMetadata = hoodieRecord.getData().getMetadata();
if (!partitionPath.equals(hoodieRecord.getPartitionPath())) {
HoodieUpsertException failureEx = new HoodieUpsertException("mismatched partition path, record partition: "
+ hoodieRecord.getPartitionPath() + " but trying to insert into partition: " + partitionPath);
writeStatus.markFailure(hoodieRecord, failureEx, recordMetadata);
return false;
}
try { try {
if (indexedRecord.isPresent()) { if (indexedRecord.isPresent()) {
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema // Convert GenericRecord to GenericRecord with hoodie commit metadata in schema

View File

@@ -52,11 +52,14 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
protected final Schema writerSchema; protected final Schema writerSchema;
protected HoodieTimer timer; protected HoodieTimer timer;
protected final WriteStatus writeStatus; protected final WriteStatus writeStatus;
protected final String partitionPath;
protected final String fileId; protected final String fileId;
protected final String writeToken; protected final String writeToken;
public HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String fileId, HoodieTable<T> hoodieTable) { public HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String partitionPath,
String fileId, HoodieTable<T> hoodieTable) {
super(config, instantTime, hoodieTable); super(config, instantTime, hoodieTable);
this.partitionPath = partitionPath;
this.fileId = fileId; this.fileId = fileId;
this.writeToken = makeSparkWriteToken(); this.writeToken = makeSparkWriteToken();
this.originalSchema = new Schema.Parser().parse(config.getSchema()); this.originalSchema = new Schema.Parser().parse(config.getSchema());

View File

@@ -170,7 +170,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table"); throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
} }
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr) public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String partitionPath, String fileId,
Iterator<HoodieRecord<T>> recordItr)
throws IOException { throws IOException {
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records // This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
if (!recordItr.hasNext()) { if (!recordItr.hasNext()) {
@@ -178,14 +179,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
return Collections.singletonList((List<WriteStatus>) Collections.EMPTY_LIST).iterator(); return Collections.singletonList((List<WriteStatus>) Collections.EMPTY_LIST).iterator();
} }
// these are updates // these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileId, recordItr); HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, partitionPath, fileId, recordItr);
return handleUpdateInternal(upsertHandle, commitTime, fileId); return handleUpdateInternal(upsertHandle, commitTime, fileId);
} }
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String partitionPath, String fileId,
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException { Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException {
// these are updates // these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileId, keyToNewRecords, oldDataFile); HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, partitionPath, fileId, keyToNewRecords, oldDataFile);
return handleUpdateInternal(upsertHandle, commitTime, fileId); return handleUpdateInternal(upsertHandle, commitTime, fileId);
} }
@@ -220,13 +221,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator(); return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator();
} }
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr) { protected HoodieMergeHandle getUpdateHandle(String commitTime, String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileId); return new HoodieMergeHandle<>(config, commitTime, this, recordItr, partitionPath, fileId);
} }
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileId, protected HoodieMergeHandle getUpdateHandle(String commitTime, String partitionPath, String fileId,
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) { Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) {
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileId, dataFileToBeMerged); return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords,
partitionPath, fileId, dataFileToBeMerged);
} }
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx, Iterator<HoodieRecord<T>> recordItr) public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx, Iterator<HoodieRecord<T>> recordItr)
@@ -258,7 +260,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
if (btype.equals(BucketType.INSERT)) { if (btype.equals(BucketType.INSERT)) {
return handleInsert(commitTime, binfo.fileIdPrefix, recordItr); return handleInsert(commitTime, binfo.fileIdPrefix, recordItr);
} else if (btype.equals(BucketType.UPDATE)) { } else if (btype.equals(BucketType.UPDATE)) {
return handleUpdate(commitTime, binfo.fileIdPrefix, recordItr); return handleUpdate(commitTime, binfo.partitionPath, binfo.fileIdPrefix, recordItr);
} else { } else {
throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition); throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition);
} }
@@ -523,12 +525,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
BucketType bucketType; BucketType bucketType;
String fileIdPrefix; String fileIdPrefix;
String partitionPath;
@Override @Override
public String toString() { public String toString() {
final StringBuilder sb = new StringBuilder("BucketInfo {"); final StringBuilder sb = new StringBuilder("BucketInfo {");
sb.append("bucketType=").append(bucketType).append(", "); sb.append("bucketType=").append(bucketType).append(", ");
sb.append("fileIdPrefix=").append(fileIdPrefix); sb.append("fileIdPrefix=").append(fileIdPrefix).append(", ");
sb.append("partitionPath=").append(partitionPath);
sb.append('}'); sb.append('}');
return sb.toString(); return sb.toString();
} }
@@ -585,18 +589,22 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
private void assignUpdates(WorkloadProfile profile) { private void assignUpdates(WorkloadProfile profile) {
// each update location gets a partition // each update location gets a partition
WorkloadStat gStat = profile.getGlobalStat(); Set<Map.Entry<String, WorkloadStat>> partitionStatEntries = profile.getPartitionPathStatMap().entrySet();
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) { for (Map.Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
addUpdateBucket(updateLocEntry.getKey()); for (Map.Entry<String, Pair<String, Long>> updateLocEntry :
partitionStat.getValue().getUpdateLocationToCount().entrySet()) {
addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey());
}
} }
} }
private int addUpdateBucket(String fileIdHint) { private int addUpdateBucket(String partitionPath, String fileIdHint) {
int bucket = totalBuckets; int bucket = totalBuckets;
updateLocationToBucket.put(fileIdHint, bucket); updateLocationToBucket.put(fileIdHint, bucket);
BucketInfo bucketInfo = new BucketInfo(); BucketInfo bucketInfo = new BucketInfo();
bucketInfo.bucketType = BucketType.UPDATE; bucketInfo.bucketType = BucketType.UPDATE;
bucketInfo.fileIdPrefix = fileIdHint; bucketInfo.fileIdPrefix = fileIdHint;
bucketInfo.partitionPath = partitionPath;
bucketInfoMap.put(totalBuckets, bucketInfo); bucketInfoMap.put(totalBuckets, bucketInfo);
totalBuckets++; totalBuckets++;
return bucket; return bucket;
@@ -631,7 +639,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
bucket = updateLocationToBucket.get(smallFile.location.getFileId()); bucket = updateLocationToBucket.get(smallFile.location.getFileId());
LOG.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket); LOG.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket);
} else { } else {
bucket = addUpdateBucket(smallFile.location.getFileId()); bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId());
LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket); LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
} }
bucketNumbers.add(bucket); bucketNumbers.add(bucket);
@@ -655,6 +663,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
recordsPerBucket.add(totalUnassignedInserts / insertBuckets); recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
BucketInfo bucketInfo = new BucketInfo(); BucketInfo bucketInfo = new BucketInfo();
bucketInfo.bucketType = BucketType.INSERT; bucketInfo.bucketType = BucketType.INSERT;
bucketInfo.partitionPath = partitionPath;
bucketInfo.fileIdPrefix = FSUtils.createNewFileIdPfx(); bucketInfo.fileIdPrefix = FSUtils.createNewFileIdPfx();
bucketInfoMap.put(totalBuckets, bucketInfo); bucketInfoMap.put(totalBuckets, bucketInfo);
totalBuckets++; totalBuckets++;

View File

@@ -98,15 +98,17 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
} }
@Override @Override
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr) public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String partitionPath,
String fileId, Iterator<HoodieRecord<T>> recordItr)
throws IOException { throws IOException {
LOG.info("Merging updates for commit " + commitTime + " for file " + fileId); LOG.info("Merging updates for commit " + commitTime + " for file " + fileId);
if (!index.canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) { if (!index.canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
LOG.info("Small file corrections for updates for commit " + commitTime + " for file " + fileId); LOG.info("Small file corrections for updates for commit " + commitTime + " for file " + fileId);
return super.handleUpdate(commitTime, fileId, recordItr); return super.handleUpdate(commitTime, partitionPath, fileId, recordItr);
} else { } else {
HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr); HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this,
partitionPath, fileId, recordItr);
appendHandle.doAppend(); appendHandle.doAppend();
appendHandle.close(); appendHandle.close();
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())).iterator(); return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())).iterator();

View File

@@ -95,6 +95,10 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
return partitionPathStatMap.keySet(); return partitionPathStatMap.keySet();
} }
public HashMap<String, WorkloadStat> getPartitionPathStatMap() {
return partitionPathStatMap;
}
public WorkloadStat getWorkloadStat(String partitionPath) { public WorkloadStat getWorkloadStat(String partitionPath) {
return partitionPathStatMap.get(partitionPath); return partitionPathStatMap.get(partitionPath);
} }

View File

@@ -136,7 +136,8 @@ public class HoodieMergeOnReadTableCompactor implements HoodieCompactor {
// If the dataFile is present, there is a base parquet file present, perform updates else perform inserts into a // If the dataFile is present, there is a base parquet file present, perform updates else perform inserts into a
// new base parquet file. // new base parquet file.
if (oldDataFileOpt.isPresent()) { if (oldDataFileOpt.isPresent()) {
result = hoodieCopyOnWriteTable.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), result = hoodieCopyOnWriteTable.handleUpdate(commitTime, operation.getPartitionPath(),
operation.getFileId(), scanner.getRecords(),
oldDataFileOpt.get()); oldDataFileOpt.get());
} else { } else {
result = hoodieCopyOnWriteTable.handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), result = hoodieCopyOnWriteTable.handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(),

View File

@@ -118,7 +118,8 @@ public class TestUpdateSchemaEvolution extends HoodieClientTestHarness {
updateRecords.add(record1); updateRecords.add(record1);
try { try {
HoodieMergeHandle mergeHandle = new HoodieMergeHandle(config2, "101", table2, updateRecords.iterator(), fileId); HoodieMergeHandle mergeHandle = new HoodieMergeHandle(config2, "101", table2,
updateRecords.iterator(), record1.getPartitionPath(), fileId);
Configuration conf = new Configuration(); Configuration conf = new Configuration();
AvroReadSupport.setAvroReadSchema(conf, mergeHandle.getWriterSchema()); AvroReadSupport.setAvroReadSchema(conf, mergeHandle.getWriterSchema());
List<GenericRecord> oldRecords = ParquetUtils.readAvroRecords(conf, List<GenericRecord> oldRecords = ParquetUtils.readAvroRecords(conf,

View File

@@ -18,8 +18,10 @@
package org.apache.hudi.table; package org.apache.hudi.table;
import org.apache.hudi.common.HoodieClientTestHarness; import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.HoodieClientTestHarness;
import org.apache.hudi.common.HoodieClientTestUtils; import org.apache.hudi.common.HoodieClientTestUtils;
import org.apache.hudi.common.HoodieTestDataGenerator; import org.apache.hudi.common.HoodieTestDataGenerator;
import org.apache.hudi.common.TestRawTripPayload; import org.apache.hudi.common.TestRawTripPayload;
@@ -41,9 +43,6 @@ import org.apache.hudi.config.HoodieStorageConfig;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.io.HoodieCreateHandle; import org.apache.hudi.io.HoodieCreateHandle;
import org.apache.hudi.table.HoodieCopyOnWriteTable.UpsertPartitioner; import org.apache.hudi.table.HoodieCopyOnWriteTable.UpsertPartitioner;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.avro.AvroReadSupport;
@@ -53,6 +52,7 @@ import org.junit.After;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import scala.Tuple2;
import java.io.File; import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
@@ -61,8 +61,6 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.UUID; import java.util.UUID;
import scala.Tuple2;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
@@ -209,8 +207,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
final HoodieCopyOnWriteTable newTable = new HoodieCopyOnWriteTable(config, jsc); final HoodieCopyOnWriteTable newTable = new HoodieCopyOnWriteTable(config, jsc);
List<WriteStatus> statuses = jsc.parallelize(Arrays.asList(1)).map(x -> { List<WriteStatus> statuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
return newTable.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), return newTable.handleUpdate(newCommitTime, updatedRecord1.getPartitionPath(),
updatedRecords.iterator()); updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator());
}).flatMap(x -> HoodieClientTestUtils.collectStatuses(x).iterator()).collect(); }).flatMap(x -> HoodieClientTestUtils.collectStatuses(x).iterator()).collect();
// Check the updated file // Check the updated file
@@ -470,7 +468,7 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
@Test @Test
public void testInsertUpsertWithHoodieAvroPayload() throws Exception { public void testInsertUpsertWithHoodieAvroPayload() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfigBuilder() HoodieWriteConfig config = makeHoodieClientConfigBuilder()
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build();
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
final HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); final HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
String commitTime = "000"; String commitTime = "000";
@@ -484,13 +482,15 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
String fileId = writeStatus.getFileId(); String fileId = writeStatus.getFileId();
metaClient.getFs().create(new Path(basePath + "/.hoodie/000.commit")).close(); metaClient.getFs().create(new Path(basePath + "/.hoodie/000.commit")).close();
final HoodieCopyOnWriteTable table2 = new HoodieCopyOnWriteTable(config, jsc); final HoodieCopyOnWriteTable table2 = new HoodieCopyOnWriteTable(config, jsc);
final List<HoodieRecord> updates = final List<HoodieRecord> updates =
dataGen.generateUpdatesWithHoodieAvroPayload(commitTime, writeStatus.getWrittenRecords()); dataGen.generateUpdatesWithHoodieAvroPayload(commitTime, inserts);
jsc.parallelize(Arrays.asList(1)).map(x -> { String partitionPath = updates.get(0).getPartitionPath();
return table2.handleUpdate("001", fileId, updates.iterator()); long numRecordsInPartition = updates.stream().filter(u -> u.getPartitionPath().equals(partitionPath)).count();
final List<List<WriteStatus>> updateStatus = jsc.parallelize(Arrays.asList(1)).map(x -> {
return table.handleUpdate(commitTime, partitionPath, fileId, updates.iterator());
}).map(x -> (List<WriteStatus>) HoodieClientTestUtils.collectStatuses(x)).collect(); }).map(x -> (List<WriteStatus>) HoodieClientTestUtils.collectStatuses(x)).collect();
assertEquals(updates.size() - numRecordsInPartition, updateStatus.get(0).get(0).getTotalErrorRecords());
} }
@After @After

View File

@@ -18,17 +18,20 @@
package org.apache.hudi.table; package org.apache.hudi.table;
import org.apache.hudi.common.HoodieClientTestHarness; import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.HoodieReadClient;
import org.apache.hudi.client.HoodieWriteClient; import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.HoodieClientTestHarness;
import org.apache.hudi.common.HoodieClientTestUtils; import org.apache.hudi.common.HoodieClientTestUtils;
import org.apache.hudi.common.HoodieMergeOnReadTestUtils; import org.apache.hudi.common.HoodieMergeOnReadTestUtils;
import org.apache.hudi.common.HoodieTestDataGenerator; import org.apache.hudi.common.HoodieTestDataGenerator;
import org.apache.hudi.common.TestRawTripPayload.MetadataMergeWriteStatus; import org.apache.hudi.common.TestRawTripPayload.MetadataMergeWriteStatus;
import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieFileGroup;
import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
@@ -52,10 +55,6 @@ import org.apache.hudi.config.HoodieStorageConfig;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.HoodieIndex.IndexType; import org.apache.hudi.index.HoodieIndex.IndexType;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.junit.After; import org.junit.After;
import org.junit.Assert; import org.junit.Assert;
@@ -1208,6 +1207,85 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness {
} }
} }
/**
* Test to validate invoking table.handleUpdate() with input records from multiple partitions will fail.
*/
@Test
public void testHandleUpdateWithMultiplePartitions() throws Exception {
HoodieWriteConfig cfg = getConfig(true);
try (HoodieWriteClient client = getWriteClient(cfg);) {
/**
* Write 1 (only inserts, written as parquet file)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieMergeOnReadTable hoodieTable = (HoodieMergeOnReadTable) HoodieTable.getHoodieTable(metaClient, cfg, jsc);
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
BaseFileOnlyView roView =
new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
assertFalse(dataFilesToRead.findAny().isPresent());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
dataFilesToRead = roView.getLatestBaseFiles();
assertTrue("should list the parquet files we wrote in the delta commit",
dataFilesToRead.findAny().isPresent());
/**
* Write 2 (only updates, written to .log file)
*/
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUpdates(newCommitTime, records);
writeRecords = jsc.parallelize(records, 1);
statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
/**
* Write 3 (only deletes, written to .log file)
*/
final String newDeleteTime = "004";
final String partitionPath = records.get(0).getPartitionPath();
final String fileId = statuses.get(0).getFileId();
client.startCommitWithTime(newDeleteTime);
List<HoodieRecord> fewRecordsForDelete = dataGen.generateDeletesFromExistingRecords(records);
JavaRDD<HoodieRecord> deleteRDD = jsc.parallelize(fewRecordsForDelete, 1);
// initialize partitioner
hoodieTable.getUpsertPartitioner(new WorkloadProfile(deleteRDD));
final List<List<WriteStatus>> deleteStatus = jsc.parallelize(Arrays.asList(1)).map(x -> {
return hoodieTable.handleUpdate(newDeleteTime, partitionPath, fileId, fewRecordsForDelete.iterator());
}).map(x -> (List<WriteStatus>) HoodieClientTestUtils.collectStatuses(x)).collect();
// Verify there are errors because records are from multiple partitions (but handleUpdate is invoked for
// specific partition)
WriteStatus status = deleteStatus.get(0).get(0);
assertTrue(status.hasErrors());
long numRecordsInPartition = fewRecordsForDelete.stream().filter(u ->
u.getPartitionPath().equals(partitionPath)).count();
assertEquals(fewRecordsForDelete.size() - numRecordsInPartition, status.getTotalErrorRecords());
}
}
private HoodieWriteConfig getConfig(Boolean autoCommit) { private HoodieWriteConfig getConfig(Boolean autoCommit) {
return getConfigBuilder(autoCommit).build(); return getConfigBuilder(autoCommit).build();
} }