1
0

[HUDI-1357] Added a check to validate records are not lost during merges. (#2216)

- Turned off by default
This commit is contained in:
Prashant Wason
2020-12-01 13:44:57 -08:00
committed by GitHub
parent b826c53e33
commit ac23d2587f
6 changed files with 122 additions and 2 deletions

View File

@@ -117,6 +117,10 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
public static final String MAX_CONSISTENCY_CHECKS_PROP = "hoodie.consistency.check.max_checks";
public static int DEFAULT_MAX_CONSISTENCY_CHECKS = 7;
// Data validation check performed during merges before actual commits
private static final String MERGE_DATA_VALIDATION_CHECK_ENABLED = "hoodie.merge.data.validation.enabled";
private static final String DEFAULT_MERGE_DATA_VALIDATION_CHECK_ENABLED = "false";
/**
* HUDI-858 : There are users who had been directly using RDD APIs and have relied on a behavior in 0.4.x to allow
* multiple write operations (upsert/buk-insert/...) to be executed within a single commit.
@@ -282,6 +286,10 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
return BulkInsertSortMode.valueOf(sortMode.toUpperCase());
}
public boolean isMergeDataValidationCheckEnabled() {
return Boolean.parseBoolean(props.getProperty(MERGE_DATA_VALIDATION_CHECK_ENABLED));
}
/**
* compaction properties.
*/
@@ -983,6 +991,11 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
return this;
}
public Builder withMergeDataValidationCheckEnabled(boolean enabled) {
props.setProperty(MERGE_DATA_VALIDATION_CHECK_ENABLED, String.valueOf(enabled));
return this;
}
public Builder withProperties(Properties properties) {
this.props.putAll(properties);
return this;
@@ -1032,6 +1045,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
setDefaultOnCondition(props, !props.containsKey(AVRO_SCHEMA_VALIDATE), AVRO_SCHEMA_VALIDATE, DEFAULT_AVRO_SCHEMA_VALIDATE);
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_SORT_MODE),
BULKINSERT_SORT_MODE, DEFAULT_BULKINSERT_SORT_MODE);
setDefaultOnCondition(props, !props.containsKey(MERGE_DATA_VALIDATION_CHECK_ENABLED),
MERGE_DATA_VALIDATION_CHECK_ENABLED, DEFAULT_MERGE_DATA_VALIDATION_CHECK_ENABLED);
// Make sure the props is propagated
setDefaultOnCondition(props, !isIndexConfigSet, HoodieIndexConfig.newBuilder().fromProperties(props).build());

View File

@@ -34,8 +34,11 @@ import org.apache.hudi.common.util.HoodieRecordSizeEstimator;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieCorruptedDataException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
import org.apache.hudi.io.storage.HoodieFileWriter;
import org.apache.hudi.table.HoodieTable;
@@ -292,6 +295,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
runtimeStats.setTotalUpsertTime(timer.endTimer());
stat.setRuntimeStats(runtimeStats);
performMergeDataValidationCheck(writeStatus);
LOG.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
stat.getFileId(), runtimeStats.getTotalUpsertTime()));
@@ -301,6 +306,28 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
}
}
public void performMergeDataValidationCheck(WriteStatus writeStatus) {
if (!config.isMergeDataValidationCheckEnabled()) {
return;
}
long oldNumWrites = 0;
try {
HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(hoodieTable.getHadoopConf(), oldFilePath);
oldNumWrites = reader.getTotalRecords();
} catch (IOException e) {
throw new HoodieUpsertException("Failed to check for merge data validation", e);
}
if ((writeStatus.getStat().getNumWrites() + writeStatus.getStat().getNumDeletes()) < oldNumWrites) {
throw new HoodieCorruptedDataException(
String.format("Record write count decreased for file: %s, Partition Path: %s (%s:%d + %d < %s:%d)",
writeStatus.getFileId(), writeStatus.getPartitionPath(),
instantTime, writeStatus.getStat().getNumWrites(), writeStatus.getStat().getNumDeletes(),
FSUtils.getCommitTime(oldFilePath.toString()), oldNumWrites));
}
}
public Path getOldFilePath() {
return oldFilePath;
}