1
0

[HUDI-1357] Added a check to validate records are not lost during merges. (#2216)

- Turned off by default
This commit is contained in:
Prashant Wason
2020-12-01 13:44:57 -08:00
committed by GitHub
parent b826c53e33
commit ac23d2587f
6 changed files with 122 additions and 2 deletions

View File

@@ -39,6 +39,7 @@ import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
@@ -261,6 +262,22 @@ public class ParquetUtils {
return records;
}
/**
* Returns the number of records in the parquet file.
*
* @param conf Configuration
* @param parquetFilePath path of the file
*/
public static long getRowCount(Configuration conf, Path parquetFilePath) {
ParquetMetadata footer;
long rowCount = 0;
footer = readMetadata(conf, parquetFilePath);
for (BlockMetaData b : footer.getBlocks()) {
rowCount += b.getRowCount();
}
return rowCount;
}
static class RecordKeysFilterFunction implements Function<String, Boolean> {
private final Set<String> candidateKeys;

View File

@@ -74,7 +74,6 @@ public class HoodieParquetReader<R extends IndexedRecord> implements HoodieFileR
@Override
public long getTotalRecords() {
// TODO Auto-generated method stub
return 0;
return ParquetUtils.getRowCount(conf, path);
}
}