[HUDI-1357] Added a check to validate records are not lost during merges. (#2216)
- Turned off by default
This commit is contained in:
@@ -39,6 +39,7 @@ import org.apache.parquet.avro.AvroReadSupport;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
import org.apache.parquet.hadoop.ParquetFileReader;
|
||||
import org.apache.parquet.hadoop.ParquetReader;
|
||||
import org.apache.parquet.hadoop.metadata.BlockMetaData;
|
||||
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
@@ -261,6 +262,22 @@ public class ParquetUtils {
|
||||
return records;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of records in the parquet file.
|
||||
*
|
||||
* @param conf Configuration
|
||||
* @param parquetFilePath path of the file
|
||||
*/
|
||||
public static long getRowCount(Configuration conf, Path parquetFilePath) {
|
||||
ParquetMetadata footer;
|
||||
long rowCount = 0;
|
||||
footer = readMetadata(conf, parquetFilePath);
|
||||
for (BlockMetaData b : footer.getBlocks()) {
|
||||
rowCount += b.getRowCount();
|
||||
}
|
||||
return rowCount;
|
||||
}
|
||||
|
||||
static class RecordKeysFilterFunction implements Function<String, Boolean> {
|
||||
|
||||
private final Set<String> candidateKeys;
|
||||
|
||||
@@ -74,7 +74,6 @@ public class HoodieParquetReader<R extends IndexedRecord> implements HoodieFileR
|
||||
|
||||
@Override
|
||||
public long getTotalRecords() {
|
||||
// TODO Auto-generated method stub
|
||||
return 0;
|
||||
return ParquetUtils.getRowCount(conf, path);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,6 +36,7 @@ import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
@@ -147,6 +148,18 @@ public class TestParquetUtils extends HoodieCommonTestHarness {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadCounts() throws Exception {
|
||||
String filePath = basePath + "/test.parquet";
|
||||
List<String> rowKeys = new ArrayList<>();
|
||||
for (int i = 0; i < 123; i++) {
|
||||
rowKeys.add(UUID.randomUUID().toString());
|
||||
}
|
||||
writeParquetFile(BloomFilterTypeCode.SIMPLE.name(), filePath, rowKeys);
|
||||
|
||||
assertEquals(123, ParquetUtils.getRowCount(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)));
|
||||
}
|
||||
|
||||
private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys) throws Exception {
|
||||
writeParquetFile(typeCode, filePath, rowKeys, HoodieAvroUtils.getRecordKeySchema(), false, "");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user