1
0

[HUDI-1357] Added a check to validate records are not lost during merges. (#2216)

- Turned off by default
This commit is contained in:
Prashant Wason
2020-12-01 13:44:57 -08:00
committed by GitHub
parent b826c53e33
commit ac23d2587f
6 changed files with 122 additions and 2 deletions

View File

@@ -39,6 +39,7 @@ import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
@@ -261,6 +262,22 @@ public class ParquetUtils {
return records;
}
/**
* Returns the number of records in the parquet file.
*
* @param conf Configuration
* @param parquetFilePath path of the file
*/
public static long getRowCount(Configuration conf, Path parquetFilePath) {
ParquetMetadata footer;
long rowCount = 0;
footer = readMetadata(conf, parquetFilePath);
for (BlockMetaData b : footer.getBlocks()) {
rowCount += b.getRowCount();
}
return rowCount;
}
static class RecordKeysFilterFunction implements Function<String, Boolean> {
private final Set<String> candidateKeys;

View File

@@ -74,7 +74,6 @@ public class HoodieParquetReader<R extends IndexedRecord> implements HoodieFileR
@Override
public long getTotalRecords() {
// TODO Auto-generated method stub
return 0;
return ParquetUtils.getRowCount(conf, path);
}
}

View File

@@ -36,6 +36,7 @@ import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
@@ -147,6 +148,18 @@ public class TestParquetUtils extends HoodieCommonTestHarness {
}
}
@Test
public void testReadCounts() throws Exception {
String filePath = basePath + "/test.parquet";
List<String> rowKeys = new ArrayList<>();
for (int i = 0; i < 123; i++) {
rowKeys.add(UUID.randomUUID().toString());
}
writeParquetFile(BloomFilterTypeCode.SIMPLE.name(), filePath, rowKeys);
assertEquals(123, ParquetUtils.getRowCount(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)));
}
private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys) throws Exception {
writeParquetFile(typeCode, filePath, rowKeys, HoodieAvroUtils.getRecordKeySchema(), false, "");
}