[HUDI-1357] Added a check to validate records are not lost during merges. (#2216)

- Turned off by default
2020-12-01 13:44:57 -08:00
parent b826c53e33
commit ac23d2587f
6 changed files with 122 additions and 2 deletions
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
@@ -39,6 +39,7 @@ import org.apache.parquet.avro.AvroReadSupport;
 import org.apache.parquet.avro.AvroSchemaConverter;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.ParquetReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;

@@ -261,6 +262,22 @@ public class ParquetUtils {
    return records;
  }

+  /**
+   * Returns the number of records in the parquet file.
+   *
+   * @param conf Configuration
+   * @param parquetFilePath path of the file
+   */
+  public static long getRowCount(Configuration conf, Path parquetFilePath) {
+    ParquetMetadata footer;
+    long rowCount = 0;
+    footer = readMetadata(conf, parquetFilePath);
+    for (BlockMetaData b : footer.getBlocks()) {
+      rowCount += b.getRowCount();
+    }
+    return rowCount;
+  }
+
  static class RecordKeysFilterFunction implements Function<String, Boolean> {

    private final Set<String> candidateKeys;
--- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetReader.java
+++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetReader.java
@@ -74,7 +74,6 @@ public class HoodieParquetReader<R extends IndexedRecord> implements HoodieFileR

  @Override
  public long getTotalRecords() {
-    // TODO Auto-generated method stub
-    return 0;
+    return ParquetUtils.getRowCount(conf, path);
  }
 }
--- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java
+++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java
@@ -36,6 +36,7 @@ import org.apache.parquet.avro.AvroSchemaConverter;
 import org.apache.parquet.hadoop.ParquetWriter;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.provider.MethodSource;
@@ -147,6 +148,18 @@ public class TestParquetUtils extends HoodieCommonTestHarness {
    }
  }

+  @Test
+  public void testReadCounts() throws Exception {
+    String filePath = basePath + "/test.parquet";
+    List<String> rowKeys = new ArrayList<>();
+    for (int i = 0; i < 123; i++) {
+      rowKeys.add(UUID.randomUUID().toString());
+    }
+    writeParquetFile(BloomFilterTypeCode.SIMPLE.name(), filePath, rowKeys);
+
+    assertEquals(123, ParquetUtils.getRowCount(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)));
+  }
+
  private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys) throws Exception {
    writeParquetFile(typeCode, filePath, rowKeys, HoodieAvroUtils.getRecordKeySchema(), false, "");
  }