From 684e12e9fcfa45c6ac922a84fb3116ac8142bc18 Mon Sep 17 00:00:00 2001 From: Udit Mehrotra Date: Mon, 18 Jan 2021 07:29:53 -0800 Subject: [PATCH] [HUDI-1529] Add block size to the FileStatus objects returned from metadata table to avoid too many file splits (#2451) --- .../apache/hudi/metadata/TestHoodieBackedMetadata.java | 8 ++++++++ .../org/apache/hudi/metadata/BaseTableMetadata.java | 2 +- .../apache/hudi/metadata/HoodieMetadataPayload.java | 10 +++++++--- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java index 3d770c737..16ee120ac 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java @@ -801,6 +801,14 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness { // File sizes should be valid Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getLen() > 0)); + // Block sizes should be valid + Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() > 0)); + List fsBlockSizes = Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList()); + Collections.sort(fsBlockSizes); + List metadataBlockSizes = Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList()); + Collections.sort(metadataBlockSizes); + assertEquals(fsBlockSizes, metadataBlockSizes); + if ((fsFileNames.size() != metadataFilenames.size()) || (!fsFileNames.equals(metadataFilenames))) { LOG.info("*** File system listing = " + Arrays.toString(fsFileNames.toArray())); LOG.info("*** Metadata listing = " + Arrays.toString(metadataFilenames.toArray())); diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java index 4ae71deb6..de0a3c4c5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java @@ -202,7 +202,7 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata { throw new HoodieMetadataException("Metadata record for partition " + partitionName + " is inconsistent: " + hoodieRecord.get().getData()); } - statuses = hoodieRecord.get().getData().getFileStatuses(partitionPath); + statuses = hoodieRecord.get().getData().getFileStatuses(hadoopConf.get(), partitionPath); } if (validateLookups) { diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java index 0863f7ef8..9c6eb89b9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java @@ -29,7 +29,9 @@ import org.apache.hudi.exception.HoodieMetadataException; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import java.io.IOException; @@ -177,10 +179,12 @@ public class HoodieMetadataPayload implements HoodieRecordPayload new FileStatus(e.getValue().getSize(), false, 0, 0, 0, 0, null, null, null, - new Path(partitionPath, e.getKey()))) + .map(e -> new FileStatus(e.getValue().getSize(), false, 0, blockSize, 0, 0, + null, null, null, new Path(partitionPath, e.getKey()))) .toArray(FileStatus[]::new); }