1
0

[HUDI-3438] Avoid getSmallFiles if hoodie.parquet.small.file.limit is 0 (#4823)

Co-authored-by: Hui An <hui.an@shopee.com>
This commit is contained in:
RexAn
2022-02-18 21:57:04 +08:00
committed by GitHub
parent fba5822ee3
commit 5009138d04
5 changed files with 26 additions and 4 deletions

View File

@@ -165,7 +165,7 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
List<SmallFile> smallFiles =
filterSmallFilesInClustering(partitionPathToPendingClusteringFileGroupsId.getOrDefault(partitionPath, Collections.emptySet()),
partitionSmallFilesMap.get(partitionPath));
partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>()));
this.smallFiles.addAll(smallFiles);
@@ -241,6 +241,11 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, HoodieEngineContext context) {
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
if (config.getParquetSmallFileLimit() <= 0) {
return partitionSmallFilesMap;
}
if (partitionPaths != null && partitionPaths.size() > 0) {
context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions");
JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());

View File

@@ -97,6 +97,10 @@ public class SparkUpsertDeltaCommitPartitioner<T extends HoodieRecordPayload<T>>
.collect(Collectors.toList());
}
if (config.getParquetSmallFileLimit() <= 0) {
return Collections.emptyList();
}
// If we cannot index log files, then we choose the smallest parquet file in the partition and add inserts to
// it. Doing this overtime for a partition, we ensure that we handle small file issues
return table.getSliceView()