[HUDI-3438] Avoid getSmallFiles if hoodie.parquet.small.file.limit is 0 (#4823)
Co-authored-by: Hui An <hui.an@shopee.com>
This commit is contained in:
@@ -165,7 +165,7 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
|
||||
|
||||
List<SmallFile> smallFiles =
|
||||
filterSmallFilesInClustering(partitionPathToPendingClusteringFileGroupsId.getOrDefault(partitionPath, Collections.emptySet()),
|
||||
partitionSmallFilesMap.get(partitionPath));
|
||||
partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>()));
|
||||
|
||||
this.smallFiles.addAll(smallFiles);
|
||||
|
||||
@@ -241,6 +241,11 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHo
|
||||
private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, HoodieEngineContext context) {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
|
||||
|
||||
if (config.getParquetSmallFileLimit() <= 0) {
|
||||
return partitionSmallFilesMap;
|
||||
}
|
||||
|
||||
if (partitionPaths != null && partitionPaths.size() > 0) {
|
||||
context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions");
|
||||
JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
|
||||
|
||||
@@ -97,6 +97,10 @@ public class SparkUpsertDeltaCommitPartitioner<T extends HoodieRecordPayload<T>>
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
if (config.getParquetSmallFileLimit() <= 0) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// If we cannot index log files, then we choose the smallest parquet file in the partition and add inserts to
|
||||
// it. Doing this overtime for a partition, we ensure that we handle small file issues
|
||||
return table.getSliceView()
|
||||
|
||||
Reference in New Issue
Block a user