[HUDI-724] Parallelize getSmallFiles for partitions (#1421)
Co-authored-by: Feichi Feng <feicfeng@amazon.com>
This commit is contained in:
@@ -489,9 +489,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> extends AbstractHo
|
||||
|
||||
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) {
|
||||
if (isUpsert) {
|
||||
return table.getUpsertPartitioner(profile);
|
||||
return table.getUpsertPartitioner(profile, jsc);
|
||||
} else {
|
||||
return table.getInsertPartitioner(profile);
|
||||
return table.getInsertPartitioner(profile, jsc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
@@ -142,16 +143,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
|
||||
@Override
|
||||
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
|
||||
public Partitioner getUpsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc) {
|
||||
if (profile == null) {
|
||||
throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
|
||||
}
|
||||
return new UpsertPartitioner(profile);
|
||||
return new UpsertPartitioner(profile, jsc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
|
||||
return getUpsertPartitioner(profile);
|
||||
public Partitioner getInsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc) {
|
||||
return getUpsertPartitioner(profile, jsc);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -573,14 +574,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
*/
|
||||
protected HoodieRollingStatMetadata rollingStatMetadata;
|
||||
|
||||
UpsertPartitioner(WorkloadProfile profile) {
|
||||
UpsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc) {
|
||||
updateLocationToBucket = new HashMap<>();
|
||||
partitionPathToInsertBuckets = new HashMap<>();
|
||||
bucketInfoMap = new HashMap<>();
|
||||
globalStat = profile.getGlobalStat();
|
||||
rollingStatMetadata = getRollingStats();
|
||||
assignUpdates(profile);
|
||||
assignInserts(profile);
|
||||
assignInserts(profile, jsc);
|
||||
|
||||
LOG.info("Total Buckets :" + totalBuckets + ", buckets info => " + bucketInfoMap + ", \n"
|
||||
+ "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n"
|
||||
@@ -610,18 +611,24 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
return bucket;
|
||||
}
|
||||
|
||||
private void assignInserts(WorkloadProfile profile) {
|
||||
private void assignInserts(WorkloadProfile profile, JavaSparkContext jsc) {
|
||||
// for new inserts, compute buckets depending on how many records we have for each partition
|
||||
Set<String> partitionPaths = profile.getPartitionPaths();
|
||||
long averageRecordSize =
|
||||
averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
|
||||
config.getCopyOnWriteRecordSizeEstimate());
|
||||
LOG.info("AvgRecordSize => " + averageRecordSize);
|
||||
|
||||
Map<String, List<SmallFile>> partitionSmallFilesMap =
|
||||
getSmallFilesForPartitions(new ArrayList<String>(partitionPaths), jsc);
|
||||
|
||||
for (String partitionPath : partitionPaths) {
|
||||
WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
|
||||
if (pStat.getNumInserts() > 0) {
|
||||
|
||||
List<SmallFile> smallFiles = getSmallFiles(partitionPath);
|
||||
List<SmallFile> smallFiles = partitionSmallFilesMap.get(partitionPath);
|
||||
this.smallFiles.addAll(smallFiles);
|
||||
|
||||
LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);
|
||||
|
||||
long totalUnassignedInserts = pStat.getNumInserts();
|
||||
@@ -684,6 +691,18 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, JavaSparkContext jsc) {
|
||||
|
||||
Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
|
||||
if (partitionPaths != null && partitionPaths.size() > 0) {
|
||||
JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
|
||||
partitionSmallFilesMap = partitionPathRdds.mapToPair((PairFunction<String, String, List<SmallFile>>)
|
||||
partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath))).collectAsMap();
|
||||
}
|
||||
|
||||
return partitionSmallFilesMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of small files in the given partition path.
|
||||
*/
|
||||
@@ -706,8 +725,6 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
|
||||
sf.sizeBytes = file.getFileSize();
|
||||
smallFileLocations.add(sf);
|
||||
// Update the global small files list
|
||||
smallFiles.add(sf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -89,11 +89,11 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
|
||||
@Override
|
||||
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
|
||||
public Partitioner getUpsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc) {
|
||||
if (profile == null) {
|
||||
throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
|
||||
}
|
||||
mergeOnReadUpsertPartitioner = new MergeOnReadUpsertPartitioner(profile);
|
||||
mergeOnReadUpsertPartitioner = new MergeOnReadUpsertPartitioner(profile, jsc);
|
||||
return mergeOnReadUpsertPartitioner;
|
||||
}
|
||||
|
||||
@@ -325,8 +325,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
*/
|
||||
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
|
||||
|
||||
MergeOnReadUpsertPartitioner(WorkloadProfile profile) {
|
||||
super(profile);
|
||||
MergeOnReadUpsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc) {
|
||||
super(profile, jsc);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -376,16 +376,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
|
||||
sf.sizeBytes = getTotalFileSize(smallFileSlice);
|
||||
smallFileLocations.add(sf);
|
||||
// Update the global small files list
|
||||
smallFiles.add(sf);
|
||||
} else {
|
||||
HoodieLogFile logFile = smallFileSlice.getLogFiles().findFirst().get();
|
||||
sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()),
|
||||
FSUtils.getFileIdFromLogPath(logFile.getPath()));
|
||||
sf.sizeBytes = getTotalFileSize(smallFileSlice);
|
||||
smallFileLocations.add(sf);
|
||||
// Update the global small files list
|
||||
smallFiles.add(sf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,12 +118,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
/**
|
||||
* Provides a partitioner to perform the upsert operation, based on the workload profile.
|
||||
*/
|
||||
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
|
||||
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc);
|
||||
|
||||
/**
|
||||
* Provides a partitioner to perform the insert operation, based on the workload profile.
|
||||
*/
|
||||
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
|
||||
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile, JavaSparkContext jsc);
|
||||
|
||||
/**
|
||||
* Return whether this HoodieTable implementation can benefit from workload profiling.
|
||||
|
||||
Reference in New Issue
Block a user