1
0

[HUDI-3007] Fix issues in HoodieRepairTool (#4564)

This commit is contained in:
Y Ethan Guo
2022-01-12 09:03:27 -08:00
committed by GitHub
parent 12e95771ee
commit 397795c7d0
7 changed files with 957 additions and 119 deletions

View File

@@ -721,6 +721,53 @@ public class FSUtils {
}
}
/**
* Lists file status at a certain level in the directory hierarchy.
* <p>
* E.g., given "/tmp/hoodie_table" as the rootPath, and 3 as the expected level,
* this method gives back the {@link FileStatus} of all files under
* "/tmp/hoodie_table/[*]/[*]/[*]/" folders.
*
* @param hoodieEngineContext {@link HoodieEngineContext} instance.
* @param fs {@link FileSystem} instance.
* @param rootPath Root path for the file listing.
* @param expectLevel Expected level of directory hierarchy for files to be added.
* @param parallelism Parallelism for the file listing.
* @return A list of file status of files at the level.
*/
public static List<FileStatus> getFileStatusAtLevel(
HoodieEngineContext hoodieEngineContext, FileSystem fs, Path rootPath,
int expectLevel, int parallelism) {
List<String> levelPaths = new ArrayList<>();
List<FileStatus> result = new ArrayList<>();
levelPaths.add(rootPath.toString());
for (int i = 0; i <= expectLevel; i++) {
result = FSUtils.parallelizeFilesProcess(hoodieEngineContext, fs, parallelism,
pairOfSubPathAndConf -> {
Path path = new Path(pairOfSubPathAndConf.getKey());
try {
FileSystem fileSystem = path.getFileSystem(pairOfSubPathAndConf.getValue().get());
return Arrays.stream(fileSystem.listStatus(path))
.collect(Collectors.toList());
} catch (IOException e) {
throw new HoodieIOException("Failed to list " + path, e);
}
},
levelPaths)
.values().stream()
.flatMap(list -> list.stream()).collect(Collectors.toList());
if (i < expectLevel) {
levelPaths = result.stream()
.filter(FileStatus::isDirectory)
.map(fileStatus -> fileStatus.getPath().toString())
.collect(Collectors.toList());
}
}
return result;
}
public interface SerializableFunction<T, R> extends Function<T, R>, Serializable {
}
}