[HUDI-2351] Extract common FS and IO utils for marker mechanism (#3529)
This commit is contained in:
@@ -29,6 +29,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
@@ -49,8 +50,10 @@ import org.apache.log4j.Logger;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@@ -59,6 +62,7 @@ import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
@@ -612,4 +616,87 @@ public class FSUtils {
|
||||
.filter(fileStatus -> !fileStatus.getPath().toString().contains(HoodieTableMetaClient.METAFOLDER_NAME))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes a directory by deleting sub-paths in parallel on the file system.
|
||||
*
|
||||
* @param hoodieEngineContext {@code HoodieEngineContext} instance
|
||||
* @param fs file system
|
||||
* @param dirPath directory path
|
||||
* @param parallelism parallelism to use for sub-paths
|
||||
* @return {@code true} if the directory is delete; {@code false} otherwise.
|
||||
*/
|
||||
public static boolean deleteDir(
|
||||
HoodieEngineContext hoodieEngineContext, FileSystem fs, Path dirPath, int parallelism) {
|
||||
try {
|
||||
if (fs.exists(dirPath)) {
|
||||
FSUtils.parallelizeSubPathProcess(hoodieEngineContext, fs, dirPath, parallelism, e -> true,
|
||||
pairOfSubPathAndConf -> deleteSubPath(
|
||||
pairOfSubPathAndConf.getKey(), pairOfSubPathAndConf.getValue(), true)
|
||||
);
|
||||
boolean result = fs.delete(dirPath, false);
|
||||
LOG.info("Removed directory at " + dirPath);
|
||||
return result;
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
throw new HoodieIOException(ioe.getMessage(), ioe);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes sub-path in parallel.
|
||||
*
|
||||
* @param hoodieEngineContext {@code HoodieEngineContext} instance
|
||||
* @param fs file system
|
||||
* @param dirPath directory path
|
||||
* @param parallelism parallelism to use for sub-paths
|
||||
* @param subPathPredicate predicate to use to filter sub-paths for processing
|
||||
* @param pairFunction actual processing logic for each sub-path
|
||||
* @param <T> type of result to return for each sub-path
|
||||
* @return a map of sub-path to result of the processing
|
||||
*/
|
||||
public static <T> Map<String, T> parallelizeSubPathProcess(
|
||||
HoodieEngineContext hoodieEngineContext, FileSystem fs, Path dirPath, int parallelism,
|
||||
Predicate<FileStatus> subPathPredicate, SerializableFunction<Pair<String, SerializableConfiguration>, T> pairFunction) {
|
||||
Map<String, T> result = new HashMap<>();
|
||||
try {
|
||||
FileStatus[] fileStatuses = fs.listStatus(dirPath);
|
||||
List<String> subPaths = Arrays.stream(fileStatuses)
|
||||
.filter(subPathPredicate)
|
||||
.map(fileStatus -> fileStatus.getPath().toString())
|
||||
.collect(Collectors.toList());
|
||||
if (subPaths.size() > 0) {
|
||||
SerializableConfiguration conf = new SerializableConfiguration(fs.getConf());
|
||||
int actualParallelism = Math.min(subPaths.size(), parallelism);
|
||||
result = hoodieEngineContext.mapToPair(subPaths,
|
||||
subPath -> new ImmutablePair<>(subPath, pairFunction.apply(new ImmutablePair<>(subPath, conf))),
|
||||
actualParallelism);
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
throw new HoodieIOException(ioe.getMessage(), ioe);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes a sub-path.
|
||||
*
|
||||
* @param subPathStr sub-path String
|
||||
* @param conf serializable config
|
||||
* @param recursive is recursive or not
|
||||
* @return {@code true} if the sub-path is deleted; {@code false} otherwise.
|
||||
*/
|
||||
public static boolean deleteSubPath(String subPathStr, SerializableConfiguration conf, boolean recursive) {
|
||||
try {
|
||||
Path subPath = new Path(subPathStr);
|
||||
FileSystem fileSystem = subPath.getFileSystem(conf.get());
|
||||
return fileSystem.delete(subPath, recursive);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public interface SerializableFunction<T, R> extends Function<T, R>, Serializable {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,18 +21,23 @@ package org.apache.hudi.common.util;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.Closeable;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Bunch of utility methods for working with files and byte streams.
|
||||
@@ -71,6 +76,20 @@ public class FileIOUtils {
|
||||
return new String(bos.toByteArray(), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the input stream into String lines.
|
||||
*
|
||||
* @param input {@code InputStream} instance.
|
||||
* @return String lines in a list.
|
||||
*/
|
||||
public static List<String> readAsUTFStringLines(InputStream input) {
|
||||
List<String> lines = new ArrayList<>();
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(input, StandardCharsets.UTF_8));
|
||||
lines = bufferedReader.lines().collect(Collectors.toList());
|
||||
closeQuietly(bufferedReader);
|
||||
return lines;
|
||||
}
|
||||
|
||||
public static void copy(InputStream inputStream, OutputStream outputStream) throws IOException {
|
||||
byte[] buffer = new byte[1024];
|
||||
int len;
|
||||
|
||||
@@ -21,9 +21,9 @@ package org.apache.hudi.common.util;
|
||||
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.marker.MarkerType;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
@@ -35,20 +35,15 @@ import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.common.util.FileIOUtils.closeQuietly;
|
||||
|
||||
@@ -178,43 +173,44 @@ public class MarkerUtils {
|
||||
Path dirPath = new Path(markerDir);
|
||||
try {
|
||||
if (fileSystem.exists(dirPath)) {
|
||||
FileStatus[] fileStatuses = fileSystem.listStatus(dirPath);
|
||||
Predicate<FileStatus> prefixFilter = fileStatus ->
|
||||
fileStatus.getPath().getName().startsWith(MARKERS_FILENAME_PREFIX);
|
||||
Predicate<FileStatus> markerTypeFilter = fileStatus ->
|
||||
!fileStatus.getPath().getName().equals(MARKER_TYPE_FILENAME);
|
||||
List<String> markerDirSubPaths = Arrays.stream(fileStatuses)
|
||||
.filter(prefixFilter.and(markerTypeFilter))
|
||||
.map(fileStatus -> fileStatus.getPath().toString())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (markerDirSubPaths.size() > 0) {
|
||||
SerializableConfiguration conf = new SerializableConfiguration(fileSystem.getConf());
|
||||
int actualParallelism = Math.min(markerDirSubPaths.size(), parallelism);
|
||||
return context.mapToPair(markerDirSubPaths, markersFilePathStr -> {
|
||||
Path markersFilePath = new Path(markersFilePathStr);
|
||||
FileSystem fs = markersFilePath.getFileSystem(conf.get());
|
||||
FSDataInputStream fsDataInputStream = null;
|
||||
BufferedReader bufferedReader = null;
|
||||
Set<String> markers = new HashSet<>();
|
||||
try {
|
||||
LOG.debug("Read marker file: " + markersFilePathStr);
|
||||
fsDataInputStream = fs.open(markersFilePath);
|
||||
bufferedReader = new BufferedReader(new InputStreamReader(fsDataInputStream, StandardCharsets.UTF_8));
|
||||
markers = bufferedReader.lines().collect(Collectors.toSet());
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Failed to read file " + markersFilePathStr, e);
|
||||
} finally {
|
||||
closeQuietly(bufferedReader);
|
||||
closeQuietly(fsDataInputStream);
|
||||
}
|
||||
return new ImmutablePair<>(markersFilePathStr, markers);
|
||||
}, actualParallelism);
|
||||
}
|
||||
return FSUtils.parallelizeSubPathProcess(
|
||||
context, fileSystem, dirPath, parallelism, prefixFilter.and(markerTypeFilter),
|
||||
pairOfSubPathAndConf -> {
|
||||
String markersFilePathStr = pairOfSubPathAndConf.getKey();
|
||||
SerializableConfiguration conf = pairOfSubPathAndConf.getValue();
|
||||
return readMarkersFromFile(new Path(markersFilePathStr), conf);
|
||||
});
|
||||
}
|
||||
return new HashMap<>();
|
||||
} catch (IOException ioe) {
|
||||
throw new HoodieIOException(ioe.getMessage(), ioe);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the markers stored in the underlying file.
|
||||
*
|
||||
* @param markersFilePath file path for the markers
|
||||
* @param conf serializable config
|
||||
* @return markers in a {@code Set} of String.
|
||||
*/
|
||||
public static Set<String> readMarkersFromFile(Path markersFilePath, SerializableConfiguration conf) {
|
||||
FSDataInputStream fsDataInputStream = null;
|
||||
Set<String> markers = new HashSet<>();
|
||||
try {
|
||||
LOG.debug("Read marker file: " + markersFilePath);
|
||||
FileSystem fs = markersFilePath.getFileSystem(conf.get());
|
||||
fsDataInputStream = fs.open(markersFilePath);
|
||||
markers = new HashSet<>(FileIOUtils.readAsUTFStringLines(fsDataInputStream));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Failed to read MARKERS file " + markersFilePath, e);
|
||||
} finally {
|
||||
closeQuietly(fsDataInputStream);
|
||||
}
|
||||
return markers;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user