1
0

FSUtils.getAllPartitionsPaths() works based on .hoodie_partition_metadata

- clean/rollback/write paths covered by existing tests
 - Snapshot copier fixed to copy metadata file also, and test fixed
 - Existing tables need to be repaired by addition of metadata, before this can be rolled out
This commit is contained in:
Vinoth Chandar
2017-03-26 17:15:52 -07:00
committed by vinoth chandar
parent 3129770fd0
commit f9fd16069d
9 changed files with 214 additions and 207 deletions

View File

@@ -71,7 +71,7 @@ public class HoodiePartitionMetadata {
}
public int getPartitionDepth() {
if (!props.contains(PARTITION_DEPTH_KEY)) {
if (!props.containsKey(PARTITION_DEPTH_KEY)) {
throw new HoodieException("Could not find partitionDepth in partition metafile");
}
return Integer.parseInt(props.getProperty(PARTITION_DEPTH_KEY));

View File

@@ -19,6 +19,7 @@ package com.uber.hoodie.common.util;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
import com.uber.hoodie.avro.model.HoodieCleanPartitionMetadata;
import com.uber.hoodie.avro.model.HoodieRollbackMetadata;
@@ -28,10 +29,10 @@ import com.uber.hoodie.avro.model.HoodieSavepointPartitionMetadata;
import com.uber.hoodie.common.HoodieCleanStat;
import com.uber.hoodie.common.HoodieRollbackStat;
import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.exception.HoodieIOException;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.DataFileWriter;
@@ -40,29 +41,20 @@ import org.apache.avro.file.SeekableByteArrayInput;
import org.apache.avro.file.SeekableInput;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.Decoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.EncoderFactory;
import org.apache.avro.mapred.FsInput;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.specific.SpecificRecordBase;
import org.apache.hadoop.fs.AvroFSInput;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.ByteArrayOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
public class AvroUtils {

View File

@@ -18,14 +18,19 @@ package com.uber.hoodie.common.util;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.exception.InvalidHoodiePathException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -75,7 +80,7 @@ public class FSUtils {
fs = FileSystem.get(conf);
} catch (IOException e) {
throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(),
e);
e);
}
LOG.info(String.format("Hadoop Configuration: fs.defaultFS: [%s], Config:[%s], FileSystem: [%s]",
conf.getRaw("fs.defaultFS"), conf.toString(), fs.toString()));
@@ -111,25 +116,43 @@ public class FSUtils {
return fs.listStatus(path)[0].getLen();
}
// TODO (weiy): rename the function for better readability
public static String getFileId(String fullFileName) {
return fullFileName.split("_")[0];
}
/**
* Obtain all the partition paths, that are present in this table.
* Gets all partition paths assuming date partitioning (year, month, day) three levels down.
*/
public static List<String> getAllPartitionPaths(FileSystem fs, String basePath)
throws IOException {
List<String> partitionsToClean = new ArrayList<>();
// TODO(vc): For now, assume partitions are two levels down from base path.
public static List<String> getAllFoldersThreeLevelsDown(FileSystem fs, String basePath) throws IOException {
List<String> datePartitions = new ArrayList<>();
FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*"));
for (FileStatus status : folders) {
Path path = status.getPath();
partitionsToClean.add(String.format("%s/%s/%s", path.getParent().getParent().getName(),
path.getParent().getName(), path.getName()));
datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(),
path.getParent().getName(), path.getName()));
}
return partitionsToClean;
return datePartitions;
}
/**
* Obtain all the partition paths, that are present in this table, denoted by presence of {@link
* com.uber.hoodie.common.model.HoodiePartitionMetadata#HOODIE_PARTITION_METAFILE}
*/
public static List<String> getAllPartitionPaths(FileSystem fs, String basePathStr)
throws IOException {
List<String> partitions = new ArrayList<>();
Path basePath = new Path(basePathStr);
RemoteIterator<LocatedFileStatus> allFiles = fs.listFiles(new Path(basePathStr), true);
while (allFiles.hasNext()) {
Path filePath = allFiles.next().getPath();
if (filePath.getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)) {
String partitionFullPath = filePath.getParent().toString();
int partitionStartIndex = partitionFullPath.lastIndexOf(basePath.getName()) ;
partitions.add(partitionFullPath.substring(partitionStartIndex + basePath.getName().length() + 1));
}
}
return partitions;
}
public static String getFileExtension(String fullName) {
@@ -146,42 +169,34 @@ public class FSUtils {
/**
* Get the file extension from the log file
* @param logPath
* @return
*/
public static String getFileExtensionFromLog(Path logPath) {
Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName());
if(!matcher.find()) {
if (!matcher.find()) {
throw new InvalidHoodiePathException(logPath, "LogFile");
}
return matcher.group(3) + "." + matcher.group(4);
}
/**
* Get the first part of the file name in the log file. That will be the fileId.
* Log file do not have commitTime in the file name.
*
* @param path
* @return
* Get the first part of the file name in the log file. That will be the fileId. Log file do not
* have commitTime in the file name.
*/
public static String getFileIdFromLogPath(Path path) {
Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
if(!matcher.find()) {
if (!matcher.find()) {
throw new InvalidHoodiePathException(path, "LogFile");
}
return matcher.group(1);
}
/**
* Get the first part of the file name in the log file. That will be the fileId.
* Log file do not have commitTime in the file name.
*
* @param path
* @return
* Get the first part of the file name in the log file. That will be the fileId. Log file do not
* have commitTime in the file name.
*/
public static String getBaseCommitTimeFromLogPath(Path path) {
Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
if(!matcher.find()) {
if (!matcher.find()) {
throw new InvalidHoodiePathException(path, "LogFile");
}
return matcher.group(2);
@@ -189,20 +204,17 @@ public class FSUtils {
/**
* Get the last part of the file name in the log file and convert to int.
*
* @param logPath
* @return
*/
public static int getFileVersionFromLog(Path logPath) {
Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName());
if(!matcher.find()) {
if (!matcher.find()) {
throw new InvalidHoodiePathException(logPath, "LogFile");
}
return Integer.parseInt(matcher.group(5));
}
public static String makeLogFileName(String fileId, String logFileExtension,
String baseCommitTime, int version) {
String baseCommitTime, int version) {
return String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version);
}
@@ -213,46 +225,30 @@ public class FSUtils {
/**
* Get the latest log file written from the list of log files passed in
*
* @param logFiles
* @return
*/
public static Optional<HoodieLogFile> getLatestLogFile(Stream<HoodieLogFile> logFiles) {
return logFiles.sorted(Comparator
.comparing(s -> s.getLogVersion(),
Comparator.reverseOrder())).findFirst();
.comparing(s -> s.getLogVersion(),
Comparator.reverseOrder())).findFirst();
}
/**
* Get all the log files for the passed in FileId in the partition path
*
* @param fs
* @param partitionPath
* @param fileId
* @param logFileExtension
* @return
*/
public static Stream<HoodieLogFile> getAllLogFiles(FileSystem fs, Path partitionPath,
final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
return Arrays.stream(fs.listStatus(partitionPath,
path -> path.getName().startsWith(fileId) && path.getName().contains(logFileExtension)))
.map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime));
path -> path.getName().startsWith(fileId) && path.getName().contains(logFileExtension)))
.map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime));
}
/**
* Get the latest log version for the fileId in the partition path
*
* @param fs
* @param partitionPath
* @param fileId
* @param logFileExtension
* @return
* @throws IOException
*/
public static Optional<Integer> getLatestLogVersion(FileSystem fs, Path partitionPath,
final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
Optional<HoodieLogFile> latestLogFile =
getLatestLogFile(getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime));
getLatestLogFile(getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime));
if (latestLogFile.isPresent()) {
return Optional.of(latestLogFile.get().getLogVersion());
}
@@ -260,26 +256,20 @@ public class FSUtils {
}
public static int getCurrentLogVersion(FileSystem fs, Path partitionPath,
final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
Optional<Integer> currentVersion =
getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime);
getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime);
// handle potential overflow
return (currentVersion.isPresent()) ? currentVersion.get() : 1;
}
/**
* computes the next log version for the specified fileId in the partition path
*
* @param fs
* @param partitionPath
* @param fileId
* @return
* @throws IOException
*/
public static int computeNextLogVersion(FileSystem fs, Path partitionPath, final String fileId,
final String logFileExtension, final String baseCommitTime) throws IOException {
final String logFileExtension, final String baseCommitTime) throws IOException {
Optional<Integer> currentVersion =
getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime);
getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime);
// handle potential overflow
return (currentVersion.isPresent()) ? currentVersion.get() + 1 : 1;
}
@@ -297,17 +287,13 @@ public class FSUtils {
}
/**
* When a file was opened and the task died without closing the stream, another task executor cannot open because the existing lease will be active.
* We will try to recover the lease, from HDFS. If a data node went down, it takes about 10 minutes for the lease to be rocovered.
* But if the client dies, this should be instant.
*
* @param dfs
* @param p
* @return
* @throws IOException
* When a file was opened and the task died without closing the stream, another task executor
* cannot open because the existing lease will be active. We will try to recover the lease, from
* HDFS. If a data node went down, it takes about 10 minutes for the lease to be rocovered. But
* if the client dies, this should be instant.
*/
public static boolean recoverDFSFileLease(final DistributedFileSystem dfs, final Path p)
throws IOException, InterruptedException {
throws IOException, InterruptedException {
LOG.info("Recover lease on dfs file " + p);
// initiate the recovery
boolean recovered = false;
@@ -324,7 +310,7 @@ public class FSUtils {
}
public static void deleteOlderCleanMetaFiles(FileSystem fs, String metaPath,
Stream<HoodieInstant> instants) {
Stream<HoodieInstant> instants) {
//TODO - this should be archived when archival is made general for all meta-data
// skip MIN_CLEAN_TO_KEEP and delete rest
instants.skip(MIN_CLEAN_TO_KEEP).map(s -> {
@@ -332,13 +318,13 @@ public class FSUtils {
return fs.delete(new Path(metaPath, s.getFileName()), false);
} catch (IOException e) {
throw new HoodieIOException("Could not delete clean meta files" + s.getFileName(),
e);
e);
}
});
}
public static void deleteOlderRollbackMetaFiles(FileSystem fs, String metaPath,
Stream<HoodieInstant> instants) {
Stream<HoodieInstant> instants) {
//TODO - this should be archived when archival is made general for all meta-data
// skip MIN_ROLLBACK_TO_KEEP and delete rest
instants.skip(MIN_ROLLBACK_TO_KEEP).map(s -> {
@@ -346,7 +332,7 @@ public class FSUtils {
return fs.delete(new Path(metaPath, s.getFileName()), false);
} catch (IOException e) {
throw new HoodieIOException(
"Could not delete rollback meta files " + s.getFileName(), e);
"Could not delete rollback meta files " + s.getFileName(), e);
}
});
}