1
0

HUDI-135 - Skip Meta folder when looking for partitions

This commit is contained in:
Balaji Varadarajan
2019-05-28 12:54:23 -07:00
committed by vinoth chandar
parent 33f5208c1e
commit 93f8f12a30
3 changed files with 126 additions and 21 deletions

View File

@@ -77,7 +77,7 @@ public class RepairsCommand implements CommandMarker {
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get() String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get()
.getTimestamp(); .getTimestamp();
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs, List<String> partitionPaths = FSUtils.getAllPartitionFoldersThreeLevelsDown(HoodieCLI.fs,
HoodieCLI.tableMetadata.getBasePath()); HoodieCLI.tableMetadata.getBasePath());
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath()); Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
String[][] rows = new String[partitionPaths.size() + 1][]; String[][] rows = new String[partitionPaths.size() + 1][];

View File

@@ -18,6 +18,7 @@ package com.uber.hoodie.common.util;
import static com.uber.hoodie.common.table.HoodieTableMetaClient.MARKER_EXTN; import static com.uber.hoodie.common.table.HoodieTableMetaClient.MARKER_EXTN;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions; import com.google.common.base.Preconditions;
import com.uber.hoodie.common.model.HoodieFileFormat; import com.uber.hoodie.common.model.HoodieFileFormat;
import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.model.HoodieLogFile;
@@ -67,6 +68,12 @@ public class FSUtils {
private static final long MIN_ROLLBACK_TO_KEEP = 10; private static final long MIN_ROLLBACK_TO_KEEP = 10;
private static final String HOODIE_ENV_PROPS_PREFIX = "HOODIE_ENV_"; private static final String HOODIE_ENV_PROPS_PREFIX = "HOODIE_ENV_";
private static final PathFilter ALLOW_ALL_FILTER = new PathFilter() {
@Override
public boolean accept(Path file) {
return true;
}
};
public static Configuration prepareHadoopConf(Configuration conf) { public static Configuration prepareHadoopConf(Configuration conf) {
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
@@ -152,16 +159,11 @@ public class FSUtils {
/** /**
* Gets all partition paths assuming date partitioning (year, month, day) three levels down. * Gets all partition paths assuming date partitioning (year, month, day) three levels down.
*/ */
public static List<String> getAllFoldersThreeLevelsDown(FileSystem fs, String basePath) public static List<String> getAllPartitionFoldersThreeLevelsDown(FileSystem fs, String basePath)
throws IOException { throws IOException {
List<String> datePartitions = new ArrayList<>(); List<String> datePartitions = new ArrayList<>();
// Avoid listing and including any folders under the metafolder // Avoid listing and including any folders under the metafolder
PathFilter filter = (path) -> { PathFilter filter = getExcludeMetaPathFilter();
if (path.toString().contains(HoodieTableMetaClient.METAFOLDER_NAME)) {
return false;
}
return true;
};
FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*"), filter); FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*"), filter);
for (FileStatus status : folders) { for (FileStatus status : folders) {
Path path = status.getPath(); Path path = status.getPath();
@@ -201,31 +203,53 @@ public class FSUtils {
partitions.add(getRelativePartitionPath(basePath, filePath.getParent())); partitions.add(getRelativePartitionPath(basePath, filePath.getParent()));
} }
return true; return true;
}); }, true);
return partitions; return partitions;
} }
public static final List<String> getAllDataFilesForMarkers(FileSystem fs, String basePath, String instantTs, public static final List<String> getAllDataFilesForMarkers(FileSystem fs, String basePath, String instantTs,
String markerDir) throws IOException { String markerDir) throws IOException {
List<String> dataFiles = new LinkedList<>(); List<String> dataFiles = new LinkedList<>();
FSUtils.processFiles(fs, markerDir, (status) -> { processFiles(fs, markerDir, (status) -> {
String pathStr = status.getPath().toString(); String pathStr = status.getPath().toString();
if (pathStr.endsWith(MARKER_EXTN)) { if (pathStr.endsWith(MARKER_EXTN)) {
dataFiles.add(FSUtils.translateMarkerToDataPath(basePath, pathStr, instantTs)); dataFiles.add(FSUtils.translateMarkerToDataPath(basePath, pathStr, instantTs));
} }
return true; return true;
}); }, false);
return dataFiles; return dataFiles;
} }
private static final void processFiles(FileSystem fs, String basePathStr, /**
Function<LocatedFileStatus, Boolean> consumer) throws IOException { * Recursively processes all files in the base-path. If excludeMetaFolder is set, the meta-folder and all its
RemoteIterator<LocatedFileStatus> allFiles = fs.listFiles(new Path(basePathStr), true); * subdirs are skipped
while (allFiles.hasNext()) { * @param fs File System
LocatedFileStatus status = allFiles.next(); * @param basePathStr Base-Path
boolean success = consumer.apply(status); * @param consumer Callback for processing
if (!success) { * @param excludeMetaFolder Exclude .hoodie folder
throw new HoodieException("Failed to process file-status=" + status); * @throws IOException
*/
@VisibleForTesting
static void processFiles(FileSystem fs, String basePathStr,
Function<FileStatus, Boolean> consumer, boolean excludeMetaFolder) throws IOException {
PathFilter pathFilter = excludeMetaFolder ? getExcludeMetaPathFilter() : ALLOW_ALL_FILTER;
FileStatus[] topLevelStatuses = fs.listStatus(new Path(basePathStr));
for (int i = 0; i < topLevelStatuses.length; i++) {
FileStatus child = topLevelStatuses[i];
if (child.isFile()) {
boolean success = consumer.apply(child);
if (!success) {
throw new HoodieException("Failed to process file-status=" + child);
}
} else if (pathFilter.accept(child.getPath())) {
RemoteIterator<LocatedFileStatus> itr = fs.listFiles(child.getPath(), true);
while (itr.hasNext()) {
FileStatus status = itr.next();
boolean success = consumer.apply(status);
if (!success) {
throw new HoodieException("Failed to process file-status=" + status);
}
}
} }
} }
} }
@@ -234,7 +258,7 @@ public class FSUtils {
boolean assumeDatePartitioning) boolean assumeDatePartitioning)
throws IOException { throws IOException {
if (assumeDatePartitioning) { if (assumeDatePartitioning) {
return getAllFoldersThreeLevelsDown(fs, basePathStr); return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr);
} else { } else {
return getAllFoldersWithPartitionMetaFile(fs, basePathStr); return getAllFoldersWithPartitionMetaFile(fs, basePathStr);
} }
@@ -247,6 +271,16 @@ public class FSUtils {
return dotIndex == -1 ? "" : fileName.substring(dotIndex); return dotIndex == -1 ? "" : fileName.substring(dotIndex);
} }
private static PathFilter getExcludeMetaPathFilter() {
// Avoid listing and including any folders under the metafolder
return (path) -> {
if (path.toString().contains(HoodieTableMetaClient.METAFOLDER_NAME)) {
return false;
}
return true;
};
}
public static String getInstantTime(String name) { public static String getInstantTime(String name) {
return name.replace(getFileExtension(name), ""); return name.replace(getFileExtension(name), "");
} }
@@ -453,7 +487,6 @@ public class FSUtils {
Thread.sleep(1000); Thread.sleep(1000);
} }
return recovered; return recovered;
} }
public static void deleteOlderCleanMetaFiles(FileSystem fs, String metaPath, public static void deleteOlderCleanMetaFiles(FileSystem fs, String metaPath,

View File

@@ -20,8 +20,14 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.exception.HoodieException;
import java.io.IOException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date; import java.util.Date;
import java.util.List;
import java.util.UUID; import java.util.UUID;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@@ -30,6 +36,7 @@ import org.junit.Assert;
import org.junit.Rule; import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.contrib.java.lang.system.EnvironmentVariables; import org.junit.contrib.java.lang.system.EnvironmentVariables;
import org.junit.rules.TemporaryFolder;
public class TestFSUtils { public class TestFSUtils {
@@ -55,6 +62,71 @@ public class TestFSUtils {
.equals("*_" + taskPartitionId + "_" + commitTime + ".parquet")); .equals("*_" + taskPartitionId + "_" + commitTime + ".parquet"));
} }
@Test
/**
* Tests if process Files return only paths excluding marker directories
* Cleaner, Rollback and compaction-scheduling logic was recursively processing all subfolders including that
* of ".hoodie" when looking for partition-paths. This causes a race when they try to list all folders (recursively)
* but the marker directory (that of compaction inside of ".hoodie" folder) is deleted underneath by compactor.
* This code tests the fix by ensuring ".hoodie" and their subfolders are never processed.
*/
public void testProcessFiles() throws Exception {
TemporaryFolder tmpFolder = new TemporaryFolder();
tmpFolder.create();
// All directories including marker dirs.
List<String> folders = Arrays.asList("2016/04/15", "2016/05/16", ".hoodie/.temp/2/2016/04/15",
".hoodie/.temp/2/2016/05/16");
HoodieTableMetaClient metaClient = HoodieTestUtils.init(tmpFolder.getRoot().getAbsolutePath());
String basePath = metaClient.getBasePath();
folders.stream().forEach(f -> {
try {
metaClient.getFs().mkdirs(new Path(new Path(basePath), f));
} catch (IOException e) {
throw new HoodieException(e);
}
});
// Files inside partitions and marker directories
List<String> files = Arrays.asList(
"2016/04/15/1_1-0-1_20190528120000.parquet",
"2016/05/16/2_1-0-1_20190528120000.parquet",
".hoodie/.temp/2/2016/05/16/2_1-0-1_20190528120000.parquet",
".hoodie/.temp/2/2016/04/15/1_1-0-1_20190528120000.parquet"
);
files.stream().forEach(f -> {
try {
metaClient.getFs().create(new Path(new Path(basePath), f));
} catch (IOException e) {
throw new HoodieException(e);
}
});
// Test excluding meta-folder
final List<String> collected = new ArrayList<>();
FSUtils.processFiles(metaClient.getFs(), basePath, (status) -> {
collected.add(status.getPath().toString());
return true;
}, true);
Assert.assertTrue("Hoodie MetaFolder MUST be skipped but got :" + collected, collected.stream()
.noneMatch(s -> s.contains(HoodieTableMetaClient.METAFOLDER_NAME)));
// Check if only files are listed
Assert.assertEquals(2, collected.size());
// Test including meta-folder
final List<String> collected2 = new ArrayList<>();
FSUtils.processFiles(metaClient.getFs(), basePath, (status) -> {
collected2.add(status.getPath().toString());
return true;
}, false);
Assert.assertFalse("Hoodie MetaFolder will be present :" + collected2, collected2.stream()
.noneMatch(s -> s.contains(HoodieTableMetaClient.METAFOLDER_NAME)));
// Check if only files are listed including hoodie.properties
Assert.assertEquals("Collected=" + collected2, 5, collected2.size());
}
@Test @Test
public void testGetCommitTime() { public void testGetCommitTime() {
String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());