1
0

[HUDI-1144] Speedup spark read queries by caching metaclient in HoodieROPathFilter (#1919)

This commit is contained in:
Balaji Varadarajan
2020-08-05 09:19:10 -07:00
committed by GitHub
parent 7a2429f5ba
commit 9bcd3221fd
2 changed files with 22 additions and 8 deletions

View File

@@ -18,6 +18,8 @@
package org.apache.hudi.hadoop;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configurable;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.model.HoodieBaseFile;
@@ -60,12 +62,17 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
* Its quite common, to have all files from a given partition path be passed into accept(), cache the check for hoodie
* metadata for known partition paths and the latest versions of files.
*/
private HashMap<String, HashSet<Path>> hoodiePathCache;
private Map<String, HashSet<Path>> hoodiePathCache;
/**
* Paths that are known to be non-hoodie tables.
*/
private HashSet<String> nonHoodiePathCache;
private Set<String> nonHoodiePathCache;
/**
* Table Meta Client Cache.
*/
Map<String, HoodieTableMetaClient> metaClientCache;
/**
* Hadoop configurations for the FileSystem.
@@ -82,6 +89,7 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
this.hoodiePathCache = new HashMap<>();
this.nonHoodiePathCache = new HashSet<>();
this.conf = new SerializableConfiguration(conf);
this.metaClientCache = new HashMap<>();
}
/**
@@ -149,7 +157,12 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
if (baseDir != null) {
try {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), baseDir.toString());
HoodieTableMetaClient metaClient = metaClientCache.get(baseDir.toString());
if (null == metaClient) {
metaClient = new HoodieTableMetaClient(fs.getConf(), baseDir.toString(), true);
metaClientCache.put(baseDir.toString(), metaClient);
}
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), fs.listStatus(folder));
List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles().collect(Collectors.toList());

View File

@@ -31,6 +31,7 @@ import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -55,9 +56,9 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness {
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f1");
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f2");
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f3");
HoodieTestUtils.createDataFile(basePath, "2017/01/02", "001", "f3");
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "002", "f2");
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "003", "f3");
HoodieTestUtils.createDataFile(basePath, "2017/01/02", "003", "f3");
HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter();
Path partitionPath = new Path("file://" + basePath + File.separator + "2017/01/01");
@@ -68,11 +69,11 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness {
assertFalse(
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2"))));
assertTrue(
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f3"))));
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/02", "001", "f3"))));
assertTrue(
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2"))));
assertFalse(
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3"))));
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/02", "003", "f3"))));
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "001"))));
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "002"))));
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getInflightCommitFilePath(basePath, "003"))));
@@ -83,7 +84,7 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness {
assertFalse(
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3"))));
assertEquals(1, pathFilter.metaClientCache.size());
}
@Test