[HUDI-1144] Speedup spark read queries by caching metaclient in HoodieROPathFilter (#1919)
This commit is contained in:
committed by
GitHub
parent
7a2429f5ba
commit
9bcd3221fd
@@ -18,6 +18,8 @@
|
||||
|
||||
package org.apache.hudi.hadoop;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import org.apache.hadoop.conf.Configurable;
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
@@ -60,12 +62,17 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
|
||||
* Its quite common, to have all files from a given partition path be passed into accept(), cache the check for hoodie
|
||||
* metadata for known partition paths and the latest versions of files.
|
||||
*/
|
||||
private HashMap<String, HashSet<Path>> hoodiePathCache;
|
||||
private Map<String, HashSet<Path>> hoodiePathCache;
|
||||
|
||||
/**
|
||||
* Paths that are known to be non-hoodie tables.
|
||||
*/
|
||||
private HashSet<String> nonHoodiePathCache;
|
||||
private Set<String> nonHoodiePathCache;
|
||||
|
||||
/**
|
||||
* Table Meta Client Cache.
|
||||
*/
|
||||
Map<String, HoodieTableMetaClient> metaClientCache;
|
||||
|
||||
/**
|
||||
* Hadoop configurations for the FileSystem.
|
||||
@@ -82,6 +89,7 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
|
||||
this.hoodiePathCache = new HashMap<>();
|
||||
this.nonHoodiePathCache = new HashSet<>();
|
||||
this.conf = new SerializableConfiguration(conf);
|
||||
this.metaClientCache = new HashMap<>();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -149,7 +157,12 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
|
||||
|
||||
if (baseDir != null) {
|
||||
try {
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), baseDir.toString());
|
||||
HoodieTableMetaClient metaClient = metaClientCache.get(baseDir.toString());
|
||||
if (null == metaClient) {
|
||||
metaClient = new HoodieTableMetaClient(fs.getConf(), baseDir.toString(), true);
|
||||
metaClientCache.put(baseDir.toString(), metaClient);
|
||||
}
|
||||
|
||||
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
|
||||
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), fs.listStatus(folder));
|
||||
List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles().collect(Collectors.toList());
|
||||
|
||||
@@ -31,6 +31,7 @@ import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
@@ -55,9 +56,9 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness {
|
||||
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f1");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f2");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f3");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/02", "001", "f3");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "002", "f2");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "003", "f3");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/02", "003", "f3");
|
||||
|
||||
HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter();
|
||||
Path partitionPath = new Path("file://" + basePath + File.separator + "2017/01/01");
|
||||
@@ -68,11 +69,11 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness {
|
||||
assertFalse(
|
||||
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2"))));
|
||||
assertTrue(
|
||||
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f3"))));
|
||||
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/02", "001", "f3"))));
|
||||
assertTrue(
|
||||
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2"))));
|
||||
assertFalse(
|
||||
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3"))));
|
||||
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/02", "003", "f3"))));
|
||||
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "001"))));
|
||||
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "002"))));
|
||||
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getInflightCommitFilePath(basePath, "003"))));
|
||||
@@ -83,7 +84,7 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness {
|
||||
|
||||
assertFalse(
|
||||
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3"))));
|
||||
|
||||
assertEquals(1, pathFilter.metaClientCache.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
||||
Reference in New Issue
Block a user