1
0

[HUDI-1144] Speedup spark read queries by caching metaclient in HoodieROPathFilter (#1919)

This commit is contained in:
Balaji Varadarajan
2020-08-05 09:19:10 -07:00
committed by GitHub
parent 7a2429f5ba
commit 9bcd3221fd
2 changed files with 22 additions and 8 deletions

View File

@@ -18,6 +18,8 @@
package org.apache.hudi.hadoop; package org.apache.hudi.hadoop;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configurable;
import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieBaseFile;
@@ -60,12 +62,17 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
* Its quite common, to have all files from a given partition path be passed into accept(), cache the check for hoodie * Its quite common, to have all files from a given partition path be passed into accept(), cache the check for hoodie
* metadata for known partition paths and the latest versions of files. * metadata for known partition paths and the latest versions of files.
*/ */
private HashMap<String, HashSet<Path>> hoodiePathCache; private Map<String, HashSet<Path>> hoodiePathCache;
/** /**
* Paths that are known to be non-hoodie tables. * Paths that are known to be non-hoodie tables.
*/ */
private HashSet<String> nonHoodiePathCache; private Set<String> nonHoodiePathCache;
/**
* Table Meta Client Cache.
*/
Map<String, HoodieTableMetaClient> metaClientCache;
/** /**
* Hadoop configurations for the FileSystem. * Hadoop configurations for the FileSystem.
@@ -82,6 +89,7 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
this.hoodiePathCache = new HashMap<>(); this.hoodiePathCache = new HashMap<>();
this.nonHoodiePathCache = new HashSet<>(); this.nonHoodiePathCache = new HashSet<>();
this.conf = new SerializableConfiguration(conf); this.conf = new SerializableConfiguration(conf);
this.metaClientCache = new HashMap<>();
} }
/** /**
@@ -149,7 +157,12 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
if (baseDir != null) { if (baseDir != null) {
try { try {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), baseDir.toString()); HoodieTableMetaClient metaClient = metaClientCache.get(baseDir.toString());
if (null == metaClient) {
metaClient = new HoodieTableMetaClient(fs.getConf(), baseDir.toString(), true);
metaClientCache.put(baseDir.toString(), metaClient);
}
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), fs.listStatus(folder)); metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), fs.listStatus(folder));
List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles().collect(Collectors.toList()); List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles().collect(Collectors.toList());

View File

@@ -31,6 +31,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -55,9 +56,9 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness {
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f1"); HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f1");
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f2"); HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f2");
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f3"); HoodieTestUtils.createDataFile(basePath, "2017/01/02", "001", "f3");
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "002", "f2"); HoodieTestUtils.createDataFile(basePath, "2017/01/01", "002", "f2");
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "003", "f3"); HoodieTestUtils.createDataFile(basePath, "2017/01/02", "003", "f3");
HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter(); HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter();
Path partitionPath = new Path("file://" + basePath + File.separator + "2017/01/01"); Path partitionPath = new Path("file://" + basePath + File.separator + "2017/01/01");
@@ -68,11 +69,11 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness {
assertFalse( assertFalse(
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2")))); pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2"))));
assertTrue( assertTrue(
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f3")))); pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/02", "001", "f3"))));
assertTrue( assertTrue(
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2")))); pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2"))));
assertFalse( assertFalse(
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3")))); pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/02", "003", "f3"))));
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "001")))); assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "001"))));
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "002")))); assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "002"))));
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getInflightCommitFilePath(basePath, "003")))); assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getInflightCommitFilePath(basePath, "003"))));
@@ -83,7 +84,7 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness {
assertFalse( assertFalse(
pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3")))); pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3"))));
assertEquals(1, pathFilter.metaClientCache.size());
} }
@Test @Test