From 9bcd3221fd440081dbae70e89d08539c3b484862 Mon Sep 17 00:00:00 2001 From: Balaji Varadarajan Date: Wed, 5 Aug 2020 09:19:10 -0700 Subject: [PATCH] [HUDI-1144] Speedup spark read queries by caching metaclient in HoodieROPathFilter (#1919) --- .../hudi/hadoop/HoodieROTablePathFilter.java | 19 ++++++++++++++++--- .../hadoop/TestHoodieROTablePathFilter.java | 11 ++++++----- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java index 86199d25f..1e616f896 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java @@ -18,6 +18,8 @@ package org.apache.hudi.hadoop; +import java.util.Map; +import java.util.Set; import org.apache.hadoop.conf.Configurable; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.model.HoodieBaseFile; @@ -60,12 +62,17 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial * Its quite common, to have all files from a given partition path be passed into accept(), cache the check for hoodie * metadata for known partition paths and the latest versions of files. */ - private HashMap> hoodiePathCache; + private Map> hoodiePathCache; /** * Paths that are known to be non-hoodie tables. */ - private HashSet nonHoodiePathCache; + private Set nonHoodiePathCache; + + /** + * Table Meta Client Cache. + */ + Map metaClientCache; /** * Hadoop configurations for the FileSystem. @@ -82,6 +89,7 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial this.hoodiePathCache = new HashMap<>(); this.nonHoodiePathCache = new HashSet<>(); this.conf = new SerializableConfiguration(conf); + this.metaClientCache = new HashMap<>(); } /** @@ -149,7 +157,12 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial if (baseDir != null) { try { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), baseDir.toString()); + HoodieTableMetaClient metaClient = metaClientCache.get(baseDir.toString()); + if (null == metaClient) { + metaClient = new HoodieTableMetaClient(fs.getConf(), baseDir.toString(), true); + metaClientCache.put(baseDir.toString(), metaClient); + } + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), fs.listStatus(folder)); List latestFiles = fsView.getLatestBaseFiles().collect(Collectors.toList()); diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java index 18e9afd73..f96f6cb29 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java @@ -31,6 +31,7 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -55,9 +56,9 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness { HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f1"); HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f2"); - HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f3"); + HoodieTestUtils.createDataFile(basePath, "2017/01/02", "001", "f3"); HoodieTestUtils.createDataFile(basePath, "2017/01/01", "002", "f2"); - HoodieTestUtils.createDataFile(basePath, "2017/01/01", "003", "f3"); + HoodieTestUtils.createDataFile(basePath, "2017/01/02", "003", "f3"); HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter(); Path partitionPath = new Path("file://" + basePath + File.separator + "2017/01/01"); @@ -68,11 +69,11 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness { assertFalse( pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2")))); assertTrue( - pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f3")))); + pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/02", "001", "f3")))); assertTrue( pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2")))); assertFalse( - pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3")))); + pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/02", "003", "f3")))); assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "001")))); assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "002")))); assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getInflightCommitFilePath(basePath, "003")))); @@ -83,7 +84,7 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness { assertFalse( pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3")))); - + assertEquals(1, pathFilter.metaClientCache.size()); } @Test