1
0

[HUDI-25] Optimize HoodieInputformat.listStatus() for faster Hive incremental queries on Hoodie

Summary:
    - InputPathHandler class classifies  inputPaths into incremental, non incremental and non hoodie paths.
    - Incremental queries leverage HoodieCommitMetadata to get partitions that are affected and only lists those partitions as opposed to listing all partitions
    - listStatus() processes each category separately
This commit is contained in:
Bhavani Sudha Saktheeswaran
2019-07-17 11:51:49 -07:00
parent 480fc7869d
commit d09eacdc13
6 changed files with 602 additions and 119 deletions

View File

@@ -0,0 +1,181 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.hadoop;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hudi.common.minicluster.HdfsTestService;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
public class InputPathHandlerTest {
// Incremental Table
public static final String RAW_TRIPS_TEST_NAME = "raw_trips";
public static final String MODEL_TRIPS_TEST_NAME = "model_trips";
// snapshot Table
public static final String ETL_TRIPS_TEST_NAME = "etl_trips";
// non Hoodie table
public static final String TRIPS_STATS_TEST_NAME = "trips_stats";
private static MiniDFSCluster dfsCluster;
private static DistributedFileSystem dfs;
private static HdfsTestService hdfsTestService;
private static InputPathHandler inputPathHandler;
private static String basePathTable1 = null;
private static String basePathTable2 = null;
private static String basePathTable3 = null;
private static String basePathTable4 = null; // non hoodie Path
private static List<String> incrementalTables;
private static List<Path> incrementalPaths;
private static List<Path> snapshotPaths;
private static List<Path> nonHoodiePaths;
private static List<Path> inputPaths;
@BeforeClass
public static void setUpDFS() throws IOException {
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
// same JVM
FileSystem.closeAll();
if (hdfsTestService == null) {
hdfsTestService = new HdfsTestService();
dfsCluster = hdfsTestService.start(true);
// Create a temp folder as the base path
dfs = dfsCluster.getFileSystem();
}
inputPaths = new ArrayList<>();
incrementalPaths = new ArrayList<>();
snapshotPaths = new ArrayList<>();
nonHoodiePaths = new ArrayList<>();
initTables();
}
@AfterClass
public static void cleanUp() throws Exception {
if (hdfsTestService != null) {
hdfsTestService.stop();
dfsCluster.shutdown();
dfsCluster = null;
dfs = null;
hdfsTestService = null;
}
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
// same JVM
FileSystem.closeAll();
}
static void initTables() throws IOException {
// Create a temp folder as the base path
TemporaryFolder parentFolder = new TemporaryFolder();
parentFolder.create();
basePathTable1 = parentFolder.newFolder(RAW_TRIPS_TEST_NAME).getAbsolutePath();
basePathTable2 = parentFolder.newFolder(MODEL_TRIPS_TEST_NAME).getAbsolutePath();
basePathTable3 = parentFolder.newFolder(ETL_TRIPS_TEST_NAME).getAbsolutePath();
basePathTable4 = parentFolder.newFolder(TRIPS_STATS_TEST_NAME).getAbsolutePath();
dfs.mkdirs(new Path(basePathTable1));
initTableType(dfs.getConf(), basePathTable1, RAW_TRIPS_TEST_NAME, HoodieTableType.MERGE_ON_READ);
incrementalPaths.addAll(generatePartitions(dfs, basePathTable1));
dfs.mkdirs(new Path(basePathTable2));
initTableType(dfs.getConf(), basePathTable2, MODEL_TRIPS_TEST_NAME, HoodieTableType.MERGE_ON_READ);
incrementalPaths.addAll(generatePartitions(dfs, basePathTable2));
dfs.mkdirs(new Path(basePathTable3));
initTableType(dfs.getConf(), basePathTable3, ETL_TRIPS_TEST_NAME, HoodieTableType.COPY_ON_WRITE);
snapshotPaths.addAll(generatePartitions(dfs, basePathTable3));
dfs.mkdirs(new Path(basePathTable4));
nonHoodiePaths.addAll(generatePartitions(dfs, basePathTable4));
inputPaths.addAll(incrementalPaths);
inputPaths.addAll(snapshotPaths);
inputPaths.addAll(nonHoodiePaths);
incrementalTables = new ArrayList<>();
incrementalTables.add(RAW_TRIPS_TEST_NAME);
incrementalTables.add(MODEL_TRIPS_TEST_NAME);
}
static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath,
String tableName, HoodieTableType tableType) throws IOException {
Properties properties = new Properties();
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName);
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name());
properties.setProperty(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, HoodieAvroPayload.class.getName());
return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties);
}
static List<Path> generatePartitions(DistributedFileSystem dfs, String basePath)
throws IOException {
List<Path> paths = new ArrayList<>();
paths.add(new Path(basePath + Path.SEPARATOR + "2019/05/21"));
paths.add(new Path(basePath + Path.SEPARATOR + "2019/05/22"));
paths.add(new Path(basePath + Path.SEPARATOR + "2019/05/23"));
paths.add(new Path(basePath + Path.SEPARATOR + "2019/05/24"));
paths.add(new Path(basePath + Path.SEPARATOR + "2019/05/25"));
for (Path path: paths) {
dfs.mkdirs(path);
}
return paths;
}
@Test
public void testInputPathHandler() throws IOException {
inputPathHandler = new InputPathHandler(dfs.getConf(), inputPaths.toArray(
new Path[inputPaths.size()]), incrementalTables);
List<Path> actualPaths = inputPathHandler.getGroupedIncrementalPaths().values().stream()
.flatMap(List::stream).collect(Collectors.toList());
assertTrue(actualComparesToExpected(actualPaths, incrementalPaths));
actualPaths = inputPathHandler.getSnapshotPaths();
assertTrue(actualComparesToExpected(actualPaths, snapshotPaths));
actualPaths = inputPathHandler.getNonHoodieInputPaths();
assertTrue(actualComparesToExpected(actualPaths, nonHoodiePaths));
}
private boolean actualComparesToExpected(List<Path> actualPaths, List<Path> expectedPaths) {
if (actualPaths.size() != expectedPaths.size()) {
return false;
}
for (Path path: actualPaths) {
if (!expectedPaths.contains(path)) {
return false;
}
}
return true;
}
}

View File

@@ -18,8 +18,14 @@
package org.apache.hudi.hadoop;
import org.apache.hudi.common.util.FSUtils;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.io.ArrayWritable;
@@ -28,17 +34,17 @@ import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieTestUtils;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.util.FSUtils;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.IOException;
import static org.junit.Assert.assertEquals;
public class TestHoodieInputFormat {
public class TestHoodieParquetInputFormat {
private HoodieParquetInputFormat inputFormat;
private JobConf jobConf;
@@ -100,7 +106,7 @@ public class TestHoodieInputFormat {
public void testIncrementalSimple() throws IOException {
// initial commit
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
InputFormatTestUtil.commit(basePath, "100");
createCommitFile(basePath, "100", "2016/05/01");
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
@@ -112,28 +118,42 @@ public class TestHoodieInputFormat {
files.length);
}
private void createCommitFile(TemporaryFolder basePath, String commitNumber, String partitionPath)
throws IOException {
List<HoodieWriteStat> writeStats = HoodieTestUtils.generateFakeHoodieWriteStat(1);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
writeStats.stream().forEach(stat -> commitMetadata.addWriteStat(partitionPath, stat));
File file = new File(basePath.getRoot().toString() + "/.hoodie/", commitNumber + ".commit");
file.createNewFile();
FileOutputStream fileOutputStream = new FileOutputStream(file);
fileOutputStream.write(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8));
fileOutputStream.flush();
fileOutputStream.close();
}
@Test
public void testIncrementalWithMultipleCommits() throws IOException {
// initial commit
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
InputFormatTestUtil.commit(basePath, "100");
createCommitFile(basePath, "100", "2016/05/01");
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
// update files
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false);
InputFormatTestUtil.commit(basePath, "200");
createCommitFile(basePath, "200", "2016/05/01");
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false);
InputFormatTestUtil.commit(basePath, "300");
createCommitFile(basePath, "300", "2016/05/01");
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false);
InputFormatTestUtil.commit(basePath, "400");
createCommitFile(basePath, "400", "2016/05/01");
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false);
InputFormatTestUtil.commit(basePath, "500");
createCommitFile(basePath, "500", "2016/05/01");
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false);
InputFormatTestUtil.commit(basePath, "600");
createCommitFile(basePath, "600", "2016/05/01");
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
FileStatus[] files = inputFormat.listStatus(jobConf);
@@ -190,8 +210,24 @@ public class TestHoodieInputFormat {
2, 10);
}
private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit, int totalExpected)
throws IOException {
@Test
public void testGetIncrementalTableNames() throws IOException {
String[] expectedincrTables = {"db1.raw_trips", "db2.model_trips"};
JobConf conf = new JobConf();
String incrementalMode1 = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, expectedincrTables[0]);
conf.set(incrementalMode1, HoodieHiveUtil.INCREMENTAL_SCAN_MODE);
String incrementalMode2 = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, expectedincrTables[1]);
conf.set(incrementalMode2,HoodieHiveUtil.INCREMENTAL_SCAN_MODE);
String defaultmode = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, "db3.first_trips");
conf.set(defaultmode, HoodieHiveUtil.DEFAULT_SCAN_MODE);
List<String> actualincrTables = HoodieHiveUtil.getIncrementalTableNames(Job.getInstance(conf));
for (int i = 0; i < expectedincrTables.length; i++) {
assertTrue(actualincrTables.contains(expectedincrTables[i]));
}
}
private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit,
int totalExpected) throws IOException {
int actualCount = 0;
int totalCount = 0;
InputSplit[] splits = inputFormat.getSplits(jobConf, 1);