1
0

[HUDI-2837] Add support for using database name in incremental query (#4083)

This commit is contained in:
董可伦
2022-01-23 14:11:27 +08:00
committed by GitHub
parent 64b1426005
commit 56cd8ffae0
19 changed files with 330 additions and 63 deletions

View File

@@ -121,7 +121,7 @@ public abstract class HoodieFileInputFormatBase extends FileInputFormat<NullWrit
continue;
}
List<Path> inputPaths = inputPathHandler.getGroupedIncrementalPaths().get(metaClient);
List<FileStatus> result = listStatusForIncrementalMode(job, metaClient, inputPaths);
List<FileStatus> result = listStatusForIncrementalMode(job, metaClient, inputPaths, table);
if (result != null) {
returns.addAll(result);
}
@@ -229,14 +229,14 @@ public abstract class HoodieFileInputFormatBase extends FileInputFormat<NullWrit
* partitions and then filtering based on the commits of interest, this logic first extracts the
* partitions touched by the desired commits and then lists only those partitions.
*/
protected List<FileStatus> listStatusForIncrementalMode(JobConf job, HoodieTableMetaClient tableMetaClient, List<Path> inputPaths) throws IOException {
String tableName = tableMetaClient.getTableConfig().getTableName();
protected List<FileStatus> listStatusForIncrementalMode(JobConf job, HoodieTableMetaClient tableMetaClient,
List<Path> inputPaths, String incrementalTable) throws IOException {
Job jobContext = Job.getInstance(job);
Option<HoodieTimeline> timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient);
if (!timeline.isPresent()) {
return null;
}
Option<List<HoodieInstant>> commitsToCheck = HoodieInputFormatUtils.getCommitsForIncrementalQuery(jobContext, tableName, timeline.get());
Option<List<HoodieInstant>> commitsToCheck = HoodieInputFormatUtils.getCommitsForIncrementalQuery(jobContext, incrementalTable, timeline.get());
if (!commitsToCheck.isPresent()) {
return null;
}

View File

@@ -19,11 +19,13 @@
package org.apache.hudi.hadoop;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.InvalidTableException;
import org.apache.hudi.exception.TableNotFoundException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.hadoop.utils.HoodieHiveUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -53,11 +55,12 @@ public class InputPathHandler {
public static final Logger LOG = LogManager.getLogger(InputPathHandler.class);
private final Configuration conf;
// tablename to metadata mapping for all Hoodie tables(both incremental & snapshot)
// tableName to metadata mapping for all Hoodie tables(both incremental & snapshot)
private final Map<String, HoodieTableMetaClient> tableMetaClientMap;
private final Map<HoodieTableMetaClient, List<Path>> groupedIncrementalPaths;
private final List<Path> snapshotPaths;
private final List<Path> nonHoodieInputPaths;
private boolean isIncrementalUseDatabase;
public InputPathHandler(Configuration conf, Path[] inputPaths, List<String> incrementalTables) throws IOException {
this.conf = conf;
@@ -65,13 +68,14 @@ public class InputPathHandler {
snapshotPaths = new ArrayList<>();
nonHoodieInputPaths = new ArrayList<>();
groupedIncrementalPaths = new HashMap<>();
this.isIncrementalUseDatabase = HoodieHiveUtils.isIncrementalUseDatabase(conf);
parseInputPaths(inputPaths, incrementalTables);
}
/**
* Takes in the original InputPaths and classifies each of them into incremental, snapshot and
* non-hoodie InputPaths. The logic is as follows:
* 1. Check if an inputPath starts with the same basepath as any of the metadata basepaths we know
* 1. Check if an inputPath starts with the same basePath as any of the metadata basePaths we know
* 1a. If yes, this belongs to a Hoodie table that we already know about. Simply classify this
* as incremental or snapshot - We can get the table name of this inputPath from the
* metadata. Then based on the list of incrementalTables, we can classify this inputPath.
@@ -95,19 +99,17 @@ public class InputPathHandler {
// We already know the base path for this inputPath.
basePathKnown = true;
// Check if this is for a snapshot query
String tableName = metaClient.getTableConfig().getTableName();
tagAsIncrementalOrSnapshot(inputPath, tableName, metaClient, incrementalTables);
tagAsIncrementalOrSnapshot(inputPath, metaClient, incrementalTables);
break;
}
}
if (!basePathKnown) {
// This path is for a table that we dont know about yet.
// This path is for a table that we don't know about yet.
HoodieTableMetaClient metaClient;
try {
metaClient = getTableMetaClientForBasePath(inputPath.getFileSystem(conf), inputPath);
String tableName = metaClient.getTableConfig().getTableName();
tableMetaClientMap.put(tableName, metaClient);
tagAsIncrementalOrSnapshot(inputPath, tableName, metaClient, incrementalTables);
tableMetaClientMap.put(getIncrementalTable(metaClient), metaClient);
tagAsIncrementalOrSnapshot(inputPath, metaClient, incrementalTables);
} catch (TableNotFoundException | InvalidTableException e) {
// This is a non Hoodie inputPath
LOG.info("Handling a non-hoodie path " + inputPath);
@@ -117,9 +119,8 @@ public class InputPathHandler {
}
}
private void tagAsIncrementalOrSnapshot(Path inputPath, String tableName,
HoodieTableMetaClient metaClient, List<String> incrementalTables) {
if (!incrementalTables.contains(tableName)) {
private void tagAsIncrementalOrSnapshot(Path inputPath, HoodieTableMetaClient metaClient, List<String> incrementalTables) {
if (!incrementalTables.contains(getIncrementalTable(metaClient))) {
snapshotPaths.add(inputPath);
} else {
// Group incremental Paths belonging to same table.
@@ -145,4 +146,11 @@ public class InputPathHandler {
public List<Path> getNonHoodieInputPaths() {
return nonHoodieInputPaths;
}
private String getIncrementalTable(HoodieTableMetaClient metaClient) {
String databaseName = metaClient.getTableConfig().getDatabaseName();
String tableName = metaClient.getTableConfig().getTableName();
return isIncrementalUseDatabase && !StringUtils.isNullOrEmpty(databaseName)
? databaseName + "." + tableName : tableName;
}
}

View File

@@ -113,9 +113,8 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
*/
@Override
protected List<FileStatus> listStatusForIncrementalMode(
JobConf job, HoodieTableMetaClient tableMetaClient, List<Path> inputPaths) throws IOException {
JobConf job, HoodieTableMetaClient tableMetaClient, List<Path> inputPaths, String incrementalTable) throws IOException {
List<FileStatus> result = new ArrayList<>();
String tableName = tableMetaClient.getTableConfig().getTableName();
Job jobContext = Job.getInstance(job);
// step1
@@ -123,7 +122,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
if (!timeline.isPresent()) {
return result;
}
HoodieTimeline commitsTimelineToReturn = HoodieInputFormatUtils.getHoodieTimelineForIncrementalQuery(jobContext, tableName, timeline.get());
HoodieTimeline commitsTimelineToReturn = HoodieInputFormatUtils.getHoodieTimelineForIncrementalQuery(jobContext, incrementalTable, timeline.get());
Option<List<HoodieInstant>> commitsToCheck = Option.of(commitsTimelineToReturn.getInstants().collect(Collectors.toList()));
if (!commitsToCheck.isPresent()) {
return result;

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.hadoop.utils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.CollectionUtils;
@@ -43,6 +44,7 @@ public class HoodieHiveUtils {
public static final Logger LOG = LogManager.getLogger(HoodieHiveUtils.class);
public static final String HOODIE_INCREMENTAL_USE_DATABASE = "hoodie.incremental.use.database";
public static final String HOODIE_CONSUME_MODE_PATTERN = "hoodie.%s.consume.mode";
public static final String HOODIE_START_COMMIT_PATTERN = "hoodie.%s.consume.start.timestamp";
public static final String HOODIE_MAX_COMMIT_PATTERN = "hoodie.%s.consume.max.commits";
@@ -178,4 +180,8 @@ public class HoodieHiveUtils {
}
return timeline.findInstantsBeforeOrEquals(maxCommit);
}
public static boolean isIncrementalUseDatabase(Configuration conf) {
return conf.getBoolean(HOODIE_INCREMENTAL_USE_DATABASE, false);
}
}