[HUDI-1789] Support reading older snapshots (#2809)
* [HUDI-1789] In HoodieParquetInoutFormat we currently default to the latest version of base files. This PR attempts to add a new jobConf `hoodie.%s.consume.snapshot.time` This new config will allow us to read older snapshots. - Reusing hoodie.%s.consume.commit for point in time snapshot queries as well. - Adding javadocs and some more tests
This commit is contained in:
@@ -97,6 +97,13 @@ public class HoodieHiveUtils {
|
||||
return job.getConfiguration().get(startCommitTimestampName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the n'th parent for the Path. Assumes the path has at-least n components
|
||||
*
|
||||
* @param path
|
||||
* @param n
|
||||
* @return
|
||||
*/
|
||||
public static Path getNthParent(Path path, int n) {
|
||||
Path parent = path;
|
||||
for (int i = 0; i < n; i++) {
|
||||
@@ -105,6 +112,12 @@ public class HoodieHiveUtils {
|
||||
return parent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of tableNames for which hoodie.<tableName>.consume.mode is set to incremental else returns empty List
|
||||
*
|
||||
* @param job
|
||||
* @return
|
||||
*/
|
||||
public static List<String> getIncrementalTableNames(JobContext job) {
|
||||
Map<String, String> tablesModeMap = job.getConfiguration()
|
||||
.getValByRegex(HOODIE_CONSUME_MODE_PATTERN_STRING.pattern());
|
||||
@@ -122,19 +135,42 @@ public class HoodieHiveUtils {
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Depending on the configs hoodie.%s.consume.pending.commits and hoodie.%s.consume.commit of job
|
||||
*
|
||||
* (hoodie.<tableName>.consume.pending.commits, hoodie.<tableName>.consume.commit) ->
|
||||
* (true, validCommit) -> returns activeTimeline filtered until validCommit
|
||||
* (true, InValidCommit) -> Raises HoodieIOException
|
||||
* (true, notSet) -> Raises HoodieIOException
|
||||
* (false, validCommit) -> returns completedTimeline filtered until validCommit
|
||||
* (false, InValidCommit) -> Raises HoodieIOException
|
||||
* (false or notSet, notSet) -> returns completedTimeline unfiltered
|
||||
*
|
||||
* validCommit is one which exists in the timeline being checked and vice versa
|
||||
*
|
||||
* @param tableName
|
||||
* @param job
|
||||
* @param metaClient
|
||||
* @return
|
||||
*/
|
||||
public static HoodieTimeline getTableTimeline(final String tableName, final JobConf job, final HoodieTableMetaClient metaClient) {
|
||||
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline();
|
||||
|
||||
boolean includePendingCommits = job.getBoolean(String.format(HOODIE_CONSUME_PENDING_COMMITS, tableName), false);
|
||||
if (includePendingCommits) {
|
||||
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline();
|
||||
String maxCommit = job.get(String.format(HOODIE_CONSUME_COMMIT, tableName));
|
||||
if (maxCommit == null || !timeline.containsInstant(maxCommit)) {
|
||||
LOG.info("Timestamp configured for validation: " + maxCommit + " commits timeline:" + timeline + " table: " + tableName);
|
||||
throw new HoodieIOException("Valid timestamp is required for " + HOODIE_CONSUME_COMMIT + " in validate mode");
|
||||
}
|
||||
return timeline.findInstantsBeforeOrEquals(maxCommit);
|
||||
String maxCommit = job.get(String.format(HOODIE_CONSUME_COMMIT, tableName));
|
||||
|
||||
if (!includePendingCommits && maxCommit == null) {
|
||||
return timeline.filterCompletedInstants();
|
||||
}
|
||||
|
||||
// by default return all completed commits.
|
||||
return metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
return filterIfInstantExists(tableName, includePendingCommits ? timeline : timeline.filterCompletedInstants(), maxCommit);
|
||||
}
|
||||
|
||||
private static HoodieTimeline filterIfInstantExists(String tableName, HoodieTimeline timeline, String maxCommit) {
|
||||
if (maxCommit == null || !timeline.containsInstant(maxCommit)) {
|
||||
LOG.info("Timestamp " + maxCommit + " doesn't exist in the commits timeline:" + timeline + " table: " + tableName);
|
||||
throw new HoodieIOException("Valid timestamp is required for " + HOODIE_CONSUME_COMMIT + " in snapshot mode");
|
||||
}
|
||||
return timeline.findInstantsBeforeOrEquals(maxCommit);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user