From ccd8cb240726deda35fabd2860edcbf4adeb1013 Mon Sep 17 00:00:00 2001 From: Prasanna Rajaperumal Date: Wed, 18 Jan 2017 01:00:36 -0800 Subject: [PATCH] Take 2: Refactor hoodie-common and create right abstractions for Hoodie Storage V2.0 - Refactored timelines to be a single timeline for all active events and one for archived events. CommitTimeline and other timelines can be inferred by applying a filter on the activeTimelime - Introduced HoodieInstant to abstract different types of action, commit time and if isInFlight - Implemented other review comments --- .../com/uber/hoodie/cli/HoodiePrompt.java | 8 +- .../hoodie/cli/commands/CommitsCommand.java | 66 ++-- .../cli/commands/HoodieSyncCommand.java | 27 +- .../hoodie/cli/commands/StatsCommand.java | 13 +- .../com/uber/hoodie/cli/utils/CommitUtil.java | 8 +- .../com/uber/hoodie/cli/DedupeSparkJob.scala | 4 +- .../com/uber/hoodie/HoodieReadClient.java | 28 +- .../com/uber/hoodie/HoodieWriteClient.java | 41 ++- .../com/uber/hoodie/index/HBaseIndex.java | 8 +- .../uber/hoodie/index/HoodieBloomIndex.java | 9 +- .../com/uber/hoodie/io/HoodieCleaner.java | 80 ++--- .../hoodie/io/HoodieCommitArchiveLog.java | 48 +-- .../com/uber/hoodie/io/HoodieIOHandle.java | 3 +- .../hoodie/table/HoodieCopyOnWriteTable.java | 22 +- .../com/uber/hoodie/TestHoodieClient.java | 31 +- .../hoodie/common/HoodieClientTestUtils.java | 5 +- .../common/HoodieTestDataGenerator.java | 3 +- .../hoodie/func/TestUpdateMapFunction.java | 3 +- .../hoodie/io/TestHoodieCommitArchiveLog.java | 51 +-- .../hoodie/table/TestCopyOnWriteTable.java | 4 +- ...StorageType.java => HoodieFileFormat.java} | 4 +- .../common/table/HoodieTableConfig.java | 19 +- .../common/table/HoodieTableMetaClient.java | 111 ++----- .../hoodie/common/table/HoodieTimeline.java | 181 ++++++----- .../common/table/TableFileSystemView.java | 12 +- .../timeline/HoodieActiveCommitTimeline.java | 77 ----- .../table/timeline/HoodieActiveTimeline.java | 252 +++++++++++++++ ...eline.java => HoodieArchivedTimeline.java} | 105 +++---- .../table/timeline/HoodieCleanerTimeline.java | 57 ---- .../table/timeline/HoodieDefaultTimeline.java | 292 +++--------------- .../common/table/timeline/HoodieInstant.java | 121 ++++++++ .../timeline/HoodieSavePointTimeline.java | 50 --- .../view/AbstractTableFileSystemView.java | 50 +-- .../table/view/ReadOptimizedTableView.java | 2 +- .../com/uber/hoodie/common/util/FSUtils.java | 24 +- .../hoodie/common/model/HoodieTestUtils.java | 19 +- .../table/HoodieTableMetaClientTest.java | 96 +++--- .../string/HoodieActiveTimelineTest.java | 126 ++++++++ .../string/HoodieDefaultTimelineTest.java | 104 ------- .../table/string/MockHoodieTimeline.java | 58 ++-- .../view/ReadOptimizedTableViewTest.java | 30 +- hoodie-hadoop-mr/pom.xml | 5 + .../uber/hoodie/hadoop/HoodieInputFormat.java | 11 +- .../utilities/HiveIncrementalPuller.java | 18 +- .../hoodie/utilities/HoodieDeltaStreamer.java | 4 +- .../utilities/HoodieSnapshotCopier.java | 10 +- 46 files changed, 1194 insertions(+), 1106 deletions(-) rename hoodie-common/src/main/java/com/uber/hoodie/common/model/{HoodieStorageType.java => HoodieFileFormat.java} (92%) delete mode 100644 hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveCommitTimeline.java create mode 100644 hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java rename hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/{HoodieArchivedCommitTimeline.java => HoodieArchivedTimeline.java} (52%) delete mode 100644 hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieCleanerTimeline.java create mode 100644 hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieInstant.java delete mode 100644 hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieSavePointTimeline.java create mode 100644 hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java delete mode 100644 hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieDefaultTimelineTest.java diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java index 31fca3eb0..66e189488 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java @@ -16,6 +16,7 @@ package com.uber.hoodie.cli; +import com.uber.hoodie.common.table.HoodieTableConfig; import org.springframework.core.Ordered; import org.springframework.core.annotation.Order; import org.springframework.shell.plugin.support.DefaultPromptProvider; @@ -27,17 +28,18 @@ public class HoodiePrompt extends DefaultPromptProvider { @Override public String getPrompt() { + String tableName = HoodieCLI.tableMetadata.getTableConfig().getTableName(); switch (HoodieCLI.state) { case INIT: return "hoodie->"; case DATASET: - return "hoodie:" + HoodieCLI.tableMetadata.getTableConfig().getTableName() + "->"; + return "hoodie:" + tableName + "->"; case SYNC: - return "hoodie:" + HoodieCLI.tableMetadata.getTableConfig().getTableName() + " <==> " + return "hoodie:" + tableName + " <==> " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->"; } if (HoodieCLI.tableMetadata != null) - return "hoodie:" + HoodieCLI.tableMetadata.getTableConfig().getTableName() + "->"; + return "hoodie:" + tableName + "->"; return "hoodie->"; } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java index a77c7ca57..8a3d45887 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java @@ -24,6 +24,8 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.model.HoodieWriteStat; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.NumericUtils; import org.apache.spark.launcher.SparkLauncher; @@ -38,6 +40,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.stream.Collectors; @Component @@ -67,15 +70,16 @@ public class CommitsCommand implements CommandMarker { @CliOption(key = { "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit) throws IOException { - HoodieTimeline timeline = HoodieCLI.tableMetadata.getActiveCommitTimeline(); - List commits = timeline.getInstants().collect(Collectors.toList()); + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); + List commits = timeline.getInstants().collect(Collectors.toList()); String[][] rows = new String[commits.size()][]; Collections.reverse(commits); for (int i = 0; i < commits.size(); i++) { - String commit = commits.get(i); + HoodieInstant commit = commits.get(i); HoodieCommitMetadata commitMetadata = - HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(commit).get()); - rows[i] = new String[] {commit, + HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get()); + rows[i] = new String[] {commit.getTimestamp(), NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()), String.valueOf(commitMetadata.fetchTotalFilesInsert()), String.valueOf(commitMetadata.fetchTotalFilesUpdated()), @@ -104,10 +108,14 @@ public class CommitsCommand implements CommandMarker { final String commitTime, @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") final String sparkPropertiesPath) throws Exception { - if (!HoodieCLI.tableMetadata.getActiveCommitTimeline().containsInstant(commitTime)) { - return "Commit " + commitTime + " not found in Commits " + HoodieCLI.tableMetadata - .getActiveCommitTimeline().getInstants().collect(Collectors.toList()); + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); + HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); + + if (!timeline.containsInstant(commitInstant)) { + return "Commit " + commitTime + " not found in Commits " + timeline; } + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime, @@ -127,13 +135,15 @@ public class CommitsCommand implements CommandMarker { public String showCommitPartitions( @CliOption(key = {"commit"}, help = "Commit to show") final String commitTime) throws Exception { - HoodieTimeline timeline = HoodieCLI.tableMetadata.getActiveCommitTimeline(); - if (!timeline.containsInstant(commitTime)) { - return "Commit " + commitTime + " not found in Commits " + HoodieCLI.tableMetadata - .getActiveCommitTimeline().getInstants().collect(Collectors.toList()); + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); + HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); + + if (!timeline.containsInstant(commitInstant)) { + return "Commit " + commitTime + " not found in Commits " + timeline; } HoodieCommitMetadata meta = - HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(commitTime).get()); + HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get()); List rows = new ArrayList(); for (Map.Entry> entry : meta.getPartitionToWriteStats() .entrySet()) { @@ -173,13 +183,15 @@ public class CommitsCommand implements CommandMarker { public String showCommitFiles( @CliOption(key = {"commit"}, help = "Commit to show") final String commitTime) throws Exception { - HoodieTimeline timeline = HoodieCLI.tableMetadata.getActiveCommitTimeline(); - if (!timeline.containsInstant(commitTime)) { - return "Commit " + commitTime + " not found in Commits " + HoodieCLI.tableMetadata - .getActiveCommitTimeline().getInstants().collect(Collectors.toList()); + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); + HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); + + if (!timeline.containsInstant(commitInstant)) { + return "Commit " + commitTime + " not found in Commits " + timeline; } HoodieCommitMetadata meta = - HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(commitTime).get()); + HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get()); List rows = new ArrayList(); for (Map.Entry> entry : meta.getPartitionToWriteStats() .entrySet()) { @@ -208,26 +220,26 @@ public class CommitsCommand implements CommandMarker { @CliOption(key = {"path"}, help = "Path of the dataset to compare to") final String path) throws Exception { HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path); - HoodieTimeline targetTimeline = target.getActiveCommitTimeline(); + HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitTimeline().filterCompletedInstants();; HoodieTableMetaClient source = HoodieCLI.tableMetadata; - HoodieTimeline sourceTimeline = source.getActiveCommitTimeline(); + HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitTimeline().filterCompletedInstants();; String targetLatestCommit = - targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get(); + targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp(); String sourceLatestCommit = - sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get(); + sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp(); if (sourceLatestCommit != null && sourceTimeline - .compareInstants(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { + .compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { // source is behind the target List commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) - .collect(Collectors.toList()); - return "Source " + source.getTableConfig().getTableName() + " is behind by " + commitsToCatchup.size() - + " commits. Commits to catch up - " + commitsToCatchup; + .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + return "Source " + source.getTableConfig().getTableName() + " is behind by " + + commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup; } else { List commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE) - .collect(Collectors.toList()); + .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); return "Source " + source.getTableConfig().getTableName() + " is ahead by " + commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup; } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java index 61a3d7f2e..6ce7d76e7 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java @@ -21,6 +21,7 @@ import com.uber.hoodie.cli.utils.HiveUtil; import com.uber.hoodie.cli.HoodieCLI; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.annotation.CliAvailabilityIndicator; import org.springframework.shell.core.annotation.CliCommand; @@ -60,9 +61,9 @@ public class HoodieSyncCommand implements CommandMarker { "hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") final String hivePass) throws Exception { HoodieTableMetaClient target = HoodieCLI.syncTableMetadata; - HoodieTimeline targetTimeline = target.getActiveCommitTimeline(); + HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitTimeline(); HoodieTableMetaClient source = HoodieCLI.tableMetadata; - HoodieTimeline sourceTimeline = source.getActiveCommitTimeline(); + HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitTimeline(); long sourceCount = 0; long targetCount = 0; if ("complete".equals(mode)) { @@ -74,36 +75,40 @@ public class HoodieSyncCommand implements CommandMarker { } String targetLatestCommit = - targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get(); + targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp(); String sourceLatestCommit = - sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get(); + sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp(); if (sourceLatestCommit != null && sourceTimeline - .compareInstants(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { + .compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { // source is behind the target - List commitsToCatchup = - targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) + List commitsToCatchup = + targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants() .collect(Collectors.toList()); if (commitsToCatchup.isEmpty()) { return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount); } else { - long newInserts = CommitUtil.countNewRecords(target, commitsToCatchup); + long newInserts = CommitUtil.countNewRecords(target, + commitsToCatchup.stream().map(HoodieInstant::getTimestamp) + .collect(Collectors.toList())); return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount) + ". Catch up count is " + newInserts; } } else { - List commitsToCatchup = - sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE) + List commitsToCatchup = + sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE).getInstants() .collect(Collectors.toList()); if (commitsToCatchup.isEmpty()) { return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount - targetCount); } else { - long newInserts = CommitUtil.countNewRecords(source, commitsToCatchup); + long newInserts = CommitUtil.countNewRecords(source, + commitsToCatchup.stream().map(HoodieInstant::getTimestamp) + .collect(Collectors.toList())); return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount - targetCount) + ". Catch up count is " + newInserts; diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java index 8be833ab7..cef92ae67 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java @@ -24,6 +24,8 @@ import com.uber.hoodie.cli.HoodieCLI; import com.uber.hoodie.cli.HoodiePrintHelper; import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.NumericUtils; @@ -53,21 +55,22 @@ public class StatsCommand implements CommandMarker { long totalRecordsUpserted = 0; long totalRecordsWritten = 0; - HoodieTimeline timeline = HoodieCLI.tableMetadata.getActiveCommitTimeline(); + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); - String[][] rows = new String[new Long(timeline.getTotalInstants()).intValue() + 1][]; + String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][]; int i = 0; DecimalFormat df = new DecimalFormat("#.00"); - for (String commitTime : timeline.getInstants().collect( + for (HoodieInstant commitTime : timeline.getInstants().collect( Collectors.toList())) { String waf = "0"; - HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(commitTime).get()); + HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitTime).get()); if (commit.fetchTotalUpdateRecordsWritten() > 0) { waf = df.format( (float) commit.fetchTotalRecordsWritten() / commit .fetchTotalUpdateRecordsWritten()); } - rows[i++] = new String[] {commitTime, + rows[i++] = new String[] {commitTime.getTimestamp(), String.valueOf(commit.fetchTotalUpdateRecordsWritten()), String.valueOf(commit.fetchTotalRecordsWritten()), waf}; totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten(); diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java index a9755cec1..8f5aabf4f 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java @@ -19,6 +19,7 @@ package com.uber.hoodie.cli.utils; import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import java.io.IOException; import java.util.List; @@ -27,10 +28,11 @@ public class CommitUtil { public static long countNewRecords(HoodieTableMetaClient target, List commitsToCatchup) throws IOException { long totalNew = 0; - HoodieTimeline timeline = target.getActiveCommitTimeline(); - timeline = timeline.reload(); + HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants(); for(String commit:commitsToCatchup) { - HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(commit).get()); + HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline + .getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit)) + .get()); totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten(); } return totalNew; diff --git a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala index 182e1e3fc..c86d13ae7 100644 --- a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala +++ b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala @@ -78,7 +78,7 @@ class DedupeSparkJob (basePath: String, val fsView = new ReadOptimizedTableView(fs, metadata) val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}")) - val latestFiles:java.util.List[HoodieDataFile] = fsView.streamLatestVersions(allFiles).collect(Collectors.toList[HoodieDataFile]()) + val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestVersions(allFiles).collect(Collectors.toList[HoodieDataFile]()) val filteredStatuses = latestFiles.map(f => f.getPath) LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}") @@ -129,7 +129,7 @@ class DedupeSparkJob (basePath: String, val fsView = new ReadOptimizedTableView(fs, metadata) val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}")) - val latestFiles:java.util.List[HoodieDataFile] = fsView.streamLatestVersions(allFiles).collect(Collectors.toList[HoodieDataFile]()) + val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestVersions(allFiles).collect(Collectors.toList[HoodieDataFile]()) val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap val dupeFixPlan = planDuplicateFix() diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java index f11b5a5a6..19ab4427e 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java @@ -25,6 +25,7 @@ import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieWriteConfig; @@ -89,8 +90,10 @@ public class HoodieReadClient implements Serializable { this.jsc = jsc; this.fs = FSUtils.getFs(); this.metaClient = new HoodieTableMetaClient(fs, basePath, true); - this.commitTimeline = metaClient.getActiveCommitTimeline(); - this.index = new HoodieBloomIndex(HoodieWriteConfig.newBuilder().withPath(basePath).build(), jsc); + this.commitTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); + this.index = + new HoodieBloomIndex(HoodieWriteConfig.newBuilder().withPath(basePath).build(), jsc); this.sqlContextOpt = Optional.absent(); } @@ -191,7 +194,7 @@ public class HoodieReadClient implements Serializable { + metaClient.getBasePath()); } - List latestFiles = fileSystemView.streamLatestVersions(fs.globStatus(new Path(path))).collect( + List latestFiles = fileSystemView.getLatestVersions(fs.globStatus(new Path(path))).collect( Collectors.toList()); for (HoodieDataFile file : latestFiles) { filteredPaths.add(file.getPath()); @@ -212,17 +215,17 @@ public class HoodieReadClient implements Serializable { */ public Dataset readSince(String lastCommitTimestamp) { - List commitsToReturn = + List commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTimestamp, Integer.MAX_VALUE) - .collect(Collectors.toList()); + .getInstants().collect(Collectors.toList()); //TODO: we can potentially trim this down to only affected partitions, using CommitMetadata try { // Go over the commit metadata, and obtain the new files that need to be read. HashMap fileIdToFullPath = new HashMap<>(); - for (String commit: commitsToReturn) { + for (HoodieInstant commit: commitsToReturn) { HoodieCommitMetadata metadata = - HoodieCommitMetadata.fromBytes(commitTimeline.readInstantDetails(commit).get()); + HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get()); // get files from each commit, and replace any previous versions fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths()); } @@ -240,13 +243,15 @@ public class HoodieReadClient implements Serializable { */ public Dataset readCommit(String commitTime) { assertSqlContext(); - if (!commitTimeline.containsInstant(commitTime)) { + HoodieInstant commitInstant = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); + if (!commitTimeline.containsInstant(commitInstant)) { new HoodieException("No commit exists at " + commitTime); } try { HoodieCommitMetadata commitMetdata = - HoodieCommitMetadata.fromBytes(commitTimeline.readInstantDetails(commitTime).get()); + HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commitInstant).get()); Collection paths = commitMetdata.getFileIdAndFullPaths().values(); return sqlContextOpt.get().read() .parquet(paths.toArray(new String[paths.size()])) @@ -298,13 +303,14 @@ public class HoodieReadClient implements Serializable { * @return */ public List listCommitsSince(String commitTimestamp) { - return commitTimeline.findInstantsAfter(commitTimestamp, Integer.MAX_VALUE).collect(Collectors.toList()); + return commitTimeline.findInstantsAfter(commitTimestamp, Integer.MAX_VALUE).getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); } /** * Returns the last successful commit (a successful write operation) into a Hoodie table. */ public String latestCommit() { - return commitTimeline.lastInstant().get(); + return commitTimeline.lastInstant().get().getTimestamp(); } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java index a83963d4c..856533b8a 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java @@ -25,6 +25,8 @@ import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieWriteStat; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieCommitException; @@ -302,7 +304,7 @@ public class HoodieWriteClient implements Seriali logger.info("Comitting " + commitTime); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, config.getBasePath(), true); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); List> stats = writeStatuses.mapToPair(new PairFunction() { @@ -319,7 +321,8 @@ public class HoodieWriteClient implements Seriali } try { - commitTimeline.saveInstantAsComplete(commitTime, + activeTimeline.saveAsComplete( + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime), Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); // Save was a success // We cannot have unbounded commit files. Archive commits if we have to archive @@ -356,17 +359,19 @@ public class HoodieWriteClient implements Seriali final Timer.Context context = metrics.getRollbackCtx(); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, config.getBasePath(), true); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + HoodieTimeline inflightTimeline = activeTimeline.getCommitTimeline().filterInflights(); + HoodieTimeline commitTimeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); try { if (commitTimeline.lastInstant().isPresent() - && commitTimeline.findInstantsAfter(commitTime, Integer.MAX_VALUE).count() > 0) { + && !commitTimeline.findInstantsAfter(commitTime, Integer.MAX_VALUE).empty()) { throw new HoodieRollbackException("Found commits after time :" + commitTime + ", please rollback greater commits first"); } - List inflights = - commitTimeline.getInflightInstants().collect(Collectors.toList()); + List inflights = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); if (!inflights.isEmpty() && inflights.indexOf(commitTime) != inflights.size() - 1) { throw new HoodieRollbackException( "Found in-flight commits after time :" + commitTime + @@ -374,10 +379,12 @@ public class HoodieWriteClient implements Seriali } if (inflights.contains(commitTime) || (commitTimeline.lastInstant().isPresent() - && commitTimeline.lastInstant().get().equals(commitTime))) { + && commitTimeline.lastInstant().get().getTimestamp().equals(commitTime))) { // 1. Atomically unpublish this commit - if(commitTimeline.containsInstant(commitTime)) { - commitTimeline.revertInstantToInflight(commitTime); + if(!inflights.contains(commitTime)) { + // This is completed commit, first revert it to inflight to unpublish data + activeTimeline.revertToInflight( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime)); } // 2. Revert the index changes logger.info("Clean out index changes at time: " + commitTime); @@ -415,7 +422,8 @@ public class HoodieWriteClient implements Seriali }); // 4. Remove commit logger.info("Clean out metadata files at time: " + commitTime); - commitTimeline.removeInflightFromTimeline(commitTime); + activeTimeline.deleteInflight( + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime)); if (context != null) { long durationInMs = metrics.getDurationInMs(context.stop()); @@ -446,7 +454,6 @@ public class HoodieWriteClient implements Seriali final Timer.Context context = metrics.getCleanCtx(); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, config.getBasePath(), true); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); List partitionsToClean = FSUtils.getAllPartitionPaths(fs, metaClient.getBasePath()); // shuffle to distribute cleaning work across partitions evenly @@ -497,8 +504,9 @@ public class HoodieWriteClient implements Seriali logger.info("Generate a new commit time " + commitTime); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, config.getBasePath(), true); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); - commitTimeline.saveInstantAsInflight(commitTime); + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + activeTimeline.createInflight( + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime)); } public static SparkConf registerClasses(SparkConf conf) { @@ -534,14 +542,17 @@ public class HoodieWriteClient implements Seriali /** * Cleanup all inflight commits + * * @throws IOException */ private void rollbackInflightCommits() { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, config.getBasePath(), true); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + HoodieTimeline inflightTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterInflights(); - List commits = commitTimeline.getInflightInstants().collect(Collectors.toList()); + List commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); Collections.reverse(commits); for (String commit : commits) { rollback(commit); diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java index 9be71cd4b..fcfff43d5 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java @@ -19,6 +19,7 @@ package com.uber.hoodie.index; import com.google.common.base.Optional; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieKey; @@ -128,9 +129,12 @@ public class HBaseIndex extends HoodieIndex { String fileId = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + HoodieTimeline commitTimeline = + metaClient.getActiveTimeline().getCommitTimeline() + .filterCompletedInstants(); // if the last commit ts for this row is less than the system commit ts - if (commitTimeline.hasInstants() && commitTimeline.containsInstant(commitTs)) { + if (!commitTimeline.empty() && commitTimeline.containsInstant( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs))) { rec.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java index c5fd18ee2..3be461746 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java @@ -22,6 +22,7 @@ import com.google.common.base.Optional; import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; @@ -218,13 +219,13 @@ public class HoodieBloomIndex extends HoodieIndex public Iterable> call(String partitionPath) { FileSystem fs = FSUtils.getFs(); TableFileSystemView view = new ReadOptimizedTableView(fs, metaClient); - java.util.Optional latestCommitTime = - metaClient.getActiveCommitTimeline().lastInstant(); + java.util.Optional latestCommitTime = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant(); List> list = new ArrayList<>(); if (latestCommitTime.isPresent()) { List filteredFiles = - view.streamLatestVersionInPartition(partitionPath, - latestCommitTime.get()).collect(Collectors.toList()); + view.getLatestVersionInPartition(partitionPath, + latestCommitTime.get().getTimestamp()).collect(Collectors.toList()); for (HoodieDataFile file : filteredFiles) { list.add(new Tuple2<>(partitionPath, file.getFileName())); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java index c47188b79..14bd732bb 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java @@ -20,6 +20,7 @@ import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.common.util.FSUtils; @@ -38,55 +39,56 @@ import java.util.stream.Collectors; /** * Cleaner is responsible for garbage collecting older files in a given partition path, such that - * + *

* 1) It provides sufficient time for existing queries running on older versions, to finish - * + *

* 2) It bounds the growth of the files in the file system - * + *

* TODO: Should all cleaning be done based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata} - * - * */ public class HoodieCleaner { private static Logger logger = LogManager.getLogger(HoodieCleaner.class); + public enum CleaningPolicy { KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_COMMITS } + private final TableFileSystemView fileSystemView; private final HoodieTimeline commitTimeline; private HoodieTableMetaClient metaClient; private HoodieWriteConfig config; private FileSystem fs; - public HoodieCleaner(HoodieTableMetaClient metaClient, - HoodieWriteConfig config, - FileSystem fs) { + public HoodieCleaner(HoodieTableMetaClient metaClient, HoodieWriteConfig config, + FileSystem fs) { this.metaClient = metaClient; this.fileSystemView = new ReadOptimizedTableView(fs, metaClient); - this.commitTimeline = metaClient.getActiveCommitTimeline(); + this.commitTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); this.config = config; this.fs = fs; } /** - * * Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. * This policy is useful, if you are simply interested in querying the table, and you don't want too many * versions for a single file (i.e run it with versionsRetained = 1) * - * * @param partitionPath * @return * @throws IOException */ - private List getFilesToCleanKeepingLatestVersions(String partitionPath) throws IOException { - logger.info("Cleaning "+ partitionPath+", retaining latest "+ config.getCleanerFileVersionsRetained()+" file versions. "); - List> fileVersions = fileSystemView.streamEveryVersionInPartition(partitionPath).collect( - Collectors.toList()); + private List getFilesToCleanKeepingLatestVersions(String partitionPath) + throws IOException { + logger.info("Cleaning " + partitionPath + ", retaining latest " + config + .getCleanerFileVersionsRetained() + " file versions. "); + List> fileVersions = + fileSystemView.getEveryVersionInPartition(partitionPath) + .collect(Collectors.toList()); List deletePaths = new ArrayList<>(); for (List versionsForFileId : fileVersions) { @@ -99,10 +101,8 @@ public class HoodieCleaner { } // Delete the remaining files while (commitItr.hasNext()) { - deletePaths.add(String.format("%s/%s/%s", - config.getBasePath(), - partitionPath, - commitItr.next().getFileName())); + deletePaths.add(String.format("%s/%s/%s", config.getBasePath(), partitionPath, + commitItr.next().getFileName())); } } return deletePaths; @@ -111,17 +111,17 @@ public class HoodieCleaner { /** * Selects the versions for file for cleaning, such that it - * - * - Leaves the latest version of the file untouched - * - For older versions, - * - It leaves all the commits untouched which has occured in last config.getCleanerCommitsRetained() commits - * - It leaves ONE commit before this window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained(). This is 12 hours by default. - * This is essential to leave the file used by the query thats running for the max time. - * - * This provides the effect of having lookback into all changes that happened in the last X - * commits. (eg: if you retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback) - * - * This policy is the default. + *

+ * - Leaves the latest version of the file untouched + * - For older versions, + * - It leaves all the commits untouched which has occured in last config.getCleanerCommitsRetained() commits + * - It leaves ONE commit before this window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained(). This is 12 hours by default. + * This is essential to leave the file used by the query thats running for the max time. + *

+ * This provides the effect of having lookback into all changes that happened in the last X + * commits. (eg: if you retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback) + *

+ * This policy is the default. * * @param partitionPath * @return @@ -135,11 +135,12 @@ public class HoodieCleaner { List deletePaths = new ArrayList<>(); // determine if we have enough commits, to start cleaning. - if (commitTimeline.getTotalInstants() > commitsRetained) { - String earliestCommitToRetain = - commitTimeline.nthInstant(commitTimeline.getTotalInstants() - commitsRetained).get(); + if (commitTimeline.countInstants() > commitsRetained) { + HoodieInstant earliestCommitToRetain = + commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained).get(); List> fileVersions = - fileSystemView.streamEveryVersionInPartition(partitionPath).collect(Collectors.toList()); + fileSystemView.getEveryVersionInPartition(partitionPath) + .collect(Collectors.toList()); for (List fileList : fileVersions) { String lastVersion = FSUtils.getCommitTime(fileList.get(0).getFileName()); String lastVersionBeforeEarliestCommitToRetain = @@ -160,8 +161,9 @@ public class HoodieCleaner { } // Always keep the last commit - if (commitTimeline.compareInstants(earliestCommitToRetain, fileCommitTime, - HoodieTimeline.GREATER)) { + if (commitTimeline + .compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime, + HoodieTimeline.GREATER)) { // this is a commit, that should be cleaned. deletePaths.add(String .format("%s/%s/%s", config.getBasePath(), partitionPath, FSUtils @@ -177,10 +179,12 @@ public class HoodieCleaner { /** * Gets the latest version < commitTime. This version file could still be used by queries. */ - private String getLatestVersionBeforeCommit(List fileList, String commitTime) { + private String getLatestVersionBeforeCommit(List fileList, + HoodieInstant commitTime) { for (HoodieDataFile file : fileList) { String fileCommitTime = FSUtils.getCommitTime(file.getFileName()); - if (commitTimeline.compareInstants(commitTime, fileCommitTime, HoodieTimeline.GREATER)) { + if (commitTimeline.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, + HoodieTimeline.GREATER)) { // fileList is sorted on the reverse, so the first commit we find <= commitTime is the one we want return fileCommitTime; } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java index cc000410a..dfe8f8d10 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java @@ -19,8 +19,8 @@ package com.uber.hoodie.io; import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; -import com.uber.hoodie.common.table.timeline.HoodieActiveCommitTimeline; -import com.uber.hoodie.common.table.timeline.HoodieArchivedCommitTimeline; +import com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.common.file.HoodieAppendLog; import com.uber.hoodie.exception.HoodieCommitException; @@ -34,6 +34,7 @@ import org.apache.log4j.Logger; import java.io.IOException; import java.util.List; +import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -47,11 +48,10 @@ public class HoodieCommitArchiveLog { private final FileSystem fs; private final HoodieWriteConfig config; - public HoodieCommitArchiveLog(HoodieWriteConfig config, - FileSystem fs) { + public HoodieCommitArchiveLog(HoodieWriteConfig config, FileSystem fs) { this.fs = fs; this.config = config; - this.archiveFilePath = HoodieArchivedCommitTimeline + this.archiveFilePath = HoodieArchivedTimeline .getArchiveLogPath(config.getBasePath() + "/" + HoodieTableMetaClient.METAFOLDER_NAME); } @@ -59,7 +59,7 @@ public class HoodieCommitArchiveLog { * Check if commits need to be archived. If yes, archive commits. */ public boolean archiveIfRequired() { - List commitsToArchive = getCommitsToArchive().collect(Collectors.toList()); + List commitsToArchive = getCommitsToArchive().collect(Collectors.toList()); if (commitsToArchive.iterator().hasNext()) { log.info("Archiving commits " + commitsToArchive); archive(commitsToArchive); @@ -70,41 +70,42 @@ public class HoodieCommitArchiveLog { } } - private Stream getCommitsToArchive() { + private Stream getCommitsToArchive() { int maxCommitsToKeep = config.getMaxCommitsToKeep(); int minCommitsToKeep = config.getMinCommitsToKeep(); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, config.getBasePath(), true); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + HoodieTimeline commitTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); - if (commitTimeline.hasInstants() && commitTimeline.getTotalInstants() > maxCommitsToKeep) { + if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) { // Actually do the commits return commitTimeline.getInstants() - .limit(commitTimeline.getTotalInstants() - minCommitsToKeep); + .limit(commitTimeline.countInstants() - minCommitsToKeep); } return Stream.empty(); } - private boolean deleteCommits(List commitsToArchive) { + private boolean deleteCommits(List commitsToArchive) { log.info("Deleting commits " + commitsToArchive); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, config.getBasePath(), true); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + HoodieTimeline commitTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); boolean success = true; - for(String commitToArchive:commitsToArchive) { - Path commitFile = new Path(metaClient.getMetaPath(), - ((HoodieActiveCommitTimeline) commitTimeline) - .getCompletedFileName(commitToArchive)); + for (HoodieInstant commitToArchive : commitsToArchive) { + Path commitFile = + new Path(metaClient.getMetaPath(), commitToArchive.getFileName()); try { if (fs.exists(commitFile)) { success &= fs.delete(commitFile, false); log.info("Archived and deleted commit file " + commitFile); } } catch (IOException e) { - throw new HoodieIOException( - "Failed to delete archived commit " + commitToArchive, e); + throw new HoodieIOException("Failed to delete archived commit " + commitToArchive, + e); } } return success; @@ -120,18 +121,19 @@ public class HoodieCommitArchiveLog { .compression(HoodieAppendLog.CompressionType.RECORD, new BZip2Codec())); } - private void archive(List commits) throws HoodieCommitException { + private void archive(List commits) throws HoodieCommitException { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, config.getBasePath(), true); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + HoodieTimeline commitTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); HoodieAppendLog.Writer writer = null; try { writer = openWriter(); - for (String commitTime : commits) { - Text k = new Text(commitTime); + for (HoodieInstant commitTime : commits) { + Text k = new Text(commitTime.getTimestamp()); HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(commitTimeline.readInstantDetails(commitTime).get()); + .fromBytes(commitTimeline.getInstantDetails(commitTime).get()); Text v = new Text(commitMetadata.toJsonString()); writer.append(k, v); log.info("Wrote " + k); diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java index 196f199d5..307f56023 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java @@ -50,7 +50,8 @@ public abstract class HoodieIOHandle { this.config = config; this.fs = FSUtils.getFs(); this.metaClient = metaClient; - this.hoodieTimeline = metaClient.getActiveCommitTimeline(); + this.hoodieTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); this.fileSystemView = new ReadOptimizedTableView(fs, metaClient); this.schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java index f1de70095..20636fabf 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java @@ -20,6 +20,7 @@ import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; @@ -291,13 +292,15 @@ public class HoodieCopyOnWriteTable extends Hoodi FileSystem fs = FSUtils.getFs(); List smallFileLocations = new ArrayList<>(); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + HoodieTimeline commitTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); TableFileSystemView fileSystemView = new ReadOptimizedTableView(fs, metaClient); - if (commitTimeline.hasInstants()) { // if we have some commits - String latestCommitTime = commitTimeline.lastInstant().get(); - List allFiles = fileSystemView.streamLatestVersionInPartition(partitionPath, latestCommitTime).collect( - Collectors.toList()); + if (!commitTimeline.empty()) { // if we have some commits + HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); + List allFiles = fileSystemView + .getLatestVersionInPartition(partitionPath, latestCommitTime.getTimestamp()) + .collect(Collectors.toList()); for (HoodieDataFile file : allFiles) { if (file.getFileSize() < config.getParquetSmallFileLimit()) { @@ -322,12 +325,13 @@ public class HoodieCopyOnWriteTable extends Hoodi */ private long averageBytesPerRecord() { long avgSize = 0L; - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + HoodieTimeline commitTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); try { - if (commitTimeline.hasInstants()) { - String latestCommitTime = commitTimeline.lastInstant().get(); + if (!commitTimeline.empty()) { + HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(commitTimeline.readInstantDetails(latestCommitTime).get()); + .fromBytes(commitTimeline.getInstantDetails(latestCommitTime).get()); avgSize = (long) Math.ceil( (1.0 * commitMetadata.fetchTotalBytesWritten()) / commitMetadata .fetchTotalRecordsWritten()); diff --git a/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java b/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java index 43313c496..37e5a4697 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java @@ -29,6 +29,7 @@ import com.uber.hoodie.common.model.HoodieWriteStat; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.ParquetUtils; @@ -41,7 +42,6 @@ import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.io.HoodieCleaner; import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.SparkConf; @@ -61,10 +61,8 @@ import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; import java.util.Optional; import java.util.Set; -import java.util.SortedMap; import java.util.TreeSet; import java.util.stream.Collectors; @@ -293,15 +291,16 @@ public class TestHoodieClient implements Serializable { assertNoWriteErrors(statuses); HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline timeline = metadata.getActiveCommitTimeline(); + HoodieTimeline timeline = + metadata.getActiveTimeline().getCommitTimeline(); TableFileSystemView fsView = new ReadOptimizedTableView(fs, metadata); // Need to ensure the following for (String partitionPath : dataGen.getPartitionPaths()) { // compute all the versions of all files, from time 0 HashMap> fileIdToVersions = new HashMap<>(); - for (String entry : timeline.getInstants().collect(Collectors.toList())) { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(entry).get()); + for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get()); for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) { if (!fileIdToVersions.containsKey(wstat.getFileId())) { @@ -312,7 +311,7 @@ public class TestHoodieClient implements Serializable { } - List> fileVersions = fsView.streamEveryVersionInPartition(partitionPath).collect(Collectors.toList()); + List> fileVersions = fsView.getEveryVersionInPartition(partitionPath).collect(Collectors.toList()); for (List entry : fileVersions) { // No file has no more than max versions String fileId = entry.iterator().next().getFileId(); @@ -373,13 +372,14 @@ public class TestHoodieClient implements Serializable { assertNoWriteErrors(statuses); HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline activeTimeline = metadata.getActiveCommitTimeline(); - Optional earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1); - Set acceptableCommits = + HoodieTimeline activeTimeline = metadata.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); + Optional + earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1); + Set acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet()); if (earliestRetainedCommit.isPresent()) { acceptableCommits.removeAll( - activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get()) + activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants() .collect(Collectors.toSet())); acceptableCommits.add(earliestRetainedCommit.get()); } @@ -387,16 +387,16 @@ public class TestHoodieClient implements Serializable { TableFileSystemView fsView = new ReadOptimizedTableView(fs, metadata); // Need to ensure the following for (String partitionPath : dataGen.getPartitionPaths()) { - List> fileVersions = fsView.streamEveryVersionInPartition(partitionPath).collect(Collectors.toList()); + List> fileVersions = fsView.getEveryVersionInPartition(partitionPath).collect(Collectors.toList()); for (List entry : fileVersions) { Set commitTimes = new HashSet<>(); for(HoodieDataFile value:entry) { System.out.println("Data File - " + value); commitTimes.add(value.getCommitTime()); } - System.out.println("Existing commits " + activeTimeline.getInstants().collect(Collectors.toList())); assertEquals("Only contain acceptable versions of file should be present", - acceptableCommits, commitTimes); + acceptableCommits.stream().map(HoodieInstant::getTimestamp) + .collect(Collectors.toSet()), commitTimes); } } } @@ -637,9 +637,8 @@ public class TestHoodieClient implements Serializable { assertEquals("2 files needs to be committed.", 2, statuses.size()); HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline activeTimeline = metadata.getActiveCommitTimeline(); TableFileSystemView fileSystemView = new ReadOptimizedTableView(fs, metadata); - List files = fileSystemView.streamLatestVersionInPartition(TEST_PARTITION_PATH, commitTime3).collect( + List files = fileSystemView.getLatestVersionInPartition(TEST_PARTITION_PATH, commitTime3).collect( Collectors.toList()); int numTotalInsertsInCommit3 = 0; for (HoodieDataFile file: files) { diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java index a2f9eb49b..0f43910ab 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java @@ -19,6 +19,7 @@ package com.uber.hoodie.common; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; import java.io.File; @@ -60,11 +61,11 @@ public class HoodieClientTestUtils { public static void fakeCommitFile(String basePath, String commitTime) throws IOException { - fakeMetaFile(basePath, commitTime, HoodieTableMetaClient.COMMIT_EXTENSION); + fakeMetaFile(basePath, commitTime, HoodieTimeline.COMMIT_EXTENSION); } public static void fakeInFlightFile(String basePath, String commitTime) throws IOException { - fakeMetaFile(basePath, commitTime, HoodieTableMetaClient.INFLIGHT_FILE_SUFFIX); + fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION); } public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId) throws Exception { diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java index b034b7ef9..800ee5e95 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java @@ -20,6 +20,7 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; @@ -142,7 +143,7 @@ public class HoodieTestDataGenerator { public static void createCommitFile(String basePath, String commitTime) throws IOException { Path commitFile = - new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTableMetaClient.makeCommitFileName(commitTime)); + new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime)); FileSystem fs = FSUtils.getFs(); FSDataOutputStream os = fs.create(commitFile, true); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); diff --git a/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java b/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java index ec6c2329f..093dd085d 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java @@ -17,6 +17,7 @@ package com.uber.hoodie.func; import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.TestRawTripPayload; @@ -79,7 +80,7 @@ public class TestUpdateMapFunction { rowChange3)); Iterator> insertResult = table.handleInsert(records.iterator()); Path commitFile = - new Path(config.getBasePath() + "/.hoodie/" + HoodieTableMetaClient.makeCommitFileName("100")); + new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100")); FSUtils.getFs().create(commitFile); // Now try an update with an evolved schema diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java index 171103742..9229be3eb 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java @@ -19,6 +19,7 @@ package com.uber.hoodie.io; import com.google.common.collect.Lists; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieCommitMetadata; @@ -78,15 +79,16 @@ public class TestHoodieCommitArchiveLog { HoodieTestDataGenerator.createCommitFile(basePath, "102"); HoodieTestDataGenerator.createCommitFile(basePath, "103"); - HoodieTimeline timeline = metadata.getActiveCommitTimeline(); + HoodieTimeline timeline = + metadata.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); - assertEquals("Loaded 4 commits and the count should match", 4, - timeline.getTotalInstants()); + assertEquals("Loaded 4 commits and the count should match", 4, timeline.countInstants()); boolean result = archiveLog.archiveIfRequired(); assertTrue(result); - timeline = timeline.reload(); + timeline = + metadata.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants(); assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4, - timeline.getTotalInstants()); + timeline.countInstants()); } @Test @@ -104,19 +106,21 @@ public class TestHoodieCommitArchiveLog { HoodieTestDataGenerator.createCommitFile(basePath, "104"); HoodieTestDataGenerator.createCommitFile(basePath, "105"); - HoodieTimeline timeline = metadata.getActiveCommitTimeline(); - List originalCommits = timeline.getInstants().collect( - Collectors.toList()); + HoodieTimeline timeline = + metadata.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); + List originalCommits = timeline.getInstants().collect(Collectors.toList()); - assertEquals("Loaded 6 commits and the count should match", 6, timeline.getTotalInstants()); + assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); boolean result = archiveLog.archiveIfRequired(); assertTrue(result); - timeline = timeline.reload(); + timeline = + metadata.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants(); assertEquals( "Should archive commits when maxCommitsToKeep is 5 and now the commits length should be minCommitsToKeep which is 2", - 2, timeline.getTotalInstants()); + 2, timeline.countInstants()); assertEquals("Archive should not archive the last 2 commits", - Lists.newArrayList("104", "105"), timeline.getInstants().collect(Collectors.toList())); + Lists.newArrayList("104", "105"), + timeline.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList())); // Remove all the commits from the original commits, make it ready to be checked against the read map timeline.getInstants().forEach(originalCommits::remove); @@ -134,7 +138,8 @@ public class TestHoodieCommitArchiveLog { assertEquals( "Read commits map should match the originalCommits - commitsLoadedAfterArchival", - originalCommits, new ArrayList<>(readCommits.keySet())); + originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()), + new ArrayList<>(readCommits.keySet())); reader.close(); } @@ -153,15 +158,21 @@ public class TestHoodieCommitArchiveLog { HoodieTestDataGenerator.createCommitFile(basePath, "104"); HoodieTestDataGenerator.createCommitFile(basePath, "105"); - HoodieTimeline timeline = metadata.getActiveCommitTimeline(); - assertEquals("Loaded 6 commits and the count should match", 6, timeline.getTotalInstants()); + HoodieTimeline timeline = + metadata.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); + assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); boolean result = archiveLog.archiveIfRequired(); assertTrue(result); - timeline = timeline.reload(); - assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("100")); - assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("101")); - assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("102")); - assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("103")); + timeline = + metadata.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants(); + assertTrue("Archived commits should always be safe", + timeline.containsOrBeforeTimelineStarts("100")); + assertTrue("Archived commits should always be safe", + timeline.containsOrBeforeTimelineStarts("101")); + assertTrue("Archived commits should always be safe", + timeline.containsOrBeforeTimelineStarts("102")); + assertTrue("Archived commits should always be safe", + timeline.containsOrBeforeTimelineStarts("103")); } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java b/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java index be035be62..9201fc061 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java @@ -184,8 +184,8 @@ public class TestCopyOnWriteTable { if (file.getName().endsWith(".parquet")) { if (FSUtils.getFileId(file.getName()) .equals(FSUtils.getFileId(parquetFile.getName())) && metadata - .getActiveCommitTimeline() - .compareInstants(FSUtils.getCommitTime(file.getName()), + .getActiveTimeline().getCommitTimeline() + .compareTimestamps(FSUtils.getCommitTime(file.getName()), FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) { updatedParquetFile = file; break; diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieStorageType.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileFormat.java similarity index 92% rename from hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieStorageType.java rename to hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileFormat.java index 6c53078c7..73da01ca9 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieStorageType.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileFormat.java @@ -16,12 +16,12 @@ package com.uber.hoodie.common.model; -public enum HoodieStorageType { +public enum HoodieFileFormat { PARQUET(".parquet"); private final String extension; - HoodieStorageType(String extension) { + HoodieFileFormat(String extension) { this.extension = extension; } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java index 38fcfd32c..33fd78152 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java @@ -16,9 +16,8 @@ package com.uber.hoodie.common.table; -import com.uber.hoodie.common.model.HoodieStorageType; +import com.uber.hoodie.common.model.HoodieFileFormat; import com.uber.hoodie.common.model.HoodieTableType; -import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieIOException; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; @@ -46,10 +45,10 @@ public class HoodieTableConfig implements Serializable { public static final String HOODIE_PROPERTIES_FILE = "hoodie.properties"; public static final String HOODIE_TABLE_NAME_PROP_NAME = "hoodie.table.name"; public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type"; - public static final String HOODIE_RO_STORAGE_FORMAT_PROP_NAME = - "hoodie.table.ro.storage.format"; + public static final String HOODIE_RO_FILE_FORMAT_PROP_NAME = + "hoodie.table.ro.file.format"; public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE; - public static final HoodieStorageType DEFAULT_RO_STORAGE_FORMAT = HoodieStorageType.PARQUET; + public static final HoodieFileFormat DEFAULT_RO_FILE_FORMAT = HoodieFileFormat.PARQUET; private Properties props; public HoodieTableConfig(FileSystem fs, String metaPath) { @@ -127,12 +126,12 @@ public class HoodieTableConfig implements Serializable { /** * Get the Read Optimized Storage Format * - * @return HoodieStorageType for the Read Optimized Storage format + * @return HoodieFileFormat for the Read Optimized Storage format */ - public HoodieStorageType getROStorageFormat() { - if (props.contains(HOODIE_RO_STORAGE_FORMAT_PROP_NAME)) { - return HoodieStorageType.valueOf(props.getProperty(HOODIE_RO_STORAGE_FORMAT_PROP_NAME)); + public HoodieFileFormat getROFileFormat() { + if (props.contains(HOODIE_RO_FILE_FORMAT_PROP_NAME)) { + return HoodieFileFormat.valueOf(props.getProperty(HOODIE_RO_FILE_FORMAT_PROP_NAME)); } - return DEFAULT_RO_STORAGE_FORMAT; + return DEFAULT_RO_FILE_FORMAT; } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java index f4a7ef44d..f80000bc9 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java @@ -17,10 +17,8 @@ package com.uber.hoodie.common.table; import com.uber.hoodie.common.model.HoodieTableType; -import com.uber.hoodie.common.table.timeline.HoodieActiveCommitTimeline; -import com.uber.hoodie.common.table.timeline.HoodieArchivedCommitTimeline; -import com.uber.hoodie.common.table.timeline.HoodieCleanerTimeline; -import com.uber.hoodie.common.table.timeline.HoodieSavePointTimeline; +import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; +import com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.DatasetNotFoundException; import org.apache.hadoop.fs.FileStatus; @@ -30,8 +28,13 @@ import org.apache.hadoop.fs.PathFilter; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import java.io.Externalizable; import java.io.File; import java.io.IOException; +import java.io.ObjectInput; +import java.io.ObjectInputStream; +import java.io.ObjectOutput; +import java.io.ObjectOutputStream; import java.io.Serializable; import java.util.Objects; import java.util.Properties; @@ -50,28 +53,22 @@ import java.util.Properties; public class HoodieTableMetaClient implements Serializable { private final transient static Logger log = LogManager.getLogger(HoodieTableMetaClient.class); public static String METAFOLDER_NAME = ".hoodie"; - public static String COMMIT_EXTENSION = ".commit"; - public static String CLEAN_EXTENSION = ".clean"; - public static String SAVEPOINT_EXTENSION = ".savepoint"; - public static String INFLIGHT_FILE_SUFFIX = ".inflight"; private String basePath; private transient FileSystem fs; private String metaPath; private HoodieTableType tableType; private HoodieTableConfig tableConfig; - private HoodieTimeline activeCommitTimeline; - private HoodieTimeline archivedCommitTimeline; - private HoodieTimeline savePointTimeline; - private HoodieTimeline cleanerTimeline; + private HoodieActiveTimeline activeTimeline; + private HoodieArchivedTimeline archivedTimeline; public HoodieTableMetaClient(FileSystem fs, String basePath) throws DatasetNotFoundException { // Do not load any timeline by default this(fs, basePath, false); } - public HoodieTableMetaClient(FileSystem fs, String basePath, - boolean loadActiveCommitTimelineOnLoad) throws DatasetNotFoundException { + public HoodieTableMetaClient(FileSystem fs, String basePath, boolean loadActiveTimelineOnLoad) + throws DatasetNotFoundException { log.info("Loading HoodieTableMetaClient from " + basePath); this.basePath = basePath; this.fs = fs; @@ -82,14 +79,15 @@ public class HoodieTableMetaClient implements Serializable { this.tableConfig = new HoodieTableConfig(fs, metaPath); this.tableType = tableConfig.getTableType(); log.info("Finished Loading Table of type " + tableType + " from " + basePath); - if (loadActiveCommitTimelineOnLoad) { + if (loadActiveTimelineOnLoad) { log.info("Loading Active commit timeline for " + basePath); - getActiveCommitTimeline(); + getActiveTimeline(); } } /** * For serailizing and de-serializing + * * @deprecated */ public HoodieTableMetaClient() { @@ -97,6 +95,7 @@ public class HoodieTableMetaClient implements Serializable { /** * This method is only used when this object is deserialized in a spark executor. + * * @deprecated */ private void readObject(java.io.ObjectInputStream in) @@ -105,6 +104,11 @@ public class HoodieTableMetaClient implements Serializable { this.fs = FSUtils.getFs(); } + private void writeObject(java.io.ObjectOutputStream out) + throws IOException { + out.defaultWriteObject(); + } + /** * @return Base path */ @@ -134,16 +138,16 @@ public class HoodieTableMetaClient implements Serializable { } /** - * Get the active commits as a timeline + * Get the active instants as a timeline * - * @return Active commit timeline + * @return Active instants timeline * @throws IOException */ - public synchronized HoodieTimeline getActiveCommitTimeline() { - if (activeCommitTimeline == null) { - activeCommitTimeline = new HoodieActiveCommitTimeline(fs, metaPath); + public synchronized HoodieActiveTimeline getActiveTimeline() { + if (activeTimeline == null) { + activeTimeline = new HoodieActiveTimeline(fs, metaPath); } - return activeCommitTimeline; + return activeTimeline; } /** @@ -153,40 +157,13 @@ public class HoodieTableMetaClient implements Serializable { * @return Active commit timeline * @throws IOException */ - public HoodieTimeline getArchivedCommitTimeline() { - if (archivedCommitTimeline == null) { - archivedCommitTimeline = new HoodieArchivedCommitTimeline(fs, metaPath); + public synchronized HoodieArchivedTimeline getArchivedTimeline() { + if (archivedTimeline == null) { + archivedTimeline = new HoodieArchivedTimeline(fs, metaPath); } - return archivedCommitTimeline; + return archivedTimeline; } - /** - * Get the save points as a timeline. - * - * @return Savepoint timeline - * @throws IOException - */ - public HoodieTimeline getSavePointsTimeline() { - if (savePointTimeline == null) { - savePointTimeline = new HoodieSavePointTimeline(fs, metaPath); - } - return savePointTimeline; - } - - /** - * Get the cleaner activity as a timeline. - * - * @return Cleaner activity - * @throws IOException - */ - public HoodieTimeline getCleanerTimeline() { - if (cleanerTimeline == null) { - cleanerTimeline = new HoodieCleanerTimeline(fs, metaPath); - } - return cleanerTimeline; - } - - /** * Helper method to initialize a given path as a hoodie dataset with configs passed in as as Properties * @@ -215,34 +192,6 @@ public class HoodieTableMetaClient implements Serializable { } // HELPER METHODS TO CREATE META FILE NAMES - public static String makeCommitFileName(String commitTime) { - return commitTime + COMMIT_EXTENSION; - } - - public static String makeInflightCommitFileName(String commitTime) { - return commitTime + INFLIGHT_FILE_SUFFIX; - } - - public static String makeCleanerFileName(String instant) { - return instant + CLEAN_EXTENSION; - } - - public static String makeInflightCleanerFileName(String instant) { - return instant + CLEAN_EXTENSION + INFLIGHT_FILE_SUFFIX; - } - - public static String makeInflightSavePointFileName(String commitTime) { - return commitTime + SAVEPOINT_EXTENSION + INFLIGHT_FILE_SUFFIX; - } - - public static String makeSavePointFileName(String commitTime) { - return commitTime + SAVEPOINT_EXTENSION; - } - - public static String getCommitFromCommitFile(String commitFileName) { - return commitFileName.split("\\.")[0]; - } - public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter nameFilter) throws IOException { return fs.listStatus(metaPath, nameFilter); diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java index 3baf142a7..c83c19c9d 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java @@ -17,153 +17,125 @@ package com.uber.hoodie.common.table; import com.uber.hoodie.common.table.timeline.HoodieDefaultTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; +import com.uber.hoodie.common.util.FSUtils; -import java.io.IOException; import java.io.Serializable; import java.util.Optional; import java.util.function.BiPredicate; import java.util.stream.Stream; /** - * HoodieTimeline allows representation of meta-data events as a timeline. - * Instants are specific points in time represented as strings. - * in this format YYYYMMDDHHmmSS. e.g. 20170101193218 - * Any operation on the timeline starts with the inflight instant and then when complete marks - * the completed instant and removes the inflight instant. - * Completed instants are plainly referred to as just instants + * HoodieTimeline is a view of meta-data instants in the hoodie dataset. + * Instants are specific points in time represented as HoodieInstant. *

- * Timelines as immutable once created. Any operation to change the timeline (like create/delete instants) - * will not be reflected unless explicitly reloaded using the reload() + * Timelines are immutable once created and operations create new instance of + * timelines which filter on the instants and this can be chained. * * @see com.uber.hoodie.common.table.HoodieTableMetaClient * @see HoodieDefaultTimeline + * @see HoodieInstant * @since 0.3.0 */ public interface HoodieTimeline extends Serializable { + String COMMIT_ACTION = "commit"; + String CLEAN_ACTION = "clean"; + String SAVEPOINT_ACTION = "savepoint"; + String INFLIGHT_EXTENSION = ".inflight"; + String COMMIT_EXTENSION = "." + COMMIT_ACTION; + String CLEAN_EXTENSION = "." + CLEAN_ACTION; + String SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION; + //this is to preserve backwards compatibility on commit in-flight filenames + String INFLIGHT_COMMIT_EXTENSION = INFLIGHT_EXTENSION; + String INFLIGHT_CLEAN_EXTENSION = "." + CLEAN_ACTION + INFLIGHT_EXTENSION; + String INFLIGHT_SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION + INFLIGHT_EXTENSION; + + /** - * Find all the completed instants after startTs and before or on endTs + * Filter this timeline to just include the in-flights + * + * @return New instance of HoodieTimeline with just in-flights + */ + HoodieTimeline filterInflights(); + + /** + * Filter this timeline to just include the completed instants + * + * @return New instance of HoodieTimeline with just completed instants + */ + HoodieTimeline filterCompletedInstants(); + + + /** + * Create a new Timeline with instants after startTs and before or on endTs * * @param startTs * @param endTs - * @return Stream of instants */ - Stream findInstantsInRange(String startTs, String endTs); + HoodieTimeline findInstantsInRange(String startTs, String endTs); /** - * Find all the completed instants after startTs + * Create a new Timeline with all the instants after startTs * * @param commitTime * @param numCommits - * @return Stream of instants */ - Stream findInstantsAfter(String commitTime, int numCommits); + HoodieTimeline findInstantsAfter(String commitTime, int numCommits); /** - * If the timeline has any completed instants + * If the timeline has any instants * - * @return true if timeline is not empty + * @return true if timeline is empty */ - boolean hasInstants(); - - /** - * If the timeline has any in-complete instants - * - * @return true if timeline has any in-complete instants - */ - boolean hasInflightInstants(); + boolean empty(); /** * @return total number of completed instants */ - int getTotalInstants(); + int countInstants(); /** * @return first completed instant if available */ - Optional firstInstant(); + Optional firstInstant(); /** * @param n * @return nth completed instant from the first completed instant */ - Optional nthInstant(int n); + Optional nthInstant(int n); /** * @return last completed instant if available */ - Optional lastInstant(); + Optional lastInstant(); /** * @param n * @return nth completed instant going back from the last completed instant */ - Optional nthFromLastInstant(int n); + Optional nthFromLastInstant(int n); /** * @return true if the passed instant is present as a completed instant on the timeline */ - boolean containsInstant(String instant); + boolean containsInstant(HoodieInstant instant); /** * @return true if the passed instant is present as a completed instant on the timeline or * if the instant is before the first completed instant in the timeline */ - boolean containsOrBeforeTimelineStarts(String instant); + boolean containsOrBeforeTimelineStarts(String ts); /** * @return Get the stream of completed instants */ - Stream getInstants(); - - /** - * @return Get the stream of in-flight instants - */ - Stream getInflightInstants(); + Stream getInstants(); /** * @return true if the passed in instant is before the first completed instant in the timeline */ - boolean isInstantBeforeTimelineStarts(String instant); - - /** - * Register the passed in instant as a in-flight - * - * @param instant - */ - void saveInstantAsInflight(String instant); - - /** - * Register the passed in instant as a completed instant. - * It needs to have a corresponding in-flight instant, otherwise it will fail. - * Pass a optional byte[] to save with the instant. - * - * @param instant - * @param data - */ - void saveInstantAsComplete(String instant, Optional data); - - /** - * Un-Register a completed instant as in-flight. This is usually atomic way to - * revert the effects of a operation on hoodie datasets - * - * @param instant - */ - void revertInstantToInflight(String instant); - - /** - * Remove the in-flight instant from the timeline - * - * @param instant - */ - void removeInflightFromTimeline(String instant); - - /** - * Reload the timeline. Timelines are immutable once created. - * - * @return - * @throws IOException - */ - HoodieTimeline reload() throws IOException; + boolean isBeforeTimelineStarts(String ts); /** * Read the completed instant details @@ -171,7 +143,7 @@ public interface HoodieTimeline extends Serializable { * @param instant * @return */ - Optional readInstantDetails(String instant); + Optional getInstantDetails(HoodieInstant instant); /** * Helper methods to compare instants @@ -183,8 +155,55 @@ public interface HoodieTimeline extends Serializable { (commit1, commit2) -> commit1.compareTo(commit2) <= 0; BiPredicate LESSER = (commit1, commit2) -> commit1.compareTo(commit2) < 0; - default boolean compareInstants(String commit1, String commit2, + default boolean compareTimestamps(String commit1, String commit2, BiPredicate predicateToApply) { return predicateToApply.test(commit1, commit2); } + + static HoodieInstant getCompletedInstant(final HoodieInstant instant) { + return new HoodieInstant(false, instant.getAction(), instant.getTimestamp()); + } + + + static HoodieInstant getInflightInstant(final HoodieInstant instant) { + return new HoodieInstant(true, instant.getAction(), instant.getTimestamp()); + } + + static String makeCommitFileName(String commitTime) { + return commitTime + HoodieTimeline.COMMIT_EXTENSION; + } + + static String makeInflightCommitFileName(String commitTime) { + return commitTime + HoodieTimeline.INFLIGHT_COMMIT_EXTENSION; + } + + static String makeCleanerFileName(String instant) { + return instant + HoodieTimeline.CLEAN_EXTENSION; + } + + static String makeInflightCleanerFileName(String instant) { + return instant + HoodieTimeline.INFLIGHT_CLEAN_EXTENSION; + } + + static String makeInflightSavePointFileName(String commitTime) { + return commitTime + HoodieTimeline.INFLIGHT_SAVEPOINT_EXTENSION; + } + + static String makeSavePointFileName(String commitTime) { + return commitTime + HoodieTimeline.SAVEPOINT_EXTENSION; + } + + static String getCommitFromCommitFile(String commitFileName) { + return commitFileName.split("\\.")[0]; + } + + static String makeFileNameAsComplete(String fileName) { + return fileName.replace(HoodieTimeline.INFLIGHT_EXTENSION, ""); + } + + static String makeFileNameAsInflight(String fileName) { + return fileName + HoodieTimeline.INFLIGHT_EXTENSION; + } + + } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java index e29c79bd0..fb3004e4a 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java @@ -51,7 +51,7 @@ public interface TableFileSystemView { * @param maxCommitTime * @return */ - Stream streamLatestVersionInPartition(String partitionPathStr, + Stream getLatestVersionInPartition(String partitionPathStr, String maxCommitTime); /** @@ -60,7 +60,7 @@ public interface TableFileSystemView { * @param partitionPath * @return */ - Stream> streamEveryVersionInPartition(String partitionPath); + Stream> getEveryVersionInPartition(String partitionPath); /** * Stream all the versions from the passed in fileStatus[] with commit times containing in commitsToReturn. @@ -69,7 +69,7 @@ public interface TableFileSystemView { * @param commitsToReturn * @return */ - Stream streamLatestVersionInRange(FileStatus[] fileStatuses, + Stream getLatestVersionInRange(FileStatus[] fileStatuses, List commitsToReturn); /** @@ -79,15 +79,15 @@ public interface TableFileSystemView { * @param maxCommitToReturn * @return */ - Stream streamLatestVersionsBeforeOrOn(FileStatus[] fileStatuses, + Stream getLatestVersionsBeforeOrOn(FileStatus[] fileStatuses, String maxCommitToReturn); /** * Stream latest versions from the passed in FileStatus[]. - * Similar to calling streamLatestVersionsBeforeOrOn(fileStatuses, currentTimeAsCommitTime) + * Similar to calling getLatestVersionsBeforeOrOn(fileStatuses, currentTimeAsCommitTime) * * @param fileStatuses * @return */ - Stream streamLatestVersions(FileStatus[] fileStatuses); + Stream getLatestVersions(FileStatus[] fileStatuses); } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveCommitTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveCommitTimeline.java deleted file mode 100644 index 2133b3fdf..000000000 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveCommitTimeline.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.uber.hoodie.common.table.timeline; - -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.common.table.HoodieTimeline; -import com.uber.hoodie.exception.HoodieIOException; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - -import java.io.IOException; -import java.util.Arrays; -import java.util.stream.Collectors; - -/** - * Active commit timeline. Much optimized for reading than the ArchivedTimeline. - */ -public class HoodieActiveCommitTimeline extends HoodieDefaultTimeline { - public HoodieActiveCommitTimeline(FileSystem fs, String metaPath) { - super(fs, metaPath); - String completedInstantExtension = HoodieTableMetaClient.COMMIT_EXTENSION; - String inflightInstantExtension = INFLIGHT_EXTENSION; - - FileStatus[] fileStatuses; - try { - fileStatuses = HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), - path -> path.toString().endsWith(completedInstantExtension) || path.toString() - .endsWith(inflightInstantExtension)); - } catch (IOException e) { - throw new HoodieIOException("Failed to scan metadata", e); - } - this.instants = Arrays.stream(fileStatuses) - .filter(status -> status.getPath().getName().endsWith(completedInstantExtension)) - .map(fileStatus -> fileStatus.getPath().getName().replaceAll(completedInstantExtension, "")) - .sorted().collect(Collectors.toList()); - this.inflights = Arrays.stream(fileStatuses).filter( - status -> status.getPath().getName().endsWith(inflightInstantExtension)).map( - fileStatus -> fileStatus.getPath().getName() - .replaceAll(inflightInstantExtension, "")).sorted() - .collect(Collectors.toList()); - } - - @Override - public String getInflightFileName(String instant) { - return HoodieTableMetaClient.makeInflightCommitFileName(instant); - } - - @Override - public String getCompletedFileName(String instant) { - return HoodieTableMetaClient.makeCommitFileName(instant); - } - - @Override - protected String getTimelineName() { - return "commit"; - } - - @Override - public HoodieTimeline reload() throws IOException { - return new HoodieActiveCommitTimeline(fs, metaPath); - } -} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java new file mode 100644 index 000000000..17ba91116 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.timeline; + +import com.google.common.io.Closeables; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.exception.HoodieIOException; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Optional; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Represents the Active Timeline for the HoodieDataset. Instants for the last 12 hours (configurable) + * is in the ActiveTimeline and the rest are Archived. ActiveTimeline is a special timeline + * that allows for creation of instants on the timeline. + *

+ * The timeline is not automatically reloaded on any mutation operation, clients have to manually call reload() + * so that they can chain multiple mutations to the timeline and then call reload() once. + *

+ * This class can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized. + */ +public class HoodieActiveTimeline extends HoodieDefaultTimeline { + private final transient static Logger log = LogManager.getLogger(HoodieActiveTimeline.class); + private String metaPath; + private transient FileSystem fs; + + + protected HoodieActiveTimeline(FileSystem fs, String metaPath, String[] includedExtensions) { + // Filter all the filter in the metapath and include only the extensions passed and + // convert them into HoodieInstant + try { + this.instants = + Arrays.stream(HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), path -> { + // Include only the meta files with extensions that needs to be included + String extension = FSUtils.getFileExtension(path.getName()); + return Arrays.stream(includedExtensions).anyMatch(Predicate.isEqual(extension)); + })).sorted(Comparator.comparing( + // Sort the meta-data by the instant time (first part of the file name) + fileStatus -> FSUtils.getInstantTime(fileStatus.getPath().getName()))) + // create HoodieInstantMarkers from FileStatus, which extracts properties + .map(HoodieInstant::new).collect(Collectors.toList()); + log.info("Loaded instants " + instants); + } catch (IOException e) { + throw new HoodieIOException("Failed to scan metadata", e); + } + this.fs = fs; + this.metaPath = metaPath; + // multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 + this.details = (Function> & Serializable) this::getInstantDetails; + } + + public HoodieActiveTimeline(FileSystem fs, String metaPath) { + this(fs, metaPath, + new String[] {COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, SAVEPOINT_EXTENSION, + INFLIGHT_SAVEPOINT_EXTENSION, CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION}); + } + + /** + * For serialization and de-serialization only. + * @deprecated + */ + public HoodieActiveTimeline() { + } + + /** + * This method is only used when this object is deserialized in a spark executor. + * + * @deprecated + */ + private void readObject(java.io.ObjectInputStream in) + throws IOException, ClassNotFoundException { + in.defaultReadObject(); + this.fs = FSUtils.getFs(); + } + + /** + * Get only the commits (inflight and completed) in the active timeline + * + * @return + */ + public HoodieTimeline getCommitTimeline() { + return new HoodieDefaultTimeline(filterInstantsByAction(COMMIT_ACTION), + (Function> & Serializable) this::getInstantDetails); + } + + /** + * Get only the cleaner action (inflight and completed) in the active timeline + * + * @return + */ + public HoodieTimeline getCleanerTimeline() { + return new HoodieDefaultTimeline(filterInstantsByAction(CLEAN_ACTION), + (Function> & Serializable) this::getInstantDetails); + } + + /** + * Get only the save point action (inflight and completed) in the active timeline + * + * @return + */ + public HoodieTimeline getSavePointTimeline() { + return new HoodieDefaultTimeline(filterInstantsByAction(SAVEPOINT_ACTION), + (Function> & Serializable) this::getInstantDetails); + } + + + protected Stream filterInstantsByAction(String action) { + return instants.stream().filter(s -> s.getAction().equals(action)); + } + + public void createInflight(HoodieInstant instant) { + log.info("Creating a new in-flight instant " + instant); + // Create the in-flight file + createFileInMetaPath(instant.getFileName(), Optional.empty()); + } + + public void saveAsComplete(HoodieInstant instant, Optional data) { + log.info("Marking instant complete " + instant); + moveInflightToComplete(instant, HoodieTimeline.getCompletedInstant(instant), data); + log.info("Completed " + instant); + } + + public void revertToInflight(HoodieInstant instant) { + log.info("Reverting instant to inflight " + instant); + moveCompleteToInflight(instant, HoodieTimeline.getInflightInstant(instant)); + log.info("Reverted " + instant + " to inflight"); + } + + public void deleteInflight(HoodieInstant instant) { + log.info("Deleting in-flight " + instant); + Path inFlightCommitFilePath = new Path(metaPath, instant.getFileName()); + try { + boolean result = fs.delete(inFlightCommitFilePath, false); + if (result) { + log.info("Removed in-flight " + instant); + } else { + throw new HoodieIOException("Could not delete in-flight instant " + instant); + } + } catch (IOException e) { + throw new HoodieIOException( + "Could not remove inflight commit " + inFlightCommitFilePath, e); + } + } + + @Override + public Optional getInstantDetails(HoodieInstant instant) { + Path detailPath = new Path(metaPath, instant.getFileName()); + return readDataFromPath(detailPath); + } + + + protected void moveInflightToComplete(HoodieInstant inflight, HoodieInstant completed, + Optional data) { + Path commitFilePath = new Path(metaPath, completed.getFileName()); + try { + // open a new file and write the commit metadata in + Path inflightCommitFile = new Path(metaPath, inflight.getFileName()); + createFileInMetaPath(inflight.getFileName(), data); + boolean success = fs.rename(inflightCommitFile, commitFilePath); + if (!success) { + throw new HoodieIOException( + "Could not rename " + inflightCommitFile + " to " + commitFilePath); + } + } catch (IOException e) { + throw new HoodieIOException("Could not complete " + inflight, e); + } + } + + protected void moveCompleteToInflight(HoodieInstant completed, HoodieInstant inflight) { + Path inFlightCommitFilePath = new Path(metaPath, inflight.getFileName()); + try { + if (!fs.exists(inFlightCommitFilePath)) { + Path commitFilePath = new Path(metaPath, completed.getFileName()); + boolean success = fs.rename(commitFilePath, inFlightCommitFilePath); + if (!success) { + throw new HoodieIOException( + "Could not rename " + commitFilePath + " to " + inFlightCommitFilePath); + } + } + } catch (IOException e) { + throw new HoodieIOException("Could not complete revert " + completed, e); + } + } + + protected void createFileInMetaPath(String filename, Optional content) { + Path fullPath = new Path(metaPath, filename); + try { + if (!content.isPresent()) { + if (fs.createNewFile(fullPath)) { + log.info("Created a new file in meta path: " + fullPath); + return; + } + } else { + FSDataOutputStream fsout = fs.create(fullPath, true); + fsout.write(content.get()); + fsout.close(); + return; + } + throw new HoodieIOException("Failed to create file " + fullPath); + } catch (IOException e) { + throw new HoodieIOException("Failed to create file " + fullPath, e); + } + } + + protected Optional readDataFromPath(Path detailPath) { + FSDataInputStream is = null; + try { + is = fs.open(detailPath); + return Optional.of(IOUtils.toByteArray(is)); + } catch (IOException e) { + throw new HoodieIOException("Could not read commit details from " + detailPath, e); + } finally { + if (is != null) { + Closeables.closeQuietly(is); + } + } + } + + public HoodieActiveTimeline reload() { + return new HoodieActiveTimeline(fs, metaPath); + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedCommitTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java similarity index 52% rename from hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedCommitTimeline.java rename to hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java index d99ad2788..e47954b2a 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedCommitTimeline.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java @@ -18,30 +18,42 @@ package com.uber.hoodie.common.table.timeline; import com.google.common.io.Closeables; import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieIOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import java.io.IOException; -import java.util.ArrayList; +import java.io.Serializable; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.Optional; +import java.util.function.Function; +import java.util.stream.Collectors; /** - * Archived commit timeline. These commits are usually cleaned up and the meta data is archived for - * future triaging - * - * @since 0.3.0 + * Represents the Archived Timeline for the HoodieDataset. Instants for the last 12 hours (configurable) + * is in the ActiveTimeline and the rest are in ArchivedTimeline. + *

+ * Instants are read from the archive file during initialization and never refreshed. To refresh, clients + * need to call reload() + *

+ * This class can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized. */ -public class HoodieArchivedCommitTimeline extends HoodieDefaultTimeline { +public class HoodieArchivedTimeline extends HoodieDefaultTimeline { private static final String HOODIE_COMMIT_ARCHIVE_LOG_FILE = "commits.archived"; - private transient Map readCommits = new HashMap<>(); + private transient FileSystem fs; + private String metaPath; + private Map readCommits = new HashMap<>(); - public HoodieArchivedCommitTimeline(FileSystem fs, String metaPath) { + private final transient static Logger log = LogManager.getLogger(HoodieArchivedTimeline.class); + + public HoodieArchivedTimeline(FileSystem fs, String metaPath) { // Read back the commits to make sure Path archiveLogPath = getArchiveLogPath(metaPath); try { @@ -55,8 +67,9 @@ public class HoodieArchivedCommitTimeline extends HoodieDefaultTimeline { // This is okay because only tooling will load the archived commit timeline today readCommits.put(key.toString(), Arrays.copyOf(val.getBytes(), val.getLength())); } - this.instants = new ArrayList<>(readCommits.keySet()); - this.inflights = new ArrayList<>(0); + this.instants = readCommits.keySet().stream().map( + s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)).collect( + Collectors.toList()); } finally { Closeables.closeQuietly(reader); } @@ -64,58 +77,42 @@ public class HoodieArchivedCommitTimeline extends HoodieDefaultTimeline { throw new HoodieIOException( "Could not load archived commit timeline from path " + archiveLogPath, e); } + // multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 + this.details = (Function> & Serializable) this::getInstantDetails; + this.fs = fs; + this.metaPath = metaPath; } - @Override - public void saveInstantAsInflight(String instant) { - throw new UnsupportedOperationException( - "Could not save inflight instant in ArchivedTimeline " + instant); + /** + * For serialization and de-serialization only. + * @deprecated + */ + public HoodieArchivedTimeline() { } - @Override - public void saveInstantAsComplete(String instant, Optional data) { - throw new UnsupportedOperationException( - "Could not save instant as complete in ArchivedTimeline " + instant); + /** + * This method is only used when this object is deserialized in a spark executor. + * + * @deprecated + */ + private void readObject(java.io.ObjectInputStream in) + throws IOException, ClassNotFoundException { + in.defaultReadObject(); + this.fs = FSUtils.getFs(); } - @Override - public void revertInstantToInflight(String instant) { - throw new UnsupportedOperationException( - "Could not revert instant in ArchivedTimeline " + instant); - } - - @Override - public void removeInflightFromTimeline(String instant) { - throw new UnsupportedOperationException( - "Could not delete inflight instant from ArchivedTimeline " + instant); - } - - @Override - public HoodieTimeline reload() throws IOException { - return new HoodieArchivedCommitTimeline(fs, metaPath); - } - - @Override - public Optional readInstantDetails(String instant) { - return Optional.ofNullable(readCommits.get(instant)); - } - - @Override - protected String getInflightFileName(String instant) { - throw new UnsupportedOperationException("No inflight filename for archived commits"); - } - - @Override - protected String getCompletedFileName(String instant) { - throw new UnsupportedOperationException("No inflight filename for archived commits"); - } - - @Override - protected String getTimelineName() { - return "archived-commits"; - } public static Path getArchiveLogPath(String metaPath) { return new Path(metaPath, HOODIE_COMMIT_ARCHIVE_LOG_FILE); } + + @Override + public Optional getInstantDetails(HoodieInstant instant) { + return Optional.ofNullable(readCommits.get(instant.getTimestamp())); + } + + public HoodieArchivedTimeline reload() { + return new HoodieArchivedTimeline(fs, metaPath); + } + } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieCleanerTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieCleanerTimeline.java deleted file mode 100644 index d8a9ed8e0..000000000 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieCleanerTimeline.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.uber.hoodie.common.table.timeline; - -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.common.table.HoodieTimeline; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - -import java.io.IOException; -import java.util.Optional; - -public class HoodieCleanerTimeline extends HoodieDefaultTimeline { - public HoodieCleanerTimeline(FileSystem fs, String path) { - super(fs, path, HoodieTableMetaClient.CLEAN_EXTENSION); - } - - @Override - public HoodieTimeline reload() throws IOException { - return new HoodieCleanerTimeline(fs, metaPath); - } - - @Override - public Optional readInstantDetails(String instant) { - // TODO - Nothing about the clean written today - this should change - return Optional.empty(); - } - - @Override - protected String getInflightFileName(String instant) { - return HoodieTableMetaClient.makeInflightCleanerFileName(instant); - } - - @Override - protected String getCompletedFileName(String instant) { - return HoodieTableMetaClient.makeCleanerFileName(instant); - } - - @Override - protected String getTimelineName() { - return "cleaner"; - } -} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java index c9143890d..0f43f9c4a 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java @@ -16,329 +16,131 @@ package com.uber.hoodie.common.table.timeline; -import com.google.common.io.Closeables; -import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; -import com.uber.hoodie.common.util.FSUtils; -import com.uber.hoodie.exception.HoodieIOException; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; -import java.util.Arrays; import java.util.List; import java.util.Optional; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; /** - * HoodieTimeline allows representation of meta-data events as a timeline. - * Instants are specific points in time represented as strings. - * in this format YYYYMMDDHHmmSS. e.g. 20170101193218 - * Any operation on the timeline starts with the inflight instant and then when complete marks - * the completed instant and removes the inflight instant. - * Completed instants are plainly referred to as just instants - *

- * Timelines as immutable once created. Any operation to change the timeline (like create/delete instants) - * will not be reflected unless explicitly reloaded using the reload() + * HoodieDefaultTimeline is a default implementation of the HoodieTimeline. + * It provides methods to inspect a List[HoodieInstant]. Function to get the details of the instant + * is passed in as a lamdba. * - * @see com.uber.hoodie.common.table.HoodieTableMetaClient * @see HoodieTimeline - * @since 0.3.0 */ -public abstract class HoodieDefaultTimeline implements HoodieTimeline { +public class HoodieDefaultTimeline implements HoodieTimeline { private final transient static Logger log = LogManager.getLogger(HoodieDefaultTimeline.class); - public static final String INFLIGHT_EXTENSION = ".inflight"; - protected String metaPath; - protected transient FileSystem fs; - protected List inflights; - protected List instants; + protected Function> details; + protected List instants; - public HoodieDefaultTimeline(FileSystem fs, String metaPath, String fileExtension) { - String completedInstantExtension = fileExtension; - String inflightInstantExtension = fileExtension + INFLIGHT_EXTENSION; - - FileStatus[] fileStatuses; - try { - fileStatuses = HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), - path -> path.toString().endsWith(completedInstantExtension) || path.toString() - .endsWith(inflightInstantExtension)); - } catch (IOException e) { - throw new HoodieIOException("Failed to scan metadata", e); - } - this.instants = Arrays.stream(fileStatuses) - .filter(status -> status.getPath().getName().endsWith(completedInstantExtension)) - .map(fileStatus -> fileStatus.getPath().getName().replaceAll(completedInstantExtension, "")) - .sorted().collect(Collectors.toList()); - this.inflights = Arrays.stream(fileStatuses).filter( - status -> status.getPath().getName().endsWith(inflightInstantExtension)).map( - fileStatus -> fileStatus.getPath().getName() - .replaceAll(inflightInstantExtension, "")).sorted() - .collect(Collectors.toList()); - this.fs = fs; - this.metaPath = metaPath; - } - - public HoodieDefaultTimeline(Stream instants, Stream inflights) { + public HoodieDefaultTimeline(Stream instants, + Function> details) { this.instants = instants.collect(Collectors.toList()); - this.inflights = inflights.collect(Collectors.toList()); - } - - /** - * This constructor only supports backwards compatibility in inflight commits in ActiveCommitTimeline. - * This should never be used. - * - * @param fs - * @param metaPath - * @deprecated - */ - public HoodieDefaultTimeline(FileSystem fs, String metaPath) { - this.fs = fs; - this.metaPath = metaPath; + this.details = details; } /** * For serailizing and de-serializing + * * @deprecated */ public HoodieDefaultTimeline() { } + public HoodieTimeline filterInflights() { + return new HoodieDefaultTimeline(instants.stream().filter(HoodieInstant::isInflight), + details); + } - /** - * This method is only used when this object is deserialized in a spark executor. - * @deprecated - */ - private void readObject(java.io.ObjectInputStream in) - throws IOException, ClassNotFoundException { - in.defaultReadObject(); - this.fs = FSUtils.getFs(); + public HoodieTimeline filterCompletedInstants() { + return new HoodieDefaultTimeline(instants.stream().filter(s -> !s.isInflight()), details); } @Override - public Stream findInstantsInRange(String startTs, String endTs) { - return instants.stream().filter( - s -> compareInstants(s, startTs, GREATER) && compareInstants(s, endTs, - LESSER_OR_EQUAL)); + public HoodieDefaultTimeline findInstantsInRange(String startTs, String endTs) { + return new HoodieDefaultTimeline(instants.stream().filter( + s -> compareTimestamps(s.getTimestamp(), startTs, GREATER) && compareTimestamps( + s.getTimestamp(), endTs, LESSER_OR_EQUAL)), details); } @Override - public Stream findInstantsAfter(String commitTime, int numCommits) { - return instants.stream().filter(s -> compareInstants(s, commitTime, GREATER)) - .limit(numCommits); + public HoodieDefaultTimeline findInstantsAfter(String commitTime, int numCommits) { + return new HoodieDefaultTimeline( + instants.stream().filter(s -> compareTimestamps(s.getTimestamp(), commitTime, GREATER)) + .limit(numCommits), details); } @Override - public boolean hasInstants() { - return instants.stream().count() != 0; + public boolean empty() { + return !instants.stream().findFirst().isPresent(); } @Override - public boolean hasInflightInstants() { - return inflights.stream().count() != 0; - } - - @Override - public int getTotalInstants() { + public int countInstants() { return new Long(instants.stream().count()).intValue(); } @Override - public Optional firstInstant() { + public Optional firstInstant() { return instants.stream().findFirst(); } @Override - public Optional nthInstant(int n) { - if(!hasInstants() || n >= getTotalInstants()) { + public Optional nthInstant(int n) { + if (empty() || n >= countInstants()) { return Optional.empty(); } return Optional.of(instants.get(n)); } @Override - public Optional lastInstant() { - return hasInstants() ? nthInstant(getTotalInstants() - 1) : Optional.empty(); + public Optional lastInstant() { + return empty() ? Optional.empty() : nthInstant(countInstants() - 1); } @Override - public Optional nthFromLastInstant(int n) { - if(getTotalInstants() < n + 1) { + public Optional nthFromLastInstant(int n) { + if (countInstants() < n + 1) { return Optional.empty(); } - return nthInstant(getTotalInstants() - 1 - n); + return nthInstant(countInstants() - 1 - n); } @Override - public boolean containsInstant(String instant) { + public boolean containsInstant(HoodieInstant instant) { return instants.stream().anyMatch(s -> s.equals(instant)); } @Override public boolean containsOrBeforeTimelineStarts(String instant) { - return containsInstant(instant) || isInstantBeforeTimelineStarts(instant); + return instants.stream().anyMatch(s -> s.getTimestamp().equals(instant)) + || isBeforeTimelineStarts(instant); } @Override - public Stream getInstants() { + public Stream getInstants() { return instants.stream(); } @Override - public Stream getInflightInstants() { - return inflights.stream(); + public boolean isBeforeTimelineStarts(String instant) { + Optional firstCommit = firstInstant(); + return firstCommit.isPresent() && compareTimestamps(instant, + firstCommit.get().getTimestamp(), LESSER); } + @Override - public boolean isInstantBeforeTimelineStarts(String instant) { - Optional firstCommit = firstInstant(); - return firstCommit.isPresent() && compareInstants(instant, firstCommit.get(), LESSER); + public Optional getInstantDetails(HoodieInstant instant) { + return details.apply(instant); } - @Override - public void saveInstantAsInflight(String instant) { - log.info("Creating a new in-flight " + getTimelineName() + " " + instant); - // Create the in-flight file - createFileInMetaPath(getInflightFileName(instant), Optional.empty()); - } - - @Override - public void saveInstantAsComplete(String instant, Optional data) { - log.info("Marking complete " + getTimelineName() + " " + instant); - moveInflightToComplete(instant, data, getCompletedFileName(instant), - HoodieTableMetaClient.makeInflightCommitFileName(instant)); - log.info("Completed " + getTimelineName() + " " + instant); - } - - @Override - public void revertInstantToInflight(String instant) { - log.info("Reverting instant to inflight " + getTimelineName() + " " + instant); - moveCompleteToInflight(instant, getCompletedFileName(instant), - getInflightFileName(instant)); - log.info("Reverted " + getTimelineName() + " " + instant + " to inflight"); - } - - @Override - public void removeInflightFromTimeline(String instant) { - log.info("Removing in-flight " + getTimelineName() + " " + instant); - String inFlightCommitFileName = getInflightFileName(instant); - Path inFlightCommitFilePath = new Path(metaPath, inFlightCommitFileName); - try { - fs.delete(inFlightCommitFilePath, false); - log.info("Removed in-flight " + getTimelineName() + " " + instant); - } catch (IOException e) { - throw new HoodieIOException( - "Could not remove inflight commit " + inFlightCommitFilePath, e); - } - } - - @Override - public Optional readInstantDetails(String instant) { - Path detailPath = new Path(metaPath, getCompletedFileName(instant)); - return readDataFromPath(detailPath); - } - - - /** - * Get the in-flight instant file name - * - * @param instant - * @return - */ - protected abstract String getInflightFileName(String instant); - - /** - * Get the completed instant file name - * - * @param instant - * @return - */ - protected abstract String getCompletedFileName(String instant); - - /** - * Get the timeline name - * - * @return - */ - protected abstract String getTimelineName(); - - - protected void moveInflightToComplete(String instant, Optional data, - String commitFileName, String inflightFileName) { - Path commitFilePath = new Path(metaPath, commitFileName); - try { - // open a new file and write the commit metadata in - Path inflightCommitFile = new Path(metaPath, inflightFileName); - createFileInMetaPath(inflightFileName, data); - boolean success = fs.rename(inflightCommitFile, commitFilePath); - if (!success) { - throw new HoodieIOException( - "Could not rename " + inflightCommitFile + " to " + commitFilePath); - } - } catch (IOException e) { - throw new HoodieIOException("Could not complete commit " + instant, e); - } - } - - protected void moveCompleteToInflight(String instant, String commitFileName, - String inflightFileName) { - Path inFlightCommitFilePath = new Path(metaPath, inflightFileName); - try { - if (!fs.exists(inFlightCommitFilePath)) { - Path commitFilePath = new Path(metaPath, commitFileName); - boolean success = fs.rename(commitFilePath, inFlightCommitFilePath); - if (!success) { - throw new HoodieIOException( - "Could not rename " + commitFilePath + " to " + inFlightCommitFilePath); - } - } - } catch (IOException e) { - throw new HoodieIOException("Could not complete commit revert " + instant, e); - } - } - - protected void createFileInMetaPath(String filename, Optional content) { - Path fullPath = new Path(metaPath, filename); - try { - if (!content.isPresent()) { - if (fs.createNewFile(fullPath)) { - log.info("Created a new file in meta path: " + fullPath); - return; - } - } else { - FSDataOutputStream fsout = fs.create(fullPath, true); - fsout.write(content.get()); - fsout.close(); - return; - } - throw new HoodieIOException("Failed to create file " + fullPath); - } catch (IOException e) { - throw new HoodieIOException("Failed to create file " + fullPath, e); - } - } - - protected Optional readDataFromPath(Path detailPath) { - FSDataInputStream is = null; - try { - is = fs.open(detailPath); - return Optional.of(IOUtils.toByteArray(is)); - } catch (IOException e) { - throw new HoodieIOException("Could not read commit details from " + detailPath, e); - } finally { - if (is != null) { - Closeables.closeQuietly(is); - } - } - } - - @Override public String toString() { return this.getClass().getName() + ": " + instants.stream().map(Object::toString) diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieInstant.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieInstant.java new file mode 100644 index 000000000..1d67bbeb9 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieInstant.java @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.timeline; + +import com.google.common.io.Files; +import com.uber.hoodie.common.table.HoodieTimeline; +import org.apache.hadoop.fs.FileStatus; + +import java.io.Serializable; +import java.util.Objects; +import java.util.Optional; + +/** + * A Hoodie Instant represents a action done on a hoodie dataset. + * All actions start with a inflight instant and then create a completed instant after done. + * + * @see HoodieTimeline + */ +public class HoodieInstant implements Serializable { + private boolean isInflight = false; + private String action; + private String timestamp; + + /** + * Load the instant from the meta FileStatus + * @param fileStatus + */ + public HoodieInstant(FileStatus fileStatus) { + // First read the instant timestamp. [==>20170101193025<==].commit + String fileName = fileStatus.getPath().getName(); + String fileExtension = Files.getFileExtension(fileName); + timestamp = fileName.replace("." + fileExtension, ""); + + // Next read the action for this marker + action = fileExtension; + if(action.equals("inflight")) { + // This is to support backwards compatibility on how in-flight commit files were written + // General rule is inflight extension is ..inflight, but for commit it is .inflight + action = "commit"; + isInflight = true; + } else if (action.contains(HoodieTimeline.INFLIGHT_EXTENSION)) { + isInflight = true; + action = action.replace(HoodieTimeline.INFLIGHT_EXTENSION, ""); + } + } + + public HoodieInstant(boolean isInflight, String action, String timestamp) { + this.isInflight = isInflight; + this.action = action; + this.timestamp = timestamp; + } + + public boolean isInflight() { + return isInflight; + } + + public String getAction() { + return action; + } + + public String getTimestamp() { + return timestamp; + } + + /** + * Get the filename for this instant + * @return + */ + public String getFileName() { + if (HoodieTimeline.COMMIT_ACTION.equals(action)) { + return isInflight ? + HoodieTimeline.makeInflightCommitFileName(timestamp) : + HoodieTimeline.makeCommitFileName(timestamp); + } else if (HoodieTimeline.CLEAN_ACTION.equals(action)) { + return isInflight ? + HoodieTimeline.makeInflightCleanerFileName(timestamp) : + HoodieTimeline.makeCleanerFileName(timestamp); + } else if (HoodieTimeline.SAVEPOINT_ACTION.equals(action)) { + return isInflight ? + HoodieTimeline.makeInflightSavePointFileName(timestamp) : + HoodieTimeline.makeSavePointFileName(timestamp); + } + throw new IllegalArgumentException("Cannot get file name for unknown action " + action); + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + HoodieInstant that = (HoodieInstant) o; + return isInflight == that.isInflight && + Objects.equals(action, that.action) && + Objects.equals(timestamp, that.timestamp); + } + + @Override + public int hashCode() { + return Objects.hash(isInflight, action, timestamp); + } + + @Override + public String toString() { + return "[" + ((isInflight) ? "==>" : "") + timestamp + "__" + action + "]"; + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieSavePointTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieSavePointTimeline.java deleted file mode 100644 index 3bc1748ff..000000000 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieSavePointTimeline.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.uber.hoodie.common.table.timeline; - -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.common.table.HoodieTimeline; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - -import java.io.IOException; - -public class HoodieSavePointTimeline extends HoodieDefaultTimeline { - public HoodieSavePointTimeline(FileSystem fs, String metaPath) { - super(fs, metaPath, HoodieTableMetaClient.SAVEPOINT_EXTENSION); - } - - @Override - public HoodieTimeline reload() throws IOException { - return new HoodieSavePointTimeline(fs, metaPath); - } - - @Override - protected String getInflightFileName(String instant) { - return HoodieTableMetaClient.makeInflightSavePointFileName(instant); - } - - @Override - protected String getCompletedFileName(String instant) { - return HoodieTableMetaClient.makeSavePointFileName(instant); - } - - @Override - protected String getTimelineName() { - return "savepoint"; - } -} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/AbstractTableFileSystemView.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/AbstractTableFileSystemView.java index e20700518..032d27096 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/AbstractTableFileSystemView.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/AbstractTableFileSystemView.java @@ -20,6 +20,7 @@ import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.TableFileSystemView; import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.exception.HoodieIOException; import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.fs.FileStatus; @@ -38,7 +39,7 @@ import java.util.stream.Stream; /** * Common abstract implementation for multiple TableFileSystemView Implementations. * 2 possible implementations are ReadOptimizedView and RealtimeView - * + *

* Concrete implementations extending this abstract class, should only implement * listDataFilesInPartition which includes files to be included in the view * @@ -54,24 +55,26 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView public AbstractTableFileSystemView(FileSystem fs, HoodieTableMetaClient metaClient) { this.metaClient = metaClient; this.fs = fs; - this.activeCommitTimeline = metaClient.getActiveCommitTimeline(); + // Get the active timeline and filter only completed commits + this.activeCommitTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); } public Stream getLatestDataFilesForFileId(final String partitionPath, String fileId) { - Optional lastInstant = activeCommitTimeline.lastInstant(); + Optional lastInstant = activeCommitTimeline.lastInstant(); if (lastInstant.isPresent()) { - return streamLatestVersionInPartition(partitionPath, lastInstant.get()) + return getLatestVersionInPartition(partitionPath, lastInstant.get().getTimestamp()) .filter(hoodieDataFile -> hoodieDataFile.getFileId().equals(fileId)); } return Stream.empty(); } @Override - public Stream streamLatestVersionInPartition(String partitionPathStr, + public Stream getLatestVersionInPartition(String partitionPathStr, String maxCommitTime) { try { - return streamLatestVersionsBeforeOrOn(listDataFilesInPartition(partitionPathStr), + return getLatestVersionsBeforeOrOn(listDataFilesInPartition(partitionPathStr), maxCommitTime); } catch (IOException e) { throw new HoodieIOException( @@ -81,11 +84,11 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView @Override - public Stream> streamEveryVersionInPartition(String partitionPath) { + public Stream> getEveryVersionInPartition(String partitionPath) { try { - if(activeCommitTimeline.lastInstant().isPresent()) { - return streamFilesByFileId(listDataFilesInPartition(partitionPath), - activeCommitTimeline.lastInstant().get()); + if (activeCommitTimeline.lastInstant().isPresent()) { + return getFilesByFileId(listDataFilesInPartition(partitionPath), + activeCommitTimeline.lastInstant().get().getTimestamp()); } return Stream.empty(); } catch (IOException e) { @@ -98,13 +101,14 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView throws IOException; @Override - public Stream streamLatestVersionInRange(FileStatus[] fileStatuses, + public Stream getLatestVersionInRange(FileStatus[] fileStatuses, List commitsToReturn) { - if (!activeCommitTimeline.hasInstants() || commitsToReturn.isEmpty()) { + if (activeCommitTimeline.empty() || commitsToReturn.isEmpty()) { return Stream.empty(); } try { - return streamFilesByFileId(fileStatuses, activeCommitTimeline.lastInstant().get()) + return getFilesByFileId(fileStatuses, + activeCommitTimeline.lastInstant().get().getTimestamp()) .map((Function, Optional>) fss -> { for (HoodieDataFile fs : fss) { if (commitsToReturn.contains(fs.getCommitTime())) { @@ -120,17 +124,18 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView } @Override - public Stream streamLatestVersionsBeforeOrOn(FileStatus[] fileStatuses, + public Stream getLatestVersionsBeforeOrOn(FileStatus[] fileStatuses, String maxCommitToReturn) { try { - if (!activeCommitTimeline.hasInstants()) { + if (activeCommitTimeline.empty()) { return Stream.empty(); } - return streamFilesByFileId(fileStatuses, activeCommitTimeline.lastInstant().get()) + return getFilesByFileId(fileStatuses, + activeCommitTimeline.lastInstant().get().getTimestamp()) .map((Function, Optional>) fss -> { for (HoodieDataFile fs1 : fss) { if (activeCommitTimeline - .compareInstants(fs1.getCommitTime(), maxCommitToReturn, + .compareTimestamps(fs1.getCommitTime(), maxCommitToReturn, HoodieTimeline.LESSER_OR_EQUAL)) { return Optional.of(fs1); } @@ -143,19 +148,20 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView } @Override - public Stream streamLatestVersions(FileStatus[] fileStatuses) { + public Stream getLatestVersions(FileStatus[] fileStatuses) { try { - if (!activeCommitTimeline.hasInstants()) { + if (activeCommitTimeline.empty()) { return Stream.empty(); } - return streamFilesByFileId(fileStatuses, activeCommitTimeline.lastInstant().get()) + return getFilesByFileId(fileStatuses, + activeCommitTimeline.lastInstant().get().getTimestamp()) .map(statuses -> statuses.get(0)); } catch (IOException e) { throw new HoodieIOException("Could not filter files for latest version ", e); } } - protected Stream> streamFilesByFileId(FileStatus[] files, + protected Stream> getFilesByFileId(FileStatus[] files, String maxCommitTime) throws IOException { return groupFilesByFileId(files, maxCommitTime).values().stream(); } @@ -173,7 +179,7 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView return Arrays.stream(files).flatMap(fileStatus -> { HoodieDataFile dataFile = new HoodieDataFile(fileStatus); if (activeCommitTimeline.containsOrBeforeTimelineStarts(dataFile.getCommitTime()) - && activeCommitTimeline.compareInstants(dataFile.getCommitTime(), maxCommitTime, + && activeCommitTimeline.compareTimestamps(dataFile.getCommitTime(), maxCommitTime, HoodieTimeline.LESSER_OR_EQUAL)) { return Stream.of(Pair.of(dataFile.getFileId(), dataFile)); } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/ReadOptimizedTableView.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/ReadOptimizedTableView.java index 6978326ee..53d440d8b 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/ReadOptimizedTableView.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/ReadOptimizedTableView.java @@ -36,7 +36,7 @@ public class ReadOptimizedTableView extends AbstractTableFileSystemView { Path partitionPath = new Path(metaClient.getBasePath(), partitionPathStr); try { return fs.listStatus(partitionPath, path -> path.getName() - .contains(metaClient.getTableConfig().getROStorageFormat().getFileExtension())); + .contains(metaClient.getTableConfig().getROFileFormat().getFileExtension())); } catch (IOException e) { throw new HoodieIOException( "Failed to list data files in partition " + partitionPathStr, e); diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java index 09d592d4a..dc5b2c352 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java @@ -16,6 +16,10 @@ package com.uber.hoodie.common.util; +import com.google.common.base.Preconditions; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.exception.HoodieIOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -24,6 +28,7 @@ import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -91,18 +96,27 @@ public class FSUtils { /** * Obtain all the partition paths, that are present in this table. */ - public static List getAllPartitionPaths(FileSystem fs, String basePath) throws IOException { + public static List getAllPartitionPaths(FileSystem fs, String basePath) + throws IOException { List partitionsToClean = new ArrayList<>(); // TODO(vc): For now, assume partitions are two levels down from base path. FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*")); for (FileStatus status : folders) { Path path = status.getPath(); - partitionsToClean.add(String.format("%s/%s/%s", - path.getParent().getParent().getName(), - path.getParent().getName(), - path.getName())); + partitionsToClean.add(String.format("%s/%s/%s", path.getParent().getParent().getName(), + path.getParent().getName(), path.getName())); } return partitionsToClean; } + public static String getFileExtension(String fullName) { + Preconditions.checkNotNull(fullName); + String fileName = (new File(fullName)).getName(); + int dotIndex = fileName.indexOf('.'); + return dotIndex == -1 ? "" : fileName.substring(dotIndex); + } + + public static String getInstantTime(String name) { + return name.replace(getFileExtension(name), ""); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java index b28284883..3a2aaa8dc 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java @@ -19,8 +19,10 @@ package com.uber.hoodie.common.model; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; +import com.esotericsoftware.kryo.serializers.JavaSerializer; import com.uber.hoodie.common.table.HoodieTableConfig; import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; import org.apache.hadoop.fs.FileSystem; @@ -30,6 +32,8 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; import java.io.Serializable; import java.text.SimpleDateFormat; import java.util.Date; @@ -66,13 +70,13 @@ public class HoodieTestUtils { public static final void createCommitFiles(String basePath, String... commitTimes) throws IOException { for (String commitTime: commitTimes) { - new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + HoodieTableMetaClient.makeCommitFileName(commitTime)).createNewFile(); + new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + HoodieTimeline.makeCommitFileName(commitTime)).createNewFile(); } } public static final void createInflightCommitFiles(String basePath, String... commitTimes) throws IOException { for (String commitTime: commitTimes) { - new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + HoodieTableMetaClient.makeInflightCommitFileName(commitTime)).createNewFile(); + new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + HoodieTimeline.makeInflightCommitFileName(commitTime)).createNewFile(); } } @@ -97,15 +101,15 @@ public class HoodieTestUtils { } public static final boolean doesCommitExist(String basePath, String commitTime) { - return new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + commitTime + HoodieTableMetaClient.COMMIT_EXTENSION).exists(); + return new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + commitTime + HoodieTimeline.COMMIT_EXTENSION).exists(); } public static final boolean doesInflightExist(String basePath, String commitTime) { - return new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + commitTime + HoodieTableMetaClient.INFLIGHT_FILE_SUFFIX).exists(); + return new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + commitTime + HoodieTimeline.INFLIGHT_EXTENSION).exists(); } public static String makeInflightTestFileName(String instant) { - return instant + TEST_EXTENSION + HoodieTableMetaClient.INFLIGHT_FILE_SUFFIX; + return instant + TEST_EXTENSION + HoodieTimeline.INFLIGHT_EXTENSION; } public static String makeTestFileName(String instant) { @@ -123,9 +127,12 @@ public class HoodieTestUtils { assert !iter1.hasNext() && !iter2.hasNext(); } - public static T serializeDeserialize(T object, Class clazz) { + public static T serializeDeserialize(T object, Class clazz) + throws IOException, ClassNotFoundException { // Using Kyro as the default serializer in Spark Jobs Kryo kryo = new Kryo(); + kryo.register(HoodieTableMetaClient.class, new JavaSerializer()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); Output output = new Output(baos); kryo.writeObject(output, object); diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java index 6ec275b15..b70e03c48 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java @@ -16,14 +16,12 @@ package com.uber.hoodie.common.table; -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Input; -import com.esotericsoftware.kryo.io.Output; import com.google.common.collect.Lists; import com.uber.hoodie.common.model.HoodieTestUtils; -import com.uber.hoodie.common.table.timeline.HoodieArchivedCommitTimeline; +import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; +import com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.ArrayFile; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; @@ -31,13 +29,15 @@ import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Optional; import java.util.stream.Collectors; -import static org.junit.Assert.*; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; public class HoodieTableMetaClientTest { private HoodieTableMetaClient metaClient; @@ -61,48 +61,53 @@ public class HoodieTableMetaClientTest { } @Test - public void checkSerDe() throws IOException { - // check if this object is serialized and se-serialized, we are able to read from the file system + public void checkSerDe() throws IOException, ClassNotFoundException { + // check if this object is serialized and de-serialized, we are able to read from the file system HoodieTableMetaClient deseralizedMetaClient = HoodieTestUtils.serializeDeserialize(metaClient, HoodieTableMetaClient.class); - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); - commitTimeline.saveInstantAsInflight("1"); - commitTimeline.saveInstantAsComplete("1", Optional.of("test-detail".getBytes())); + assertNotNull(deseralizedMetaClient); + HoodieActiveTimeline commitTimeline = deseralizedMetaClient.getActiveTimeline(); + HoodieInstant instant = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); + commitTimeline.createInflight(instant); + commitTimeline.saveAsComplete(instant, Optional.of("test-detail".getBytes())); commitTimeline = commitTimeline.reload(); - assertEquals("Commit should be 1", "1", commitTimeline.getInstants().findFirst().get()); + HoodieInstant completedInstant = HoodieTimeline.getCompletedInstant(instant); + assertEquals("Commit should be 1 and completed", completedInstant, + commitTimeline.getInstants().findFirst().get()); assertArrayEquals("Commit value should be \"test-detail\"", "test-detail".getBytes(), - commitTimeline.readInstantDetails("1").get()); + commitTimeline.getInstantDetails(completedInstant).get()); } @Test public void checkCommitTimeline() throws IOException { - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); - assertFalse("Should be empty commit timeline", - commitTimeline.getInstants().findFirst().isPresent()); - assertFalse("Should be empty commit timeline", - commitTimeline.getInflightInstants().findFirst().isPresent()); - commitTimeline.saveInstantAsInflight("1"); - commitTimeline.saveInstantAsComplete("1", Optional.of("test-detail".getBytes())); + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + HoodieTimeline activeCommitTimeline = activeTimeline.getCommitTimeline(); + assertTrue("Should be empty commit timeline", activeCommitTimeline.empty()); + + HoodieInstant instant = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); + activeTimeline.createInflight(instant); + activeTimeline.saveAsComplete(instant, Optional.of("test-detail".getBytes())); // Commit timeline should not auto-reload every time getActiveCommitTimeline(), it should be cached - commitTimeline = metaClient.getActiveCommitTimeline(); - assertFalse("Should be empty commit timeline", - commitTimeline.getInstants().findFirst().isPresent()); - assertFalse("Should be empty commit timeline", - commitTimeline.getInflightInstants().findFirst().isPresent()); + activeTimeline = metaClient.getActiveTimeline(); + activeCommitTimeline = activeTimeline.getCommitTimeline(); + assertTrue("Should be empty commit timeline", activeCommitTimeline.empty()); - commitTimeline = commitTimeline.reload(); - assertTrue("Should be the 1 commit we made", - commitTimeline.getInstants().findFirst().isPresent()); - assertEquals("Commit should be 1", "1", commitTimeline.getInstants().findFirst().get()); + HoodieInstant completedInstant = HoodieTimeline.getCompletedInstant(instant); + activeTimeline = activeTimeline.reload(); + activeCommitTimeline = activeTimeline.getCommitTimeline(); + assertFalse("Should be the 1 commit we made", activeCommitTimeline.empty()); + assertEquals("Commit should be 1", completedInstant, + activeCommitTimeline.getInstants().findFirst().get()); assertArrayEquals("Commit value should be \"test-detail\"", "test-detail".getBytes(), - commitTimeline.readInstantDetails("1").get()); + activeCommitTimeline.getInstantDetails(completedInstant).get()); } @Test public void checkArchiveCommitTimeline() throws IOException { - Path archiveLogPath = - HoodieArchivedCommitTimeline.getArchiveLogPath(metaClient.getMetaPath()); + Path archiveLogPath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getMetaPath()); SequenceFile.Writer writer = SequenceFile .createWriter(HoodieTestUtils.fs.getConf(), SequenceFile.Writer.file(archiveLogPath), SequenceFile.Writer.keyClass(Text.class), @@ -114,13 +119,24 @@ public class HoodieTableMetaClientTest { IOUtils.closeStream(writer); - HoodieTimeline archivedTimeline = metaClient.getArchivedCommitTimeline(); - assertEquals(Lists.newArrayList("1", "2", "3"), + HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); + + HoodieInstant instant1 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "1"); + HoodieInstant instant2 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "2"); + HoodieInstant instant3 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "3"); + + assertEquals(Lists.newArrayList(instant1, instant2, instant3), archivedTimeline.getInstants().collect(Collectors.toList())); - System.out.println(new String( archivedTimeline.readInstantDetails("1").get())); - assertArrayEquals(new Text("data1").getBytes(), archivedTimeline.readInstantDetails("1").get()); - assertArrayEquals(new Text("data2").getBytes(), archivedTimeline.readInstantDetails("2").get()); - assertArrayEquals(new Text("data3").getBytes(), archivedTimeline.readInstantDetails("3").get()); + + assertArrayEquals(new Text("data1").getBytes(), + archivedTimeline.getInstantDetails(instant1).get()); + assertArrayEquals(new Text("data2").getBytes(), + archivedTimeline.getInstantDetails(instant2).get()); + assertArrayEquals(new Text("data3").getBytes(), + archivedTimeline.getInstantDetails(instant3).get()); } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java new file mode 100644 index 000000000..bbbde5945 --- /dev/null +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.string; + +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; +import org.apache.hadoop.fs.Path; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import java.io.IOException; +import java.util.Optional; +import java.util.stream.Stream; + +import static org.junit.Assert.*; + +public class HoodieActiveTimelineTest { + private HoodieActiveTimeline timeline; + private HoodieTableMetaClient metaClient; + @Rule + public final ExpectedException exception = ExpectedException.none(); + + @Before + public void setUp() throws Exception { + this.metaClient = HoodieTestUtils.initOnTemp(); + } + + @After + public void tearDown() throws Exception { + HoodieTestUtils.fs.delete(new Path(this.metaClient.getBasePath()), true); + } + + @Test + public void testLoadingInstantsFromFiles() throws IOException { + HoodieInstant instant1 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "1"); + HoodieInstant instant2 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "3"); + HoodieInstant instant3 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "5"); + HoodieInstant instant4 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "8"); + HoodieInstant instant5 = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "9"); + + timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath()); + timeline.saveAsComplete(instant1, Optional.empty()); + timeline.saveAsComplete(instant2, Optional.empty()); + timeline.saveAsComplete(instant3, Optional.empty()); + timeline.saveAsComplete(instant4, Optional.empty()); + timeline.createInflight(instant5); + timeline = timeline.reload(); + + assertEquals("Total instants should be 5", 5, timeline.countInstants()); + HoodieTestUtils.assertStreamEquals("Check the instants stream", + Stream.of(instant1, instant2, instant3, instant4, instant5), timeline.getInstants()); + HoodieTestUtils.assertStreamEquals("Check the instants stream", + Stream.of(instant1, instant2, instant3, instant4, instant5), + timeline.getCommitTimeline().getInstants()); + HoodieTestUtils.assertStreamEquals("Check the instants stream", + Stream.of(instant1, instant2, instant3, instant4), + timeline.getCommitTimeline().filterCompletedInstants().getInstants()); + HoodieTestUtils.assertStreamEquals("Check the instants stream", Stream.of(instant5), + timeline.getCommitTimeline().filterInflights().getInstants()); + } + + @Test + public void testTimelineOperationsBasic() throws Exception { + timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath()); + assertTrue(timeline.empty()); + assertEquals("", 0, timeline.countInstants()); + assertEquals("", Optional.empty(), timeline.firstInstant()); + assertEquals("", Optional.empty(), timeline.nthInstant(5)); + assertEquals("", Optional.empty(), timeline.nthInstant(-1)); + assertEquals("", Optional.empty(), timeline.lastInstant()); + assertFalse("", timeline.containsInstant( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "01"))); + } + + @Test + public void testTimelineOperations() throws Exception { + timeline = new MockHoodieTimeline( + Stream.of("01", "03", "05", "07", "09", "11", "13", "15", "17", "19"), + Stream.of("21", "23")); + HoodieTestUtils.assertStreamEquals("", Stream.of("05", "07", "09", "11"), + timeline.getCommitTimeline().filterCompletedInstants().findInstantsInRange("04", "11") + .getInstants().map(HoodieInstant::getTimestamp)); + HoodieTestUtils.assertStreamEquals("", Stream.of("09", "11"), + timeline.getCommitTimeline().filterCompletedInstants().findInstantsAfter("07", 2) + .getInstants().map(HoodieInstant::getTimestamp)); + assertFalse(timeline.empty()); + assertFalse(timeline.getCommitTimeline().filterInflights().empty()); + assertEquals("", 12, timeline.countInstants()); + HoodieTimeline activeCommitTimeline = timeline.getCommitTimeline().filterCompletedInstants(); + assertEquals("", 10, activeCommitTimeline.countInstants()); + + assertEquals("", "01", activeCommitTimeline.firstInstant().get().getTimestamp()); + assertEquals("", "11", activeCommitTimeline.nthInstant(5).get().getTimestamp()); + assertEquals("", "19", activeCommitTimeline.lastInstant().get().getTimestamp()); + assertEquals("", "09", activeCommitTimeline.nthFromLastInstant(5).get().getTimestamp()); + assertTrue("", activeCommitTimeline.containsInstant( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "09"))); + assertFalse("", activeCommitTimeline.isBeforeTimelineStarts("02")); + assertTrue("", activeCommitTimeline.isBeforeTimelineStarts("00")); + } +} diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieDefaultTimelineTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieDefaultTimelineTest.java deleted file mode 100644 index d6d510d67..000000000 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieDefaultTimelineTest.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.uber.hoodie.common.table.string; - -import com.uber.hoodie.common.model.HoodieTestUtils; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.common.table.HoodieTimeline; -import org.apache.hadoop.fs.Path; -import org.junit.After; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.ExpectedException; - -import java.io.IOException; -import java.util.Optional; -import java.util.stream.Stream; - -import static org.junit.Assert.*; - -public class HoodieDefaultTimelineTest { - private HoodieTimeline timeline; - private HoodieTableMetaClient metaClient; - @Rule - public final ExpectedException exception = ExpectedException.none(); - - @Before - public void setUp() throws Exception { - this.metaClient = HoodieTestUtils.initOnTemp(); - } - - @After - public void tearDown() throws Exception { - HoodieTestUtils.fs.delete(new Path(this.metaClient.getBasePath()), true); - } - - @Test - public void testLoadingInstantsFromFiles() throws IOException { - timeline = - new MockHoodieTimeline(HoodieTestUtils.fs, metaClient.getMetaPath(), ".test"); - timeline.saveInstantAsComplete("1", Optional.empty()); - timeline.saveInstantAsComplete("3", Optional.empty()); - timeline.saveInstantAsComplete("5", Optional.empty()); - timeline.saveInstantAsComplete("8", Optional.empty()); - timeline.saveInstantAsInflight("9"); - timeline = timeline.reload(); - - assertEquals("Total instants should be 4", 4, timeline.getTotalInstants()); - HoodieTestUtils - .assertStreamEquals("Check the instants stream", Stream.of("1", "3", "5", "8"), - timeline.getInstants()); - assertTrue("Inflights should be present in the timeline", timeline.hasInflightInstants()); - HoodieTestUtils.assertStreamEquals("Check the inflights stream", Stream.of("9"), - timeline.getInflightInstants()); - } - - @Test - public void testTimelineOperationsBasic() throws Exception { - timeline = new MockHoodieTimeline(Stream.empty(), Stream.empty()); - assertFalse(timeline.hasInstants()); - assertFalse(timeline.hasInflightInstants()); - assertEquals("", 0, timeline.getTotalInstants()); - assertEquals("", Optional.empty(), timeline.firstInstant()); - assertEquals("", Optional.empty(), timeline.nthInstant(5)); - assertEquals("", Optional.empty(), timeline.nthInstant(-1)); - assertEquals("", Optional.empty(), timeline.lastInstant()); - assertFalse("", timeline.containsInstant("01")); - } - - @Test - public void testTimelineOperations() throws Exception { - timeline = new MockHoodieTimeline( - Stream.of("01", "03", "05", "07", "09", "11", "13", "15", "17", "19"), - Stream.of("21", "23")); - HoodieTestUtils.assertStreamEquals("", Stream.of("05", "07", "09", "11"), - timeline.findInstantsInRange("04", "11")); - HoodieTestUtils - .assertStreamEquals("", Stream.of("09", "11"), timeline.findInstantsAfter("07", 2)); - assertTrue(timeline.hasInstants()); - assertTrue(timeline.hasInflightInstants()); - assertEquals("", 10, timeline.getTotalInstants()); - assertEquals("", "01", timeline.firstInstant().get()); - assertEquals("", "11", timeline.nthInstant(5).get()); - assertEquals("", "19", timeline.lastInstant().get()); - assertEquals("", "09", timeline.nthFromLastInstant(5).get()); - assertTrue("", timeline.containsInstant("09")); - assertFalse("", timeline.isInstantBeforeTimelineStarts("02")); - assertTrue("", timeline.isInstantBeforeTimelineStarts("00")); - } -} diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java index ea72e88d2..050bbe145 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java @@ -16,51 +16,29 @@ package com.uber.hoodie.common.table.string; -import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.HoodieTimeline; -import com.uber.hoodie.common.table.timeline.HoodieDefaultTimeline; -import org.apache.hadoop.fs.FileSystem; +import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import java.io.IOException; -import java.util.Optional; +import java.util.Comparator; +import java.util.function.Function; +import java.util.stream.Collectors; import java.util.stream.Stream; -public class MockHoodieTimeline extends HoodieDefaultTimeline { - private String fileExt; - - public MockHoodieTimeline(FileSystem fs, String metaPath, String fileExtension) +public class MockHoodieTimeline extends HoodieActiveTimeline { + public MockHoodieTimeline(Stream completed, Stream inflights) throws IOException { - super(fs, metaPath, fileExtension); - this.fileExt = fileExtension; - } - - public MockHoodieTimeline(Stream instants, Stream inflights) - throws IOException { - super(instants, inflights); - } - - @Override - public HoodieTimeline reload() throws IOException { - return new MockHoodieTimeline(fs, metaPath, fileExt); - } - - @Override - public Optional readInstantDetails(String instant) { - return Optional.empty(); - } - - @Override - protected String getInflightFileName(String instant) { - return HoodieTestUtils.makeInflightTestFileName(instant); - } - - @Override - protected String getCompletedFileName(String instant) { - return HoodieTestUtils.makeTestFileName(instant); - } - - @Override - protected String getTimelineName() { - return "mock-test"; + super(); + this.instants = Stream.concat(completed + .map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)), + inflights.map( + s -> new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, s))) + .sorted(Comparator.comparing(new Function() { + @Override + public String apply(HoodieInstant hoodieInstant) { + return hoodieInstant.getFileName(); + } + })).collect(Collectors.toList()); } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/ReadOptimizedTableViewTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/ReadOptimizedTableViewTest.java index 8e45f7aa9..f1ec707a2 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/ReadOptimizedTableViewTest.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/ReadOptimizedTableViewTest.java @@ -23,6 +23,8 @@ import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.FSUtils; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; @@ -40,6 +42,7 @@ import java.util.stream.Collectors; import static org.junit.Assert.*; +@SuppressWarnings("ResultOfMethodCallIgnored") public class ReadOptimizedTableViewTest { private HoodieTableMetaClient metaClient; private String basePath; @@ -77,8 +80,10 @@ public class ReadOptimizedTableViewTest { fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().isPresent()); // Make this commit safe - HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); - commitTimeline.saveInstantAsComplete(commitTime1, Optional.empty()); + HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); + HoodieInstant instant1 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime1); + commitTimeline.saveAsComplete(instant1, Optional.empty()); refreshFsView(); assertEquals("", fileName1, fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().get() @@ -94,7 +99,9 @@ public class ReadOptimizedTableViewTest { .getFileName()); // Make it safe - commitTimeline.saveInstantAsComplete(commitTime2, Optional.empty()); + HoodieInstant instant2 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime2); + commitTimeline.saveAsComplete(instant2, Optional.empty()); refreshFsView(); assertEquals("", fileName2, fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().get() @@ -140,7 +147,7 @@ public class ReadOptimizedTableViewTest { refreshFsView(); List statuses1 = - fsView.streamLatestVersionInPartition("2016/05/01", commitTime4) + fsView.getLatestVersionInPartition("2016/05/01", commitTime4) .collect(Collectors.toList()); assertEquals(statuses1.size(), 3); Set filenames = Sets.newHashSet(); @@ -153,7 +160,7 @@ public class ReadOptimizedTableViewTest { // Reset the max commit time List statuses2 = - fsView.streamLatestVersionInPartition("2016/05/01", commitTime3) + fsView.getLatestVersionInPartition("2016/05/01", commitTime3) .collect(Collectors.toList()); assertEquals(statuses2.size(), 3); filenames = Sets.newHashSet(); @@ -204,7 +211,7 @@ public class ReadOptimizedTableViewTest { refreshFsView(); List> statuses1 = - fsView.streamEveryVersionInPartition("2016/05/01").collect(Collectors.toList()); + fsView.getEveryVersionInPartition("2016/05/01").collect(Collectors.toList()); assertEquals(statuses1.size(), 3); for (List status : statuses1) { @@ -269,9 +276,9 @@ public class ReadOptimizedTableViewTest { assertEquals(statuses.length, 7); refreshFsView(); - List statuses1 = - fsView.streamLatestVersionInRange(statuses, Lists.newArrayList(commitTime2, commitTime3)) - .collect(Collectors.toList()); + List statuses1 = fsView + .getLatestVersionInRange(statuses, Lists.newArrayList(commitTime2, commitTime3)) + .collect(Collectors.toList()); assertEquals(statuses1.size(), 2); Set filenames = Sets.newHashSet(); for (HoodieDataFile status : statuses1) { @@ -320,7 +327,7 @@ public class ReadOptimizedTableViewTest { refreshFsView(); List statuses1 = - fsView.streamLatestVersionsBeforeOrOn(statuses, commitTime2) + fsView.getLatestVersionsBeforeOrOn(statuses, commitTime2) .collect(Collectors.toList()); assertEquals(statuses1.size(), 2); Set filenames = Sets.newHashSet(); @@ -371,8 +378,7 @@ public class ReadOptimizedTableViewTest { refreshFsView(); List statuses1 = - fsView.streamLatestVersions(statuses) - .collect(Collectors.toList()); + fsView.getLatestVersions(statuses).collect(Collectors.toList()); assertEquals(statuses1.size(), 3); Set filenames = Sets.newHashSet(); for (HoodieDataFile status : statuses1) { diff --git a/hoodie-hadoop-mr/pom.xml b/hoodie-hadoop-mr/pom.xml index cb1d3c144..26a112817 100644 --- a/hoodie-hadoop-mr/pom.xml +++ b/hoodie-hadoop-mr/pom.xml @@ -79,6 +79,11 @@ org.apache.avro avro + + com.esotericsoftware + kryo + test + diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java index 21b96fe2a..e8b50d0fd 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java @@ -21,6 +21,7 @@ import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.InvalidDatasetException; @@ -95,7 +96,7 @@ public class HoodieInputFormat extends MapredParquetInputFormat String tableName = metadata.getTableConfig().getTableName(); String mode = HoodieHiveUtil.readMode(Job.getInstance(job), tableName); TableFileSystemView fsView = new ReadOptimizedTableView(FSUtils.getFs(), metadata); - HoodieTimeline timeline = metadata.getActiveCommitTimeline(); + HoodieTimeline timeline = metadata.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) { // this is of the form commitTs_partition_sequenceNumber String lastIncrementalTs = HoodieHiveUtil.readStartCommitTime(Job.getInstance(job), tableName); @@ -103,10 +104,10 @@ public class HoodieInputFormat extends MapredParquetInputFormat Integer maxCommits = HoodieHiveUtil.readMaxCommits(Job.getInstance(job), tableName); LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs); List commitsToReturn = - timeline.findInstantsAfter(lastIncrementalTs, maxCommits) - .collect(Collectors.toList()); + timeline.findInstantsAfter(lastIncrementalTs, maxCommits).getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); List filteredFiles = - fsView.streamLatestVersionInRange(value, commitsToReturn) + fsView.getLatestVersionInRange(value, commitsToReturn) .collect(Collectors.toList()); for (HoodieDataFile filteredFile : filteredFiles) { LOG.info("Processing incremental hoodie file - " + filteredFile.getPath()); @@ -116,7 +117,7 @@ public class HoodieInputFormat extends MapredParquetInputFormat "Total paths to process after hoodie incremental filter " + filteredFiles.size()); } else { // filter files on the latest commit found - List filteredFiles = fsView.streamLatestVersions(value).collect(Collectors.toList()); + List filteredFiles = fsView.getLatestVersions(value).collect(Collectors.toList()); LOG.info("Total paths to process after hoodie filter " + filteredFiles.size()); for (HoodieDataFile filteredFile : filteredFiles) { LOG.info("Processing latest hoodie file - " + filteredFile.getPath()); diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java index 1dd6eabef..e3a8c07fe 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java @@ -19,6 +19,7 @@ package com.uber.hoodie.utilities; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.utilities.exception.HoodieIncrementalPullException; import com.uber.hoodie.utilities.exception.HoodieIncrementalPullSQLException; @@ -267,8 +268,12 @@ public class HiveIncrementalPuller { } HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, targetDataPath); - Optional lastCommit = metadata.getActiveCommitTimeline().lastInstant(); - return lastCommit.orElse("0"); + Optional + lastCommit = metadata.getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant(); + if(lastCommit.isPresent()) { + return lastCommit.get().getTimestamp(); + } + return "0"; } private boolean ensureTempPathExists(FileSystem fs, String lastCommitTime) @@ -299,13 +304,14 @@ public class HiveIncrementalPuller { private String getLastCommitTimePulled(FileSystem fs, String sourceTableLocation) throws IOException { HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, sourceTableLocation); - List commitsToSync = metadata.getActiveCommitTimeline() - .findInstantsAfter(config.fromCommitTime, config.maxCommits) + List commitsToSync = metadata.getActiveTimeline().getCommitTimeline().filterCompletedInstants() + .findInstantsAfter(config.fromCommitTime, config.maxCommits).getInstants().map(HoodieInstant::getTimestamp) .collect(Collectors.toList()); if (commitsToSync.isEmpty()) { log.warn("Nothing to sync. All commits in " + config.sourceTable + " are " + metadata - .getActiveCommitTimeline().getInstants().collect(Collectors.toList()) - + " and from commit time is " + config.fromCommitTime); + .getActiveTimeline().getCommitTimeline().filterCompletedInstants().getInstants() + .collect(Collectors.toList()) + " and from commit time is " + + config.fromCommitTime); return null; } log.info("Syncing commits " + commitsToSync); diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieDeltaStreamer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieDeltaStreamer.java index 16da1d8ee..ea34b42d7 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieDeltaStreamer.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieDeltaStreamer.java @@ -66,11 +66,11 @@ public class HoodieDeltaStreamer implements Serializable { JavaSparkContext sc = getSparkContext(cfg); FileSystem fs = FSUtils.getFs(); HoodieTableMetaClient targetHoodieMetadata = new HoodieTableMetaClient(fs, cfg.targetPath); - HoodieTimeline timeline = targetHoodieMetadata.getActiveCommitTimeline(); + HoodieTimeline timeline = targetHoodieMetadata.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); String lastCommitPulled = findLastCommitPulled(fs, cfg.dataPath); log.info("Last commit pulled on the source dataset is " + lastCommitPulled); if (!timeline.getInstants().iterator().hasNext() && timeline - .compareInstants(timeline.lastInstant().get(), lastCommitPulled, + .compareTimestamps(timeline.lastInstant().get().getTimestamp(), lastCommitPulled, HoodieTimeline.GREATER)) { // this should never be the case throw new IllegalStateException( diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java index 6acf03fdf..464458fd7 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java @@ -24,6 +24,7 @@ import com.uber.hoodie.common.table.HoodieTableConfig; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.common.util.FSUtils; import org.apache.hadoop.fs.FileStatus; @@ -66,7 +67,8 @@ public class HoodieSnapshotCopier implements Serializable { final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs, baseDir); final TableFileSystemView fsView = new ReadOptimizedTableView(fs, tableMetadata); // Get the latest commit - final Optional latestCommit = tableMetadata.getActiveCommitTimeline().lastInstant(); + final Optional + latestCommit = tableMetadata.getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant(); if(!latestCommit.isPresent()) { logger.warn("No commits present. Nothing to snapshot"); } else { @@ -91,7 +93,7 @@ public class HoodieSnapshotCopier implements Serializable { FileSystem fs = FSUtils.getFs(); List> filePaths = new ArrayList<>(); for (HoodieDataFile hoodieDataFile : fsView - .streamLatestVersionInPartition(partition, latestCommit.get()) + .getLatestVersionInPartition(partition, latestCommit.get().getTimestamp()) .collect(Collectors.toList())) { filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())); } @@ -124,8 +126,8 @@ public class HoodieSnapshotCopier implements Serializable { } else { String commitTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName()); - return tableMetadata.getActiveCommitTimeline() - .compareInstants(commitTime, latestCommit.get(), HoodieTimeline.GREATER); + return tableMetadata.getActiveTimeline().getCommitTimeline() + .compareTimestamps(commitTime, latestCommit.get().getTimestamp(), HoodieTimeline.GREATER); } } });