diff --git a/hoodie-cli/pom.xml b/hoodie-cli/pom.xml index f4c384c85..d3a586624 100644 --- a/hoodie-cli/pom.xml +++ b/hoodie-cli/pom.xml @@ -61,14 +61,6 @@ - - org.apache.maven.plugins - maven-compiler-plugin - - 1.5 - 1.5 - - org.apache.maven.plugins maven-dependency-plugin @@ -203,6 +195,7 @@ joda-time 2.9.6 + diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java index 5c8e6e9ca..0b8e8fced 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java @@ -16,7 +16,7 @@ package com.uber.hoodie.cli; -import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -26,8 +26,8 @@ public class HoodieCLI { public static Configuration conf; public static FileSystem fs; public static CLIState state = CLIState.INIT; - public static HoodieTableMetadata tableMetadata; - public static HoodieTableMetadata syncTableMetadata; + public static HoodieTableMetaClient tableMetadata; + public static HoodieTableMetaClient syncTableMetadata; public enum CLIState { @@ -48,7 +48,7 @@ public class HoodieCLI { } } - public static void setTableMetadata(HoodieTableMetadata tableMetadata) { + public static void setTableMetadata(HoodieTableMetaClient tableMetadata) { HoodieCLI.tableMetadata = tableMetadata; } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java index e44c62dfd..31fca3eb0 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java @@ -31,13 +31,13 @@ public class HoodiePrompt extends DefaultPromptProvider { case INIT: return "hoodie->"; case DATASET: - return "hoodie:" + HoodieCLI.tableMetadata.getTableName() + "->"; + return "hoodie:" + HoodieCLI.tableMetadata.getTableConfig().getTableName() + "->"; case SYNC: - return "hoodie:" + HoodieCLI.tableMetadata.getTableName() + " <==> " - + HoodieCLI.syncTableMetadata.getTableName() + "->"; + return "hoodie:" + HoodieCLI.tableMetadata.getTableConfig().getTableName() + " <==> " + + HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->"; } if (HoodieCLI.tableMetadata != null) - return "hoodie:" + HoodieCLI.tableMetadata.getTableName() + "->"; + return "hoodie:" + HoodieCLI.tableMetadata.getTableConfig().getTableName() + "->"; return "hoodie->"; } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java index e2f415c08..a77c7ca57 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java @@ -21,9 +21,9 @@ import com.uber.hoodie.cli.HoodiePrintHelper; import com.uber.hoodie.cli.utils.InputStreamConsumer; import com.uber.hoodie.cli.utils.SparkUtil; import com.uber.hoodie.common.model.HoodieCommitMetadata; -import com.uber.hoodie.common.model.HoodieCommits; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieWriteStat; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.NumericUtils; import org.apache.spark.launcher.SparkLauncher; @@ -38,7 +38,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.SortedMap; +import java.util.stream.Collectors; @Component public class CommitsCommand implements CommandMarker { @@ -67,17 +67,14 @@ public class CommitsCommand implements CommandMarker { @CliOption(key = { "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit) throws IOException { - SortedMap map = - HoodieCLI.tableMetadata.getAllCommitMetadata(); - int arraySize = - Math.min(limit, HoodieCLI.tableMetadata.getAllCommits().getCommitList().size()); - String[][] rows = new String[arraySize][]; - ArrayList commitList = - new ArrayList(HoodieCLI.tableMetadata.getAllCommits().getCommitList()); - Collections.reverse(commitList); - for (int i = 0; i < arraySize; i++) { - String commit = commitList.get(i); - HoodieCommitMetadata commitMetadata = map.get(commit); + HoodieTimeline timeline = HoodieCLI.tableMetadata.getActiveCommitTimeline(); + List commits = timeline.getInstants().collect(Collectors.toList()); + String[][] rows = new String[commits.size()][]; + Collections.reverse(commits); + for (int i = 0; i < commits.size(); i++) { + String commit = commits.get(i); + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(commit).get()); rows[i] = new String[] {commit, NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()), String.valueOf(commitMetadata.fetchTotalFilesInsert()), @@ -95,10 +92,10 @@ public class CommitsCommand implements CommandMarker { @CliCommand(value = "commits refresh", help = "Refresh the commits") public String refreshCommits() throws IOException { - HoodieTableMetadata metadata = - new HoodieTableMetadata(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); + HoodieTableMetaClient metadata = + new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); HoodieCLI.setTableMetadata(metadata); - return "Metadata for table " + metadata.getTableName() + " refreshed."; + return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; } @CliCommand(value = "commit rollback", help = "Rollback a commit") @@ -107,9 +104,9 @@ public class CommitsCommand implements CommandMarker { final String commitTime, @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") final String sparkPropertiesPath) throws Exception { - if (!HoodieCLI.tableMetadata.getAllCommits().contains(commitTime)) { + if (!HoodieCLI.tableMetadata.getActiveCommitTimeline().containsInstant(commitTime)) { return "Commit " + commitTime + " not found in Commits " + HoodieCLI.tableMetadata - .getAllCommits(); + .getActiveCommitTimeline().getInstants().collect(Collectors.toList()); } SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), @@ -130,11 +127,13 @@ public class CommitsCommand implements CommandMarker { public String showCommitPartitions( @CliOption(key = {"commit"}, help = "Commit to show") final String commitTime) throws Exception { - if (!HoodieCLI.tableMetadata.getAllCommits().contains(commitTime)) { + HoodieTimeline timeline = HoodieCLI.tableMetadata.getActiveCommitTimeline(); + if (!timeline.containsInstant(commitTime)) { return "Commit " + commitTime + " not found in Commits " + HoodieCLI.tableMetadata - .getAllCommits(); + .getActiveCommitTimeline().getInstants().collect(Collectors.toList()); } - HoodieCommitMetadata meta = HoodieCLI.tableMetadata.getAllCommitMetadata().get(commitTime); + HoodieCommitMetadata meta = + HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(commitTime).get()); List rows = new ArrayList(); for (Map.Entry> entry : meta.getPartitionToWriteStats() .entrySet()) { @@ -174,11 +173,13 @@ public class CommitsCommand implements CommandMarker { public String showCommitFiles( @CliOption(key = {"commit"}, help = "Commit to show") final String commitTime) throws Exception { - if (!HoodieCLI.tableMetadata.getAllCommits().contains(commitTime)) { + HoodieTimeline timeline = HoodieCLI.tableMetadata.getActiveCommitTimeline(); + if (!timeline.containsInstant(commitTime)) { return "Commit " + commitTime + " not found in Commits " + HoodieCLI.tableMetadata - .getAllCommits(); + .getActiveCommitTimeline().getInstants().collect(Collectors.toList()); } - HoodieCommitMetadata meta = HoodieCLI.tableMetadata.getAllCommitMetadata().get(commitTime); + HoodieCommitMetadata meta = + HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(commitTime).get()); List rows = new ArrayList(); for (Map.Entry> entry : meta.getPartitionToWriteStats() .entrySet()) { @@ -206,23 +207,29 @@ public class CommitsCommand implements CommandMarker { public String compareCommits( @CliOption(key = {"path"}, help = "Path of the dataset to compare to") final String path) throws Exception { - HoodieTableMetadata target = new HoodieTableMetadata(HoodieCLI.fs, path); - HoodieTableMetadata source = HoodieCLI.tableMetadata; + HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path); + HoodieTimeline targetTimeline = target.getActiveCommitTimeline(); + HoodieTableMetaClient source = HoodieCLI.tableMetadata; + HoodieTimeline sourceTimeline = source.getActiveCommitTimeline(); String targetLatestCommit = - target.isCommitsEmpty() ? "0" : target.getAllCommits().lastCommit(); + targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get(); String sourceLatestCommit = - source.isCommitsEmpty() ? "0" : source.getAllCommits().lastCommit(); + sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get(); - if (sourceLatestCommit != null && HoodieCommits - .isCommit1After(targetLatestCommit, sourceLatestCommit)) { + if (sourceLatestCommit != null && sourceTimeline + .compareInstants(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { // source is behind the target - List commitsToCatchup = target.findCommitsSinceTs(sourceLatestCommit); - return "Source " + source.getTableName() + " is behind by " + commitsToCatchup.size() + List commitsToCatchup = + targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) + .collect(Collectors.toList()); + return "Source " + source.getTableConfig().getTableName() + " is behind by " + commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup; } else { - List commitsToCatchup = source.findCommitsSinceTs(targetLatestCommit); - return "Source " + source.getTableName() + " is ahead by " + commitsToCatchup.size() - + " commits. Commits to catch up - " + commitsToCatchup; + List commitsToCatchup = + sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE) + .collect(Collectors.toList()); + return "Source " + source.getTableConfig().getTableName() + " is ahead by " + + commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup; } } @@ -235,10 +242,10 @@ public class CommitsCommand implements CommandMarker { public String syncCommits( @CliOption(key = {"path"}, help = "Path of the dataset to compare to") final String path) throws Exception { - HoodieCLI.syncTableMetadata = new HoodieTableMetadata(HoodieCLI.fs, path); + HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.fs, path); HoodieCLI.state = HoodieCLI.CLIState.SYNC; - return "Load sync state between " + HoodieCLI.tableMetadata.getTableName() + " and " - + HoodieCLI.syncTableMetadata.getTableName(); + return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName() + + " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName(); } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java index 5646566eb..9e17da1a4 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java @@ -17,9 +17,7 @@ package com.uber.hoodie.cli.commands; import com.uber.hoodie.cli.HoodieCLI; -import com.uber.hoodie.common.model.HoodieTableMetadata; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; @@ -31,12 +29,13 @@ import java.io.IOException; public class DatasetsCommand implements CommandMarker { @CliCommand(value = "connect", help = "Connect to a hoodie dataset") public String connect( - @CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset") - final String path) throws IOException { + @CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset") + final String path) throws IOException { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); - HoodieCLI.setTableMetadata(new HoodieTableMetadata(HoodieCLI.fs, path)); + HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.fs, path)); HoodieCLI.state = HoodieCLI.CLIState.DATASET; - return "Metadata for table " + HoodieCLI.tableMetadata.getTableName() + " loaded"; + return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName() + + " loaded"; } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java index 3a3767042..61a3d7f2e 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java @@ -19,8 +19,8 @@ package com.uber.hoodie.cli.commands; import com.uber.hoodie.cli.utils.CommitUtil; import com.uber.hoodie.cli.utils.HiveUtil; import com.uber.hoodie.cli.HoodieCLI; -import com.uber.hoodie.common.model.HoodieCommits; -import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.annotation.CliAvailabilityIndicator; import org.springframework.shell.core.annotation.CliCommand; @@ -28,6 +28,7 @@ import org.springframework.shell.core.annotation.CliOption; import org.springframework.stereotype.Component; import java.util.List; +import java.util.stream.Collectors; @Component public class HoodieSyncCommand implements CommandMarker { @@ -58,8 +59,10 @@ public class HoodieSyncCommand implements CommandMarker { @CliOption(key = { "hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") final String hivePass) throws Exception { - HoodieTableMetadata target = HoodieCLI.syncTableMetadata; - HoodieTableMetadata source = HoodieCLI.tableMetadata; + HoodieTableMetaClient target = HoodieCLI.syncTableMetadata; + HoodieTimeline targetTimeline = target.getActiveCommitTimeline(); + HoodieTableMetaClient source = HoodieCLI.tableMetadata; + HoodieTimeline sourceTimeline = source.getActiveCommitTimeline(); long sourceCount = 0; long targetCount = 0; if ("complete".equals(mode)) { @@ -71,33 +74,39 @@ public class HoodieSyncCommand implements CommandMarker { } String targetLatestCommit = - target.isCommitsEmpty() ? "0" : target.getAllCommits().lastCommit(); + targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get(); String sourceLatestCommit = - source.isCommitsEmpty() ? "0" : source.getAllCommits().lastCommit(); + sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get(); - if (sourceLatestCommit != null && HoodieCommits - .isCommit1After(targetLatestCommit, sourceLatestCommit)) { + if (sourceLatestCommit != null && sourceTimeline + .compareInstants(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { // source is behind the target - List commitsToCatchup = target.findCommitsSinceTs(sourceLatestCommit); + List commitsToCatchup = + targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) + .collect(Collectors.toList()); if (commitsToCatchup.isEmpty()) { - return "Count difference now is (count(" + target.getTableName() + ") - count(" - + source.getTableName() + ") == " + (targetCount - sourceCount); + return "Count difference now is (count(" + target.getTableConfig().getTableName() + + ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount + - sourceCount); } else { long newInserts = CommitUtil.countNewRecords(target, commitsToCatchup); - return "Count difference now is (count(" + target.getTableName() + ") - count(" - + source.getTableName() + ") == " + (targetCount - sourceCount) - + ". Catch up count is " + newInserts; + return "Count difference now is (count(" + target.getTableConfig().getTableName() + + ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount + - sourceCount) + ". Catch up count is " + newInserts; } } else { - List commitsToCatchup = source.findCommitsSinceTs(targetLatestCommit); + List commitsToCatchup = + sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE) + .collect(Collectors.toList()); if (commitsToCatchup.isEmpty()) { - return "Count difference now is (count(" + source.getTableName() + ") - count(" - + target.getTableName() + ") == " + (sourceCount - targetCount); + return "Count difference now is (count(" + source.getTableConfig().getTableName() + + ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount + - targetCount); } else { long newInserts = CommitUtil.countNewRecords(source, commitsToCatchup); - return "Count difference now is (count(" + source.getTableName() + ") - count(" - + target.getTableName() + ") == " + (sourceCount - targetCount) - + ". Catch up count is " + newInserts; + return "Count difference now is (count(" + source.getTableConfig().getTableName() + + ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount + - targetCount) + ". Catch up count is " + newInserts; } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java index 8a4e68e57..8be833ab7 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java @@ -23,6 +23,7 @@ import com.codahale.metrics.UniformReservoir; import com.uber.hoodie.cli.HoodieCLI; import com.uber.hoodie.cli.HoodiePrintHelper; import com.uber.hoodie.common.model.HoodieCommitMetadata; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.NumericUtils; @@ -38,7 +39,7 @@ import org.springframework.stereotype.Component; import java.io.IOException; import java.text.DecimalFormat; import java.util.HashMap; -import java.util.Map; +import java.util.stream.Collectors; @Component public class StatsCommand implements CommandMarker { @@ -52,22 +53,25 @@ public class StatsCommand implements CommandMarker { long totalRecordsUpserted = 0; long totalRecordsWritten = 0; - String[][] rows = new String[HoodieCLI.tableMetadata.getAllCommitMetadata().size() + 1][]; + HoodieTimeline timeline = HoodieCLI.tableMetadata.getActiveCommitTimeline(); + + String[][] rows = new String[new Long(timeline.getTotalInstants()).intValue() + 1][]; int i = 0; DecimalFormat df = new DecimalFormat("#.00"); - for (Map.Entry commit : HoodieCLI.tableMetadata - .getAllCommitMetadata().entrySet()) { + for (String commitTime : timeline.getInstants().collect( + Collectors.toList())) { String waf = "0"; - if (commit.getValue().fetchTotalUpdateRecordsWritten() > 0) { + HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(commitTime).get()); + if (commit.fetchTotalUpdateRecordsWritten() > 0) { waf = df.format( - (float) commit.getValue().fetchTotalRecordsWritten() / commit.getValue() + (float) commit.fetchTotalRecordsWritten() / commit .fetchTotalUpdateRecordsWritten()); } - rows[i++] = new String[] {commit.getKey(), - String.valueOf(commit.getValue().fetchTotalUpdateRecordsWritten()), - String.valueOf(commit.getValue().fetchTotalRecordsWritten()), waf}; - totalRecordsUpserted += commit.getValue().fetchTotalUpdateRecordsWritten(); - totalRecordsWritten += commit.getValue().fetchTotalRecordsWritten(); + rows[i++] = new String[] {commitTime, + String.valueOf(commit.fetchTotalUpdateRecordsWritten()), + String.valueOf(commit.fetchTotalRecordsWritten()), waf}; + totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten(); + totalRecordsWritten += commit.fetchTotalRecordsWritten(); } String waf = "0"; if (totalRecordsUpserted > 0) { diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java index a92036402..a9755cec1 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java @@ -17,20 +17,20 @@ package com.uber.hoodie.cli.utils; import com.uber.hoodie.common.model.HoodieCommitMetadata; -import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import java.io.IOException; import java.util.List; -import java.util.SortedMap; -import java.util.TreeMap; public class CommitUtil { - public static long countNewRecords(HoodieTableMetadata target, List commitsToCatchup) + public static long countNewRecords(HoodieTableMetaClient target, List commitsToCatchup) throws IOException { long totalNew = 0; - SortedMap meta = target.getAllCommitMetadata(); + HoodieTimeline timeline = target.getActiveCommitTimeline(); + timeline = timeline.reload(); for(String commit:commitsToCatchup) { - HoodieCommitMetadata c = meta.get(commit); + HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(commit).get()); totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten(); } return totalNew; diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/HiveUtil.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/HiveUtil.java index c348fbb2a..1d4b00349 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/HiveUtil.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/HiveUtil.java @@ -16,7 +16,7 @@ package com.uber.hoodie.cli.utils; -import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import org.apache.commons.dbcp.BasicDataSource; import org.joda.time.DateTime; @@ -53,7 +53,7 @@ public class HiveUtil { return ds; } - public static long countRecords(String jdbcUrl, HoodieTableMetadata source, String dbName, String user, String pass) throws SQLException { + public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, String user, String pass) throws SQLException { Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass); ResultSet rs = null; Statement stmt = conn.createStatement(); @@ -62,13 +62,13 @@ public class HiveUtil { stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat" ); stmt.execute("set hive.stats.autogather=false" ); rs = stmt.executeQuery( - "select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source + "select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig() .getTableName()); long count = -1; if(rs.next()) { count = rs.getLong("cnt"); } - System.out.println("Total records in " + source.getTableName() + " is " + count); + System.out.println("Total records in " + source.getTableConfig().getTableName() + " is " + count); return count; } finally { if (rs != null) { @@ -80,7 +80,7 @@ public class HiveUtil { } } - public static long countRecords(String jdbcUrl, HoodieTableMetadata source, String srcDb, + public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, int partitions, String user, String pass) throws SQLException { DateTime dateTime = DateTime.now(); String endDateStr = @@ -94,7 +94,7 @@ public class HiveUtil { return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass); } - private static long countRecords(String jdbcUrl, HoodieTableMetadata source, String srcDb, String startDateStr, + private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, String startDateStr, String endDateStr, String user, String pass) throws SQLException { Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass); ResultSet rs = null; @@ -104,7 +104,7 @@ public class HiveUtil { stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); stmt.execute("set hive.stats.autogather=false"); rs = stmt.executeQuery( - "select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source + "select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig() .getTableName() + " where datestr>'" + startDateStr + "' and datestr<='" + endDateStr + "'"); if(rs.next()) { diff --git a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala index 7bacade3a..182e1e3fc 100644 --- a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala +++ b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala @@ -16,7 +16,11 @@ package com.uber.hoodie.cli -import com.uber.hoodie.common.model.{HoodieRecord, HoodieTableMetadata} +import java.util.stream.Collectors + +import com.uber.hoodie.common.model.{HoodieDataFile, HoodieRecord} +import com.uber.hoodie.common.table.HoodieTableMetaClient +import com.uber.hoodie.common.table.view.ReadOptimizedTableView import com.uber.hoodie.common.util.FSUtils import com.uber.hoodie.exception.HoodieException import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} @@ -70,9 +74,12 @@ class DedupeSparkJob (basePath: String, val tmpTableName = s"htbl_${System.currentTimeMillis()}" val dedupeTblName = s"${tmpTableName}_dupeKeys" - val metadata = new HoodieTableMetadata(fs, basePath) + val metadata = new HoodieTableMetaClient(fs, basePath) + val fsView = new ReadOptimizedTableView(fs, metadata) + val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}")) - val filteredStatuses = metadata.getLatestVersions(allFiles).map(f => f.getPath.toString); + val latestFiles:java.util.List[HoodieDataFile] = fsView.streamLatestVersions(allFiles).collect(Collectors.toList[HoodieDataFile]()) + val filteredStatuses = latestFiles.map(f => f.getPath) LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}") val df = sqlContext.parquetFile(filteredStatuses:_*) @@ -118,9 +125,13 @@ class DedupeSparkJob (basePath: String, def fixDuplicates(dryRun: Boolean = true) = { - val metadata = new HoodieTableMetadata(fs, basePath) + val metadata = new HoodieTableMetaClient(fs, basePath) + val fsView = new ReadOptimizedTableView(fs, metadata) + val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}")) - val fileNameToPathMap = metadata.getLatestVersions(allFiles).map(f => (FSUtils.getFileId(f.getPath.getName), f.getPath)).toMap; + val latestFiles:java.util.List[HoodieDataFile] = fsView.streamLatestVersions(allFiles).collect(Collectors.toList[HoodieDataFile]()) + + val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap val dupeFixPlan = planDuplicateFix() // 1. Copy all latest files into the temp fix path diff --git a/hoodie-client/pom.xml b/hoodie-client/pom.xml index e37402f1b..13aa1d7ca 100644 --- a/hoodie-client/pom.xml +++ b/hoodie-client/pom.xml @@ -28,8 +28,8 @@ - org.codehaus.mojo - cobertura-maven-plugin + org.jacoco + jacoco-maven-plugin org.apache.maven.plugins diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java index c2a4500ea..f11b5a5a6 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java @@ -19,17 +19,18 @@ package com.uber.hoodie; import com.google.common.base.Optional; import com.uber.hoodie.common.model.HoodieCommitMetadata; -import com.uber.hoodie.common.model.HoodieCommits; +import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; -import com.uber.hoodie.common.model.HoodieTableMetadata; -import com.uber.hoodie.common.model.HoodieWriteStat; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.index.HoodieBloomIndex; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; @@ -53,6 +54,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import scala.Tuple2; @@ -75,7 +77,8 @@ public class HoodieReadClient implements Serializable { * BloomIndex */ private transient final HoodieBloomIndex index; - private HoodieTableMetadata metadata; + private final HoodieTimeline commitTimeline; + private HoodieTableMetaClient metaClient; private transient Optional sqlContextOpt; @@ -85,7 +88,8 @@ public class HoodieReadClient implements Serializable { public HoodieReadClient(JavaSparkContext jsc, String basePath) { this.jsc = jsc; this.fs = FSUtils.getFs(); - this.metadata = new HoodieTableMetadata(fs, basePath); + this.metaClient = new HoodieTableMetaClient(fs, basePath, true); + this.commitTimeline = metaClient.getActiveCommitTimeline(); this.index = new HoodieBloomIndex(HoodieWriteConfig.newBuilder().withPath(basePath).build(), jsc); this.sqlContextOpt = Optional.absent(); } @@ -127,7 +131,7 @@ public class HoodieReadClient implements Serializable { assertSqlContext(); JavaPairRDD> keyToFileRDD = - index.fetchRecordLocation(hoodieKeys, metadata); + index.fetchRecordLocation(hoodieKeys, metaClient); List paths = keyToFileRDD .filter(new Function>, Boolean>() { @Override @@ -177,17 +181,20 @@ public class HoodieReadClient implements Serializable { public Dataset read(String... paths) { assertSqlContext(); List filteredPaths = new ArrayList<>(); + TableFileSystemView fileSystemView = new ReadOptimizedTableView(fs, metaClient); + try { for (String path : paths) { - if (!path.contains(metadata.getBasePath())) { + if (!path.contains(metaClient.getBasePath())) { throw new HoodieException("Path " + path + " does not seem to be a part of a Hoodie dataset at base path " - + metadata.getBasePath()); + + metaClient.getBasePath()); } - FileStatus[] latestFiles = metadata.getLatestVersions(fs.globStatus(new Path(path))); - for (FileStatus file : latestFiles) { - filteredPaths.add(file.getPath().toString()); + List latestFiles = fileSystemView.streamLatestVersions(fs.globStatus(new Path(path))).collect( + Collectors.toList()); + for (HoodieDataFile file : latestFiles) { + filteredPaths.add(file.getPath()); } } return sqlContextOpt.get().read() @@ -205,15 +212,19 @@ public class HoodieReadClient implements Serializable { */ public Dataset readSince(String lastCommitTimestamp) { - List commitsToReturn = metadata.findCommitsAfter(lastCommitTimestamp, Integer.MAX_VALUE); + List commitsToReturn = + commitTimeline.findInstantsAfter(lastCommitTimestamp, Integer.MAX_VALUE) + .collect(Collectors.toList()); //TODO: we can potentially trim this down to only affected partitions, using CommitMetadata try { // Go over the commit metadata, and obtain the new files that need to be read. HashMap fileIdToFullPath = new HashMap<>(); for (String commit: commitsToReturn) { + HoodieCommitMetadata metadata = + HoodieCommitMetadata.fromBytes(commitTimeline.readInstantDetails(commit).get()); // get files from each commit, and replace any previous versions - fileIdToFullPath.putAll(metadata.getCommitMetadata(commit).getFileIdAndFullPaths()); + fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths()); } return sqlContextOpt.get().read() @@ -229,13 +240,13 @@ public class HoodieReadClient implements Serializable { */ public Dataset readCommit(String commitTime) { assertSqlContext(); - HoodieCommits commits = metadata.getAllCommits(); - if (!commits.contains(commitTime)) { + if (!commitTimeline.containsInstant(commitTime)) { new HoodieException("No commit exists at " + commitTime); } try { - HoodieCommitMetadata commitMetdata = metadata.getCommitMetadata(commitTime); + HoodieCommitMetadata commitMetdata = + HoodieCommitMetadata.fromBytes(commitTimeline.readInstantDetails(commitTime).get()); Collection paths = commitMetdata.getFileIdAndFullPaths().values(); return sqlContextOpt.get().read() .parquet(paths.toArray(new String[paths.size()])) @@ -253,7 +264,7 @@ public class HoodieReadClient implements Serializable { */ public JavaPairRDD> checkExists( JavaRDD hoodieKeys) { - return index.fetchRecordLocation(hoodieKeys, metadata); + return index.fetchRecordLocation(hoodieKeys, metaClient); } /** @@ -264,7 +275,7 @@ public class HoodieReadClient implements Serializable { * @return A subset of hoodieRecords RDD, with existing records filtered out. */ public JavaRDD filterExists(JavaRDD hoodieRecords) { - JavaRDD recordsWithLocation = index.tagLocation(hoodieRecords, metadata); + JavaRDD recordsWithLocation = index.tagLocation(hoodieRecords, metaClient); return recordsWithLocation.filter(new Function() { @Override public Boolean call(HoodieRecord v1) throws Exception { @@ -287,13 +298,13 @@ public class HoodieReadClient implements Serializable { * @return */ public List listCommitsSince(String commitTimestamp) { - return metadata.getAllCommits().findCommitsAfter(commitTimestamp, Integer.MAX_VALUE); + return commitTimeline.findInstantsAfter(commitTimestamp, Integer.MAX_VALUE).collect(Collectors.toList()); } /** * Returns the last successful commit (a successful write operation) into a Hoodie table. */ public String latestCommit() { - return metadata.getAllCommits().lastCommit(); + return commitTimeline.lastInstant().get(); } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java index f367ad27b..a83963d4c 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java @@ -22,8 +22,9 @@ import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieWriteStat; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieCommitException; @@ -31,7 +32,7 @@ import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.HoodieInsertException; import com.uber.hoodie.exception.HoodieRollbackException; import com.uber.hoodie.exception.HoodieUpsertException; -import com.uber.hoodie.func.BulkInsertMapFunction; +import com.uber.hoodie.func.InsertMapFunction; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.io.HoodieCleaner; import com.uber.hoodie.io.HoodieCommitArchiveLog; @@ -39,7 +40,6 @@ import com.uber.hoodie.metrics.HoodieMetrics; import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.WorkloadProfile; -import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -67,16 +67,19 @@ import java.util.Collections; import java.util.Date; import java.util.Iterator; import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; import scala.Option; import scala.Tuple2; /** - * Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient - * mutations on a HDFS dataset [upsert()] + * Hoodie Write Client helps you build datasets on HDFS [insert()] and then + * perform efficient mutations on a HDFS dataset [upsert()] + * + * Note that, at any given time, there can only be one Spark job performing + * these operatons on a Hoodie dataset. * - * Note that, at any given time, there can only be one Spark job performing these operatons on a - * Hoodie dataset. */ public class HoodieWriteClient implements Serializable { @@ -111,7 +114,8 @@ public class HoodieWriteClient implements Seriali this.config = clientConfig; this.index = HoodieIndex.createIndex(config, jsc); this.metrics = new HoodieMetrics(config, config.getTableName()); - this.archiveLog = new HoodieCommitArchiveLog(clientConfig); + this.archiveLog = new HoodieCommitArchiveLog(clientConfig, fs); + if (rollbackInFlight) { rollbackInflightCommits(); } @@ -125,9 +129,9 @@ public class HoodieWriteClient implements Seriali * @return A subset of hoodieRecords RDD, with existing records filtered out. */ public JavaRDD> filterExists(JavaRDD> hoodieRecords) { - final HoodieTableMetadata metadata = - new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); - JavaRDD> recordsWithLocation = index.tagLocation(hoodieRecords, metadata); + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + JavaRDD> recordsWithLocation = index.tagLocation(hoodieRecords, metaClient); return recordsWithLocation.filter(new Function, Boolean>() { @Override public Boolean call(HoodieRecord v1) throws Exception { @@ -140,19 +144,74 @@ public class HoodieWriteClient implements Seriali * Upserts a bunch of new records into the Hoodie table, at the supplied commitTime */ public JavaRDD upsert(JavaRDD> records, final String commitTime) { - final HoodieTableMetadata metadata = - new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); writeContext = metrics.getCommitCtx(); + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + + final HoodieTable table = + HoodieTable.getHoodieTable(metaClient, commitTime, config); try { // De-dupe/merge if needed JavaRDD> dedupedRecords = - combineOnCondition(config.shouldCombineBeforeUpsert(), records, - config.getUpsertShuffleParallelism()); + combineOnCondition(config.shouldCombineBeforeUpsert(), records, + config.getUpsertShuffleParallelism()); // perform index loop up to get existing location of records - JavaRDD> taggedRecords = index.tagLocation(dedupedRecords, metadata); - return upsertRecordsInternal(taggedRecords, commitTime, metadata, true); + JavaRDD> taggedRecords = index.tagLocation(dedupedRecords, metaClient); + + // Cache the tagged records, so we don't end up computing both + taggedRecords.persist(StorageLevel.MEMORY_AND_DISK_SER()); + + + WorkloadProfile profile = null; + if (table.isWorkloadProfileNeeded()) { + profile = new WorkloadProfile(taggedRecords); + logger.info("Workload profile :" + profile); + } + + // obtain the upsert partitioner, and the run the tagger records through that & get a partitioned RDD. + final Partitioner upsertPartitioner = table.getUpsertPartitioner(profile); + JavaRDD> partitionedRecords = taggedRecords.mapToPair( + new PairFunction, Tuple2>, HoodieRecord>() { + @Override + public Tuple2>, HoodieRecord> call( + HoodieRecord record) throws Exception { + return new Tuple2<>(new Tuple2<>(record.getKey(), + Option.apply(record.getCurrentLocation())), record); + } + }).partitionBy(upsertPartitioner).map( + new Function>, HoodieRecord>, HoodieRecord>() { + @Override + public HoodieRecord call( + Tuple2>, HoodieRecord> tuple) + throws Exception { + return tuple._2(); + } + }); + + + // Perform the actual writing. + JavaRDD upsertStatusRDD = partitionedRecords.mapPartitionsWithIndex( + new Function2>, Iterator>>() { + @Override + public Iterator> call(Integer partition, + Iterator> recordItr) throws Exception { + return table.handleUpsertPartition(partition, recordItr, upsertPartitioner); + } + }, true).flatMap(new FlatMapFunction, WriteStatus>() { + @Override + public Iterable call(List writeStatuses) + throws Exception { + return writeStatuses; + } + }); + + // Update the index back. + JavaRDD resultRDD = index.updateLocation(upsertStatusRDD, metaClient); + resultRDD = resultRDD.persist(config.getWriteStatusStorageLevel()); + commitOnAutoCommit(commitTime, resultRDD); + return resultRDD; } catch (Throwable e) { if (e instanceof HoodieUpsertException) { throw (HoodieUpsertException) e; @@ -161,38 +220,8 @@ public class HoodieWriteClient implements Seriali } } - /** - * Inserts the given HoodieRecords, into the table. This API is intended to be used for normal - * writes. - * - * This implementation skips the index check and is able to leverage benefits such as - * small file handling/blocking alignment, as with upsert(), by profiling the workload - * - * @param records HoodieRecords to insert - * @param commitTime Commit Time handle - * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts - */ - public JavaRDD insert(JavaRDD> records, final String commitTime) { - final HoodieTableMetadata metadata = - new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); - writeContext = metrics.getCommitCtx(); - try { - // De-dupe/merge if needed - JavaRDD> dedupedRecords = - combineOnCondition(config.shouldCombineBeforeInsert(), records, - config.getInsertShuffleParallelism()); - - return upsertRecordsInternal(dedupedRecords, commitTime, metadata, false); - } catch (Throwable e) { - if (e instanceof HoodieInsertException) { - throw e; - } - throw new HoodieInsertException("Failed to insert for commit time " + commitTime, e); - } - } - private void commitOnAutoCommit(String commitTime, JavaRDD resultRDD) { - if (config.shouldAutoCommit()) { + if(config.shouldAutoCommit()) { logger.info("Auto commit enabled: Committing " + commitTime); boolean commitResult = commit(commitTime, resultRDD); if (!commitResult) { @@ -204,146 +233,65 @@ public class HoodieWriteClient implements Seriali } private JavaRDD> combineOnCondition(boolean condition, - JavaRDD> records, int parallelism) { - if (condition) { + JavaRDD> records, int parallelism) { + if(condition) { return deduplicateRecords(records, parallelism); } return records; } - private JavaRDD> partition(JavaRDD> dedupedRecords, Partitioner partitioner) { - return dedupedRecords.mapToPair( - new PairFunction, Tuple2>, HoodieRecord>() { - @Override - public Tuple2>, HoodieRecord> call( - HoodieRecord record) throws Exception { - return new Tuple2<>(new Tuple2<>(record.getKey(), - Option.apply(record.getCurrentLocation())), record); - } - }).partitionBy(partitioner).map( - new Function>, HoodieRecord>, HoodieRecord>() { - @Override - public HoodieRecord call( - Tuple2>, HoodieRecord> tuple) - throws Exception { - return tuple._2(); - } - }); - } - - private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) { - if (isUpsert) { - return table.getUpsertPartitioner(profile); - } else { - return table.getInsertPartitioner(profile); - } - } - - private JavaRDD updateIndexAndCommitIfNeeded(JavaRDD writeStatusRDD, - HoodieTableMetadata metadata, - String commitTime) { - // Update the index back - JavaRDD statuses = index.updateLocation(writeStatusRDD, metadata); - // Trigger the insert and collect statuses - statuses = statuses.persist(config.getWriteStatusStorageLevel()); - commitOnAutoCommit(commitTime, statuses); - return statuses; - } - - private JavaRDD upsertRecordsInternal(JavaRDD> preppedRecords, - String commitTime, - HoodieTableMetadata metadata, - final boolean isUpsert) { - - final HoodieTable table = - HoodieTable.getHoodieTable(metadata.getTableType(), commitTime, config, metadata); - - // Cache the tagged records, so we don't end up computing both - preppedRecords.persist(StorageLevel.MEMORY_AND_DISK_SER()); - - WorkloadProfile profile = null; - if (table.isWorkloadProfileNeeded()) { - profile = new WorkloadProfile(preppedRecords); - logger.info("Workload profile :" + profile); - } - - // partition using the insert partitioner - final Partitioner partitioner = getPartitioner(table, isUpsert, profile); - JavaRDD> partitionedRecords = partition(preppedRecords, partitioner); - JavaRDD writeStatusRDD = partitionedRecords.mapPartitionsWithIndex( - new Function2>, Iterator>>() { - @Override - public Iterator> call(Integer partition, - Iterator> recordItr) throws Exception { - if (isUpsert) { - return table.handleUpsertPartition(partition, recordItr, partitioner); - } else { - return table.handleInsertPartition(partition, recordItr, partitioner); - } - } - }, true).flatMap(new FlatMapFunction, WriteStatus>() { - @Override - public Iterator call(List writeStatuses) - throws Exception { - return writeStatuses.iterator(); - } - }); - - return updateIndexAndCommitIfNeeded(writeStatusRDD, metadata, commitTime); - } - - /** - * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk - * loads into a Hoodie table for the very first time (e.g: converting an existing dataset to - * Hoodie). + * Loads the given HoodieRecords, as inserts into the table. + * (This implementation uses sortBy and attempts to control the numbers of files with less memory) * - * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and - * attempts to control the numbers of files with less memory compared to the {@link - * HoodieWriteClient#insert(JavaRDD, String)} - * - * @param records HoodieRecords to insert + * @param records HoodieRecords to insert * @param commitTime Commit Time handle * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts + * */ - public JavaRDD bulkInsert(JavaRDD> records, final String commitTime) { - final HoodieTableMetadata metadata = - new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); + public JavaRDD insert(JavaRDD> records, final String commitTime) { writeContext = metrics.getCommitCtx(); + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + try { // De-dupe/merge if needed JavaRDD> dedupedRecords = - combineOnCondition(config.shouldCombineBeforeInsert(), records, - config.getInsertShuffleParallelism()); + combineOnCondition(config.shouldCombineBeforeInsert(), records, + config.getInsertShuffleParallelism()); // Now, sort the records and line them up nicely for loading. JavaRDD> sortedRecords = - dedupedRecords.sortBy(new Function, String>() { - @Override - public String call(HoodieRecord record) { - // Let's use "partitionPath + key" as the sort key. Spark, will ensure - // the records split evenly across RDD partitions, such that small partitions fit - // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions - return String - .format("%s+%s", record.getPartitionPath(), record.getRecordKey()); - } - }, true, config.getInsertShuffleParallelism()); + dedupedRecords.sortBy(new Function, String>() { + @Override + public String call(HoodieRecord record) { + // Let's use "partitionPath + key" as the sort key. Spark, will ensure + // the records split evenly across RDD partitions, such that small partitions fit + // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions + return String + .format("%s+%s", record.getPartitionPath(), record.getRecordKey()); + } + }, true, config.getInsertShuffleParallelism()); JavaRDD writeStatusRDD = sortedRecords - .mapPartitionsWithIndex(new BulkInsertMapFunction(commitTime, config, metadata), - true).flatMap(new FlatMapFunction, WriteStatus>() { - @Override - public Iterator call(List writeStatuses) - throws Exception { - return writeStatuses.iterator(); - } - }); - - return updateIndexAndCommitIfNeeded(writeStatusRDD, metadata, commitTime); + .mapPartitionsWithIndex(new InsertMapFunction(commitTime, config, metaClient), + true).flatMap(new FlatMapFunction, WriteStatus>() { + @Override + public Iterable call(List writeStatuses) + throws Exception { + return writeStatuses; + } + }); + // Update the index back + JavaRDD statuses = index.updateLocation(writeStatusRDD, metaClient); + // Trigger the insert and collect statuses + statuses = statuses.persist(config.getWriteStatusStorageLevel()); + commitOnAutoCommit(commitTime, statuses); + return statuses; } catch (Throwable e) { if (e instanceof HoodieInsertException) { throw e; } - throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, e); + throw new HoodieInsertException("Failed to insert for commit time " + commitTime, e); } } @@ -352,118 +300,110 @@ public class HoodieWriteClient implements Seriali */ public boolean commit(String commitTime, JavaRDD writeStatuses) { logger.info("Comitting " + commitTime); - Path commitFile = - new Path(config.getBasePath() + "/.hoodie/" + FSUtils.makeCommitFileName(commitTime)); - try { + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); - if (fs.exists(commitFile)) { - throw new HoodieCommitException("Duplicate commit found. " + commitTime); - } - - List> stats = - writeStatuses.mapToPair(new PairFunction() { - @Override - public Tuple2 call(WriteStatus writeStatus) - throws Exception { - return new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat()); - } - }).collect(); - - HoodieCommitMetadata metadata = new HoodieCommitMetadata(); - for (Tuple2 stat : stats) { - metadata.addWriteStat(stat._1(), stat._2()); - } - - // open a new file and write the commit metadata in - Path inflightCommitFile = new Path(config.getBasePath() + "/.hoodie/" + FSUtils - .makeInflightCommitFileName(commitTime)); - FSDataOutputStream fsout = fs.create(inflightCommitFile, true); - fsout.writeBytes(new String(metadata.toJsonString().getBytes(StandardCharsets.UTF_8), - StandardCharsets.UTF_8)); - fsout.close(); - - boolean success = fs.rename(inflightCommitFile, commitFile); - if (success) { - // We cannot have unbounded commit files. Archive commits if we have to archive - archiveLog.archiveIfRequired(); - // Call clean to cleanup if there is anything to cleanup after the commit, - clean(); - if (writeContext != null) { - long durationInMs = metrics.getDurationInMs(writeContext.stop()); - metrics.updateCommitMetrics(FORMATTER.parse(commitTime).getTime(), durationInMs, - metadata); - writeContext = null; + List> stats = + writeStatuses.mapToPair(new PairFunction() { + @Override + public Tuple2 call(WriteStatus writeStatus) + throws Exception { + return new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat()); } + }).collect(); + + HoodieCommitMetadata metadata = new HoodieCommitMetadata(); + for (Tuple2 stat : stats) { + metadata.addWriteStat(stat._1(), stat._2()); + } + + try { + commitTimeline.saveInstantAsComplete(commitTime, + Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + // Save was a success + // We cannot have unbounded commit files. Archive commits if we have to archive + archiveLog.archiveIfRequired(); + // Call clean to cleanup if there is anything to cleanup after the commit, + clean(); + if (writeContext != null) { + long durationInMs = metrics.getDurationInMs(writeContext.stop()); + metrics.updateCommitMetrics(FORMATTER.parse(commitTime).getTime(), durationInMs, + metadata); + writeContext = null; } - logger.info("Status of the commit " + commitTime + ": " + success); - return success; + logger.info("Status of the commit " + commitTime); } catch (IOException e) { throw new HoodieCommitException( - "Failed to commit " + config.getBasePath() + " at time " + commitTime, e); + "Failed to commit " + config.getBasePath() + " at time " + commitTime, e); } catch (ParseException e) { throw new HoodieCommitException( - "Commit time is not of valid format.Failed to commit " + config.getBasePath() - + " at time " + commitTime, e); + "Commit time is not of valid format.Failed to commit " + config.getBasePath() + + " at time " + commitTime, e); } + return true; } /** - * Rollback the (inflight/committed) record changes with the given commit time. Three steps: (0) - * Obtain the commit or rollback file (1) clean indexing data, (2) clean new generated parquet - * files. (3) Finally delete .commit or .inflight file, + * Rollback the (inflight/committed) record changes with the given commit time. + * Three steps: + * (1) Atomically unpublish this commit + * (2) clean indexing data, + * (3) clean new generated parquet files. + * (4) Finally delete .commit or .inflight file, */ public boolean rollback(final String commitTime) throws HoodieRollbackException { - final Timer.Context context = metrics.getRollbackCtx(); - final HoodieTableMetadata metadata = - new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); - final String metaPath = config.getBasePath() + "/" + HoodieTableMetadata.METAFOLDER_NAME; + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + try { - // 0. Obtain the commit/.inflight file, to work on - FileStatus[] commitFiles = - fs.globStatus(new Path(metaPath + "/" + commitTime + ".*")); - if (commitFiles.length != 1) { - throw new HoodieRollbackException("Expected exactly one .commit or .inflight file for commitTime: " + commitTime); + if (commitTimeline.lastInstant().isPresent() + && commitTimeline.findInstantsAfter(commitTime, Integer.MAX_VALUE).count() > 0) { + throw new HoodieRollbackException("Found commits after time :" + commitTime + + ", please rollback greater commits first"); } - // we first need to unpublish the commit by making it .inflight again. (this will ensure no future queries see this data) - Path filePath = commitFiles[0].getPath(); - if (filePath.getName().endsWith(HoodieTableMetadata.COMMIT_FILE_SUFFIX)) { - if (metadata.findCommitsAfter(commitTime, Integer.MAX_VALUE).size() > 0) { - throw new HoodieRollbackException("Found commits after time :" + commitTime + + List inflights = + commitTimeline.getInflightInstants().collect(Collectors.toList()); + if (!inflights.isEmpty() && inflights.indexOf(commitTime) != inflights.size() - 1) { + throw new HoodieRollbackException( + "Found in-flight commits after time :" + commitTime + ", please rollback greater commits first"); - } - Path newInflightPath = new Path(metaPath + "/" + commitTime + HoodieTableMetadata.INFLIGHT_FILE_SUFFIX); - if (!fs.rename(filePath, newInflightPath)) { - throw new HoodieRollbackException("Unable to rename .commit file to .inflight for commitTime:" + commitTime); - } - filePath = newInflightPath; } - // 1. Revert the index changes - logger.info("Clean out index changes at time: " + commitTime); - if (!index.rollbackCommit(commitTime)) { - throw new HoodieRollbackException("Clean out index changes failed, for time :" + commitTime); - } + if (inflights.contains(commitTime) || (commitTimeline.lastInstant().isPresent() + && commitTimeline.lastInstant().get().equals(commitTime))) { + // 1. Atomically unpublish this commit + if(commitTimeline.containsInstant(commitTime)) { + commitTimeline.revertInstantToInflight(commitTime); + } + // 2. Revert the index changes + logger.info("Clean out index changes at time: " + commitTime); + if (!index.rollbackCommit(commitTime)) { + throw new HoodieRollbackException( + "Clean out index changes failed, for time :" + commitTime); + } - // 2. Delete the new generated parquet files - logger.info("Clean out all parquet files generated at time: " + commitTime); - final Accumulator numFilesDeletedAccu = jsc.accumulator(0); - jsc.parallelize(FSUtils.getAllPartitionPaths(fs, metadata.getBasePath())) + // 3. Delete the new generated parquet files + logger.info("Clean out all parquet files generated at time: " + commitTime); + final Accumulator numFilesDeletedAccu = jsc.accumulator(0); + jsc.parallelize(FSUtils.getAllPartitionPaths(fs, metaClient.getBasePath())) .foreach(new VoidFunction() { @Override public void call(String partitionPath) throws Exception { // Scan all partitions files with this commit time FileSystem fs = FSUtils.getFs(); FileStatus[] toBeDeleted = - fs.listStatus(new Path(config.getBasePath(), partitionPath), - new PathFilter() { - @Override - public boolean accept(Path path) { - return commitTime - .equals(FSUtils.getCommitTime(path.getName())); - } - }); + fs.listStatus(new Path(config.getBasePath(), partitionPath), + new PathFilter() { + @Override + public boolean accept(Path path) { + return commitTime + .equals(FSUtils.getCommitTime(path.getName())); + } + }); for (FileStatus file : toBeDeleted) { boolean success = fs.delete(file.getPath(), false); logger.info("Delete file " + file.getPath() + "\t" + success); @@ -473,24 +413,20 @@ public class HoodieWriteClient implements Seriali } } }); + // 4. Remove commit + logger.info("Clean out metadata files at time: " + commitTime); + commitTimeline.removeInflightFromTimeline(commitTime); - // 3. Clean out metadata (.commit or .tmp) - logger.info("Clean out metadata files at time: " + commitTime); - if (!fs.delete(filePath, false)) { - logger.error("Deleting file " + filePath + " failed."); - throw new HoodieRollbackException("Delete file " + filePath + " failed."); + if (context != null) { + long durationInMs = metrics.getDurationInMs(context.stop()); + int numFilesDeleted = numFilesDeletedAccu.value(); + metrics.updateRollbackMetrics(durationInMs, numFilesDeleted); + } } - - if (context != null) { - long durationInMs = metrics.getDurationInMs(context.stop()); - int numFilesDeleted = numFilesDeletedAccu.value(); - metrics.updateRollbackMetrics(durationInMs, numFilesDeleted); - } - return true; } catch (IOException e) { throw new HoodieRollbackException("Failed to rollback " + - config.getBasePath() + " at commit time" + commitTime, e); + config.getBasePath() + " at commit time" + commitTime, e); } } @@ -504,35 +440,38 @@ public class HoodieWriteClient implements Seriali /** * Clean up any stale/old files/data lying around (either on file storage or index storage) */ - private void clean() throws HoodieIOException { + private void clean() throws HoodieIOException { try { logger.info("Cleaner started"); final Timer.Context context = metrics.getCleanCtx(); - final HoodieTableMetadata metadata = new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); - List partitionsToClean = FSUtils.getAllPartitionPaths(fs, metadata.getBasePath()); + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + + List partitionsToClean = FSUtils.getAllPartitionPaths(fs, metaClient.getBasePath()); // shuffle to distribute cleaning work across partitions evenly Collections.shuffle(partitionsToClean); logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config.getCleanerPolicy()); - if (partitionsToClean.isEmpty()) { + if(partitionsToClean.isEmpty()) { logger.info("Nothing to clean here mom. It is already clean"); return; } int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); int numFilesDeleted = jsc.parallelize(partitionsToClean, cleanerParallelism) - .map(new Function() { - @Override - public Integer call(String partitionPathToClean) throws Exception { - FileSystem fs = FSUtils.getFs(); - HoodieCleaner cleaner = new HoodieCleaner(metadata, config, fs); - return cleaner.clean(partitionPathToClean); - } - }).reduce(new Function2() { - @Override - public Integer call(Integer v1, Integer v2) throws Exception { - return v1 + v2; - } - }); + .map(new Function() { + @Override + public Integer call(String partitionPathToClean) throws Exception { + FileSystem fs = FSUtils.getFs(); + HoodieCleaner cleaner = new HoodieCleaner(metaClient, config, fs); + return cleaner.clean(partitionPathToClean); + } + }).reduce(new Function2() { + @Override + public Integer call(Integer v1, Integer v2) throws Exception { + return v1 + v2; + } + }); logger.info("Cleaned " + numFilesDeleted + " files"); // Emit metrics (duration, numFilesDeleted) if needed if (context != null) { @@ -556,21 +495,10 @@ public class HoodieWriteClient implements Seriali public void startCommitWithTime(String commitTime) { logger.info("Generate a new commit time " + commitTime); - // Create the in-flight commit file - Path inflightCommitFilePath = new Path( - config.getBasePath() + "/.hoodie/" + FSUtils.makeInflightCommitFileName(commitTime)); - try { - if (fs.createNewFile(inflightCommitFilePath)) { - logger.info("Create an inflight commit file " + inflightCommitFilePath); - return; - } - throw new HoodieCommitException( - "Failed to create the inflight commit file " + inflightCommitFilePath); - } catch (IOException e) { - // handled below - throw new HoodieCommitException( - "Failed to create the inflight commit file " + inflightCommitFilePath, e); - } + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + commitTimeline.saveInstantAsInflight(commitTime); } public static SparkConf registerClasses(SparkConf conf) { @@ -606,10 +534,16 @@ public class HoodieWriteClient implements Seriali /** * Cleanup all inflight commits + * @throws IOException */ private void rollbackInflightCommits() { - final HoodieTableMetadata metadata = new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); - for (String commit : metadata.getAllInflightCommits()) { + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + + List commits = commitTimeline.getInflightInstants().collect(Collectors.toList()); + Collections.reverse(commits); + for (String commit : commits) { rollback(commit); } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/func/BulkInsertMapFunction.java b/hoodie-client/src/main/java/com/uber/hoodie/func/BulkInsertMapFunction.java index 55959270b..da6b526f1 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/func/BulkInsertMapFunction.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/func/BulkInsertMapFunction.java @@ -16,11 +16,11 @@ package com.uber.hoodie.func; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import org.apache.spark.api.java.function.Function2; import java.util.Iterator; @@ -30,23 +30,23 @@ import java.util.List; /** * Map function that handles a sorted stream of HoodieRecords */ -public class BulkInsertMapFunction +public class InsertMapFunction implements Function2>, Iterator>> { private String commitTime; private HoodieWriteConfig config; - private HoodieTableMetadata metadata; + private HoodieTableMetaClient metaClient; - public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config, - HoodieTableMetadata metadata) { + public InsertMapFunction(String commitTime, HoodieWriteConfig config, + HoodieTableMetaClient metaClient) { this.commitTime = commitTime; this.config = config; - this.metadata = metadata; + this.metaClient = metaClient; } @Override public Iterator> call(Integer partition, Iterator> sortedRecordItr) throws Exception { - return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, metadata); + return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, metaClient); } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java index 579191651..ab369da7e 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java @@ -16,11 +16,11 @@ package com.uber.hoodie.func; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.io.HoodieIOHandle; import com.uber.hoodie.io.HoodieInsertHandle; @@ -40,17 +40,17 @@ public class LazyInsertIterable extends LazyItera private final HoodieWriteConfig hoodieConfig; private final String commitTime; - private final HoodieTableMetadata tableMetadata; + private final HoodieTableMetaClient metaClient; private Set partitionsCleaned; private HoodieInsertHandle handle; public LazyInsertIterable(Iterator> sortedRecordItr, HoodieWriteConfig config, - String commitTime, HoodieTableMetadata metadata) { + String commitTime, HoodieTableMetaClient metaClient) { super(sortedRecordItr); this.partitionsCleaned = new HashSet<>(); this.hoodieConfig = config; this.commitTime = commitTime; - this.tableMetadata = metadata; + this.metaClient = metaClient; } @Override protected void start() { @@ -78,7 +78,7 @@ public class LazyInsertIterable extends LazyItera // lazily initialize the handle, for the first time if (handle == null) { handle = - new HoodieInsertHandle(hoodieConfig, commitTime, tableMetadata, + new HoodieInsertHandle(hoodieConfig, commitTime, metaClient, record.getPartitionPath()); } @@ -90,7 +90,7 @@ public class LazyInsertIterable extends LazyItera statuses.add(handle.close()); // Need to handle the rejected record & open new handle handle = - new HoodieInsertHandle(hoodieConfig, commitTime, tableMetadata, + new HoodieInsertHandle(hoodieConfig, commitTime, metaClient, record.getPartitionPath()); handle.write(record); // we should be able to write 1 record. break; diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java index 98e889b65..9be71cd4b 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java @@ -17,12 +17,13 @@ package com.uber.hoodie.index; import com.google.common.base.Optional; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.config.HoodieIndexConfig; @@ -65,7 +66,7 @@ public class HBaseIndex extends HoodieIndex { @Override public JavaPairRDD> fetchRecordLocation( - JavaRDD hoodieKeys, HoodieTableMetadata metadata) { + JavaRDD hoodieKeys, HoodieTableMetaClient metaClient) { throw new UnsupportedOperationException("HBase index does not implement check exist yet"); } @@ -91,10 +92,10 @@ public class HBaseIndex extends HoodieIndex { class LocationTagFunction implements Function2>, Iterator>> { - private final HoodieTableMetadata metadata; + private final HoodieTableMetaClient metaClient; - LocationTagFunction(HoodieTableMetadata metadata) { - this.metadata = metadata; + LocationTagFunction(HoodieTableMetaClient metaClient) { + this.metaClient = metaClient; } @Override @@ -127,8 +128,9 @@ public class HBaseIndex extends HoodieIndex { String fileId = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); // if the last commit ts for this row is less than the system commit ts - if (!metadata.isCommitsEmpty() && metadata.isCommitTsSafe(commitTs)) { + if (commitTimeline.hasInstants() && commitTimeline.containsInstant(commitTs)) { rec.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); } } @@ -155,8 +157,8 @@ public class HBaseIndex extends HoodieIndex { @Override public JavaRDD> tagLocation(JavaRDD> recordRDD, - HoodieTableMetadata metadata) { - return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(metadata), true); + HoodieTableMetaClient metaClient) { + return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(metaClient), true); } class UpdateLocationTask implements Function2, Iterator> { @@ -217,7 +219,7 @@ public class HBaseIndex extends HoodieIndex { @Override public JavaRDD updateLocation(JavaRDD writeStatusRDD, - HoodieTableMetadata metadata) { + HoodieTableMetaClient metaClient) { return writeStatusRDD.mapPartitionsWithIndex(new UpdateLocationTask(), true); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java index e5622d99e..c5fd18ee2 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java @@ -19,16 +19,18 @@ package com.uber.hoodie.index; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; +import com.uber.hoodie.common.model.HoodieDataFile; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.util.FSUtils; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; @@ -43,6 +45,7 @@ import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; import java.util.*; +import java.util.stream.Collectors; /** * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in @@ -63,10 +66,7 @@ public class HoodieBloomIndex extends HoodieIndex } @Override - /** - * - */ - public JavaRDD> tagLocation(JavaRDD> recordRDD, final HoodieTableMetadata metadata) { + public JavaRDD> tagLocation(JavaRDD> recordRDD, final HoodieTableMetaClient metaClient) { // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) JavaPairRDD partitionRecordKeyPairRDD = recordRDD @@ -79,7 +79,7 @@ public class HoodieBloomIndex extends HoodieIndex // Lookup indexes for all the partition/recordkey pair JavaPairRDD rowKeyFilenamePairRDD = - lookupIndex(partitionRecordKeyPairRDD, metadata); + lookupIndex(partitionRecordKeyPairRDD, metaClient); // Cache the result, for subsequent stages. rowKeyFilenamePairRDD.cache(); @@ -93,7 +93,7 @@ public class HoodieBloomIndex extends HoodieIndex } public JavaPairRDD> fetchRecordLocation( - JavaRDD hoodieKeys, final HoodieTableMetadata metadata) { + JavaRDD hoodieKeys, final HoodieTableMetaClient metaClient) { JavaPairRDD partitionRecordKeyPairRDD = hoodieKeys.mapToPair(new PairFunction() { @Override @@ -104,7 +104,7 @@ public class HoodieBloomIndex extends HoodieIndex // Lookup indexes for all the partition/recordkey pair JavaPairRDD rowKeyFilenamePairRDD = - lookupIndex(partitionRecordKeyPairRDD, metadata); + lookupIndex(partitionRecordKeyPairRDD, metaClient); JavaPairRDD rowKeyHoodieKeyPairRDD = hoodieKeys.mapToPair(new PairFunction() { @@ -115,17 +115,17 @@ public class HoodieBloomIndex extends HoodieIndex }); return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).mapToPair( - new PairFunction>>, HoodieKey, Optional>() { + new PairFunction>>, HoodieKey, Optional>() { @Override public Tuple2> call( - Tuple2>> keyPathTuple) + Tuple2>> keyPathTuple) throws Exception { Optional recordLocationPath; if (keyPathTuple._2._2.isPresent()) { String fileName = keyPathTuple._2._2.get(); String partitionPath = keyPathTuple._2._1.getPartitionPath(); recordLocationPath = Optional - .of(new Path(new Path(metadata.getBasePath(), partitionPath), fileName) + .of(new Path(new Path(metaClient.getBasePath(), partitionPath), fileName) .toUri().getPath()); } else { recordLocationPath = Optional.absent(); @@ -140,19 +140,19 @@ public class HoodieBloomIndex extends HoodieIndex * record keys already present and drop the record keys if not present * * @param partitionRecordKeyPairRDD - * @param metadata + * @param metaClient * @return */ private JavaPairRDD lookupIndex( - JavaPairRDD partitionRecordKeyPairRDD, final HoodieTableMetadata metadata) { + JavaPairRDD partitionRecordKeyPairRDD, final HoodieTableMetaClient metaClient) { // Obtain records per partition, in the incoming records - Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); + Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); // Step 2: Load all involved files as pairs JavaPairRDD partitionFilePairRDD = - loadInvolvedFiles(affectedPartitionPathList, metadata); - Map filesPerPartition = partitionFilePairRDD.countByKey(); + loadInvolvedFiles(affectedPartitionPathList, metaClient); + Map filesPerPartition = partitionFilePairRDD.countByKey(); // Compute total subpartitions, to split partitions into. Map subpartitionCountMap = @@ -174,7 +174,7 @@ public class HoodieBloomIndex extends HoodieIndex * @param filesPerPartition * @return */ - private Map computeSubPartitions(Map recordsPerPartition, Map filesPerPartition) { + private Map computeSubPartitions(Map recordsPerPartition, Map filesPerPartition) { Map subpartitionCountMap = new HashMap<>(); long totalRecords = 0; long totalFiles = 0; @@ -210,21 +210,28 @@ public class HoodieBloomIndex extends HoodieIndex * Load all involved files as pair RDD. */ @VisibleForTesting - JavaPairRDD loadInvolvedFiles(List partitions, final HoodieTableMetadata metadata) { + JavaPairRDD loadInvolvedFiles(List partitions, + final HoodieTableMetaClient metaClient) { return jsc.parallelize(partitions, Math.max(partitions.size(), 1)) - .flatMapToPair(new PairFlatMapFunction() { - @Override - public Iterator> call(String partitionPath) { - FileSystem fs = FSUtils.getFs(); - String latestCommitTime = metadata.getAllCommits().lastCommit(); - FileStatus[] filteredStatus = metadata.getLatestVersionInPartition(fs, partitionPath, latestCommitTime); - List> list = new ArrayList<>(); - for (FileStatus fileStatus : filteredStatus) { - list.add(new Tuple2<>(partitionPath, fileStatus.getPath().getName())); + .flatMapToPair(new PairFlatMapFunction() { + @Override + public Iterable> call(String partitionPath) { + FileSystem fs = FSUtils.getFs(); + TableFileSystemView view = new ReadOptimizedTableView(fs, metaClient); + java.util.Optional latestCommitTime = + metaClient.getActiveCommitTimeline().lastInstant(); + List> list = new ArrayList<>(); + if (latestCommitTime.isPresent()) { + List filteredFiles = + view.streamLatestVersionInPartition(partitionPath, + latestCommitTime.get()).collect(Collectors.toList()); + for (HoodieDataFile file : filteredFiles) { + list.add(new Tuple2<>(partitionPath, file.getFileName())); } - return list.iterator(); } - }); + return list; + } + }); } @Override @@ -261,8 +268,8 @@ public class HoodieBloomIndex extends HoodieIndex }) .flatMapToPair(new PairFlatMapFunction>, String, String>() { @Override - public Iterator> call(List> exploded) throws Exception { - return exploded.iterator(); + public Iterable> call(List> exploded) throws Exception { + return exploded; } }); @@ -323,7 +330,7 @@ public class HoodieBloomIndex extends HoodieIndex /** * Find out pair. All workload grouped by file-level. * - * // Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) and then repartition such that + * // Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that // each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey // Make sure the parallelism is atleast the groupby parallelism for tagging location */ @@ -362,9 +369,9 @@ public class HoodieBloomIndex extends HoodieIndex .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(config.getBasePath()), true) .flatMap(new FlatMapFunction, IndexLookupResult>() { @Override - public Iterator call(List indexLookupResults) + public Iterable call(List indexLookupResults) throws Exception { - return indexLookupResults.iterator(); + return indexLookupResults; } }).filter(new Function() { @Override @@ -373,13 +380,13 @@ public class HoodieBloomIndex extends HoodieIndex } }).flatMapToPair(new PairFlatMapFunction() { @Override - public Iterator> call(IndexLookupResult lookupResult) + public Iterable> call(IndexLookupResult lookupResult) throws Exception { List> vals = new ArrayList<>(); for (String recordKey : lookupResult.getMatchingRecordKeys()) { vals.add(new Tuple2<>(recordKey, lookupResult.getFileName())); } - return vals.iterator(); + return vals; } }); } @@ -399,9 +406,9 @@ public class HoodieBloomIndex extends HoodieIndex // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join. return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map( - new Function, org.apache.spark.api.java.Optional>, HoodieRecord>() { + new Function, Optional>, HoodieRecord>() { @Override - public HoodieRecord call(Tuple2, org.apache.spark.api.java.Optional> v1) throws Exception { + public HoodieRecord call(Tuple2, Optional> v1) throws Exception { HoodieRecord record = v1._1(); if (v1._2().isPresent()) { String filename = v1._2().get(); @@ -416,7 +423,7 @@ public class HoodieBloomIndex extends HoodieIndex } @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieTableMetadata metadata) { + public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieTableMetaClient metaClient) { return writeStatusRDD; } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java index 17a1d26ad..7cc0a3404 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java @@ -17,11 +17,12 @@ package com.uber.hoodie.index; import com.google.common.base.Optional; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.exception.HoodieIndexException; @@ -58,18 +59,18 @@ public abstract class HoodieIndex implements Seri * value is present, it is the path component (without scheme) of the URI underlying file * * @param hoodieKeys - * @param metadata + * @param metaClient * @return */ public abstract JavaPairRDD> fetchRecordLocation( - JavaRDD hoodieKeys, final HoodieTableMetadata metadata); + JavaRDD hoodieKeys, final HoodieTableMetaClient metaClient); /** * Looks up the index and tags each incoming record with a location of a file that contains the * row (if it is actually present) */ public abstract JavaRDD> tagLocation(JavaRDD> recordRDD, - HoodieTableMetadata metadata) throws + HoodieTableMetaClient metaClient) throws HoodieIndexException; /** @@ -78,8 +79,7 @@ public abstract class HoodieIndex implements Seri * TODO(vc): We may need to propagate the record as well in a WriteStatus class */ public abstract JavaRDD updateLocation(JavaRDD writeStatusRDD, - HoodieTableMetadata metadata) throws - HoodieIndexException; + HoodieTableMetaClient metaClient) throws HoodieIndexException; /** * Rollback the efffects of the commit made at commitTime. diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java index 775aaf9fb..b3b13d25b 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java @@ -17,13 +17,13 @@ package com.uber.hoodie.index; import com.google.common.base.Optional; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; @@ -55,7 +55,7 @@ public class InMemoryHashIndex extends HoodieInde @Override public JavaPairRDD> fetchRecordLocation( - JavaRDD hoodieKeys, final HoodieTableMetadata metadata) { + JavaRDD hoodieKeys, final HoodieTableMetaClient metaClient) { throw new UnsupportedOperationException("InMemory index does not implement check exist yet"); } @@ -81,13 +81,13 @@ public class InMemoryHashIndex extends HoodieInde @Override public JavaRDD> tagLocation(JavaRDD> recordRDD, - HoodieTableMetadata metadata) { + HoodieTableMetaClient metaClient) { return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(), true); } @Override public JavaRDD updateLocation(JavaRDD writeStatusRDD, - HoodieTableMetadata metadata) { + HoodieTableMetaClient metaClient) { return writeStatusRDD.map(new Function() { @Override public WriteStatus call(WriteStatus writeStatus) { diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java index e2dd4d77f..c47188b79 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java @@ -16,9 +16,12 @@ package com.uber.hoodie.io; +import com.uber.hoodie.common.model.HoodieDataFile; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.config.HoodieWriteConfig; -import com.uber.hoodie.common.model.HoodieCommits; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.util.FSUtils; import org.apache.hadoop.fs.FileStatus; @@ -31,7 +34,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import java.util.Map; +import java.util.stream.Collectors; /** * Cleaner is responsible for garbage collecting older files in a given partition path, such that @@ -45,26 +48,25 @@ import java.util.Map; * */ public class HoodieCleaner { + private static Logger logger = LogManager.getLogger(HoodieCleaner.class); public enum CleaningPolicy { KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_COMMITS } - - private static Logger logger = LogManager.getLogger(HoodieCleaner.class); - - - private HoodieTableMetadata metadata; - + private final TableFileSystemView fileSystemView; + private final HoodieTimeline commitTimeline; + private HoodieTableMetaClient metaClient; private HoodieWriteConfig config; - private FileSystem fs; - public HoodieCleaner(HoodieTableMetadata metadata, + public HoodieCleaner(HoodieTableMetaClient metaClient, HoodieWriteConfig config, FileSystem fs) { - this.metadata = metadata; + this.metaClient = metaClient; + this.fileSystemView = new ReadOptimizedTableView(fs, metaClient); + this.commitTimeline = metaClient.getActiveCommitTimeline(); this.config = config; this.fs = fs; } @@ -83,13 +85,13 @@ public class HoodieCleaner { */ private List getFilesToCleanKeepingLatestVersions(String partitionPath) throws IOException { logger.info("Cleaning "+ partitionPath+", retaining latest "+ config.getCleanerFileVersionsRetained()+" file versions. "); - Map> fileVersions = metadata.getAllVersionsInPartition(fs, partitionPath); + List> fileVersions = fileSystemView.streamEveryVersionInPartition(partitionPath).collect( + Collectors.toList()); List deletePaths = new ArrayList<>(); - for (String file : fileVersions.keySet()) { - List commitList = fileVersions.get(file); + for (List versionsForFileId : fileVersions) { int keepVersions = config.getCleanerFileVersionsRetained(); - Iterator commitItr = commitList.iterator(); + Iterator commitItr = versionsForFileId.iterator(); while (commitItr.hasNext() && keepVersions > 0) { // Skip this most recent version commitItr.next(); @@ -100,7 +102,7 @@ public class HoodieCleaner { deletePaths.add(String.format("%s/%s/%s", config.getBasePath(), partitionPath, - commitItr.next().getPath().getName())); + commitItr.next().getFileName())); } } return deletePaths; @@ -133,22 +135,20 @@ public class HoodieCleaner { List deletePaths = new ArrayList<>(); // determine if we have enough commits, to start cleaning. - HoodieCommits commits = metadata.getAllCommits(); - if (commits.getNumCommits() > commitsRetained) { + if (commitTimeline.getTotalInstants() > commitsRetained) { String earliestCommitToRetain = - commits.nthCommit(commits.getNumCommits() - commitsRetained); - Map> fileVersions = - metadata.getAllVersionsInPartition(fs, partitionPath); - for (String file : fileVersions.keySet()) { - List fileList = fileVersions.get(file); - String lastVersion = FSUtils.getCommitTime(fileList.get(0).getPath().getName()); + commitTimeline.nthInstant(commitTimeline.getTotalInstants() - commitsRetained).get(); + List> fileVersions = + fileSystemView.streamEveryVersionInPartition(partitionPath).collect(Collectors.toList()); + for (List fileList : fileVersions) { + String lastVersion = FSUtils.getCommitTime(fileList.get(0).getFileName()); String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileList, earliestCommitToRetain); // Ensure there are more than 1 version of the file (we only clean old files from updates) // i.e always spare the last commit. - for (FileStatus afile : fileList) { - String fileCommitTime = FSUtils.getCommitTime(afile.getPath().getName()); + for (HoodieDataFile afile : fileList) { + String fileCommitTime = afile.getCommitTime(); // Dont delete the latest commit and also the last commit before the earliest commit we are retaining // The window of commit retain == max query run time. So a query could be running which still // uses this file. @@ -160,11 +160,12 @@ public class HoodieCleaner { } // Always keep the last commit - if (HoodieCommits.isCommit1After(earliestCommitToRetain, fileCommitTime)) { + if (commitTimeline.compareInstants(earliestCommitToRetain, fileCommitTime, + HoodieTimeline.GREATER)) { // this is a commit, that should be cleaned. deletePaths.add(String - .format("%s/%s/%s", config.getBasePath(), partitionPath, - FSUtils.maskWithoutTaskPartitionId(fileCommitTime, file))); + .format("%s/%s/%s", config.getBasePath(), partitionPath, FSUtils + .maskWithoutTaskPartitionId(fileCommitTime, afile.getFileId()))); } } } @@ -176,10 +177,10 @@ public class HoodieCleaner { /** * Gets the latest version < commitTime. This version file could still be used by queries. */ - private String getLatestVersionBeforeCommit(List fileList, String commitTime) { - for (FileStatus file : fileList) { - String fileCommitTime = FSUtils.getCommitTime(file.getPath().getName()); - if (HoodieCommits.isCommit1After(commitTime, fileCommitTime)) { + private String getLatestVersionBeforeCommit(List fileList, String commitTime) { + for (HoodieDataFile file : fileList) { + String fileCommitTime = FSUtils.getCommitTime(file.getFileName()); + if (commitTimeline.compareInstants(commitTime, fileCommitTime, HoodieTimeline.GREATER)) { // fileList is sorted on the reverse, so the first commit we find <= commitTime is the one we want return fileCommitTime; } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java index 679314e7a..cc000410a 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java @@ -16,10 +16,13 @@ package com.uber.hoodie.io; +import com.uber.hoodie.common.model.HoodieCommitMetadata; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieActiveCommitTimeline; +import com.uber.hoodie.common.table.timeline.HoodieArchivedCommitTimeline; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.common.file.HoodieAppendLog; -import com.uber.hoodie.common.model.HoodieTableMetadata; -import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieCommitException; import com.uber.hoodie.exception.HoodieIOException; import org.apache.hadoop.fs.FileSystem; @@ -30,65 +33,70 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; -import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Log to hold older historical commits, to bound the growth of .commit files */ public class HoodieCommitArchiveLog { private static Logger log = LogManager.getLogger(HoodieCommitArchiveLog.class); - private static final String HOODIE_COMMIT_ARCHIVE_LOG_FILE = "commits.archived"; private final Path archiveFilePath; private final FileSystem fs; private final HoodieWriteConfig config; - public HoodieCommitArchiveLog(HoodieWriteConfig config) { - this.archiveFilePath = - new Path(config.getBasePath(), - HoodieTableMetadata.METAFOLDER_NAME + "/" +HOODIE_COMMIT_ARCHIVE_LOG_FILE); - this.fs = FSUtils.getFs(); + public HoodieCommitArchiveLog(HoodieWriteConfig config, + FileSystem fs) { + this.fs = fs; this.config = config; + this.archiveFilePath = HoodieArchivedCommitTimeline + .getArchiveLogPath(config.getBasePath() + "/" + HoodieTableMetaClient.METAFOLDER_NAME); } /** * Check if commits need to be archived. If yes, archive commits. */ public boolean archiveIfRequired() { - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, config.getBasePath()); - List commitsToArchive = getCommitsToArchive(metadata); - if (!commitsToArchive.isEmpty()) { + List commitsToArchive = getCommitsToArchive().collect(Collectors.toList()); + if (commitsToArchive.iterator().hasNext()) { log.info("Archiving commits " + commitsToArchive); - archive(metadata, commitsToArchive); - return deleteCommits(metadata, commitsToArchive); + archive(commitsToArchive); + return deleteCommits(commitsToArchive); } else { log.info("No Commits to archive"); return true; } } - private List getCommitsToArchive(HoodieTableMetadata metadata) { + private Stream getCommitsToArchive() { int maxCommitsToKeep = config.getMaxCommitsToKeep(); int minCommitsToKeep = config.getMinCommitsToKeep(); - List commits = metadata.getAllCommits().getCommitList(); - List commitsToArchive = new ArrayList(); - if (commits.size() > maxCommitsToKeep) { + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + + if (commitTimeline.hasInstants() && commitTimeline.getTotalInstants() > maxCommitsToKeep) { // Actually do the commits - commitsToArchive = commits.subList(0, commits.size() - minCommitsToKeep); + return commitTimeline.getInstants() + .limit(commitTimeline.getTotalInstants() - minCommitsToKeep); } - return commitsToArchive; + return Stream.empty(); } - private boolean deleteCommits(HoodieTableMetadata metadata, List commitsToArchive) { + private boolean deleteCommits(List commitsToArchive) { log.info("Deleting commits " + commitsToArchive); + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + boolean success = true; for(String commitToArchive:commitsToArchive) { - Path commitFile = - new Path(metadata.getBasePath() + "/" + - HoodieTableMetadata.METAFOLDER_NAME + "/" + - FSUtils.makeCommitFileName(commitToArchive)); + Path commitFile = new Path(metaClient.getMetaPath(), + ((HoodieActiveCommitTimeline) commitTimeline) + .getCompletedFileName(commitToArchive)); try { if (fs.exists(commitFile)) { success &= fs.delete(commitFile, false); @@ -112,14 +120,19 @@ public class HoodieCommitArchiveLog { .compression(HoodieAppendLog.CompressionType.RECORD, new BZip2Codec())); } - private void archive(HoodieTableMetadata metadata, List commits) - throws HoodieCommitException { + private void archive(List commits) throws HoodieCommitException { + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + HoodieAppendLog.Writer writer = null; try { writer = openWriter(); for (String commitTime : commits) { Text k = new Text(commitTime); - Text v = new Text(metadata.getCommitMetadata(commitTime).toJsonString()); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(commitTimeline.readInstantDetails(commitTime).get()); + Text v = new Text(commitMetadata.toJsonString()); writer.append(k, v); log.info("Wrote " + k); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java index 28f3f86ea..196f199d5 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java @@ -16,9 +16,12 @@ package com.uber.hoodie.io; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.exception.HoodieIOException; @@ -36,15 +39,19 @@ public abstract class HoodieIOHandle { protected final String commitTime; protected final HoodieWriteConfig config; protected final FileSystem fs; - protected final HoodieTableMetadata metadata; + protected final HoodieTableMetaClient metaClient; + protected final HoodieTimeline hoodieTimeline; + protected final TableFileSystemView fileSystemView; protected final Schema schema; public HoodieIOHandle(HoodieWriteConfig config, String commitTime, - HoodieTableMetadata metadata) { + HoodieTableMetaClient metaClient) { this.commitTime = commitTime; this.config = config; this.fs = FSUtils.getFs(); - this.metadata = metadata; + this.metaClient = metaClient; + this.hoodieTimeline = metaClient.getActiveCommitTimeline(); + this.fileSystemView = new ReadOptimizedTableView(fs, metaClient); this.schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java index 0b3e862d8..e1a787dad 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java @@ -16,12 +16,12 @@ package com.uber.hoodie.io; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieWriteStat; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieInsertException; @@ -45,7 +45,7 @@ public class HoodieInsertHandle extends HoodieIOH private int recordsWritten = 0; public HoodieInsertHandle(HoodieWriteConfig config, String commitTime, - HoodieTableMetadata metadata, String partitionPath) { + HoodieTableMetaClient metadata, String partitionPath) { super(config, commitTime, metadata); this.status = new WriteStatus(); status.setFileId(UUID.randomUUID().toString()); diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieUpdateHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieUpdateHandle.java index 92dbff783..0c7fdfe2a 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieUpdateHandle.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieUpdateHandle.java @@ -16,12 +16,12 @@ package com.uber.hoodie.io; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieWriteStat; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieUpsertException; @@ -52,10 +52,10 @@ import java.util.Iterator; public HoodieUpdateHandle(HoodieWriteConfig config, String commitTime, - HoodieTableMetadata metadata, + HoodieTableMetaClient metaClient, Iterator> recordItr, String fileId) { - super(config, commitTime, metadata); + super(config, commitTime, metaClient); WriteStatus writeStatus = new WriteStatus(); writeStatus.setStat(new HoodieWriteStat()); this.writeStatus = writeStatus; @@ -74,7 +74,9 @@ import java.util.Iterator; HoodieRecord record = newRecordsItr.next(); // If the first record, we need to extract some info out if (oldFilePath == null) { - String latestValidFilePath = metadata.getFilenameForRecord(fs, record, fileId); + String latestValidFilePath = fileSystemView + .getLatestDataFilesForFileId(record.getPartitionPath(), fileId).findFirst() + .get().getFileName(); writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath)); oldFilePath = new Path( config.getBasePath() + "/" + record.getPartitionPath() + "/" @@ -102,14 +104,14 @@ import java.util.Iterator; } // Create the writer for writing the new version file storageWriter = HoodieStorageWriterFactory - .getStorageWriter(commitTime, newFilePath, metadata, config, schema); + .getStorageWriter(commitTime, newFilePath, metaClient, config, schema); } catch (Exception e) { logger.error("Error in update task at commit " + commitTime, e); writeStatus.setGlobalError(e); throw new HoodieUpsertException( "Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit " - + commitTime + " on HDFS path " + metadata.getBasePath()); + + commitTime + " on HDFS path " + metaClient.getBasePath()); } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java index c393b638f..3fedde19f 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java @@ -16,11 +16,11 @@ package com.uber.hoodie.io.storage; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.util.FSUtils; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; @@ -32,7 +32,7 @@ import java.io.IOException; public class HoodieStorageWriterFactory { public static HoodieStorageWriter getStorageWriter( - String commitTime, Path path, HoodieTableMetadata metadata, HoodieWriteConfig config, Schema schema) + String commitTime, Path path, HoodieTableMetaClient metaClient, HoodieWriteConfig config, Schema schema) throws IOException { //TODO - based on the metadata choose the implementation of HoodieStorageWriter // Currently only parquet is supported diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java index d8edb000b..f1de70095 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java @@ -16,6 +16,11 @@ package com.uber.hoodie.table; +import com.uber.hoodie.common.model.HoodieDataFile; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieCommitMetadata; @@ -23,7 +28,6 @@ import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieInsertException; import com.uber.hoodie.exception.HoodieUpsertException; @@ -33,7 +37,6 @@ import com.uber.hoodie.io.HoodieUpdateHandle; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -52,6 +55,7 @@ import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; +import java.util.stream.Collectors; import scala.Option; import scala.Tuple2; @@ -133,8 +137,8 @@ public class HoodieCopyOnWriteTable extends Hoodi } - public HoodieCopyOnWriteTable(String commitTime, HoodieWriteConfig config, HoodieTableMetadata metadata) { - super(commitTime, config, metadata); + public HoodieCopyOnWriteTable(String commitTime, HoodieWriteConfig config, HoodieTableMetaClient metaClient) { + super(commitTime, config, metaClient); } /** @@ -287,21 +291,22 @@ public class HoodieCopyOnWriteTable extends Hoodi FileSystem fs = FSUtils.getFs(); List smallFileLocations = new ArrayList<>(); - if (metadata.getAllCommits().getNumCommits() > 0) { // if we have some commits - String latestCommitTime = metadata.getAllCommits().lastCommit(); - FileStatus[] allFiles = metadata.getLatestVersionInPartition(fs, partitionPath, latestCommitTime); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + TableFileSystemView fileSystemView = new ReadOptimizedTableView(fs, metaClient); - if (allFiles != null && allFiles.length > 0) { - for (FileStatus fileStatus : allFiles) { - if (fileStatus.getLen() < config.getParquetSmallFileLimit()) { - String filename = fileStatus.getPath().getName(); - SmallFile sf = new SmallFile(); - sf.location = new HoodieRecordLocation( - FSUtils.getCommitTime(filename), - FSUtils.getFileId(filename)); - sf.sizeBytes = fileStatus.getLen(); - smallFileLocations.add(sf); - } + if (commitTimeline.hasInstants()) { // if we have some commits + String latestCommitTime = commitTimeline.lastInstant().get(); + List allFiles = fileSystemView.streamLatestVersionInPartition(partitionPath, latestCommitTime).collect( + Collectors.toList()); + + for (HoodieDataFile file : allFiles) { + if (file.getFileSize() < config.getParquetSmallFileLimit()) { + String filename = file.getFileName(); + SmallFile sf = new SmallFile(); + sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), + FSUtils.getFileId(filename)); + sf.sizeBytes = file.getFileSize(); + smallFileLocations.add(sf); } } } @@ -317,11 +322,15 @@ public class HoodieCopyOnWriteTable extends Hoodi */ private long averageBytesPerRecord() { long avgSize = 0L; + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); try { - if (metadata.getAllCommits().getNumCommits() > 0) { - String latestCommitTime = metadata.getAllCommits().lastCommit(); - HoodieCommitMetadata commitMetadata = metadata.getCommitMetadata(latestCommitTime); - avgSize =(long) Math.ceil((1.0 * commitMetadata.fetchTotalBytesWritten())/commitMetadata.fetchTotalRecordsWritten()); + if (commitTimeline.hasInstants()) { + String latestCommitTime = commitTimeline.lastInstant().get(); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(commitTimeline.readInstantDetails(latestCommitTime).get()); + avgSize = (long) Math.ceil( + (1.0 * commitMetadata.fetchTotalBytesWritten()) / commitMetadata + .fetchTotalRecordsWritten()); } } catch (Throwable t) { // make this fail safe. @@ -389,7 +398,7 @@ public class HoodieCopyOnWriteTable extends Hoodi public Iterator> handleUpdate(String fileLoc, Iterator> recordItr) throws Exception { // these are updates HoodieUpdateHandle upsertHandle = - new HoodieUpdateHandle<>(config, commitTime, metadata, recordItr, fileLoc); + new HoodieUpdateHandle<>(config, commitTime, metaClient, recordItr, fileLoc); if (upsertHandle.getOldFilePath() == null) { throw new HoodieUpsertException("Error in finding the old file path at commit " + commitTime +" at fileLoc: " + fileLoc); @@ -424,7 +433,7 @@ public class HoodieCopyOnWriteTable extends Hoodi } public Iterator> handleInsert(Iterator> recordItr) throws Exception { - return new LazyInsertIterable<>(recordItr, config, commitTime, metadata); + return new LazyInsertIterable<>(recordItr, config, commitTime, metaClient); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java index c79a55144..557882357 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java @@ -16,11 +16,11 @@ package com.uber.hoodie.table; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.exception.HoodieException; @@ -39,12 +39,13 @@ public abstract class HoodieTable implements Seri protected final HoodieWriteConfig config; - protected final HoodieTableMetadata metadata; + protected final HoodieTableMetaClient metaClient; - protected HoodieTable(String commitTime, HoodieWriteConfig config, HoodieTableMetadata metadata) { + protected HoodieTable(String commitTime, HoodieWriteConfig config, + HoodieTableMetaClient metaClient) { this.commitTime = commitTime; this.config = config; - this.metadata = metadata; + this.metaClient = metaClient; } /** @@ -81,8 +82,7 @@ public abstract class HoodieTable implements Seri * @param partitioner */ public abstract Iterator> handleUpsertPartition(Integer partition, - Iterator> recordIterator, - Partitioner partitioner); + Iterator> recordIterator, Partitioner partitioner); /** * Perform the ultimate IO for a given inserted (RDD) partition @@ -96,14 +96,13 @@ public abstract class HoodieTable implements Seri Partitioner partitioner); - public static HoodieTable getHoodieTable(HoodieTableType type, + public static HoodieTable getHoodieTable(HoodieTableMetaClient metaClient, String commitTime, - HoodieWriteConfig config, - HoodieTableMetadata metadata) { - if (type == HoodieTableType.COPY_ON_WRITE) { - return new HoodieCopyOnWriteTable(commitTime, config, metadata); + HoodieWriteConfig config) { + if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) { + return new HoodieCopyOnWriteTable(commitTime, config, metaClient); } else { - throw new HoodieException("Unsupported table type :"+ type); + throw new HoodieException("Unsupported table type :"+ metaClient.getTableType()); } } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java b/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java index 8981309e2..43313c496 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java @@ -21,12 +21,15 @@ import com.google.common.collect.Iterables; import com.uber.hoodie.common.HoodieClientTestUtils; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieCommitMetadata; -import com.uber.hoodie.common.model.HoodieCommits; +import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.model.HoodieWriteStat; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.ParquetUtils; import com.uber.hoodie.config.HoodieWriteConfig; @@ -59,9 +62,11 @@ import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.SortedMap; import java.util.TreeSet; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -87,7 +92,7 @@ public class TestHoodieClient implements Serializable { TemporaryFolder folder = new TemporaryFolder(); folder.create(); basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.initializeHoodieDirectory(basePath); + HoodieTestUtils.init(basePath); dataGen = new HoodieTestDataGenerator(); } @@ -200,7 +205,7 @@ public class TestHoodieClient implements Serializable { assertEquals("Latest commit should be 001",readClient.latestCommit(), newCommitTime); assertEquals("Must contain 200 records", readClient.readCommit(newCommitTime).count(), records.size()); // Should have 100 records in table (check using Index), all in locations marked at commit - List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), new HoodieTableMetadata(fs, basePath)).collect(); + List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), new HoodieTableMetaClient(fs, basePath)).collect(); checkTaggedRecords(taggedRecords, "001"); /** @@ -226,7 +231,7 @@ public class TestHoodieClient implements Serializable { assertEquals("Latest commit should be 004",readClient.latestCommit(), newCommitTime); // Index should be able to locate all updates in correct locations. - taggedRecords = index.tagLocation(jsc.parallelize(dedupedRecords, 1), new HoodieTableMetadata(fs, basePath)).collect(); + taggedRecords = index.tagLocation(jsc.parallelize(dedupedRecords, 1), new HoodieTableMetaClient(fs, basePath)).collect(); checkTaggedRecords(taggedRecords, "004"); // Check the entire dataset has 100 records still @@ -273,7 +278,7 @@ public class TestHoodieClient implements Serializable { assertEquals("Expecting a single commit.", new HoodieReadClient(jsc, basePath).listCommitsSince("000").size(), 1); // Should have 100 records in table (check using Index), all in locations marked at commit - List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), new HoodieTableMetadata(fs, basePath)).collect(); + List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), new HoodieTableMetaClient(fs, basePath)).collect(); checkTaggedRecords(taggedRecords, newCommitTime); // Keep doing some writes and clean inline. Make sure we have expected number of files remaining. @@ -287,34 +292,39 @@ public class TestHoodieClient implements Serializable { // Verify there are no errors assertNoWriteErrors(statuses); - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); - SortedMap commitMetadata = metadata.getAllCommitMetadata(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTimeline timeline = metadata.getActiveCommitTimeline(); + TableFileSystemView fsView = new ReadOptimizedTableView(fs, metadata); // Need to ensure the following for (String partitionPath : dataGen.getPartitionPaths()) { // compute all the versions of all files, from time 0 HashMap> fileIdToVersions = new HashMap<>(); - for (Map.Entry entry : commitMetadata.entrySet()) { - for (HoodieWriteStat wstat : entry.getValue().getWriteStats(partitionPath)) { + for (String entry : timeline.getInstants().collect(Collectors.toList())) { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.readInstantDetails(entry).get()); + + for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) { if (!fileIdToVersions.containsKey(wstat.getFileId())) { - fileIdToVersions.put(wstat.getFileId(), new TreeSet()); + fileIdToVersions.put(wstat.getFileId(), new TreeSet<>()); } - fileIdToVersions.get(wstat.getFileId()).add(entry.getKey()); + fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getFullPath()).getName())); } } - Map> fileVersions = metadata.getAllVersionsInPartition(fs, partitionPath); - for (Map.Entry> entry : fileVersions.entrySet()) { - List versions = entry.getValue(); + + List> fileVersions = fsView.streamEveryVersionInPartition(partitionPath).collect(Collectors.toList()); + for (List entry : fileVersions) { // No file has no more than max versions - assertTrue("fileId " + entry.getKey() + " has more than " + maxVersions + " versions", - versions.size() <= maxVersions); + String fileId = entry.iterator().next().getFileId(); + + assertTrue("fileId " + fileId + " has more than " + maxVersions + " versions", + entry.size() <= maxVersions); // Each file, has the latest N versions (i.e cleaning gets rid of older versions) - List commitedVersions = new ArrayList<>(fileIdToVersions.get(entry.getKey())); - for (int i = 0; i < versions.size(); i++) { - assertEquals("File " + entry.getKey() + " does not have latest versions" + versions + " on commits" + commitedVersions, - FSUtils.getCommitTime(Iterables.get(versions, i).getPath().getName()), + List commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId)); + for (int i = 0; i < entry.size(); i++) { + assertEquals("File " + fileId + " does not have latest versions on commits" + commitedVersions, + Iterables.get(entry, i).getCommitTime(), commitedVersions.get(commitedVersions.size() - 1 - i)); } } @@ -349,7 +359,7 @@ public class TestHoodieClient implements Serializable { // verify that there is a commit assertEquals("Expecting a single commit.", new HoodieReadClient(jsc, basePath).listCommitsSince("000").size(), 1); // Should have 100 records in table (check using Index), all in locations marked at commit - List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), new HoodieTableMetadata(fs, basePath)).collect(); + List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), new HoodieTableMetaClient(fs, basePath)).collect(); checkTaggedRecords(taggedRecords, newCommitTime); // Keep doing some writes and clean inline. Make sure we have expected number of files remaining. @@ -362,23 +372,29 @@ public class TestHoodieClient implements Serializable { // Verify there are no errors assertNoWriteErrors(statuses); - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); - HoodieCommits commits = metadata.getAllCommits(); - String earliestRetainedCommit = commits.lastCommit(maxCommits - 1); - Set acceptableCommits = new HashSet<>(commits.getCommitList()); - if (earliestRetainedCommit != null) { - acceptableCommits.removeAll(commits.findCommitsInRange("000", earliestRetainedCommit)); - acceptableCommits.add(earliestRetainedCommit); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTimeline activeTimeline = metadata.getActiveCommitTimeline(); + Optional earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1); + Set acceptableCommits = + activeTimeline.getInstants().collect(Collectors.toSet()); + if (earliestRetainedCommit.isPresent()) { + acceptableCommits.removeAll( + activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get()) + .collect(Collectors.toSet())); + acceptableCommits.add(earliestRetainedCommit.get()); } + TableFileSystemView fsView = new ReadOptimizedTableView(fs, metadata); // Need to ensure the following for (String partitionPath : dataGen.getPartitionPaths()) { - Map> fileVersions = metadata.getAllVersionsInPartition(fs, partitionPath); - for (Map.Entry> entry : fileVersions.entrySet()) { - Set commitTimes = new HashSet<>(entry.getValue().size()); - for(FileStatus value:entry.getValue()) { - commitTimes.add(FSUtils.getCommitTime(value.getPath().getName())); + List> fileVersions = fsView.streamEveryVersionInPartition(partitionPath).collect(Collectors.toList()); + for (List entry : fileVersions) { + Set commitTimes = new HashSet<>(); + for(HoodieDataFile value:entry) { + System.out.println("Data File - " + value); + commitTimes.add(value.getCommitTime()); } + System.out.println("Existing commits " + activeTimeline.getInstants().collect(Collectors.toList())); assertEquals("Only contain acceptable versions of file should be present", acceptableCommits, commitTimes); } @@ -620,13 +636,16 @@ public class TestHoodieClient implements Serializable { assertNoWriteErrors(statuses); assertEquals("2 files needs to be committed.", 2, statuses.size()); - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); - FileStatus[] files = metadata.getLatestVersionInPartition(fs, TEST_PARTITION_PATH, commitTime3); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTimeline activeTimeline = metadata.getActiveCommitTimeline(); + TableFileSystemView fileSystemView = new ReadOptimizedTableView(fs, metadata); + List files = fileSystemView.streamLatestVersionInPartition(TEST_PARTITION_PATH, commitTime3).collect( + Collectors.toList()); int numTotalInsertsInCommit3 = 0; - for (FileStatus file: files) { - if (file.getPath().getName().contains(file1)) { - assertEquals("Existing file should be expanded", commitTime3, FSUtils.getCommitTime(file.getPath().getName())); - records = ParquetUtils.readAvroRecords(file.getPath()); + for (HoodieDataFile file: files) { + if (file.getFileName().contains(file1)) { + assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime()); + records = ParquetUtils.readAvroRecords(new Path(file.getPath())); for (GenericRecord record: records) { String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); @@ -641,8 +660,8 @@ public class TestHoodieClient implements Serializable { } assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0, keys2.size()); } else { - assertEquals("New file must be written for commit 3", commitTime3, FSUtils.getCommitTime(file.getPath().getName())); - records = ParquetUtils.readAvroRecords(file.getPath()); + assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime()); + records = ParquetUtils.readAvroRecords(new Path(file.getPath())); for (GenericRecord record: records) { String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); assertEquals("only expect commit3", commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java index 63095f592..a2f9eb49b 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java @@ -18,7 +18,7 @@ package com.uber.hoodie.common; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieRecord; -import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.util.FSUtils; import java.io.File; @@ -53,18 +53,18 @@ public class HoodieClientTestUtils { } private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException { - String parentPath = basePath + "/"+ HoodieTableMetadata.METAFOLDER_NAME; + String parentPath = basePath + "/"+ HoodieTableMetaClient.METAFOLDER_NAME; new File(parentPath).mkdirs(); new File(parentPath + "/" + commitTime + suffix).createNewFile(); } public static void fakeCommitFile(String basePath, String commitTime) throws IOException { - fakeMetaFile(basePath, commitTime, HoodieTableMetadata.COMMIT_FILE_SUFFIX); + fakeMetaFile(basePath, commitTime, HoodieTableMetaClient.COMMIT_EXTENSION); } public static void fakeInFlightFile(String basePath, String commitTime) throws IOException { - fakeMetaFile(basePath, commitTime, HoodieTableMetadata.INFLIGHT_FILE_SUFFIX); + fakeMetaFile(basePath, commitTime, HoodieTableMetaClient.INFLIGHT_FILE_SUFFIX); } public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId) throws Exception { diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java index 530d798d9..b034b7ef9 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java @@ -19,7 +19,7 @@ package com.uber.hoodie.common; import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; -import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; @@ -29,8 +29,6 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -144,7 +142,7 @@ public class HoodieTestDataGenerator { public static void createCommitFile(String basePath, String commitTime) throws IOException { Path commitFile = - new Path(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME + "/" + FSUtils.makeCommitFileName(commitTime)); + new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTableMetaClient.makeCommitFileName(commitTime)); FileSystem fs = FSUtils.getFs(); FSDataOutputStream os = fs.create(commitFile, true); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); diff --git a/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java b/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java index a954759c5..ec6c2329f 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java @@ -16,13 +16,13 @@ package com.uber.hoodie.func; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.TestRawTripPayload; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.table.HoodieCopyOnWriteTable; @@ -48,14 +48,14 @@ public class TestUpdateMapFunction { TemporaryFolder folder = new TemporaryFolder(); folder.create(); this.basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.initializeHoodieDirectory(basePath); + HoodieTestUtils.init(basePath); } @Test public void testSchemaEvolutionOnUpdate() throws Exception { // Create a bunch of records with a old version of schema HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt"); - HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable("100", config, metadata); String recordStr1 = @@ -79,13 +79,13 @@ public class TestUpdateMapFunction { rowChange3)); Iterator> insertResult = table.handleInsert(records.iterator()); Path commitFile = - new Path(config.getBasePath() + "/.hoodie/" + FSUtils.makeCommitFileName("100")); + new Path(config.getBasePath() + "/.hoodie/" + HoodieTableMetaClient.makeCommitFileName("100")); FSUtils.getFs().create(commitFile); // Now try an update with an evolved schema // Evolved schema does not have guarantee on preserving the original field ordering config = makeHoodieClientConfig("/exampleEvolvedSchema.txt"); - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath); + metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); String fileId = insertResult.next().get(0).getFileId(); System.out.println(fileId); diff --git a/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieBloomIndex.java b/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieBloomIndex.java index 24091fa71..faa866ec6 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieBloomIndex.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieBloomIndex.java @@ -19,13 +19,13 @@ package com.uber.hoodie.index; import com.google.common.base.Optional; import com.google.common.collect.Lists; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.TestRawTripPayload; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; @@ -74,7 +74,7 @@ public class TestHoodieBloomIndex { TemporaryFolder folder = new TemporaryFolder(); folder.create(); basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.initializeHoodieDirectory(basePath); + HoodieTestUtils.init(basePath); } @Test @@ -126,7 +126,7 @@ public class TestHoodieBloomIndex { new File(basePath + "/2015/03/12/3_0_20150312101010.parquet").createNewFile(); new File(basePath + "/2015/03/12/4_0_20150312101010.parquet").createNewFile(); List partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); JavaPairRDD rdd = index.loadInvolvedFiles(partitions, metadata); // Still 0, as no valid commit assertEquals(rdd.count(), 0); @@ -135,7 +135,7 @@ public class TestHoodieBloomIndex { new File(basePath + "/.hoodie").mkdirs(); new File(basePath + "/.hoodie/20160401010101.commit").createNewFile(); new File(basePath + "/.hoodie/20150312101010.commit").createNewFile(); - metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + metadata = new HoodieTableMetaClient(fs, basePath); rdd = index.loadInvolvedFiles(partitions, metadata); final List> filesList = rdd.collect(); assertEquals(filesList.size(), 4); @@ -212,7 +212,7 @@ public class TestHoodieBloomIndex { // We have some records to be tagged (two different partitions) JavaRDD recordRDD = jsc.emptyRDD(); // Also create the metadata and config - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); // Let's tag @@ -248,7 +248,7 @@ public class TestHoodieBloomIndex { JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); // Also create the metadata and config - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); // Let's tag @@ -266,7 +266,7 @@ public class TestHoodieBloomIndex { String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true); // We do the tag again - metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + metadata = new HoodieTableMetaClient(fs, basePath); taggedRecordRDD = bloomIndex.tagLocation(recordRDD, metadata); // Check results @@ -309,7 +309,7 @@ public class TestHoodieBloomIndex { JavaRDD keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4)); // Also create the metadata and config - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); // Let's tag @@ -327,7 +327,7 @@ public class TestHoodieBloomIndex { String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true); // We do the tag again - metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + metadata = new HoodieTableMetaClient(fs, basePath); taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, metadata); // Check results @@ -375,7 +375,7 @@ public class TestHoodieBloomIndex { // We do the tag JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2)); - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, metadata); @@ -421,8 +421,8 @@ public class TestHoodieBloomIndex { if (createCommitTime) { // Also make sure the commit is valid - new File(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME).mkdirs(); - new File(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile(); + new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs(); + new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile(); } return filename; } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCleaner.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCleaner.java index 4b5e6b629..b7d453ba4 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCleaner.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCleaner.java @@ -16,8 +16,8 @@ package com.uber.hoodie.io; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.config.HoodieWriteConfig; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.util.FSUtils; @@ -34,10 +34,12 @@ public class TestHoodieCleaner { private String basePath = null; private String[] partitionPaths = {"2016/01/01", "2016/02/02"}; + private HoodieTableMetaClient metaClient; @Before public void init() throws Exception { - this.basePath = HoodieTestUtils.initializeTempHoodieBasePath(); + this.metaClient = HoodieTestUtils.initOnTemp(); + this.basePath = metaClient.getBasePath(); } @Test @@ -53,7 +55,7 @@ public class TestHoodieCleaner { String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); - HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); HoodieCleaner cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[0])); assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[1])); @@ -67,7 +69,7 @@ public class TestHoodieCleaner { String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); assertEquals("Must clean 1 file" , 1, cleaner.clean(partitionPaths[0])); assertEquals("Must clean 1 file" , 1, cleaner.clean(partitionPaths[1])); @@ -82,7 +84,7 @@ public class TestHoodieCleaner { HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002"); - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); assertEquals("Must clean two files" , 2, cleaner.clean(partitionPaths[0])); assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); @@ -110,7 +112,7 @@ public class TestHoodieCleaner { String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); - HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); HoodieCleaner cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[0])); assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[1])); @@ -124,7 +126,7 @@ public class TestHoodieCleaner { String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[0])); assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[1])); @@ -139,7 +141,7 @@ public class TestHoodieCleaner { HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002"); - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); assertEquals( "Must not clean any file. We have to keep 1 version before the latest commit time to keep", @@ -153,7 +155,7 @@ public class TestHoodieCleaner { HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update String file4P0C3 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "003"); - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); assertEquals( "Must not clean one old file", 1, cleaner.clean(partitionPaths[0])); diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java index d9f785eda..171103742 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java @@ -17,10 +17,11 @@ package com.uber.hoodie.io; import com.google.common.collect.Lists; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieCommitMetadata; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieCompactionConfig; @@ -32,8 +33,11 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.SortedMap; import java.util.TreeMap; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -47,7 +51,7 @@ public class TestHoodieCommitArchiveLog { TemporaryFolder folder = new TemporaryFolder(); folder.create(); basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.initializeHoodieDirectory(basePath); + HoodieTestUtils.init(basePath); fs = FSUtils.getFs(); } @@ -56,7 +60,7 @@ public class TestHoodieCommitArchiveLog { HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .forTable("test-trip-table").build(); - HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); boolean result = archiveLog.archiveIfRequired(); assertTrue(result); } @@ -67,20 +71,22 @@ public class TestHoodieCommitArchiveLog { .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .forTable("test-trip-table").withCompactionConfig( HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); - HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); HoodieTestDataGenerator.createCommitFile(basePath, "100"); HoodieTestDataGenerator.createCommitFile(basePath, "101"); HoodieTestDataGenerator.createCommitFile(basePath, "102"); HoodieTestDataGenerator.createCommitFile(basePath, "103"); - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); + HoodieTimeline timeline = metadata.getActiveCommitTimeline(); + assertEquals("Loaded 4 commits and the count should match", 4, - metadata.getAllCommits().getCommitList().size()); + timeline.getTotalInstants()); boolean result = archiveLog.archiveIfRequired(); assertTrue(result); - metadata = new HoodieTableMetadata(fs, basePath); + timeline = timeline.reload(); assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4, - metadata.getAllCommits().getCommitList().size()); + timeline.getTotalInstants()); } @Test @@ -89,7 +95,8 @@ public class TestHoodieCommitArchiveLog { .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .forTable("test-trip-table").withCompactionConfig( HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); - HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); HoodieTestDataGenerator.createCommitFile(basePath, "100"); HoodieTestDataGenerator.createCommitFile(basePath, "101"); HoodieTestDataGenerator.createCommitFile(basePath, "102"); @@ -97,24 +104,22 @@ public class TestHoodieCommitArchiveLog { HoodieTestDataGenerator.createCommitFile(basePath, "104"); HoodieTestDataGenerator.createCommitFile(basePath, "105"); - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); - SortedMap originalCommits = new TreeMap<>(metadata.getAllCommitMetadata()); + HoodieTimeline timeline = metadata.getActiveCommitTimeline(); + List originalCommits = timeline.getInstants().collect( + Collectors.toList()); - assertEquals("Loaded 6 commits and the count should match", 6, - metadata.getAllCommits().getCommitList().size()); + assertEquals("Loaded 6 commits and the count should match", 6, timeline.getTotalInstants()); boolean result = archiveLog.archiveIfRequired(); assertTrue(result); - metadata = new HoodieTableMetadata(fs, basePath); + timeline = timeline.reload(); assertEquals( "Should archive commits when maxCommitsToKeep is 5 and now the commits length should be minCommitsToKeep which is 2", - 2, metadata.getAllCommits().getCommitList().size()); + 2, timeline.getTotalInstants()); assertEquals("Archive should not archive the last 2 commits", - Lists.newArrayList("104", "105"), metadata.getAllCommits().getCommitList()); + Lists.newArrayList("104", "105"), timeline.getInstants().collect(Collectors.toList())); // Remove all the commits from the original commits, make it ready to be checked against the read map - for(String key:metadata.getAllCommitMetadata().keySet()) { - originalCommits.remove(key); - } + timeline.getInstants().forEach(originalCommits::remove); // Read back the commits to make sure SequenceFile.Reader reader = new SequenceFile.Reader(fs.getConf(), @@ -129,7 +134,7 @@ public class TestHoodieCommitArchiveLog { assertEquals( "Read commits map should match the originalCommits - commitsLoadedAfterArchival", - originalCommits, readCommits); + originalCommits, new ArrayList<>(readCommits.keySet())); reader.close(); } @@ -139,7 +144,8 @@ public class TestHoodieCommitArchiveLog { .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .forTable("test-trip-table").withCompactionConfig( HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); - HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); HoodieTestDataGenerator.createCommitFile(basePath, "100"); HoodieTestDataGenerator.createCommitFile(basePath, "101"); HoodieTestDataGenerator.createCommitFile(basePath, "102"); @@ -147,16 +153,15 @@ public class TestHoodieCommitArchiveLog { HoodieTestDataGenerator.createCommitFile(basePath, "104"); HoodieTestDataGenerator.createCommitFile(basePath, "105"); - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); - assertEquals("Loaded 6 commits and the count should match", 6, - metadata.getAllCommits().getCommitList().size()); + HoodieTimeline timeline = metadata.getActiveCommitTimeline(); + assertEquals("Loaded 6 commits and the count should match", 6, timeline.getTotalInstants()); boolean result = archiveLog.archiveIfRequired(); assertTrue(result); - metadata = new HoodieTableMetadata(fs, basePath); - assertTrue("Archived commits should always be safe", metadata.isCommitTsSafe("100")); - assertTrue("Archived commits should always be safe", metadata.isCommitTsSafe("101")); - assertTrue("Archived commits should always be safe", metadata.isCommitTsSafe("102")); - assertTrue("Archived commits should always be safe", metadata.isCommitTsSafe("103")); + timeline = timeline.reload(); + assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("100")); + assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("101")); + assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("102")); + assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("103")); } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java b/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java index 1b389c875..be035be62 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java @@ -16,17 +16,17 @@ package com.uber.hoodie.table; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.HoodieClientTestUtils; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.TestRawTripPayload; -import com.uber.hoodie.common.model.HoodieCommits; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.ParquetUtils; @@ -76,7 +76,7 @@ public class TestCopyOnWriteTable { TemporaryFolder folder = new TemporaryFolder(); folder.create(); this.basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.initializeHoodieDirectory(basePath); + HoodieTestUtils.init(basePath); } @Test @@ -87,9 +87,10 @@ public class TestCopyOnWriteTable { HoodieRecord record = mock(HoodieRecord.class); when(record.getPartitionPath()).thenReturn(partitionPath); - String commitTime = HoodieTestUtils.getNewCommitTime(); + String commitTime = HoodieTestUtils.makeNewCommitTime(); HoodieWriteConfig config = makeHoodieClientConfig(); - HoodieInsertHandle io = new HoodieInsertHandle(config, commitTime, null, partitionPath); + HoodieInsertHandle io = new HoodieInsertHandle(config, commitTime, + new HoodieTableMetaClient(FSUtils.getFs(), basePath), partitionPath); Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName); assertTrue(newPath.toString().equals(this.basePath + "/" + partitionPath + "/" + FSUtils .makeDataFileName(commitTime, unitNumber, fileName))); @@ -110,8 +111,8 @@ public class TestCopyOnWriteTable { public void testUpdateRecords() throws Exception { // Prepare the AvroParquetIO HoodieWriteConfig config = makeHoodieClientConfig(); - String firstCommitTime = HoodieTestUtils.getNewCommitTime(); - HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + String firstCommitTime = HoodieTestUtils.makeNewCommitTime(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); String partitionPath = "/2016/01/31"; HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(firstCommitTime, config, metadata); @@ -148,7 +149,7 @@ public class TestCopyOnWriteTable { assertTrue(filter.mightContain(record.getRecordKey())); } // Create a commit file - new File(this.basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME + "/" + new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile(); // Read the parquet file, check the record content @@ -172,8 +173,8 @@ public class TestCopyOnWriteTable { List updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1); Thread.sleep(1000); - String newCommitTime = HoodieTestUtils.getNewCommitTime(); - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + String newCommitTime = HoodieTestUtils.makeNewCommitTime(); + metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); table = new HoodieCopyOnWriteTable(newCommitTime, config, metadata); Iterator> iter = table.handleUpdate(updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator()); @@ -181,9 +182,11 @@ public class TestCopyOnWriteTable { File updatedParquetFile = null; for (File file : new File(basePath + "/2016/01/31").listFiles()) { if (file.getName().endsWith(".parquet")) { - if (FSUtils.getFileId(file.getName()).equals(FSUtils.getFileId(parquetFile.getName())) - && HoodieCommits - .isCommit1After(FSUtils.getCommitTime(file.getName()), FSUtils.getCommitTime(parquetFile.getName()))) { + if (FSUtils.getFileId(file.getName()) + .equals(FSUtils.getFileId(parquetFile.getName())) && metadata + .getActiveCommitTimeline() + .compareInstants(FSUtils.getCommitTime(file.getName()), + FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) { updatedParquetFile = file; break; } @@ -236,9 +239,9 @@ public class TestCopyOnWriteTable { @Test public void testInsertWithPartialFailures() throws Exception { HoodieWriteConfig config = makeHoodieClientConfig(); - String commitTime = HoodieTestUtils.getNewCommitTime(); + String commitTime = HoodieTestUtils.makeNewCommitTime(); FileSystem fs = FSUtils.getFs(); - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(commitTime, config, metadata); // Write a few records, and get atleast one file @@ -275,8 +278,8 @@ public class TestCopyOnWriteTable { @Test public void testInsertRecords() throws Exception { HoodieWriteConfig config = makeHoodieClientConfig(); - String commitTime = HoodieTestUtils.getNewCommitTime(); - HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + String commitTime = HoodieTestUtils.makeNewCommitTime(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(commitTime, config, metadata); // Case 1: @@ -322,8 +325,8 @@ public class TestCopyOnWriteTable { HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig( HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024) .parquetPageSize(64 * 1024).build()).build(); - String commitTime = HoodieTestUtils.getNewCommitTime(); - HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + String commitTime = HoodieTestUtils.makeNewCommitTime(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(commitTime, config, metadata); List records = new ArrayList<>(); @@ -367,7 +370,7 @@ public class TestCopyOnWriteTable { HoodieClientTestUtils.fakeCommitFile(basePath, "001"); HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize); - HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable("001", config, metadata); HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH}); diff --git a/hoodie-common/pom.xml b/hoodie-common/pom.xml index 1c23d8fe8..7c81674b6 100644 --- a/hoodie-common/pom.xml +++ b/hoodie-common/pom.xml @@ -28,8 +28,8 @@ - org.codehaus.mojo - cobertura-maven-plugin + org.jacoco + jacoco-maven-plugin org.apache.maven.plugins @@ -90,5 +90,14 @@ 1.10.19 test + + org.apache.commons + commons-lang3 + + + com.esotericsoftware + kryo + test + diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommitMetadata.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommitMetadata.java index 1e7448475..ff221fc0b 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommitMetadata.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommitMetadata.java @@ -24,6 +24,7 @@ import org.codehaus.jackson.map.ObjectMapper; import java.io.IOException; import java.io.Serializable; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -187,4 +188,8 @@ public class HoodieCommitMetadata implements Serializable { public int hashCode() { return partitionToWriteStats != null ? partitionToWriteStats.hashCode() : 0; } + + public static HoodieCommitMetadata fromBytes(byte[] bytes) throws IOException { + return fromJsonString(new String(bytes, Charset.forName("utf-8"))); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommits.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommits.java deleted file mode 100644 index 4094f78c2..000000000 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommits.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.uber.hoodie.common.model; - - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Manages the commit meta and provides operations on the commit timeline - */ -public class HoodieCommits implements Serializable { - - private List commitList; - - public HoodieCommits(List commitList) { - this.commitList = new ArrayList<>(commitList); - Collections.sort(this.commitList); - this.commitList = Collections.unmodifiableList(this.commitList); - } - - /** - * Returns the commits which are in the range (startsTs, endTs]. - * - * @param startTs - exclusive start commit ts - * @param endTs - inclusive end commit ts - */ - public List findCommitsInRange(String startTs, String endTs) { - if (commitList.isEmpty()) { - return Collections.EMPTY_LIST; - } - int startIndex = 0; - if (startTs != null) { - startIndex = Collections.binarySearch(commitList, startTs); - // If startIndex is negative - if (startIndex < 0) { - startIndex = -(startIndex + 1); - } - } - - int endIndex = Collections.binarySearch(commitList, endTs); - // If endIndex is negative - if (endIndex < 0) { - endIndex = -(endIndex + 1); - } - - if (endIndex < startIndex) { - throw new IllegalArgumentException( - "Start Commit Ts " + startTs + " cannot be less than end commit ts" + endTs); - } - List returns = new ArrayList<>(commitList.subList(startIndex, endIndex)); - if(endIndex < commitList.size()) { - // Be inclusive of the endIndex - returns.add(commitList.get(endIndex)); - } - return Collections.unmodifiableList(returns); - } - - /** - * Finds the list of commits on or before asOfTs - */ - public List findCommitsAfter(String commitTimeStamp, int numCommits) { - if (commitList.isEmpty()) { - return null; - } - - int startIndex = Collections.binarySearch(commitList, commitTimeStamp); - if (startIndex < 0) { - startIndex = -(startIndex + 1); - } else { - // we found asOfTs at startIndex. We want to exclude it. - startIndex++; - } - - - List commits = new ArrayList<>(); - while (numCommits > 0 && startIndex < commitList.size()) { - commits.add(commitList.get(startIndex)); - startIndex++; - numCommits--; - } - - return Collections.unmodifiableList(commits); - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("HoodieCommits{"); - sb.append("commitList=").append(commitList); - sb.append('}'); - return sb.toString(); - } - - public boolean isEmpty() { - return commitList.isEmpty(); - } - - public int getNumCommits() { - return commitList.size(); - } - - public String firstCommit() { - return commitList.isEmpty() ? null : commitList.get(0); - } - - public String nthCommit(int n) { - return commitList.isEmpty() || n >= commitList.size() ? null : commitList.get(n); - } - - public String lastCommit() { - return commitList.isEmpty() ? null : commitList.get(commitList.size() - 1); - } - - /** - * Returns the nth commit from the latest commit such that lastCommit(0) gteq lastCommit() - */ - public String lastCommit(int n) { - if (commitList.size() < n + 1) { - return null; - } - return commitList.get(commitList.size() - 1 - n); - } - - public boolean contains(String commitTs) { - return commitList.contains(commitTs); - } - - public String max(String commit1, String commit2) { - if (commit1 == null && commit2 == null) { - return null; - } - if (commit1 == null) { - return commit2; - } - if (commit2 == null) { - return commit1; - } - return (isCommit1BeforeOrOn(commit1, commit2) ? commit2 : commit1); - } - - public static boolean isCommit1BeforeOrOn(String commit1, String commit2) { - return commit1.compareTo(commit2) <= 0; - } - - public static boolean isCommit1After(String commit1, String commit2) { - return commit1.compareTo(commit2) > 0; - } - - public List getCommitList() { - return commitList; - } - - public boolean isCommitBeforeEarliestCommit(String commitTs) { - return isCommit1BeforeOrOn(commitTs, firstCommit()); - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - - HoodieCommits that = (HoodieCommits) o; - - return commitList != null ? commitList.equals(that.commitList) : that.commitList == null; - - } - - @Override - public int hashCode() { - return commitList != null ? commitList.hashCode() : 0; - } - -} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDataFile.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDataFile.java new file mode 100644 index 000000000..fc02110b5 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDataFile.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.model; + +import com.uber.hoodie.common.util.FSUtils; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; + +import java.util.Comparator; + +public class HoodieDataFile { + private FileStatus fileStatus; + + public HoodieDataFile(FileStatus fileStatus) { + this.fileStatus = fileStatus; + } + + public String getFileId() { + return FSUtils.getFileId(fileStatus.getPath().getName()); + } + + public String getCommitTime() { + return FSUtils.getCommitTime(fileStatus.getPath().getName()); + } + + public String getPath() { + return fileStatus.getPath().toString(); + } + + public String getFileName() { + return fileStatus.getPath().getName(); + } + + public FileStatus getFileStatus() { + return fileStatus; + } + + public static Comparator getCommitTimeComparator() { + return (o1, o2) -> { + // reverse the order + return o2.getCommitTime().compareTo(o1.getCommitTime()); + }; + } + + public long getFileSize() { + return fileStatus.getLen(); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("HoodieDataFile{"); + sb.append("fileStatus=").append(fileStatus); + sb.append('}'); + return sb.toString(); + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFile.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFile.java deleted file mode 100644 index ca3f46b2d..000000000 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFile.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.uber.hoodie.common.model; - -import com.uber.hoodie.common.util.FSUtils; - -import org.apache.hadoop.fs.FileStatus; - -public class HoodieFile { - - private final FileStatus fileStatus; - private String fileNameWithoutCommitTs; - private String commitTs; - - public HoodieFile(FileStatus fileStatus) { - this.fileStatus = fileStatus; - String fileName = fileStatus.getPath().getName(); - this.fileNameWithoutCommitTs = FSUtils.getFileId(fileName); - this.commitTs = FSUtils.getCommitTime(fileName); - } - - public String getFileNameWithoutCommitTs() { - return fileNameWithoutCommitTs; - } - - public String getCommitTs() { - return commitTs; - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("HoodieFile{"); - sb.append("fileStatus=").append(fileStatus); - sb.append(", fileNameWithoutCommitTs='").append(fileNameWithoutCommitTs).append('\''); - sb.append(", commitTs='").append(commitTs).append('\''); - sb.append('}'); - return sb.toString(); - } - - public FileStatus getFileStatus() { - return fileStatus; - } -} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieStorageType.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieStorageType.java new file mode 100644 index 000000000..6c53078c7 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieStorageType.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.model; + +public enum HoodieStorageType { + PARQUET(".parquet"); + + private final String extension; + + HoodieStorageType(String extension) { + this.extension = extension; + } + + public String getFileExtension() { + return extension; + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableMetadata.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableMetadata.java deleted file mode 100644 index 0bda301e9..000000000 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableMetadata.java +++ /dev/null @@ -1,477 +0,0 @@ -/* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.uber.hoodie.common.model; - -import com.uber.hoodie.common.util.FSUtils; - -import com.uber.hoodie.exception.DatasetNotFoundException; -import com.uber.hoodie.exception.HoodieIOException; -import com.uber.hoodie.exception.InvalidDatasetException; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.SortedMap; -import java.util.TreeMap; - -/** - * Manages all file system level interactions for the Hoodie tables. - */ -public class HoodieTableMetadata implements Serializable { - public static final String MAX_COMMIT_TS = String.valueOf(Long.MAX_VALUE); - public static final String HOODIE_TABLE_NAME_PROP_NAME = "hoodie.table.name"; - public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type"; - public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE; - - public static final String HOODIE_PROPERTIES_FILE = "hoodie.properties"; - private static final String HOODIE_HDRONE_PROFILE_DEFAULT_VALUE = "HOODIE"; - private static final java.lang.String HOODIE_HDRONE_PROFILE_PROP_NAME = - "hoodie.hdrone.dataset.profile"; - - private static Logger log = LogManager.getLogger(HoodieTableMetadata.class); - private transient final FileSystem fs; - private transient final Path metadataFolder; - private final Properties properties; - private HoodieCommits commits; - private List inflightCommits; - private String basePath; - - public static final String METAFOLDER_NAME = ".hoodie"; - public static final String COMMIT_FILE_SUFFIX = ".commit"; - public static final String INFLIGHT_FILE_SUFFIX = ".inflight"; - - /** - * Constructor which initializes the hoodie table metadata. It will initialize the meta-data if not already present. - * - * @param fs - * @param basePath - * @param tableName - */ - public HoodieTableMetadata(FileSystem fs, String basePath, String tableName) { - this(fs, basePath, tableName, true); - } - - /** - * Constructor which loads the hoodie table metadata, It requires the meta-data to be present already - * @param fs - * @param basePath - */ - public HoodieTableMetadata(FileSystem fs, String basePath) { - this(fs, basePath, null, false); - } - - private HoodieTableMetadata(FileSystem fs, String basePath, String tableName, - boolean initOnMissing) { - this.fs = fs; - this.basePath = basePath; - - try { - Path basePathDir = new Path(this.basePath); - if (!fs.exists(basePathDir)) { - if (initOnMissing) { - fs.mkdirs(basePathDir); - } else { - throw new DatasetNotFoundException(this.basePath); - } - } - - if (!fs.isDirectory(new Path(basePath))) { - throw new DatasetNotFoundException(this.basePath); - } - - this.metadataFolder = new Path(this.basePath, METAFOLDER_NAME); - Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE); - if (!fs.exists(propertyPath)) { - if (initOnMissing) { - // create .hoodie folder if it does not exist. - createHoodieProperties(metadataFolder, tableName); - } else { - throw new InvalidDatasetException(this.basePath); - } - } - - // Load meta data - this.commits = new HoodieCommits(scanCommits(COMMIT_FILE_SUFFIX)); - this.inflightCommits = scanCommits(INFLIGHT_FILE_SUFFIX); - this.properties = readHoodieProperties(); - log.info("All commits :" + commits); - } catch (IOException e) { - throw new HoodieIOException("Could not load HoodieMetadata from path " + basePath, e); - } - } - - /** - * Returns all the commit metadata for this table. Reads all the commit files from HDFS. - * Expensive operation, use with caution. - * - * @return SortedMap of CommitTime,HoodieCommitMetadata - */ - public SortedMap getAllCommitMetadata() { - try { - TreeMap metadataMap = new TreeMap<>(); - for (String commitTs : commits.getCommitList()) { - metadataMap.put(commitTs, getCommitMetadata(commitTs)); - } - return Collections.unmodifiableSortedMap(metadataMap); - } catch (IOException e) { - throw new HoodieIOException("Could not load all commits for table " + getTableName(), - e); - } - } - - public HoodieCommitMetadata getCommitMetadata(String commitTime) throws IOException { - FSDataInputStream is = fs.open(new Path(metadataFolder, FSUtils.makeCommitFileName(commitTime))); - try { - String jsonStr = IOUtils.toString(is); - return HoodieCommitMetadata.fromJsonString(jsonStr); - } finally { - is.close(); - } - } - - public HoodieTableType getTableType() { - return HoodieTableType.valueOf(properties.getProperty(HOODIE_TABLE_TYPE_PROP_NAME)); - } - - /** - * Lookup the file name for specified HoodieRecord - * - * TODO(vc): This metadata needs to be cached in each executor, statically, and used across, if - * we need to be nicer to the NameNode - */ - public String getFilenameForRecord(FileSystem fs, final HoodieRecord record) { - String fileId = record.getCurrentLocation().getFileId(); - return getFilenameForRecord(fs, record, fileId); - } - - - public String getFilenameForRecord(FileSystem fs, final HoodieRecord record, String fileId) { - try { - FileStatus[] files = fs.listStatus(new Path(basePath, record.getPartitionPath())); - Map> fileIdToVersions = - groupFilesByFileId(files, commits.lastCommit()); - // If the record is not found - if(!fileIdToVersions.containsKey(fileId)) { - throw new FileNotFoundException("Cannot find valid versions for fileId " + fileId); - } - - List statuses = fileIdToVersions.get(fileId); - return statuses.get(0).getPath().getName(); - } catch (IOException e) { - throw new HoodieIOException( - "Could not get Filename for record " + record, e); - } - } - - - - /** - * Get only the latest file in the partition with precondition commitTime(file) lt maxCommitTime - * - * @param fs - * @param partitionPathStr - * @param maxCommitTime - * @return - */ - public FileStatus[] getLatestVersionInPartition(FileSystem fs, String partitionPathStr, - String maxCommitTime) { - try { - Path partitionPath = new Path(basePath, partitionPathStr); - if(!fs.exists(partitionPath)) { - return new FileStatus[0]; - } - FileStatus[] files = fs.listStatus(partitionPath); - Map> fileIdToVersions = - groupFilesByFileId(files, commits.lastCommit()); - HashMap validFiles = new HashMap<>(); - for (String fileId : fileIdToVersions.keySet()) { - List versions = fileIdToVersions.get(fileId); - for (FileStatus file : versions) { - String filename = file.getPath().getName(); - String commitTime = FSUtils.getCommitTime(filename); - if (HoodieCommits.isCommit1BeforeOrOn(commitTime, maxCommitTime)) { - validFiles.put(fileId, file); - break; - } - } - } - return validFiles.values().toArray(new FileStatus[validFiles.size()]); - } catch (IOException e) { - throw new HoodieIOException( - "Could not get latest versions in Partition " + partitionPathStr, e); - } - } - - /** - * Get ALL the data files in partition grouped by fileId and sorted by the commitTime - * Given a partition path, provide all the files with a list of their commits, sorted by commit time. - */ - public Map> getAllVersionsInPartition(FileSystem fs, String partitionPath) { - try { - FileStatus[] files = fs.listStatus(new Path(basePath, partitionPath)); - return groupFilesByFileId(files, commits.lastCommit()); - } catch (IOException e) { - throw new HoodieIOException( - "Could not load all file versions in partition " + partitionPath, e); - } - } - - /** - * Get all the versions of files, within the commit range provided. - * - * @param commitsToReturn - commits to include - */ - public FileStatus[] getLatestVersionInRange(FileStatus[] fileStatuses, List commitsToReturn) { - if (commitsToReturn.isEmpty()) { - return new FileStatus[0]; - } - try { - Map> fileIdToVersions = - groupFilesByFileId(fileStatuses, commits.lastCommit()); - - List statuses = new ArrayList<>(); - for (List entry : fileIdToVersions.values()) { - for (FileStatus status : entry) { - String commitTime = FSUtils.getCommitTime(status.getPath().getName()); - if (commitsToReturn.contains(commitTime)) { - statuses.add(status); - break; - } - } - } - return statuses.toArray(new FileStatus[statuses.size()]); - } catch (IOException e) { - throw new HoodieIOException("Could not filter files from commits " + commitsToReturn, e); - } - } - - /** - * - * Get the latest versions of all the files. - * - * @param fileStatuses - * @return - */ - public FileStatus[] getLatestVersions(FileStatus[] fileStatuses) { - try { - Map> fileIdToVersions = - groupFilesByFileId(fileStatuses, commits.lastCommit()); - - List statuses = new ArrayList<>(); - for(List entry:fileIdToVersions.values()) { - // first file is the latest one - statuses.add(entry.get(0)); - } - return statuses.toArray(new FileStatus[statuses.size()]); - } catch (IOException e) { - throw new HoodieIOException("Could not filter files for latest version ", e); - } - } - - - /** - * Get the base path for the Hoodie Table - * - * @return - */ - public String getBasePath() { - return basePath; - } - - - public boolean isCommitsEmpty() { - return commits.isEmpty(); - } - - public boolean isCommitTsSafe(String commitTs) { - return !isCommitsEmpty() && (commits.isCommitBeforeEarliestCommit(commitTs) || commits - .contains(commitTs)); - } - - public List findCommitsSinceTs(String startTs) { - return commits.findCommitsInRange(startTs, MAX_COMMIT_TS); - } - - public List findCommitsInRange(String startTs, String endTs) { - return commits.findCommitsInRange(startTs, endTs); - } - - public List findCommitsAfter(String startTs, Integer maxCommits) { - return commits.findCommitsAfter(startTs, maxCommits); - } - - public HoodieCommits getAllCommits() { - return commits; - } - - public List getAllInflightCommits() { - return inflightCommits; - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("HoodieTableMetadata{"); - sb.append("commits=").append(commits); - sb.append('}'); - return sb.toString(); - } - - public String getTableName() { - return properties.getProperty(HOODIE_TABLE_NAME_PROP_NAME); - } - - public String getHDroneDatasetProfile() { - return properties.getProperty(HOODIE_HDRONE_PROFILE_PROP_NAME, HOODIE_HDRONE_PROFILE_DEFAULT_VALUE); - } - - /** - * Initialize the hoodie meta directory and any necessary files inside the meta (including the hoodie.properties) - * - * @param metadataFolder - * @param tableName - * @throws IOException - */ - private void createHoodieProperties(Path metadataFolder, String tableName) throws IOException { - if (!fs.exists(metadataFolder)) { - fs.mkdirs(metadataFolder); - } - Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE); - FSDataOutputStream outputStream = fs.create(propertyPath); - try { - Properties props = new Properties(); - props.setProperty(HOODIE_TABLE_NAME_PROP_NAME, tableName); - props.setProperty(HOODIE_TABLE_TYPE_PROP_NAME, DEFAULT_TABLE_TYPE.name()); - props - .store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); - } finally { - outputStream.close(); - } - } - - /** - * Loads the hoodie table properties from the hoodie.properties file under the .hoodie path - */ - private Properties readHoodieProperties() throws IOException { - Properties props = new Properties(); - Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE); - FSDataInputStream inputStream = fs.open(propertyPath); - try { - props.load(inputStream); - } finally { - inputStream.close(); - } - return props; - } - - /** - * Scan the commit times (only choosing commit file with the given suffix) - */ - private List scanCommits(final String commitFileSuffix) throws IOException { - log.info("Attempting to load the commits under " + metadataFolder + " with suffix " + commitFileSuffix); - final List commitFiles = new ArrayList<>(); - fs.listStatus(metadataFolder, new PathFilter() { - @Override - public boolean accept(Path path) { - if (path.getName().endsWith(commitFileSuffix)) { - commitFiles.add(path.getName().split("\\.")[0]); - return true; - } - return false; - } - }); - return commitFiles; - } - - /** - * Takes a bunch of file versions, and returns a map keyed by fileId, with the necessary - * version safety checking. Returns a map of commitTime and Sorted list of FileStats - * ( by reverse commit time ) - * - * @param maxCommitTime maximum permissible commit time - * - * @return - */ - private Map> groupFilesByFileId(FileStatus[] files, - String maxCommitTime) throws IOException { - HashMap> fileIdtoVersions = new HashMap<>(); - for (FileStatus file : files) { - String filename = file.getPath().getName(); - String fileId = FSUtils.getFileId(filename); - String commitTime = FSUtils.getCommitTime(filename); - if (isCommitTsSafe(commitTime) && HoodieCommits - .isCommit1BeforeOrOn(commitTime, maxCommitTime)) { - if (!fileIdtoVersions.containsKey(fileId)) { - fileIdtoVersions.put(fileId, new ArrayList()); - } - fileIdtoVersions.get(fileId).add(file); - } - } - for (Map.Entry> entry : fileIdtoVersions.entrySet()) { - Collections.sort(fileIdtoVersions.get(entry.getKey()), new Comparator() { - @Override - public int compare(FileStatus o1, FileStatus o2) { - String o1CommitTime = FSUtils.getCommitTime(o1.getPath().getName()); - String o2CommitTime = FSUtils.getCommitTime(o2.getPath().getName()); - // Reverse the order - return o2CommitTime.compareTo(o1CommitTime); - } - }); - } - return fileIdtoVersions; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - - HoodieTableMetadata metadata = (HoodieTableMetadata) o; - - if (commits != null ? !commits.equals(metadata.commits) : metadata.commits != null) - return false; - return basePath != null ? basePath.equals(metadata.basePath) : metadata.basePath == null; - - } - - @Override - public int hashCode() { - int result = commits != null ? commits.hashCode() : 0; - result = 31 * result + (basePath != null ? basePath.hashCode() : 0); - return result; - } - -} - diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java new file mode 100644 index 000000000..38fcfd32c --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table; + +import com.uber.hoodie.common.model.HoodieStorageType; +import com.uber.hoodie.common.model.HoodieTableType; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.exception.HoodieIOException; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Date; +import java.util.Properties; + +/** + * Configurations on the Hoodie Table like type of ingestion, storage formats, hive table name etc + * Configurations are loaded from hoodie.properties, these properties are usually set during initializing a path as hoodie base path + * and never changes during the lifetime of a hoodie dataset. + * + * @see HoodieTableMetaClient + * @since 0.3.0 + */ +public class HoodieTableConfig implements Serializable { + private final transient static Logger log = LogManager.getLogger(HoodieTableConfig.class); + + public static final String HOODIE_PROPERTIES_FILE = "hoodie.properties"; + public static final String HOODIE_TABLE_NAME_PROP_NAME = "hoodie.table.name"; + public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type"; + public static final String HOODIE_RO_STORAGE_FORMAT_PROP_NAME = + "hoodie.table.ro.storage.format"; + public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE; + public static final HoodieStorageType DEFAULT_RO_STORAGE_FORMAT = HoodieStorageType.PARQUET; + private Properties props; + + public HoodieTableConfig(FileSystem fs, String metaPath) { + Properties props = new Properties(); + Path propertyPath = new Path(metaPath, HOODIE_PROPERTIES_FILE); + log.info("Loading dataset properties from " + propertyPath); + try { + try (FSDataInputStream inputStream = fs.open(propertyPath)) { + props.load(inputStream); + } + } catch (IOException e) { + throw new HoodieIOException("Could not load Hoodie properties from " + propertyPath, e); + } + this.props = props; + } + + /** + * For serailizing and de-serializing + * @deprecated + */ + public HoodieTableConfig() { + } + + /** + * Initialize the hoodie meta directory and any necessary files inside the meta (including the hoodie.properties) + * + * @param metadataFolder + * @param properties + * @throws IOException + */ + public static void createHoodieProperties(FileSystem fs, Path metadataFolder, + Properties properties) throws IOException { + if (!fs.exists(metadataFolder)) { + fs.mkdirs(metadataFolder); + } + Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE); + FSDataOutputStream outputStream = fs.create(propertyPath); + try { + if (!properties.containsKey(HOODIE_TABLE_NAME_PROP_NAME)) { + throw new IllegalArgumentException( + HOODIE_TABLE_NAME_PROP_NAME + " property needs to be specified"); + } + if (!properties.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) { + properties.setProperty(HOODIE_TABLE_TYPE_PROP_NAME, DEFAULT_TABLE_TYPE.name()); + } + properties + .store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); + } finally { + outputStream.close(); + } + } + + + /** + * Read the table type from the table properties and if not found, return the default + * + * @return + */ + public HoodieTableType getTableType() { + if (props.contains(HOODIE_TABLE_TYPE_PROP_NAME)) { + return HoodieTableType.valueOf(props.getProperty(HOODIE_TABLE_TYPE_PROP_NAME)); + } + return DEFAULT_TABLE_TYPE; + } + + /** + * Read the table name + * + * @return + */ + public String getTableName() { + return props.getProperty(HOODIE_TABLE_NAME_PROP_NAME); + } + + /** + * Get the Read Optimized Storage Format + * + * @return HoodieStorageType for the Read Optimized Storage format + */ + public HoodieStorageType getROStorageFormat() { + if (props.contains(HOODIE_RO_STORAGE_FORMAT_PROP_NAME)) { + return HoodieStorageType.valueOf(props.getProperty(HOODIE_RO_STORAGE_FORMAT_PROP_NAME)); + } + return DEFAULT_RO_STORAGE_FORMAT; + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java new file mode 100644 index 000000000..f4a7ef44d --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table; + +import com.uber.hoodie.common.model.HoodieTableType; +import com.uber.hoodie.common.table.timeline.HoodieActiveCommitTimeline; +import com.uber.hoodie.common.table.timeline.HoodieArchivedCommitTimeline; +import com.uber.hoodie.common.table.timeline.HoodieCleanerTimeline; +import com.uber.hoodie.common.table.timeline.HoodieSavePointTimeline; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.exception.DatasetNotFoundException; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.util.Objects; +import java.util.Properties; + +/** + * HoodieTableMetaClient allows to access meta-data about a hoodie table + * It returns meta-data about commits, savepoints, compactions, cleanups as a HoodieTimeline + * Create an instance of the HoodieTableMetaClient with FileSystem and basePath to start getting the meta-data. + *

+ * All the timelines are computed lazily, once computed the timeline is cached and never refreshed. + * Use the HoodieTimeline.reload() to refresh timelines. + * + * @see HoodieTimeline + * @since 0.3.0 + */ +public class HoodieTableMetaClient implements Serializable { + private final transient static Logger log = LogManager.getLogger(HoodieTableMetaClient.class); + public static String METAFOLDER_NAME = ".hoodie"; + public static String COMMIT_EXTENSION = ".commit"; + public static String CLEAN_EXTENSION = ".clean"; + public static String SAVEPOINT_EXTENSION = ".savepoint"; + public static String INFLIGHT_FILE_SUFFIX = ".inflight"; + + private String basePath; + private transient FileSystem fs; + private String metaPath; + private HoodieTableType tableType; + private HoodieTableConfig tableConfig; + private HoodieTimeline activeCommitTimeline; + private HoodieTimeline archivedCommitTimeline; + private HoodieTimeline savePointTimeline; + private HoodieTimeline cleanerTimeline; + + public HoodieTableMetaClient(FileSystem fs, String basePath) throws DatasetNotFoundException { + // Do not load any timeline by default + this(fs, basePath, false); + } + + public HoodieTableMetaClient(FileSystem fs, String basePath, + boolean loadActiveCommitTimelineOnLoad) throws DatasetNotFoundException { + log.info("Loading HoodieTableMetaClient from " + basePath); + this.basePath = basePath; + this.fs = fs; + Path basePathDir = new Path(this.basePath); + this.metaPath = basePath + File.separator + METAFOLDER_NAME; + Path metaPathDir = new Path(this.metaPath); + DatasetNotFoundException.checkValidDataset(fs, basePathDir, metaPathDir); + this.tableConfig = new HoodieTableConfig(fs, metaPath); + this.tableType = tableConfig.getTableType(); + log.info("Finished Loading Table of type " + tableType + " from " + basePath); + if (loadActiveCommitTimelineOnLoad) { + log.info("Loading Active commit timeline for " + basePath); + getActiveCommitTimeline(); + } + } + + /** + * For serailizing and de-serializing + * @deprecated + */ + public HoodieTableMetaClient() { + } + + /** + * This method is only used when this object is deserialized in a spark executor. + * @deprecated + */ + private void readObject(java.io.ObjectInputStream in) + throws IOException, ClassNotFoundException { + in.defaultReadObject(); + this.fs = FSUtils.getFs(); + } + + /** + * @return Base path + */ + public String getBasePath() { + return basePath; + } + + /** + * @return Hoodie Table Type + */ + public HoodieTableType getTableType() { + return tableType; + } + + /** + * @return Meta path + */ + public String getMetaPath() { + return metaPath; + } + + /** + * @return Table Config + */ + public HoodieTableConfig getTableConfig() { + return tableConfig; + } + + /** + * Get the active commits as a timeline + * + * @return Active commit timeline + * @throws IOException + */ + public synchronized HoodieTimeline getActiveCommitTimeline() { + if (activeCommitTimeline == null) { + activeCommitTimeline = new HoodieActiveCommitTimeline(fs, metaPath); + } + return activeCommitTimeline; + } + + /** + * Get the archived commits as a timeline. This is costly operation, as all data from the + * archived files are read. This should not be used, unless for historical debugging purposes + * + * @return Active commit timeline + * @throws IOException + */ + public HoodieTimeline getArchivedCommitTimeline() { + if (archivedCommitTimeline == null) { + archivedCommitTimeline = new HoodieArchivedCommitTimeline(fs, metaPath); + } + return archivedCommitTimeline; + } + + /** + * Get the save points as a timeline. + * + * @return Savepoint timeline + * @throws IOException + */ + public HoodieTimeline getSavePointsTimeline() { + if (savePointTimeline == null) { + savePointTimeline = new HoodieSavePointTimeline(fs, metaPath); + } + return savePointTimeline; + } + + /** + * Get the cleaner activity as a timeline. + * + * @return Cleaner activity + * @throws IOException + */ + public HoodieTimeline getCleanerTimeline() { + if (cleanerTimeline == null) { + cleanerTimeline = new HoodieCleanerTimeline(fs, metaPath); + } + return cleanerTimeline; + } + + + /** + * Helper method to initialize a given path as a hoodie dataset with configs passed in as as Properties + * + * @param fs + * @param basePath + * @param props + * @return Instance of HoodieTableMetaClient + * @throws IOException + */ + public static HoodieTableMetaClient initializePathAsHoodieDataset(FileSystem fs, + String basePath, Properties props) throws IOException { + log.info("Initializing " + basePath + " as hoodie dataset " + basePath); + Path basePathDir = new Path(basePath); + if (!fs.exists(basePathDir)) { + fs.mkdirs(basePathDir); + } + Path metaPathDir = new Path(basePath, METAFOLDER_NAME); + if (!fs.exists(metaPathDir)) { + fs.mkdirs(metaPathDir); + } + HoodieTableConfig.createHoodieProperties(fs, metaPathDir, props); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + log.info("Finished initializing Table of type " + metaClient.getTableConfig().getTableType() + + " from " + basePath); + return metaClient; + } + + // HELPER METHODS TO CREATE META FILE NAMES + public static String makeCommitFileName(String commitTime) { + return commitTime + COMMIT_EXTENSION; + } + + public static String makeInflightCommitFileName(String commitTime) { + return commitTime + INFLIGHT_FILE_SUFFIX; + } + + public static String makeCleanerFileName(String instant) { + return instant + CLEAN_EXTENSION; + } + + public static String makeInflightCleanerFileName(String instant) { + return instant + CLEAN_EXTENSION + INFLIGHT_FILE_SUFFIX; + } + + public static String makeInflightSavePointFileName(String commitTime) { + return commitTime + SAVEPOINT_EXTENSION + INFLIGHT_FILE_SUFFIX; + } + + public static String makeSavePointFileName(String commitTime) { + return commitTime + SAVEPOINT_EXTENSION; + } + + public static String getCommitFromCommitFile(String commitFileName) { + return commitFileName.split("\\.")[0]; + } + + public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter nameFilter) + throws IOException { + return fs.listStatus(metaPath, nameFilter); + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + HoodieTableMetaClient that = (HoodieTableMetaClient) o; + return Objects.equals(basePath, that.basePath) && tableType == that.tableType; + } + + @Override + public int hashCode() { + return Objects.hash(basePath, tableType); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("HoodieTableMetaClient{"); + sb.append("basePath='").append(basePath).append('\''); + sb.append(", metaPath='").append(metaPath).append('\''); + sb.append(", tableType=").append(tableType); + sb.append('}'); + return sb.toString(); + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java new file mode 100644 index 000000000..3baf142a7 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table; + +import com.uber.hoodie.common.table.timeline.HoodieDefaultTimeline; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Optional; +import java.util.function.BiPredicate; +import java.util.stream.Stream; + +/** + * HoodieTimeline allows representation of meta-data events as a timeline. + * Instants are specific points in time represented as strings. + * in this format YYYYMMDDHHmmSS. e.g. 20170101193218 + * Any operation on the timeline starts with the inflight instant and then when complete marks + * the completed instant and removes the inflight instant. + * Completed instants are plainly referred to as just instants + *

+ * Timelines as immutable once created. Any operation to change the timeline (like create/delete instants) + * will not be reflected unless explicitly reloaded using the reload() + * + * @see com.uber.hoodie.common.table.HoodieTableMetaClient + * @see HoodieDefaultTimeline + * @since 0.3.0 + */ +public interface HoodieTimeline extends Serializable { + /** + * Find all the completed instants after startTs and before or on endTs + * + * @param startTs + * @param endTs + * @return Stream of instants + */ + Stream findInstantsInRange(String startTs, String endTs); + + /** + * Find all the completed instants after startTs + * + * @param commitTime + * @param numCommits + * @return Stream of instants + */ + Stream findInstantsAfter(String commitTime, int numCommits); + + /** + * If the timeline has any completed instants + * + * @return true if timeline is not empty + */ + boolean hasInstants(); + + /** + * If the timeline has any in-complete instants + * + * @return true if timeline has any in-complete instants + */ + boolean hasInflightInstants(); + + /** + * @return total number of completed instants + */ + int getTotalInstants(); + + /** + * @return first completed instant if available + */ + Optional firstInstant(); + + /** + * @param n + * @return nth completed instant from the first completed instant + */ + Optional nthInstant(int n); + + /** + * @return last completed instant if available + */ + Optional lastInstant(); + + /** + * @param n + * @return nth completed instant going back from the last completed instant + */ + Optional nthFromLastInstant(int n); + + /** + * @return true if the passed instant is present as a completed instant on the timeline + */ + boolean containsInstant(String instant); + + /** + * @return true if the passed instant is present as a completed instant on the timeline or + * if the instant is before the first completed instant in the timeline + */ + boolean containsOrBeforeTimelineStarts(String instant); + + /** + * @return Get the stream of completed instants + */ + Stream getInstants(); + + /** + * @return Get the stream of in-flight instants + */ + Stream getInflightInstants(); + + /** + * @return true if the passed in instant is before the first completed instant in the timeline + */ + boolean isInstantBeforeTimelineStarts(String instant); + + /** + * Register the passed in instant as a in-flight + * + * @param instant + */ + void saveInstantAsInflight(String instant); + + /** + * Register the passed in instant as a completed instant. + * It needs to have a corresponding in-flight instant, otherwise it will fail. + * Pass a optional byte[] to save with the instant. + * + * @param instant + * @param data + */ + void saveInstantAsComplete(String instant, Optional data); + + /** + * Un-Register a completed instant as in-flight. This is usually atomic way to + * revert the effects of a operation on hoodie datasets + * + * @param instant + */ + void revertInstantToInflight(String instant); + + /** + * Remove the in-flight instant from the timeline + * + * @param instant + */ + void removeInflightFromTimeline(String instant); + + /** + * Reload the timeline. Timelines are immutable once created. + * + * @return + * @throws IOException + */ + HoodieTimeline reload() throws IOException; + + /** + * Read the completed instant details + * + * @param instant + * @return + */ + Optional readInstantDetails(String instant); + + /** + * Helper methods to compare instants + **/ + BiPredicate GREATER_OR_EQUAL = + (commit1, commit2) -> commit1.compareTo(commit2) >= 0; + BiPredicate GREATER = (commit1, commit2) -> commit1.compareTo(commit2) > 0; + BiPredicate LESSER_OR_EQUAL = + (commit1, commit2) -> commit1.compareTo(commit2) <= 0; + BiPredicate LESSER = (commit1, commit2) -> commit1.compareTo(commit2) < 0; + + default boolean compareInstants(String commit1, String commit2, + BiPredicate predicateToApply) { + return predicateToApply.test(commit1, commit2); + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java new file mode 100644 index 000000000..e29c79bd0 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table; + +import com.uber.hoodie.common.model.HoodieDataFile; +import com.uber.hoodie.common.model.HoodieRecord; +import org.apache.hadoop.fs.FileStatus; + +import java.util.List; +import java.util.stream.Stream; + +/** + * Interface for viewing the table file system. + * Dependening on the Hoodie Table Type - The view of the filesystem changes. + *

+ * ReadOptimizedView - Lets queries run only on organized columnar data files at the expense of latency + * WriteOptimizedView - Lets queries run on columnar data as well as delta files (sequential) at the expense of query execution time + * @since 0.3.0 + */ +public interface TableFileSystemView { + /** + * Stream all the data files for a specific FileId. + * This usually has a single RO file and multiple WO files if present. + * + * @param partitionPath + * @param fileId + * @return + */ + Stream getLatestDataFilesForFileId(final String partitionPath, + final String fileId); + + /** + * Stream all the latest version data files in the given partition + * with precondition that commitTime(file) before maxCommitTime + * + * @param partitionPathStr + * @param maxCommitTime + * @return + */ + Stream streamLatestVersionInPartition(String partitionPathStr, + String maxCommitTime); + + /** + * Stream all the data file versions grouped by FileId for a given partition + * + * @param partitionPath + * @return + */ + Stream> streamEveryVersionInPartition(String partitionPath); + + /** + * Stream all the versions from the passed in fileStatus[] with commit times containing in commitsToReturn. + * + * @param fileStatuses + * @param commitsToReturn + * @return + */ + Stream streamLatestVersionInRange(FileStatus[] fileStatuses, + List commitsToReturn); + + /** + * Stream the latest version from the passed in FileStatus[] with commit times less than maxCommitToReturn + * + * @param fileStatuses + * @param maxCommitToReturn + * @return + */ + Stream streamLatestVersionsBeforeOrOn(FileStatus[] fileStatuses, + String maxCommitToReturn); + + /** + * Stream latest versions from the passed in FileStatus[]. + * Similar to calling streamLatestVersionsBeforeOrOn(fileStatuses, currentTimeAsCommitTime) + * + * @param fileStatuses + * @return + */ + Stream streamLatestVersions(FileStatus[] fileStatuses); +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveCommitTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveCommitTimeline.java new file mode 100644 index 000000000..2133b3fdf --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveCommitTimeline.java @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.timeline; + +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.exception.HoodieIOException; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Arrays; +import java.util.stream.Collectors; + +/** + * Active commit timeline. Much optimized for reading than the ArchivedTimeline. + */ +public class HoodieActiveCommitTimeline extends HoodieDefaultTimeline { + public HoodieActiveCommitTimeline(FileSystem fs, String metaPath) { + super(fs, metaPath); + String completedInstantExtension = HoodieTableMetaClient.COMMIT_EXTENSION; + String inflightInstantExtension = INFLIGHT_EXTENSION; + + FileStatus[] fileStatuses; + try { + fileStatuses = HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), + path -> path.toString().endsWith(completedInstantExtension) || path.toString() + .endsWith(inflightInstantExtension)); + } catch (IOException e) { + throw new HoodieIOException("Failed to scan metadata", e); + } + this.instants = Arrays.stream(fileStatuses) + .filter(status -> status.getPath().getName().endsWith(completedInstantExtension)) + .map(fileStatus -> fileStatus.getPath().getName().replaceAll(completedInstantExtension, "")) + .sorted().collect(Collectors.toList()); + this.inflights = Arrays.stream(fileStatuses).filter( + status -> status.getPath().getName().endsWith(inflightInstantExtension)).map( + fileStatus -> fileStatus.getPath().getName() + .replaceAll(inflightInstantExtension, "")).sorted() + .collect(Collectors.toList()); + } + + @Override + public String getInflightFileName(String instant) { + return HoodieTableMetaClient.makeInflightCommitFileName(instant); + } + + @Override + public String getCompletedFileName(String instant) { + return HoodieTableMetaClient.makeCommitFileName(instant); + } + + @Override + protected String getTimelineName() { + return "commit"; + } + + @Override + public HoodieTimeline reload() throws IOException { + return new HoodieActiveCommitTimeline(fs, metaPath); + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedCommitTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedCommitTimeline.java new file mode 100644 index 000000000..d99ad2788 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedCommitTimeline.java @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.timeline; + +import com.google.common.io.Closeables; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.exception.HoodieIOException; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +/** + * Archived commit timeline. These commits are usually cleaned up and the meta data is archived for + * future triaging + * + * @since 0.3.0 + */ +public class HoodieArchivedCommitTimeline extends HoodieDefaultTimeline { + private static final String HOODIE_COMMIT_ARCHIVE_LOG_FILE = "commits.archived"; + private transient Map readCommits = new HashMap<>(); + + public HoodieArchivedCommitTimeline(FileSystem fs, String metaPath) { + // Read back the commits to make sure + Path archiveLogPath = getArchiveLogPath(metaPath); + try { + SequenceFile.Reader reader = + new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(archiveLogPath)); + try { + Text key = new Text(); + Text val = new Text(); + while (reader.next(key, val)) { + // TODO - limit the number of commits loaded in memory. this could get very large. + // This is okay because only tooling will load the archived commit timeline today + readCommits.put(key.toString(), Arrays.copyOf(val.getBytes(), val.getLength())); + } + this.instants = new ArrayList<>(readCommits.keySet()); + this.inflights = new ArrayList<>(0); + } finally { + Closeables.closeQuietly(reader); + } + } catch (IOException e) { + throw new HoodieIOException( + "Could not load archived commit timeline from path " + archiveLogPath, e); + } + } + + @Override + public void saveInstantAsInflight(String instant) { + throw new UnsupportedOperationException( + "Could not save inflight instant in ArchivedTimeline " + instant); + } + + @Override + public void saveInstantAsComplete(String instant, Optional data) { + throw new UnsupportedOperationException( + "Could not save instant as complete in ArchivedTimeline " + instant); + } + + @Override + public void revertInstantToInflight(String instant) { + throw new UnsupportedOperationException( + "Could not revert instant in ArchivedTimeline " + instant); + } + + @Override + public void removeInflightFromTimeline(String instant) { + throw new UnsupportedOperationException( + "Could not delete inflight instant from ArchivedTimeline " + instant); + } + + @Override + public HoodieTimeline reload() throws IOException { + return new HoodieArchivedCommitTimeline(fs, metaPath); + } + + @Override + public Optional readInstantDetails(String instant) { + return Optional.ofNullable(readCommits.get(instant)); + } + + @Override + protected String getInflightFileName(String instant) { + throw new UnsupportedOperationException("No inflight filename for archived commits"); + } + + @Override + protected String getCompletedFileName(String instant) { + throw new UnsupportedOperationException("No inflight filename for archived commits"); + } + + @Override + protected String getTimelineName() { + return "archived-commits"; + } + + public static Path getArchiveLogPath(String metaPath) { + return new Path(metaPath, HOODIE_COMMIT_ARCHIVE_LOG_FILE); + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieCleanerTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieCleanerTimeline.java new file mode 100644 index 000000000..d8a9ed8e0 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieCleanerTimeline.java @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.timeline; + +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Optional; + +public class HoodieCleanerTimeline extends HoodieDefaultTimeline { + public HoodieCleanerTimeline(FileSystem fs, String path) { + super(fs, path, HoodieTableMetaClient.CLEAN_EXTENSION); + } + + @Override + public HoodieTimeline reload() throws IOException { + return new HoodieCleanerTimeline(fs, metaPath); + } + + @Override + public Optional readInstantDetails(String instant) { + // TODO - Nothing about the clean written today - this should change + return Optional.empty(); + } + + @Override + protected String getInflightFileName(String instant) { + return HoodieTableMetaClient.makeInflightCleanerFileName(instant); + } + + @Override + protected String getCompletedFileName(String instant) { + return HoodieTableMetaClient.makeCleanerFileName(instant); + } + + @Override + protected String getTimelineName() { + return "cleaner"; + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java new file mode 100644 index 000000000..c9143890d --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.timeline; + +import com.google.common.io.Closeables; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.exception.HoodieIOException; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * HoodieTimeline allows representation of meta-data events as a timeline. + * Instants are specific points in time represented as strings. + * in this format YYYYMMDDHHmmSS. e.g. 20170101193218 + * Any operation on the timeline starts with the inflight instant and then when complete marks + * the completed instant and removes the inflight instant. + * Completed instants are plainly referred to as just instants + *

+ * Timelines as immutable once created. Any operation to change the timeline (like create/delete instants) + * will not be reflected unless explicitly reloaded using the reload() + * + * @see com.uber.hoodie.common.table.HoodieTableMetaClient + * @see HoodieTimeline + * @since 0.3.0 + */ +public abstract class HoodieDefaultTimeline implements HoodieTimeline { + private final transient static Logger log = LogManager.getLogger(HoodieDefaultTimeline.class); + + public static final String INFLIGHT_EXTENSION = ".inflight"; + protected String metaPath; + protected transient FileSystem fs; + protected List inflights; + protected List instants; + + public HoodieDefaultTimeline(FileSystem fs, String metaPath, String fileExtension) { + String completedInstantExtension = fileExtension; + String inflightInstantExtension = fileExtension + INFLIGHT_EXTENSION; + + FileStatus[] fileStatuses; + try { + fileStatuses = HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), + path -> path.toString().endsWith(completedInstantExtension) || path.toString() + .endsWith(inflightInstantExtension)); + } catch (IOException e) { + throw new HoodieIOException("Failed to scan metadata", e); + } + this.instants = Arrays.stream(fileStatuses) + .filter(status -> status.getPath().getName().endsWith(completedInstantExtension)) + .map(fileStatus -> fileStatus.getPath().getName().replaceAll(completedInstantExtension, "")) + .sorted().collect(Collectors.toList()); + this.inflights = Arrays.stream(fileStatuses).filter( + status -> status.getPath().getName().endsWith(inflightInstantExtension)).map( + fileStatus -> fileStatus.getPath().getName() + .replaceAll(inflightInstantExtension, "")).sorted() + .collect(Collectors.toList()); + this.fs = fs; + this.metaPath = metaPath; + } + + public HoodieDefaultTimeline(Stream instants, Stream inflights) { + this.instants = instants.collect(Collectors.toList()); + this.inflights = inflights.collect(Collectors.toList()); + } + + /** + * This constructor only supports backwards compatibility in inflight commits in ActiveCommitTimeline. + * This should never be used. + * + * @param fs + * @param metaPath + * @deprecated + */ + public HoodieDefaultTimeline(FileSystem fs, String metaPath) { + this.fs = fs; + this.metaPath = metaPath; + } + + /** + * For serailizing and de-serializing + * @deprecated + */ + public HoodieDefaultTimeline() { + } + + + /** + * This method is only used when this object is deserialized in a spark executor. + * @deprecated + */ + private void readObject(java.io.ObjectInputStream in) + throws IOException, ClassNotFoundException { + in.defaultReadObject(); + this.fs = FSUtils.getFs(); + } + + @Override + public Stream findInstantsInRange(String startTs, String endTs) { + return instants.stream().filter( + s -> compareInstants(s, startTs, GREATER) && compareInstants(s, endTs, + LESSER_OR_EQUAL)); + } + + @Override + public Stream findInstantsAfter(String commitTime, int numCommits) { + return instants.stream().filter(s -> compareInstants(s, commitTime, GREATER)) + .limit(numCommits); + } + + @Override + public boolean hasInstants() { + return instants.stream().count() != 0; + } + + @Override + public boolean hasInflightInstants() { + return inflights.stream().count() != 0; + } + + @Override + public int getTotalInstants() { + return new Long(instants.stream().count()).intValue(); + } + + @Override + public Optional firstInstant() { + return instants.stream().findFirst(); + } + + @Override + public Optional nthInstant(int n) { + if(!hasInstants() || n >= getTotalInstants()) { + return Optional.empty(); + } + return Optional.of(instants.get(n)); + } + + @Override + public Optional lastInstant() { + return hasInstants() ? nthInstant(getTotalInstants() - 1) : Optional.empty(); + } + + @Override + public Optional nthFromLastInstant(int n) { + if(getTotalInstants() < n + 1) { + return Optional.empty(); + } + return nthInstant(getTotalInstants() - 1 - n); + } + + @Override + public boolean containsInstant(String instant) { + return instants.stream().anyMatch(s -> s.equals(instant)); + } + + @Override + public boolean containsOrBeforeTimelineStarts(String instant) { + return containsInstant(instant) || isInstantBeforeTimelineStarts(instant); + } + + @Override + public Stream getInstants() { + return instants.stream(); + } + + @Override + public Stream getInflightInstants() { + return inflights.stream(); + } + + @Override + public boolean isInstantBeforeTimelineStarts(String instant) { + Optional firstCommit = firstInstant(); + return firstCommit.isPresent() && compareInstants(instant, firstCommit.get(), LESSER); + } + + @Override + public void saveInstantAsInflight(String instant) { + log.info("Creating a new in-flight " + getTimelineName() + " " + instant); + // Create the in-flight file + createFileInMetaPath(getInflightFileName(instant), Optional.empty()); + } + + @Override + public void saveInstantAsComplete(String instant, Optional data) { + log.info("Marking complete " + getTimelineName() + " " + instant); + moveInflightToComplete(instant, data, getCompletedFileName(instant), + HoodieTableMetaClient.makeInflightCommitFileName(instant)); + log.info("Completed " + getTimelineName() + " " + instant); + } + + @Override + public void revertInstantToInflight(String instant) { + log.info("Reverting instant to inflight " + getTimelineName() + " " + instant); + moveCompleteToInflight(instant, getCompletedFileName(instant), + getInflightFileName(instant)); + log.info("Reverted " + getTimelineName() + " " + instant + " to inflight"); + } + + @Override + public void removeInflightFromTimeline(String instant) { + log.info("Removing in-flight " + getTimelineName() + " " + instant); + String inFlightCommitFileName = getInflightFileName(instant); + Path inFlightCommitFilePath = new Path(metaPath, inFlightCommitFileName); + try { + fs.delete(inFlightCommitFilePath, false); + log.info("Removed in-flight " + getTimelineName() + " " + instant); + } catch (IOException e) { + throw new HoodieIOException( + "Could not remove inflight commit " + inFlightCommitFilePath, e); + } + } + + @Override + public Optional readInstantDetails(String instant) { + Path detailPath = new Path(metaPath, getCompletedFileName(instant)); + return readDataFromPath(detailPath); + } + + + /** + * Get the in-flight instant file name + * + * @param instant + * @return + */ + protected abstract String getInflightFileName(String instant); + + /** + * Get the completed instant file name + * + * @param instant + * @return + */ + protected abstract String getCompletedFileName(String instant); + + /** + * Get the timeline name + * + * @return + */ + protected abstract String getTimelineName(); + + + protected void moveInflightToComplete(String instant, Optional data, + String commitFileName, String inflightFileName) { + Path commitFilePath = new Path(metaPath, commitFileName); + try { + // open a new file and write the commit metadata in + Path inflightCommitFile = new Path(metaPath, inflightFileName); + createFileInMetaPath(inflightFileName, data); + boolean success = fs.rename(inflightCommitFile, commitFilePath); + if (!success) { + throw new HoodieIOException( + "Could not rename " + inflightCommitFile + " to " + commitFilePath); + } + } catch (IOException e) { + throw new HoodieIOException("Could not complete commit " + instant, e); + } + } + + protected void moveCompleteToInflight(String instant, String commitFileName, + String inflightFileName) { + Path inFlightCommitFilePath = new Path(metaPath, inflightFileName); + try { + if (!fs.exists(inFlightCommitFilePath)) { + Path commitFilePath = new Path(metaPath, commitFileName); + boolean success = fs.rename(commitFilePath, inFlightCommitFilePath); + if (!success) { + throw new HoodieIOException( + "Could not rename " + commitFilePath + " to " + inFlightCommitFilePath); + } + } + } catch (IOException e) { + throw new HoodieIOException("Could not complete commit revert " + instant, e); + } + } + + protected void createFileInMetaPath(String filename, Optional content) { + Path fullPath = new Path(metaPath, filename); + try { + if (!content.isPresent()) { + if (fs.createNewFile(fullPath)) { + log.info("Created a new file in meta path: " + fullPath); + return; + } + } else { + FSDataOutputStream fsout = fs.create(fullPath, true); + fsout.write(content.get()); + fsout.close(); + return; + } + throw new HoodieIOException("Failed to create file " + fullPath); + } catch (IOException e) { + throw new HoodieIOException("Failed to create file " + fullPath, e); + } + } + + protected Optional readDataFromPath(Path detailPath) { + FSDataInputStream is = null; + try { + is = fs.open(detailPath); + return Optional.of(IOUtils.toByteArray(is)); + } catch (IOException e) { + throw new HoodieIOException("Could not read commit details from " + detailPath, e); + } finally { + if (is != null) { + Closeables.closeQuietly(is); + } + } + } + + + @Override + public String toString() { + return this.getClass().getName() + ": " + instants.stream().map(Object::toString) + .collect(Collectors.joining(",")); + } + +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieSavePointTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieSavePointTimeline.java new file mode 100644 index 000000000..3bc1748ff --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieSavePointTimeline.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.timeline; + +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + +public class HoodieSavePointTimeline extends HoodieDefaultTimeline { + public HoodieSavePointTimeline(FileSystem fs, String metaPath) { + super(fs, metaPath, HoodieTableMetaClient.SAVEPOINT_EXTENSION); + } + + @Override + public HoodieTimeline reload() throws IOException { + return new HoodieSavePointTimeline(fs, metaPath); + } + + @Override + protected String getInflightFileName(String instant) { + return HoodieTableMetaClient.makeInflightSavePointFileName(instant); + } + + @Override + protected String getCompletedFileName(String instant) { + return HoodieTableMetaClient.makeSavePointFileName(instant); + } + + @Override + protected String getTimelineName() { + return "savepoint"; + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/AbstractTableFileSystemView.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/AbstractTableFileSystemView.java new file mode 100644 index 000000000..e20700518 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/AbstractTableFileSystemView.java @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.view; + +import com.uber.hoodie.common.model.HoodieDataFile; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.exception.HoodieIOException; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; +import java.util.stream.Collector; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Common abstract implementation for multiple TableFileSystemView Implementations. + * 2 possible implementations are ReadOptimizedView and RealtimeView + * + * Concrete implementations extending this abstract class, should only implement + * listDataFilesInPartition which includes files to be included in the view + * + * @see TableFileSystemView + * @see ReadOptimizedTableView + * @since 0.3.0 + */ +public abstract class AbstractTableFileSystemView implements TableFileSystemView { + protected final HoodieTableMetaClient metaClient; + protected final transient FileSystem fs; + protected final HoodieTimeline activeCommitTimeline; + + public AbstractTableFileSystemView(FileSystem fs, HoodieTableMetaClient metaClient) { + this.metaClient = metaClient; + this.fs = fs; + this.activeCommitTimeline = metaClient.getActiveCommitTimeline(); + } + + public Stream getLatestDataFilesForFileId(final String partitionPath, + String fileId) { + Optional lastInstant = activeCommitTimeline.lastInstant(); + if (lastInstant.isPresent()) { + return streamLatestVersionInPartition(partitionPath, lastInstant.get()) + .filter(hoodieDataFile -> hoodieDataFile.getFileId().equals(fileId)); + } + return Stream.empty(); + } + + @Override + public Stream streamLatestVersionInPartition(String partitionPathStr, + String maxCommitTime) { + try { + return streamLatestVersionsBeforeOrOn(listDataFilesInPartition(partitionPathStr), + maxCommitTime); + } catch (IOException e) { + throw new HoodieIOException( + "Could not get latest versions in Partition " + partitionPathStr, e); + } + } + + + @Override + public Stream> streamEveryVersionInPartition(String partitionPath) { + try { + if(activeCommitTimeline.lastInstant().isPresent()) { + return streamFilesByFileId(listDataFilesInPartition(partitionPath), + activeCommitTimeline.lastInstant().get()); + } + return Stream.empty(); + } catch (IOException e) { + throw new HoodieIOException( + "Could not load all file versions in partition " + partitionPath, e); + } + } + + protected abstract FileStatus[] listDataFilesInPartition(String partitionPathStr) + throws IOException; + + @Override + public Stream streamLatestVersionInRange(FileStatus[] fileStatuses, + List commitsToReturn) { + if (!activeCommitTimeline.hasInstants() || commitsToReturn.isEmpty()) { + return Stream.empty(); + } + try { + return streamFilesByFileId(fileStatuses, activeCommitTimeline.lastInstant().get()) + .map((Function, Optional>) fss -> { + for (HoodieDataFile fs : fss) { + if (commitsToReturn.contains(fs.getCommitTime())) { + return Optional.of(fs); + } + } + return Optional.empty(); + }).filter(Optional::isPresent).map(Optional::get); + } catch (IOException e) { + throw new HoodieIOException("Could not filter files from commits " + commitsToReturn, + e); + } + } + + @Override + public Stream streamLatestVersionsBeforeOrOn(FileStatus[] fileStatuses, + String maxCommitToReturn) { + try { + if (!activeCommitTimeline.hasInstants()) { + return Stream.empty(); + } + return streamFilesByFileId(fileStatuses, activeCommitTimeline.lastInstant().get()) + .map((Function, Optional>) fss -> { + for (HoodieDataFile fs1 : fss) { + if (activeCommitTimeline + .compareInstants(fs1.getCommitTime(), maxCommitToReturn, + HoodieTimeline.LESSER_OR_EQUAL)) { + return Optional.of(fs1); + } + } + return Optional.empty(); + }).filter(Optional::isPresent).map(Optional::get); + } catch (IOException e) { + throw new HoodieIOException("Could not filter files for latest version ", e); + } + } + + @Override + public Stream streamLatestVersions(FileStatus[] fileStatuses) { + try { + if (!activeCommitTimeline.hasInstants()) { + return Stream.empty(); + } + return streamFilesByFileId(fileStatuses, activeCommitTimeline.lastInstant().get()) + .map(statuses -> statuses.get(0)); + } catch (IOException e) { + throw new HoodieIOException("Could not filter files for latest version ", e); + } + } + + protected Stream> streamFilesByFileId(FileStatus[] files, + String maxCommitTime) throws IOException { + return groupFilesByFileId(files, maxCommitTime).values().stream(); + } + + /** + * Filters the list of FileStatus to exclude non-committed data files and group by FileID + * and sort the actial files by commit time (newer commit first) + * + * @param files Files to filter and group from + * @param maxCommitTime maximum permissible commit time + * @return Grouped map by fileId + */ + private Map> groupFilesByFileId(FileStatus[] files, + String maxCommitTime) throws IOException { + return Arrays.stream(files).flatMap(fileStatus -> { + HoodieDataFile dataFile = new HoodieDataFile(fileStatus); + if (activeCommitTimeline.containsOrBeforeTimelineStarts(dataFile.getCommitTime()) + && activeCommitTimeline.compareInstants(dataFile.getCommitTime(), maxCommitTime, + HoodieTimeline.LESSER_OR_EQUAL)) { + return Stream.of(Pair.of(dataFile.getFileId(), dataFile)); + } + return Stream.empty(); + }).collect(Collectors + .groupingBy(Pair::getKey, Collectors.mapping(Pair::getValue, toSortedFileStatus()))); + } + + private Collector> toSortedFileStatus() { + return Collectors.collectingAndThen(Collectors.toList(), + l -> l.stream().sorted(HoodieDataFile.getCommitTimeComparator()) + .collect(Collectors.toList())); + } + + +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/ReadOptimizedTableView.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/ReadOptimizedTableView.java new file mode 100644 index 000000000..6978326ee --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/ReadOptimizedTableView.java @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.view; + +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.exception.HoodieIOException; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + +/** + * ReadOptimized view which includes only the ROStorageformat files + */ +public class ReadOptimizedTableView extends AbstractTableFileSystemView { + public ReadOptimizedTableView(FileSystem fs, HoodieTableMetaClient metaClient) { + super(fs, metaClient); + } + + protected FileStatus[] listDataFilesInPartition(String partitionPathStr) { + Path partitionPath = new Path(metaClient.getBasePath(), partitionPathStr); + try { + return fs.listStatus(partitionPath, path -> path.getName() + .contains(metaClient.getTableConfig().getROStorageFormat().getFileExtension())); + } catch (IOException e) { + throw new HoodieIOException( + "Failed to list data files in partition " + partitionPathStr, e); + } + } + + +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java index 1926946db..09d592d4a 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java @@ -16,7 +16,6 @@ package com.uber.hoodie.common.util; -import com.uber.hoodie.common.model.HoodieTableMetadata; import com.uber.hoodie.exception.HoodieIOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -68,14 +67,6 @@ public class FSUtils { return String.format("*_*_%s.parquet", commitTime); } - public static String makeInflightCommitFileName(String commitTime) { - return commitTime + HoodieTableMetadata.INFLIGHT_FILE_SUFFIX; - } - - public static String makeCommitFileName(String commitTime) { - return commitTime + HoodieTableMetadata.COMMIT_FILE_SUFFIX; - } - public static String getCommitFromCommitFile(String commitFileName) { return commitFileName.split("\\.")[0]; } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/DatasetNotFoundException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/DatasetNotFoundException.java index 3981529e4..701f0162f 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/DatasetNotFoundException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/DatasetNotFoundException.java @@ -16,6 +16,11 @@ package com.uber.hoodie.exception; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + /** *

* Exception thrown to indicate that a hoodie dataset was not found on the path provided @@ -29,4 +34,21 @@ public class DatasetNotFoundException extends HoodieException { private static String getErrorMessage(String basePath) { return "Hoodie dataset not found in path " + basePath; } + + public static void checkValidDataset(FileSystem fs, Path basePathDir, Path metaPathDir) + throws DatasetNotFoundException { + // Check if the base path is found + try { + if (!fs.exists(basePathDir) || !fs.isDirectory(basePathDir)) { + throw new DatasetNotFoundException(basePathDir.toString()); + } + // Check if the meta path is found + if (!fs.exists(metaPathDir) || !fs.isDirectory(metaPathDir)) { + throw new DatasetNotFoundException(metaPathDir.toString()); + } + } catch (IOException e) { + throw new HoodieIOException( + "Could not check if dataset " + basePathDir + " is valid dataset", e); + } + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIOException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIOException.java index 2e0b838b7..74f7ed164 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIOException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIOException.java @@ -24,13 +24,17 @@ import java.io.IOException; *

*/ public class HoodieIOException extends HoodieException { - private final IOException ioException; + private IOException ioException; public HoodieIOException(String msg, IOException t) { super(msg, t); this.ioException = t; } + public HoodieIOException(String msg) { + super(msg); + } + public IOException getIOException() { return ioException; } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/TestBloomFilter.java b/hoodie-common/src/test/java/com/uber/hoodie/common/TestBloomFilter.java index fa90c3e73..54f93c49d 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/TestBloomFilter.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/TestBloomFilter.java @@ -1,17 +1,17 @@ /* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package com.uber.hoodie.common; diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java index df43aa89f..b28284883 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java @@ -1,73 +1,78 @@ /* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package com.uber.hoodie.common.model; +import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.io.Input; +import com.esotericsoftware.kryo.io.Output; +import com.uber.hoodie.common.table.HoodieTableConfig; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.util.FSUtils; +import org.apache.hadoop.fs.FileSystem; import org.junit.rules.TemporaryFolder; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.FileWriter; import java.io.IOException; +import java.io.Serializable; import java.text.SimpleDateFormat; import java.util.Date; +import java.util.Iterator; import java.util.Properties; import java.util.UUID; +import java.util.stream.Stream; + +import static org.junit.Assert.assertEquals; public class HoodieTestUtils { - + public static FileSystem fs = FSUtils.getFs(); + public static final String TEST_EXTENSION = ".test"; public static final String RAW_TRIPS_TEST_NAME = "raw_trips"; public static final int DEFAULT_TASK_PARTITIONID = 1; - public static final void initializeHoodieDirectory(String basePath) throws IOException { - new File(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME).mkdirs(); + public static HoodieTableMetaClient init(String basePath) throws IOException { Properties properties = new Properties(); - properties.setProperty(HoodieTableMetadata.HOODIE_TABLE_NAME_PROP_NAME, RAW_TRIPS_TEST_NAME); - properties.setProperty(HoodieTableMetadata.HOODIE_TABLE_TYPE_PROP_NAME, HoodieTableMetadata.DEFAULT_TABLE_TYPE.name()); - FileWriter fileWriter = new FileWriter(new File(basePath + "/.hoodie/hoodie.properties")); - try { - properties.store(fileWriter, ""); - } finally { - fileWriter.close(); - } + properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, RAW_TRIPS_TEST_NAME); + return HoodieTableMetaClient.initializePathAsHoodieDataset(fs, basePath, properties); } - public static final String initializeTempHoodieBasePath() throws IOException { + public static HoodieTableMetaClient initOnTemp() throws IOException { // Create a temp folder as the base path TemporaryFolder folder = new TemporaryFolder(); folder.create(); String basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.initializeHoodieDirectory(basePath); - return basePath; + return HoodieTestUtils.init(basePath); } - public static final String getNewCommitTime() { + public static String makeNewCommitTime() { return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); } public static final void createCommitFiles(String basePath, String... commitTimes) throws IOException { for (String commitTime: commitTimes) { - new File(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME+ "/" + FSUtils.makeCommitFileName(commitTime)).createNewFile(); + new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + HoodieTableMetaClient.makeCommitFileName(commitTime)).createNewFile(); } } public static final void createInflightCommitFiles(String basePath, String... commitTimes) throws IOException { for (String commitTime: commitTimes) { - new File(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME+ "/" + FSUtils.makeInflightCommitFileName(commitTime)).createNewFile(); + new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + HoodieTableMetaClient.makeInflightCommitFileName(commitTime)).createNewFile(); } } @@ -92,10 +97,43 @@ public class HoodieTestUtils { } public static final boolean doesCommitExist(String basePath, String commitTime) { - return new File(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME+ "/" + commitTime + HoodieTableMetadata.COMMIT_FILE_SUFFIX).exists(); + return new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + commitTime + HoodieTableMetaClient.COMMIT_EXTENSION).exists(); } public static final boolean doesInflightExist(String basePath, String commitTime) { - return new File(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME+ "/" + commitTime + HoodieTableMetadata.INFLIGHT_FILE_SUFFIX).exists(); + return new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + commitTime + HoodieTableMetaClient.INFLIGHT_FILE_SUFFIX).exists(); + } + + public static String makeInflightTestFileName(String instant) { + return instant + TEST_EXTENSION + HoodieTableMetaClient.INFLIGHT_FILE_SUFFIX; + } + + public static String makeTestFileName(String instant) { + return instant + TEST_EXTENSION; + } + + public static String makeCommitFileName(String instant) { + return instant + ".commit"; + } + + public static void assertStreamEquals(String message, Stream expected, Stream actual) { + Iterator iter1 = expected.iterator(), iter2 = actual.iterator(); + while(iter1.hasNext() && iter2.hasNext()) + assertEquals(message, iter1.next(), iter2.next()); + assert !iter1.hasNext() && !iter2.hasNext(); + } + + public static T serializeDeserialize(T object, Class clazz) { + // Using Kyro as the default serializer in Spark Jobs + Kryo kryo = new Kryo(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + Output output = new Output(baos); + kryo.writeObject(output, object); + output.close(); + + Input input = new Input(new ByteArrayInputStream(baos.toByteArray())); + T deseralizedObject = kryo.readObject(input, clazz); + input.close(); + return deseralizedObject; } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/model/TestHoodieCommits.java b/hoodie-common/src/test/java/com/uber/hoodie/common/model/TestHoodieCommits.java deleted file mode 100644 index dbe92e4ce..000000000 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/model/TestHoodieCommits.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.uber.hoodie.common.model; - - -import org.junit.Test; - -import java.util.Arrays; - -import static org.junit.Assert.*; - -/** - * - */ -public class TestHoodieCommits { - - @Test - public void testHoodieCommits() throws Exception { - HoodieCommits commits = new HoodieCommits(Arrays.asList("001", "005", "004", "002")); - assertFalse(commits.contains("003")); - assertTrue(commits.contains("002")); - assertEquals(Arrays.asList("004", "005"), commits.findCommitsAfter("003", 2)); - assertEquals(Arrays.asList("001", "002", "004"), commits.findCommitsInRange("000", "004")); - assertEquals(commits.lastCommit(), commits.lastCommit(0)); - assertEquals("001", commits.lastCommit(3)); - assertEquals(null, commits.lastCommit(4)); - - assertEquals(commits.max("001", "000"), "001"); - assertFalse(HoodieCommits.isCommit1After("001", "002")); - assertFalse(HoodieCommits.isCommit1After("001", "001")); - assertTrue(HoodieCommits.isCommit1After("003", "002")); - assertTrue(HoodieCommits.isCommit1BeforeOrOn("003", "003")); - } -} diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/model/TestHoodieTableMetadata.java b/hoodie-common/src/test/java/com/uber/hoodie/common/model/TestHoodieTableMetadata.java deleted file mode 100644 index a07bd8699..000000000 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/model/TestHoodieTableMetadata.java +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.uber.hoodie.common.model; - -import com.google.common.collect.Sets; - -import com.uber.hoodie.common.util.FSUtils; - -import com.uber.hoodie.exception.HoodieIOException; -import com.uber.hoodie.exception.HoodieRecordMissingException; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.junit.After; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.ExpectedException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.*; - -import static org.junit.Assert.*; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -public class TestHoodieTableMetadata { - private String basePath = null; - private HoodieTableMetadata metadata = null; - @Rule - public final ExpectedException exception = ExpectedException.none(); - - @Before - public void init() throws Exception { - basePath = HoodieTestUtils.initializeTempHoodieBasePath(); - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); - } - - @Test - public void testScanCommitTs() throws Exception { - // Empty commit dir - assertTrue(metadata.getAllCommits().isEmpty()); - - // Create some commit files - new File(basePath + "/.hoodie/20160504123032.commit").createNewFile(); - new File(basePath + "/.hoodie/20160503122032.commit").createNewFile(); - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); - List list = metadata.getAllCommits().getCommitList(); - assertEquals(list.size(), 2); - assertTrue(list.contains("20160504123032")); - assertTrue(list.contains("20160503122032")); - - // Check the .inflight files - assertTrue(metadata.getAllInflightCommits().isEmpty()); - new File(basePath + "/.hoodie/20160505123032.inflight").createNewFile(); - new File(basePath + "/.hoodie/20160506122032.inflight").createNewFile(); - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); - list = metadata.getAllInflightCommits(); - assertEquals(list.size(), 2); - assertTrue(list.contains("20160505123032")); - assertTrue(list.contains("20160506122032")); - } - - @Test - public void testGetLastValidFileNameForRecord() throws Exception { - FileSystem fs = FSUtils.getFs(); - String partitionPath = "2016/05/01"; - new File(basePath + "/" + partitionPath).mkdirs(); - String fileId = UUID.randomUUID().toString(); - HoodieRecord record = mock(HoodieRecord.class); - when(record.getPartitionPath()).thenReturn(partitionPath); - when(record.getCurrentLocation()).thenReturn(new HoodieRecordLocation("001", fileId)); - - // First, no commit for this record - exception.expect(HoodieIOException.class); - metadata.getFilenameForRecord(fs, record); - - // Only one commit, but is not safe - String commitTime1 = "20160501123212"; - String fileName1 = FSUtils.makeDataFileName(commitTime1, 1, fileId); - new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); - assertNull(metadata.getFilenameForRecord(fs, record)); - - // Make this commit safe - new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); - metadata = new HoodieTableMetadata(fs, basePath, "testTable"); - assertTrue(metadata.getFilenameForRecord(fs, record).equals(fileName1)); - - // Do another commit, but not safe - String commitTime2 = "20160502123012"; - String fileName2 = FSUtils.makeDataFileName(commitTime2, 1, fileId); - new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); - assertTrue(metadata.getFilenameForRecord(fs, record).equals(fileName1)); - - // Make it safe - new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); - metadata = new HoodieTableMetadata(fs, basePath, "testTable"); - assertTrue(metadata.getFilenameForRecord(fs, record).equals(fileName2)); - } - - @Test - public void testGetAllPartitionPaths() throws IOException { - FileSystem fs = FSUtils.getFs(); - - // Empty - List partitions = FSUtils.getAllPartitionPaths(fs, basePath); - assertEquals(partitions.size(), 0); - - // Add some dirs - new File(basePath + "/2016/04/01").mkdirs(); - new File(basePath + "/2015/04/01").mkdirs(); - partitions = FSUtils.getAllPartitionPaths(fs, basePath); - assertEquals(partitions.size(), 2); - assertTrue(partitions.contains("2016/04/01")); - assertTrue(partitions.contains("2015/04/01")); - } - - @Test - public void testGetFileVersionsInPartition() throws IOException { - // Put some files in the partition - String fullPartitionPath = basePath + "/2016/05/01/"; - new File(fullPartitionPath).mkdirs(); - - String commitTime1 = "20160501123032"; - String commitTime2 = "20160502123032"; - String commitTime3 = "20160503123032"; - String commitTime4 = "20160504123032"; - - HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2, commitTime3, commitTime4); - - String fileId1 = UUID.randomUUID().toString(); - String fileId2 = UUID.randomUUID().toString(); - String fileId3 = UUID.randomUUID().toString(); - - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)).createNewFile(); - - metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); - - Map> fileVersions = metadata.getAllVersionsInPartition(FSUtils.getFs(), "2016/05/01"); - assertEquals(fileVersions.get(fileId1).size(), 2); - assertEquals(fileVersions.get(fileId2).size(), 3); - assertEquals(fileVersions.get(fileId3).size(), 2); - String commitTs = FSUtils.getCommitTime(fileVersions.get(fileId1).get(fileVersions.get(fileId1).size() - 1).getPath().getName()); - assertTrue(commitTs.equals(commitTime1)); - commitTs = FSUtils.getCommitTime(fileVersions.get(fileId1).get(fileVersions.get(fileId1).size() - 2).getPath().getName()); - assertTrue(commitTs.equals(commitTime4)); - } - - @Test - public void testGetOnlyLatestVersionFiles() throws Exception { - // Put some files in the partition - String fullPartitionPath = basePath + "/2016/05/01/"; - new File(fullPartitionPath).mkdirs(); - String commitTime1 = "20160501123032"; - String commitTime2 = "20160502123032"; - String commitTime3 = "20160503123032"; - String commitTime4 = "20160504123032"; - String fileId1 = UUID.randomUUID().toString(); - String fileId2 = UUID.randomUUID().toString(); - String fileId3 = UUID.randomUUID().toString(); - - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)).createNewFile(); - - new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); - - // Now we list the entire partition - FileSystem fs = FSUtils.getFs(); - FileStatus[] statuses = fs.listStatus(new Path(fullPartitionPath)); - assertEquals(statuses.length, 7); - - metadata = new HoodieTableMetadata(fs, basePath, "testTable"); - FileStatus[] statuses1 = metadata - .getLatestVersionInPartition(fs, "2016/05/01", commitTime4); - assertEquals(statuses1.length, 3); - Set filenames = Sets.newHashSet(); - for (FileStatus status : statuses1) { - filenames.add(status.getPath().getName()); - } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId3))); - - // Reset the max commit time - FileStatus[] statuses2 = metadata - .getLatestVersionInPartition(fs, "2016/05/01", commitTime3); - assertEquals(statuses2.length, 3); - filenames = Sets.newHashSet(); - for (FileStatus status : statuses2) { - filenames.add(status.getPath().getName()); - } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, 1, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId3))); - } - - @Test - public void testCommitTimeComparison() { - String commitTime1 = "20160504123032"; - String commitTime2 = "20151231203159"; - assertTrue(HoodieCommits.isCommit1After(commitTime1, commitTime2)); - assertTrue(HoodieCommits.isCommit1BeforeOrOn(commitTime1, commitTime1)); - assertTrue(HoodieCommits.isCommit1BeforeOrOn(commitTime2, commitTime1)); - } - - @After - public void cleanup() { - if (basePath != null) { - new File(basePath).delete(); - } - } -} diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java new file mode 100644 index 000000000..6ec275b15 --- /dev/null +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table; + +import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.io.Input; +import com.esotericsoftware.kryo.io.Output; +import com.google.common.collect.Lists; +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.table.timeline.HoodieArchivedCommitTimeline; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.ArrayFile; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.junit.Before; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.Optional; +import java.util.stream.Collectors; + +import static org.junit.Assert.*; + +public class HoodieTableMetaClientTest { + private HoodieTableMetaClient metaClient; + private String basePath; + + @Before + public void init() throws IOException { + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + this.basePath = folder.getRoot().getAbsolutePath(); + metaClient = HoodieTestUtils.init(basePath); + } + + @Test + public void checkMetadata() { + assertEquals("Table name should be raw_trips", HoodieTestUtils.RAW_TRIPS_TEST_NAME, + metaClient.getTableConfig().getTableName()); + assertEquals("Basepath should be the one assigned", basePath, metaClient.getBasePath()); + assertEquals("Metapath should be ${basepath}/.hoodie", basePath + "/.hoodie", + metaClient.getMetaPath()); + } + + @Test + public void checkSerDe() throws IOException { + // check if this object is serialized and se-serialized, we are able to read from the file system + HoodieTableMetaClient deseralizedMetaClient = + HoodieTestUtils.serializeDeserialize(metaClient, HoodieTableMetaClient.class); + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + commitTimeline.saveInstantAsInflight("1"); + commitTimeline.saveInstantAsComplete("1", Optional.of("test-detail".getBytes())); + commitTimeline = commitTimeline.reload(); + assertEquals("Commit should be 1", "1", commitTimeline.getInstants().findFirst().get()); + assertArrayEquals("Commit value should be \"test-detail\"", "test-detail".getBytes(), + commitTimeline.readInstantDetails("1").get()); + } + + @Test + public void checkCommitTimeline() throws IOException { + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + assertFalse("Should be empty commit timeline", + commitTimeline.getInstants().findFirst().isPresent()); + assertFalse("Should be empty commit timeline", + commitTimeline.getInflightInstants().findFirst().isPresent()); + commitTimeline.saveInstantAsInflight("1"); + commitTimeline.saveInstantAsComplete("1", Optional.of("test-detail".getBytes())); + + // Commit timeline should not auto-reload every time getActiveCommitTimeline(), it should be cached + commitTimeline = metaClient.getActiveCommitTimeline(); + assertFalse("Should be empty commit timeline", + commitTimeline.getInstants().findFirst().isPresent()); + assertFalse("Should be empty commit timeline", + commitTimeline.getInflightInstants().findFirst().isPresent()); + + commitTimeline = commitTimeline.reload(); + assertTrue("Should be the 1 commit we made", + commitTimeline.getInstants().findFirst().isPresent()); + assertEquals("Commit should be 1", "1", commitTimeline.getInstants().findFirst().get()); + assertArrayEquals("Commit value should be \"test-detail\"", "test-detail".getBytes(), + commitTimeline.readInstantDetails("1").get()); + } + + @Test + public void checkArchiveCommitTimeline() throws IOException { + Path archiveLogPath = + HoodieArchivedCommitTimeline.getArchiveLogPath(metaClient.getMetaPath()); + SequenceFile.Writer writer = SequenceFile + .createWriter(HoodieTestUtils.fs.getConf(), SequenceFile.Writer.file(archiveLogPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class)); + + writer.append(new Text("1"), new Text("data1")); + writer.append(new Text("2"), new Text("data2")); + writer.append(new Text("3"), new Text("data3")); + + IOUtils.closeStream(writer); + + HoodieTimeline archivedTimeline = metaClient.getArchivedCommitTimeline(); + assertEquals(Lists.newArrayList("1", "2", "3"), + archivedTimeline.getInstants().collect(Collectors.toList())); + System.out.println(new String( archivedTimeline.readInstantDetails("1").get())); + assertArrayEquals(new Text("data1").getBytes(), archivedTimeline.readInstantDetails("1").get()); + assertArrayEquals(new Text("data2").getBytes(), archivedTimeline.readInstantDetails("2").get()); + assertArrayEquals(new Text("data3").getBytes(), archivedTimeline.readInstantDetails("3").get()); + } + + + +} diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieDefaultTimelineTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieDefaultTimelineTest.java new file mode 100644 index 000000000..d6d510d67 --- /dev/null +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieDefaultTimelineTest.java @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.string; + +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import org.apache.hadoop.fs.Path; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import java.io.IOException; +import java.util.Optional; +import java.util.stream.Stream; + +import static org.junit.Assert.*; + +public class HoodieDefaultTimelineTest { + private HoodieTimeline timeline; + private HoodieTableMetaClient metaClient; + @Rule + public final ExpectedException exception = ExpectedException.none(); + + @Before + public void setUp() throws Exception { + this.metaClient = HoodieTestUtils.initOnTemp(); + } + + @After + public void tearDown() throws Exception { + HoodieTestUtils.fs.delete(new Path(this.metaClient.getBasePath()), true); + } + + @Test + public void testLoadingInstantsFromFiles() throws IOException { + timeline = + new MockHoodieTimeline(HoodieTestUtils.fs, metaClient.getMetaPath(), ".test"); + timeline.saveInstantAsComplete("1", Optional.empty()); + timeline.saveInstantAsComplete("3", Optional.empty()); + timeline.saveInstantAsComplete("5", Optional.empty()); + timeline.saveInstantAsComplete("8", Optional.empty()); + timeline.saveInstantAsInflight("9"); + timeline = timeline.reload(); + + assertEquals("Total instants should be 4", 4, timeline.getTotalInstants()); + HoodieTestUtils + .assertStreamEquals("Check the instants stream", Stream.of("1", "3", "5", "8"), + timeline.getInstants()); + assertTrue("Inflights should be present in the timeline", timeline.hasInflightInstants()); + HoodieTestUtils.assertStreamEquals("Check the inflights stream", Stream.of("9"), + timeline.getInflightInstants()); + } + + @Test + public void testTimelineOperationsBasic() throws Exception { + timeline = new MockHoodieTimeline(Stream.empty(), Stream.empty()); + assertFalse(timeline.hasInstants()); + assertFalse(timeline.hasInflightInstants()); + assertEquals("", 0, timeline.getTotalInstants()); + assertEquals("", Optional.empty(), timeline.firstInstant()); + assertEquals("", Optional.empty(), timeline.nthInstant(5)); + assertEquals("", Optional.empty(), timeline.nthInstant(-1)); + assertEquals("", Optional.empty(), timeline.lastInstant()); + assertFalse("", timeline.containsInstant("01")); + } + + @Test + public void testTimelineOperations() throws Exception { + timeline = new MockHoodieTimeline( + Stream.of("01", "03", "05", "07", "09", "11", "13", "15", "17", "19"), + Stream.of("21", "23")); + HoodieTestUtils.assertStreamEquals("", Stream.of("05", "07", "09", "11"), + timeline.findInstantsInRange("04", "11")); + HoodieTestUtils + .assertStreamEquals("", Stream.of("09", "11"), timeline.findInstantsAfter("07", 2)); + assertTrue(timeline.hasInstants()); + assertTrue(timeline.hasInflightInstants()); + assertEquals("", 10, timeline.getTotalInstants()); + assertEquals("", "01", timeline.firstInstant().get()); + assertEquals("", "11", timeline.nthInstant(5).get()); + assertEquals("", "19", timeline.lastInstant().get()); + assertEquals("", "09", timeline.nthFromLastInstant(5).get()); + assertTrue("", timeline.containsInstant("09")); + assertFalse("", timeline.isInstantBeforeTimelineStarts("02")); + assertTrue("", timeline.isInstantBeforeTimelineStarts("00")); + } +} diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java new file mode 100644 index 000000000..ea72e88d2 --- /dev/null +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.string; + +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieDefaultTimeline; +import org.apache.hadoop.fs.FileSystem; + +import java.io.IOException; +import java.util.Optional; +import java.util.stream.Stream; + +public class MockHoodieTimeline extends HoodieDefaultTimeline { + private String fileExt; + + public MockHoodieTimeline(FileSystem fs, String metaPath, String fileExtension) + throws IOException { + super(fs, metaPath, fileExtension); + this.fileExt = fileExtension; + } + + public MockHoodieTimeline(Stream instants, Stream inflights) + throws IOException { + super(instants, inflights); + } + + @Override + public HoodieTimeline reload() throws IOException { + return new MockHoodieTimeline(fs, metaPath, fileExt); + } + + @Override + public Optional readInstantDetails(String instant) { + return Optional.empty(); + } + + @Override + protected String getInflightFileName(String instant) { + return HoodieTestUtils.makeInflightTestFileName(instant); + } + + @Override + protected String getCompletedFileName(String instant) { + return HoodieTestUtils.makeTestFileName(instant); + } + + @Override + protected String getTimelineName() { + return "mock-test"; + } +} diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/ReadOptimizedTableViewTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/ReadOptimizedTableViewTest.java new file mode 100644 index 000000000..8e45f7aa9 --- /dev/null +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/ReadOptimizedTableViewTest.java @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.table.view; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import com.uber.hoodie.common.model.HoodieDataFile; +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.util.FSUtils; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.junit.Before; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; + +import static org.junit.Assert.*; + +public class ReadOptimizedTableViewTest { + private HoodieTableMetaClient metaClient; + private String basePath; + private TableFileSystemView fsView; + + @Before + public void init() throws IOException { + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + this.basePath = folder.getRoot().getAbsolutePath(); + metaClient = HoodieTestUtils.init(basePath); + fsView = new ReadOptimizedTableView(HoodieTestUtils.fs, metaClient); + } + + private void refreshFsView() { + metaClient = new HoodieTableMetaClient(HoodieTestUtils.fs, basePath, true); + fsView = new ReadOptimizedTableView(HoodieTestUtils.fs, metaClient); + } + + @Test + public void testGetLatestDataFilesForFileId() throws IOException { + String partitionPath = "2016/05/01"; + new File(basePath + "/" + partitionPath).mkdirs(); + String fileId = UUID.randomUUID().toString(); + + assertFalse("No commit, should not find any data file", + fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().isPresent()); + + // Only one commit, but is not safe + String commitTime1 = "1"; + String fileName1 = FSUtils.makeDataFileName(commitTime1, 1, fileId); + new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); + refreshFsView(); + assertFalse("No commit, should not find any data file", + fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().isPresent()); + + // Make this commit safe + HoodieTimeline commitTimeline = metaClient.getActiveCommitTimeline(); + commitTimeline.saveInstantAsComplete(commitTime1, Optional.empty()); + refreshFsView(); + assertEquals("", fileName1, + fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().get() + .getFileName()); + + // Do another commit, but not safe + String commitTime2 = "2"; + String fileName2 = FSUtils.makeDataFileName(commitTime2, 1, fileId); + new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); + refreshFsView(); + assertEquals("", fileName1, + fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().get() + .getFileName()); + + // Make it safe + commitTimeline.saveInstantAsComplete(commitTime2, Optional.empty()); + refreshFsView(); + assertEquals("", fileName2, + fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().get() + .getFileName()); + } + + @Test + public void testStreamLatestVersionInPartition() throws IOException { + // Put some files in the partition + String fullPartitionPath = basePath + "/2016/05/01/"; + new File(fullPartitionPath).mkdirs(); + String commitTime1 = "1"; + String commitTime2 = "2"; + String commitTime3 = "3"; + String commitTime4 = "4"; + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) + .createNewFile(); + + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + + // Now we list the entire partition + FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); + assertEquals(statuses.length, 7); + + refreshFsView(); + List statuses1 = + fsView.streamLatestVersionInPartition("2016/05/01", commitTime4) + .collect(Collectors.toList()); + assertEquals(statuses1.size(), 3); + Set filenames = Sets.newHashSet(); + for (HoodieDataFile status : statuses1) { + filenames.add(status.getFileName()); + } + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId1))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId3))); + + // Reset the max commit time + List statuses2 = + fsView.streamLatestVersionInPartition("2016/05/01", commitTime3) + .collect(Collectors.toList()); + assertEquals(statuses2.size(), 3); + filenames = Sets.newHashSet(); + for (HoodieDataFile status : statuses2) { + filenames.add(status.getFileName()); + } + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, 1, fileId1))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId3))); + } + + @Test + public void testStreamEveryVersionInPartition() throws IOException { + // Put some files in the partition + String fullPartitionPath = basePath + "/2016/05/01/"; + new File(fullPartitionPath).mkdirs(); + String commitTime1 = "1"; + String commitTime2 = "2"; + String commitTime3 = "3"; + String commitTime4 = "4"; + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) + .createNewFile(); + + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + + // Now we list the entire partition + FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); + assertEquals(statuses.length, 7); + + refreshFsView(); + List> statuses1 = + fsView.streamEveryVersionInPartition("2016/05/01").collect(Collectors.toList()); + assertEquals(statuses1.size(), 3); + + for (List status : statuses1) { + String fileId = status.get(0).getFileId(); + Set filenames = Sets.newHashSet(); + for (HoodieDataFile dataFile : status) { + assertEquals("All same fileId should be grouped", fileId, dataFile.getFileId()); + filenames.add(dataFile.getFileName()); + } + if (fileId.equals(fileId1)) { + assertEquals(filenames, + Sets.newHashSet(FSUtils.makeDataFileName(commitTime1, 1, fileId1), + FSUtils.makeDataFileName(commitTime4, 1, fileId1))); + } else if (fileId.equals(fileId2)) { + assertEquals(filenames, + Sets.newHashSet(FSUtils.makeDataFileName(commitTime1, 1, fileId2), + FSUtils.makeDataFileName(commitTime2, 1, fileId2), + FSUtils.makeDataFileName(commitTime3, 1, fileId2))); + } else { + assertEquals(filenames, + Sets.newHashSet(FSUtils.makeDataFileName(commitTime3, 1, fileId3), + FSUtils.makeDataFileName(commitTime4, 1, fileId3))); + } + } + } + + @Test + public void streamLatestVersionInRange() throws IOException { + // Put some files in the partition + String fullPartitionPath = basePath + "/2016/05/01/"; + new File(fullPartitionPath).mkdirs(); + String commitTime1 = "1"; + String commitTime2 = "2"; + String commitTime3 = "3"; + String commitTime4 = "4"; + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) + .createNewFile(); + + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + + // Now we list the entire partition + FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); + assertEquals(statuses.length, 7); + + refreshFsView(); + List statuses1 = + fsView.streamLatestVersionInRange(statuses, Lists.newArrayList(commitTime2, commitTime3)) + .collect(Collectors.toList()); + assertEquals(statuses1.size(), 2); + Set filenames = Sets.newHashSet(); + for (HoodieDataFile status : statuses1) { + filenames.add(status.getFileName()); + } + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId3))); + } + + @Test + public void streamLatestVersionsBefore() throws IOException { + // Put some files in the partition + String fullPartitionPath = basePath + "/2016/05/01/"; + new File(fullPartitionPath).mkdirs(); + String commitTime1 = "1"; + String commitTime2 = "2"; + String commitTime3 = "3"; + String commitTime4 = "4"; + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) + .createNewFile(); + + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + + // Now we list the entire partition + FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); + assertEquals(statuses.length, 7); + + refreshFsView(); + List statuses1 = + fsView.streamLatestVersionsBeforeOrOn(statuses, commitTime2) + .collect(Collectors.toList()); + assertEquals(statuses1.size(), 2); + Set filenames = Sets.newHashSet(); + for (HoodieDataFile status : statuses1) { + filenames.add(status.getFileName()); + } + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, 1, fileId1))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime2, 1, fileId2))); + + } + + @Test + public void streamLatestVersions() throws IOException { + // Put some files in the partition + String fullPartitionPath = basePath + "/2016/05/01/"; + new File(fullPartitionPath).mkdirs(); + String commitTime1 = "1"; + String commitTime2 = "2"; + String commitTime3 = "3"; + String commitTime4 = "4"; + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) + .createNewFile(); + + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + + // Now we list the entire partition + FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); + assertEquals(statuses.length, 7); + + refreshFsView(); + List statuses1 = + fsView.streamLatestVersions(statuses) + .collect(Collectors.toList()); + assertEquals(statuses1.size(), 3); + Set filenames = Sets.newHashSet(); + for (HoodieDataFile status : statuses1) { + filenames.add(status.getFileName()); + } + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId1))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId3))); + } +} diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java index 7156991c6..c5d19b50e 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java @@ -1,17 +1,17 @@ /* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package com.uber.hoodie.common.util; @@ -52,13 +52,6 @@ public class TestFSUtils { assertTrue(FSUtils.getCommitTime(fullFileName).equals(commitTime)); } - @Test - public void testGetCommitFromCommitFile() { - String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); - String commitFileName = FSUtils.makeCommitFileName(commitTime); - assertTrue(FSUtils.getCommitFromCommitFile(commitFileName).equals(commitTime)); - } - @Test public void testGetFileNameWithoutMeta() { String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestNumericUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestNumericUtils.java index 9a916d4da..99f8a67f2 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestNumericUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestNumericUtils.java @@ -1,17 +1,17 @@ /* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package com.uber.hoodie.common.util; diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java index 86efe691a..4145ed6ed 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java @@ -1,17 +1,17 @@ /* - * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package com.uber.hoodie.common.util; diff --git a/hoodie-hadoop-mr/pom.xml b/hoodie-hadoop-mr/pom.xml index 31c4ac152..cb1d3c144 100644 --- a/hoodie-hadoop-mr/pom.xml +++ b/hoodie-hadoop-mr/pom.xml @@ -90,7 +90,7 @@ org.apache.maven.plugins maven-shade-plugin - 2.3 + 2.4 package @@ -98,6 +98,7 @@ shade + ${project.build.directory}/dependency-reduced-pom.xml true diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java index 550de5cf4..21b96fe2a 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java @@ -16,8 +16,13 @@ package com.uber.hoodie.hadoop; +import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieRecord; -import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.view.ReadOptimizedTableView; +import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.InvalidDatasetException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -52,6 +57,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import static parquet.filter2.predicate.FilterApi.and; import static parquet.filter2.predicate.FilterApi.binaryColumn; @@ -73,11 +79,11 @@ public class HoodieInputFormat extends MapredParquetInputFormat public FileStatus[] listStatus(JobConf job) throws IOException { // Get all the file status from FileInputFormat and then do the filter FileStatus[] fileStatuses = super.listStatus(job); - Map> groupedFileStatus = groupFileStatus(fileStatuses); + Map> groupedFileStatus = groupFileStatus(fileStatuses); LOG.info("Found a total of " + groupedFileStatus.size() + " groups"); List returns = new ArrayList(); - for(Map.Entry> entry:groupedFileStatus.entrySet()) { - HoodieTableMetadata metadata = entry.getKey(); + for(Map.Entry> entry:groupedFileStatus.entrySet()) { + HoodieTableMetaClient metadata = entry.getKey(); if(metadata == null) { // Add all the paths which are not hoodie specific returns.addAll(entry.getValue()); @@ -86,31 +92,35 @@ public class HoodieInputFormat extends MapredParquetInputFormat FileStatus[] value = entry.getValue().toArray(new FileStatus[entry.getValue().size()]); LOG.info("Hoodie Metadata initialized with completed commit Ts as :" + metadata); - String tableName = metadata.getTableName(); + String tableName = metadata.getTableConfig().getTableName(); String mode = HoodieHiveUtil.readMode(Job.getInstance(job), tableName); + TableFileSystemView fsView = new ReadOptimizedTableView(FSUtils.getFs(), metadata); + HoodieTimeline timeline = metadata.getActiveCommitTimeline(); if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) { // this is of the form commitTs_partition_sequenceNumber String lastIncrementalTs = HoodieHiveUtil.readStartCommitTime(Job.getInstance(job), tableName); // Total number of commits to return in this batch. Set this to -1 to get all the commits. Integer maxCommits = HoodieHiveUtil.readMaxCommits(Job.getInstance(job), tableName); LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs); - List - commitsToReturn = metadata.findCommitsAfter(lastIncrementalTs, maxCommits); - FileStatus[] filteredFiles = - metadata.getLatestVersionInRange(value, commitsToReturn); - for (FileStatus filteredFile : filteredFiles) { + List commitsToReturn = + timeline.findInstantsAfter(lastIncrementalTs, maxCommits) + .collect(Collectors.toList()); + List filteredFiles = + fsView.streamLatestVersionInRange(value, commitsToReturn) + .collect(Collectors.toList()); + for (HoodieDataFile filteredFile : filteredFiles) { LOG.info("Processing incremental hoodie file - " + filteredFile.getPath()); - returns.add(filteredFile); + returns.add(filteredFile.getFileStatus()); } LOG.info( - "Total paths to process after hoodie incremental filter " + filteredFiles.length); + "Total paths to process after hoodie incremental filter " + filteredFiles.size()); } else { // filter files on the latest commit found - FileStatus[] filteredFiles = metadata.getLatestVersions(value); - LOG.info("Total paths to process after hoodie filter " + filteredFiles.length); - for (FileStatus filteredFile : filteredFiles) { + List filteredFiles = fsView.streamLatestVersions(value).collect(Collectors.toList()); + LOG.info("Total paths to process after hoodie filter " + filteredFiles.size()); + for (HoodieDataFile filteredFile : filteredFiles) { LOG.info("Processing latest hoodie file - " + filteredFile.getPath()); - returns.add(filteredFile); + returns.add(filteredFile.getFileStatus()); } } } @@ -118,18 +128,18 @@ public class HoodieInputFormat extends MapredParquetInputFormat } - private Map> groupFileStatus(FileStatus[] fileStatuses) + private Map> groupFileStatus(FileStatus[] fileStatuses) throws IOException { // This assumes the paths for different tables are grouped together - Map> grouped = new HashMap<>(); - HoodieTableMetadata metadata = null; + Map> grouped = new HashMap<>(); + HoodieTableMetaClient metadata = null; String nonHoodieBasePath = null; for(FileStatus status:fileStatuses) { if ((metadata == null && nonHoodieBasePath == null) || (metadata == null && !status.getPath().toString() .contains(nonHoodieBasePath)) || (metadata != null && !status.getPath().toString() .contains(metadata.getBasePath()))) { try { - metadata = getTableMetadata(status.getPath().getParent()); + metadata = getTableMetaClient(status.getPath().getParent()); nonHoodieBasePath = null; } catch (InvalidDatasetException e) { LOG.info("Handling a non-hoodie path " + status.getPath()); @@ -138,7 +148,7 @@ public class HoodieInputFormat extends MapredParquetInputFormat status.getPath().getParent().toString(); } if(!grouped.containsKey(metadata)) { - grouped.put(metadata, new ArrayList()); + grouped.put(metadata, new ArrayList<>()); } } grouped.get(metadata).add(status); @@ -242,12 +252,12 @@ public class HoodieInputFormat extends MapredParquetInputFormat * @return * @throws IOException */ - private HoodieTableMetadata getTableMetadata(Path dataPath) throws IOException { + private HoodieTableMetaClient getTableMetaClient(Path dataPath) throws IOException { FileSystem fs = dataPath.getFileSystem(conf); // TODO - remove this hard-coding. Pass this in job conf, somehow. Or read the Table Location Path baseDir = dataPath.getParent().getParent().getParent(); LOG.info("Reading hoodie metadata from path " + baseDir.toString()); - return new HoodieTableMetadata(fs, baseDir.toString()); + return new HoodieTableMetaClient(fs, baseDir.toString()); } } diff --git a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/InputFormatTestUtil.java b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/InputFormatTestUtil.java index 6a016c4a6..21fdee633 100644 --- a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/InputFormatTestUtil.java +++ b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/InputFormatTestUtil.java @@ -41,7 +41,7 @@ public class InputFormatTestUtil { public static File prepareDataset(TemporaryFolder basePath, int numberOfFiles, String commitNumber) throws IOException { basePath.create(); - HoodieTestUtils.initializeHoodieDirectory(basePath.getRoot().toString()); + HoodieTestUtils.init(basePath.getRoot().toString()); File partitionPath = basePath.newFolder("2016", "05", "01"); for (int i = 0; i < numberOfFiles; i++) { File dataFile = @@ -95,7 +95,7 @@ public class InputFormatTestUtil { public static File prepareParquetDataset(TemporaryFolder basePath, Schema schema, int numberOfFiles, int numberOfRecords, String commitNumber) throws IOException { basePath.create(); - HoodieTestUtils.initializeHoodieDirectory(basePath.getRoot().toString()); + HoodieTestUtils.init(basePath.getRoot().toString()); File partitionPath = basePath.newFolder("2016", "05", "01"); AvroParquetWriter parquetWriter; for (int i = 0; i < numberOfFiles; i++) { diff --git a/hoodie-utilities/pom.xml b/hoodie-utilities/pom.xml index 21f914125..d44fbf298 100644 --- a/hoodie-utilities/pom.xml +++ b/hoodie-utilities/pom.xml @@ -28,16 +28,8 @@ - org.codehaus.mojo - cobertura-maven-plugin - - - org.apache.maven.plugins - maven-compiler-plugin - - 1.7 - 1.7 - + org.jacoco + jacoco-maven-plugin org.apache.maven.plugins diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java index 96f7c2c39..1dd6eabef 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java @@ -18,7 +18,7 @@ package com.uber.hoodie.utilities; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; -import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.utilities.exception.HoodieIncrementalPullException; import com.uber.hoodie.utilities.exception.HoodieIncrementalPullSQLException; @@ -44,7 +44,9 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.List; +import java.util.Optional; import java.util.Scanner; +import java.util.stream.Collectors; /** * Utility to pull data after a given commit, based on the supplied HiveQL and save the delta as another hive temporary table. @@ -263,9 +265,10 @@ public class HiveIncrementalPuller { if(!fs.exists(new Path(targetDataPath)) || !fs.exists(new Path(targetDataPath + "/.hoodie"))) { return "0"; } - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, targetDataPath); - String lastCommit = metadata.getAllCommits().lastCommit(); - return lastCommit == null ? "0" : lastCommit; + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, targetDataPath); + + Optional lastCommit = metadata.getActiveCommitTimeline().lastInstant(); + return lastCommit.orElse("0"); } private boolean ensureTempPathExists(FileSystem fs, String lastCommitTime) @@ -295,12 +298,14 @@ public class HiveIncrementalPuller { } private String getLastCommitTimePulled(FileSystem fs, String sourceTableLocation) throws IOException { - HoodieTableMetadata metadata = new HoodieTableMetadata(fs, sourceTableLocation); - List commitsToSync = - metadata.getAllCommits().findCommitsAfter(config.fromCommitTime, config.maxCommits); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, sourceTableLocation); + List commitsToSync = metadata.getActiveCommitTimeline() + .findInstantsAfter(config.fromCommitTime, config.maxCommits) + .collect(Collectors.toList()); if (commitsToSync.isEmpty()) { log.warn("Nothing to sync. All commits in " + config.sourceTable + " are " + metadata - .getAllCommits().getCommitList() + " and from commit time is " + config.fromCommitTime); + .getActiveCommitTimeline().getInstants().collect(Collectors.toList()) + + " and from commit time is " + config.fromCommitTime); return null; } log.info("Syncing commits " + commitsToSync); diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieDeltaStreamer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieDeltaStreamer.java index 167c9cd1f..16da1d8ee 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieDeltaStreamer.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieDeltaStreamer.java @@ -22,10 +22,10 @@ import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.common.HoodieJsonPayload; -import com.uber.hoodie.common.model.HoodieCommits; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; -import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieWriteConfig; @@ -65,22 +65,23 @@ public class HoodieDeltaStreamer implements Serializable { private void sync() throws Exception { JavaSparkContext sc = getSparkContext(cfg); FileSystem fs = FSUtils.getFs(); - HoodieTableMetadata targetHoodieMetadata = - new HoodieTableMetadata(fs, cfg.targetPath, cfg.targetTableName); + HoodieTableMetaClient targetHoodieMetadata = new HoodieTableMetaClient(fs, cfg.targetPath); + HoodieTimeline timeline = targetHoodieMetadata.getActiveCommitTimeline(); String lastCommitPulled = findLastCommitPulled(fs, cfg.dataPath); log.info("Last commit pulled on the source dataset is " + lastCommitPulled); - if (!targetHoodieMetadata.getAllCommits().isEmpty() && HoodieCommits - .isCommit1After(targetHoodieMetadata.getAllCommits().lastCommit(), lastCommitPulled)) { + if (!timeline.getInstants().iterator().hasNext() && timeline + .compareInstants(timeline.lastInstant().get(), lastCommitPulled, + HoodieTimeline.GREATER)) { // this should never be the case throw new IllegalStateException( - "Last commit pulled from source table " + lastCommitPulled - + " is before the last commit in the target table " + targetHoodieMetadata - .getAllCommits().lastCommit()); + "Last commit pulled from source table " + lastCommitPulled + + " is before the last commit in the target table " + timeline.lastInstant() + .get()); } - if (!cfg.override && targetHoodieMetadata.getAllCommits().contains(lastCommitPulled)) { + if (!cfg.override && timeline.containsOrBeforeTimelineStarts(lastCommitPulled)) { throw new IllegalStateException( - "Target Table already has the commit " + lastCommitPulled - + ". Not overriding as cfg.override is false"); + "Target Table already has the commit " + lastCommitPulled + + ". Not overriding as cfg.override is false"); } syncTill(lastCommitPulled, targetHoodieMetadata, sc); } @@ -98,7 +99,7 @@ public class HoodieDeltaStreamer implements Serializable { return commitTimes.get(0); } - private void syncTill(String lastCommitPulled, HoodieTableMetadata target, + private void syncTill(String lastCommitPulled, HoodieTableMetaClient target, JavaSparkContext sc) throws Exception { // Step 1 : Scan incrementally and get the input records as a RDD of source format String dataPath = cfg.dataPath + "/" + lastCommitPulled; @@ -159,13 +160,13 @@ public class HoodieDeltaStreamer implements Serializable { // }) } - private HoodieWriteConfig getHoodieClientConfig(HoodieTableMetadata metadata) + private HoodieWriteConfig getHoodieClientConfig(HoodieTableMetaClient metadata) throws Exception { final String schemaStr = Files.toString(new File(cfg.schemaFile), Charset.forName("UTF-8")); return HoodieWriteConfig.newBuilder().withPath(metadata.getBasePath()) .withSchema(schemaStr) .withParallelism(cfg.groupByParallelism, cfg.groupByParallelism) - .forTable(metadata.getTableName()).withIndexConfig( + .forTable(metadata.getTableConfig().getTableName()).withIndexConfig( HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) .build(); } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java index 4196fe449..6acf03fdf 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java @@ -19,8 +19,12 @@ package com.uber.hoodie.utilities; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; -import com.uber.hoodie.common.model.HoodieCommits; -import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieDataFile; +import com.uber.hoodie.common.table.HoodieTableConfig; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.TableFileSystemView; +import com.uber.hoodie.common.table.view.ReadOptimizedTableView; import com.uber.hoodie.common.util.FSUtils; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -40,6 +44,8 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; /** * Hoodie snapshot copy job which copies latest files from all partitions to another place, for snapshot backup. @@ -57,11 +63,15 @@ public class HoodieSnapshotCopier implements Serializable { public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir) throws IOException { FileSystem fs = FSUtils.getFs(); - final HoodieTableMetadata tableMetadata = new HoodieTableMetadata(fs, baseDir); - + final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs, baseDir); + final TableFileSystemView fsView = new ReadOptimizedTableView(fs, tableMetadata); // Get the latest commit - final String latestCommit = tableMetadata.getAllCommits().lastCommit(); - logger.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommit)); + final Optional latestCommit = tableMetadata.getActiveCommitTimeline().lastInstant(); + if(!latestCommit.isPresent()) { + logger.warn("No commits present. Nothing to snapshot"); + } else { + logger.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommit.get())); + } List partitions = FSUtils.getAllPartitionPaths(fs, baseDir); if (partitions.size() > 0) { @@ -80,8 +90,10 @@ public class HoodieSnapshotCopier implements Serializable { // Only take latest version files <= latestCommit. FileSystem fs = FSUtils.getFs(); List> filePaths = new ArrayList<>(); - for (FileStatus fileStatus : tableMetadata.getLatestVersionInPartition(fs, partition, latestCommit)) { - filePaths.add(new Tuple2<>(partition, fileStatus.getPath().toString())); + for (HoodieDataFile hoodieDataFile : fsView + .streamLatestVersionInPartition(partition, latestCommit.get()) + .collect(Collectors.toList())) { + filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())); } return filePaths.iterator(); } @@ -102,22 +114,25 @@ public class HoodieSnapshotCopier implements Serializable { }); // Also copy the .commit files - logger.info(String.format("Copying .commit files which are no-late-than %s.", latestCommit)); + logger.info(String.format("Copying .commit files which are no-late-than %s.", latestCommit.get())); FileStatus[] commitFilesToCopy = fs.listStatus( - new Path(baseDir + "/" + HoodieTableMetadata.METAFOLDER_NAME), new PathFilter() { + new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), new PathFilter() { @Override public boolean accept(Path commitFilePath) { - if (commitFilePath.getName().equals(HoodieTableMetadata.HOODIE_PROPERTIES_FILE)) { + if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) { return true; } else { - String commitTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName()); - return HoodieCommits.isCommit1BeforeOrOn(commitTime, latestCommit); + String commitTime = + FSUtils.getCommitFromCommitFile(commitFilePath.getName()); + return tableMetadata.getActiveCommitTimeline() + .compareInstants(commitTime, latestCommit.get(), HoodieTimeline.GREATER); } } }); for (FileStatus commitStatus : commitFilesToCopy) { - Path targetFilePath = - new Path(outputDir + "/" + HoodieTableMetadata.METAFOLDER_NAME + "/" + commitStatus.getPath().getName()); + Path targetFilePath = new Path( + outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus + .getPath().getName()); if (! fs.exists(targetFilePath.getParent())) { fs.mkdirs(targetFilePath.getParent()); } diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java index 6a0343412..868d4617b 100644 --- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java +++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java @@ -46,7 +46,7 @@ public class TestHoodieSnapshotCopier { folder.create(); rootPath = folder.getRoot().getAbsolutePath(); basePath = rootPath + "/" + HoodieTestUtils.RAW_TRIPS_TEST_NAME; - HoodieTestUtils.initializeHoodieDirectory(basePath); + HoodieTestUtils.init(basePath); outputPath = rootPath + "/output"; fs = FSUtils.getFs(); // Start a local Spark job diff --git a/pom.xml b/pom.xml index af990c7a9..5e81073d1 100644 --- a/pom.xml +++ b/pom.xml @@ -126,8 +126,8 @@ org.apache.maven.plugins maven-compiler-plugin - 1.7 - 1.7 + 1.8 + 1.8 @@ -146,9 +146,15 @@ maven-surefire-plugin ${maven-surefire-plugin.version} + + ${surefireArgLine} file:${project.build.testOutputDirectory}/log4j-surefire.properties + + + **/IT*.java + @@ -164,24 +170,74 @@ ${maven-jar-plugin.version} - org.codehaus.mojo - cobertura-maven-plugin - 2.7 - - - html - xml - - + org.jacoco + jacoco-maven-plugin + 0.7.8 + + pre-unit-test + + prepare-agent + + + + ${project.build.directory}/coverage-reports/jacoco-ut.exec + + surefireArgLine + + + + + post-unit-test test - cobertura + report + + + ${project.build.directory}/coverage-reports/jacoco-ut.exec + + ${project.reporting.outputDirectory}/jacoco-ut + + + + + + + + + + + + + + + + + + + + + + + + + + + org.apache.rat @@ -422,6 +478,11 @@ hive-metastore ${hive.version}-cdh${cdh.version} + + org.apache.commons + commons-lang3 + 3.4 + junit @@ -446,6 +507,14 @@ test 1.10.19 + + + com.esotericsoftware + kryo + 4.0.0 + test + +