diff --git a/hoodie-cli/pom.xml b/hoodie-cli/pom.xml index 40a6cc928..d590153a8 100644 --- a/hoodie-cli/pom.xml +++ b/hoodie-cli/pom.xml @@ -15,7 +15,9 @@ ~ limitations under the License. --> - + hoodie com.uber.hoodie @@ -117,7 +119,7 @@ - + org.apache.rat apache-rat-plugin diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java index 0b8e8fced..27f0ab516 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java @@ -17,38 +17,38 @@ package com.uber.hoodie.cli; import com.uber.hoodie.common.table.HoodieTableMetaClient; +import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; -import java.io.IOException; - public class HoodieCLI { - public static Configuration conf; - public static FileSystem fs; - public static CLIState state = CLIState.INIT; - public static HoodieTableMetaClient tableMetadata; - public static HoodieTableMetaClient syncTableMetadata; + + public static Configuration conf; + public static FileSystem fs; + public static CLIState state = CLIState.INIT; + public static HoodieTableMetaClient tableMetadata; + public static HoodieTableMetaClient syncTableMetadata; - public enum CLIState { - INIT, DATASET, SYNC + public enum CLIState { + INIT, DATASET, SYNC + } + + public static boolean initConf() { + if (HoodieCLI.conf == null) { + HoodieCLI.conf = new Configuration(); + return true; } + return false; + } - public static boolean initConf() { - if (HoodieCLI.conf == null) { - HoodieCLI.conf = new Configuration(); - return true; - } - return false; + public static void initFS(boolean force) throws IOException { + if (fs == null || force) { + fs = FileSystem.get(conf); } + } - public static void initFS(boolean force) throws IOException { - if(fs == null || force) { - fs = FileSystem.get(conf); - } - } - - public static void setTableMetadata(HoodieTableMetaClient tableMetadata) { - HoodieCLI.tableMetadata = tableMetadata; - } + public static void setTableMetadata(HoodieTableMetaClient tableMetadata) { + HoodieCLI.tableMetadata = tableMetadata; + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieHistoryFileNameProvider.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieHistoryFileNameProvider.java index aecf1de49..01440a274 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieHistoryFileNameProvider.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieHistoryFileNameProvider.java @@ -25,13 +25,13 @@ import org.springframework.stereotype.Component; @Order(Ordered.HIGHEST_PRECEDENCE) public class HoodieHistoryFileNameProvider extends DefaultHistoryFileNameProvider { - public String getHistoryFileName() { - return "hoodie-cmd.log"; - } + public String getHistoryFileName() { + return "hoodie-cmd.log"; + } - @Override - public String getProviderName() { - return "Hoodie file name provider"; - } + @Override + public String getProviderName() { + return "Hoodie file name provider"; + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrintHelper.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrintHelper.java index 34b2c2414..b6625718b 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrintHelper.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrintHelper.java @@ -17,18 +17,17 @@ package com.uber.hoodie.cli; import dnl.utils.text.table.TextTable; - import java.io.ByteArrayOutputStream; import java.io.PrintStream; import java.nio.charset.Charset; public class HoodiePrintHelper { - public static String print(String[] header, String[][] rows) { - TextTable textTable = new TextTable(header, rows); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - PrintStream ps = new PrintStream(baos); - textTable.printTable(ps, 4); - return new String(baos.toByteArray(), Charset.forName("utf-8")); - } + public static String print(String[] header, String[][] rows) { + TextTable textTable = new TextTable(header, rows); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PrintStream ps = new PrintStream(baos); + textTable.printTable(ps, 4); + return new String(baos.toByteArray(), Charset.forName("utf-8")); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java index 268ec1721..2839cac99 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodiePrompt.java @@ -16,7 +16,6 @@ package com.uber.hoodie.cli; -import com.uber.hoodie.common.table.HoodieTableConfig; import org.springframework.core.Ordered; import org.springframework.core.annotation.Order; import org.springframework.shell.plugin.support.DefaultPromptProvider; @@ -26,27 +25,27 @@ import org.springframework.stereotype.Component; @Order(Ordered.HIGHEST_PRECEDENCE) public class HoodiePrompt extends DefaultPromptProvider { - @Override - public String getPrompt() { - if (HoodieCLI.tableMetadata != null) { - String tableName = HoodieCLI.tableMetadata.getTableConfig().getTableName(); - switch (HoodieCLI.state) { - case INIT: - return "hoodie->"; - case DATASET: - return "hoodie:" + tableName + "->"; - case SYNC: - return "hoodie:" + tableName + " <==> " - + HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->"; - } - return "hoodie:" + tableName + "->"; - } - return "hoodie->"; + @Override + public String getPrompt() { + if (HoodieCLI.tableMetadata != null) { + String tableName = HoodieCLI.tableMetadata.getTableConfig().getTableName(); + switch (HoodieCLI.state) { + case INIT: + return "hoodie->"; + case DATASET: + return "hoodie:" + tableName + "->"; + case SYNC: + return "hoodie:" + tableName + " <==> " + + HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->"; + } + return "hoodie:" + tableName + "->"; } + return "hoodie->"; + } - @Override - public String getProviderName() { - return "Hoodie provider"; - } + @Override + public String getProviderName() { + return "Hoodie provider"; + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieSplashScreen.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieSplashScreen.java index fa27d5749..d6a16891a 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieSplashScreen.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieSplashScreen.java @@ -22,34 +22,39 @@ import org.springframework.shell.plugin.support.DefaultBannerProvider; import org.springframework.shell.support.util.OsUtils; import org.springframework.stereotype.Component; -@Component @Order(Ordered.HIGHEST_PRECEDENCE) public class HoodieSplashScreen +@Component +@Order(Ordered.HIGHEST_PRECEDENCE) +public class HoodieSplashScreen extends DefaultBannerProvider { - private static String screen = "============================================" + OsUtils.LINE_SEPARATOR + - "* *" + OsUtils.LINE_SEPARATOR + - "* _ _ _ _ *" + OsUtils.LINE_SEPARATOR + - "* | | | | | (_) *" + OsUtils.LINE_SEPARATOR + - "* | |__| | ___ ___ __| |_ ___ *" + OsUtils.LINE_SEPARATOR + - "* | __ |/ _ \\ / _ \\ / _` | |/ _ \\ *" + - OsUtils.LINE_SEPARATOR + - "* | | | | (_) | (_) | (_| | | __/ *" + OsUtils.LINE_SEPARATOR + - "* |_| |_|\\___/ \\___/ \\__,_|_|\\___| *" + - OsUtils.LINE_SEPARATOR + - "* *" + OsUtils.LINE_SEPARATOR + - "============================================" + OsUtils.LINE_SEPARATOR; - public String getBanner() { - return screen; - } + private static String screen = + "============================================" + OsUtils.LINE_SEPARATOR + + "* *" + OsUtils.LINE_SEPARATOR + + "* _ _ _ _ *" + OsUtils.LINE_SEPARATOR + + "* | | | | | (_) *" + OsUtils.LINE_SEPARATOR + + "* | |__| | ___ ___ __| |_ ___ *" + OsUtils.LINE_SEPARATOR + + "* | __ |/ _ \\ / _ \\ / _` | |/ _ \\ *" + + OsUtils.LINE_SEPARATOR + + "* | | | | (_) | (_) | (_| | | __/ *" + OsUtils.LINE_SEPARATOR + + "* |_| |_|\\___/ \\___/ \\__,_|_|\\___| *" + + OsUtils.LINE_SEPARATOR + + "* *" + OsUtils.LINE_SEPARATOR + + "============================================" + OsUtils.LINE_SEPARATOR; - public String getVersion() { - return "1.0"; - } + public String getBanner() { + return screen; + } - public String getWelcomeMessage() { - return "Welcome to Hoodie CLI. Please type help if you are looking for help. "; - } + public String getVersion() { + return "1.0"; + } - @Override public String getProviderName() { - return "Hoodie Banner"; - } + public String getWelcomeMessage() { + return "Welcome to Hoodie CLI. Please type help if you are looking for help. "; + } + + @Override + public String getProviderName() { + return "Hoodie Banner"; + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/Main.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/Main.java index 779df13f0..c0d7924ff 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/Main.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/Main.java @@ -16,18 +16,16 @@ package com.uber.hoodie.cli; +import java.io.IOException; import org.springframework.shell.Bootstrap; -import java.io.IOException; - public class Main { - /** - * Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging inside an IDE - * - * @param args - * @throws IOException - */ - public static void main(String[] args) throws IOException { - Bootstrap.main(args); - } + + /** + * Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging + * inside an IDE + */ + public static void main(String[] args) throws IOException { + Bootstrap.main(args); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java index 147734990..069c6564a 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java @@ -24,6 +24,10 @@ import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.log.HoodieLogFormat; import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock; import com.uber.hoodie.common.util.FSUtils; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FileStatus; @@ -34,90 +38,90 @@ import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; import org.springframework.stereotype.Component; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.Collectors; - @Component public class ArchivedCommitsCommand implements CommandMarker { - @CliAvailabilityIndicator({"show archived commits"}) - public boolean isShowArchivedCommitAvailable() { - return HoodieCLI.tableMetadata != null; + @CliAvailabilityIndicator({"show archived commits"}) + public boolean isShowArchivedCommitAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliCommand(value = "show archived commits", help = "Read commits from archived files and show details") + public String showCommits( + @CliOption(key = { + "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") + final Integer limit) throws IOException { + + System.out + .println("===============> Showing only " + limit + " archived commits <==============="); + FileStatus[] fsStatuses = FSUtils.getFs().globStatus( + new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*")); + List allCommits = new ArrayList<>(); + for (FileStatus fs : fsStatuses) { + //read the archived file + HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(), + new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false); + + List readRecords = new ArrayList<>(); + //read the avro blocks + while (reader.hasNext()) { + HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); + List records = blk.getRecords(); + readRecords.addAll(records); + } + List readCommits = readRecords.stream().map(r -> (GenericRecord) r) + .map(r -> readCommit(r)).limit(limit).collect(Collectors.toList()); + allCommits.addAll(readCommits); } + return HoodiePrintHelper.print( + new String[]{"CommitTime", "CommitType", "CommitDetails"}, + allCommits.toArray(new String[allCommits.size()][])); + } - @CliCommand(value = "show archived commits", help = "Read commits from archived files and show details") - public String showCommits( - @CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") - final Integer limit) throws IOException { - - System.out.println("===============> Showing only " + limit + " archived commits <==============="); - FileStatus [] fsStatuses = FSUtils.getFs().globStatus(new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*")); - List allCommits = new ArrayList<>(); - for(FileStatus fs : fsStatuses) { - //read the archived file - HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(), - new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false); - - List readRecords = new ArrayList<>(); - //read the avro blocks - while (reader.hasNext()) { - HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - List records = blk.getRecords(); - readRecords.addAll(records); - } - List readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> readCommit(r)).limit(limit).collect(Collectors.toList()); - allCommits.addAll(readCommits); + private String[] readCommit(GenericRecord record) { + List commitDetails = new ArrayList<>(); + try { + switch (record.get("actionType").toString()) { + case HoodieTimeline.CLEAN_ACTION: { + commitDetails.add(record.get("commitTime").toString()); + commitDetails.add(record.get("actionType").toString()); + commitDetails.add(record.get("hoodieCleanMetadata").toString()); + break; } - return HoodiePrintHelper.print( - new String[] {"CommitTime", "CommitType", "CommitDetails"}, allCommits.toArray(new String[allCommits.size()][])); - } - - private String[] readCommit(GenericRecord record) { - List commitDetails = new ArrayList<>(); - try { - switch (record.get("actionType").toString()) { - case HoodieTimeline.CLEAN_ACTION: { - commitDetails.add(record.get("commitTime").toString()); - commitDetails.add(record.get("actionType").toString()); - commitDetails.add(record.get("hoodieCleanMetadata").toString()); - break; - } - case HoodieTimeline.COMMIT_ACTION: { - commitDetails.add(record.get("commitTime").toString()); - commitDetails.add(record.get("actionType").toString()); - commitDetails.add(record.get("hoodieCommitMetadata").toString()); - break; - } - case HoodieTimeline.COMPACTION_ACTION: { - commitDetails.add(record.get("commitTime").toString()); - commitDetails.add(record.get("actionType").toString()); - commitDetails.add(record.get("hoodieCompactionMetadata").toString()); - break; - } - case HoodieTimeline.DELTA_COMMIT_ACTION: { - commitDetails.add(record.get("commitTime").toString()); - commitDetails.add(record.get("actionType").toString()); - commitDetails.add(record.get("hoodieCommitMetadata").toString()); - break; - } - case HoodieTimeline.ROLLBACK_ACTION: { - commitDetails.add(record.get("commitTime").toString()); - commitDetails.add(record.get("actionType").toString()); - commitDetails.add(record.get("hoodieRollbackMetadata").toString()); - break; - } - case HoodieTimeline.SAVEPOINT_ACTION: { - commitDetails.add(record.get("commitTime").toString()); - commitDetails.add(record.get("actionType").toString()); - commitDetails.add(record.get("hoodieSavePointMetadata").toString()); - break; - } - } - } catch (Exception e) { - e.printStackTrace(); + case HoodieTimeline.COMMIT_ACTION: { + commitDetails.add(record.get("commitTime").toString()); + commitDetails.add(record.get("actionType").toString()); + commitDetails.add(record.get("hoodieCommitMetadata").toString()); + break; } - return commitDetails.toArray(new String[commitDetails.size()]); + case HoodieTimeline.COMPACTION_ACTION: { + commitDetails.add(record.get("commitTime").toString()); + commitDetails.add(record.get("actionType").toString()); + commitDetails.add(record.get("hoodieCompactionMetadata").toString()); + break; + } + case HoodieTimeline.DELTA_COMMIT_ACTION: { + commitDetails.add(record.get("commitTime").toString()); + commitDetails.add(record.get("actionType").toString()); + commitDetails.add(record.get("hoodieCommitMetadata").toString()); + break; + } + case HoodieTimeline.ROLLBACK_ACTION: { + commitDetails.add(record.get("commitTime").toString()); + commitDetails.add(record.get("actionType").toString()); + commitDetails.add(record.get("hoodieRollbackMetadata").toString()); + break; + } + case HoodieTimeline.SAVEPOINT_ACTION: { + commitDetails.add(record.get("commitTime").toString()); + commitDetails.add(record.get("actionType").toString()); + commitDetails.add(record.get("hoodieSavePointMetadata").toString()); + break; + } + } + } catch (Exception e) { + e.printStackTrace(); } -} \ No newline at end of file + return commitDetails.toArray(new String[commitDetails.size()]); + } +} diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CleansCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CleansCommand.java index 160b9f3c6..1b5a9602d 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CleansCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CleansCommand.java @@ -24,89 +24,90 @@ import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.AvroUtils; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliAvailabilityIndicator; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; - import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.springframework.shell.core.CommandMarker; +import org.springframework.shell.core.annotation.CliAvailabilityIndicator; +import org.springframework.shell.core.annotation.CliCommand; +import org.springframework.shell.core.annotation.CliOption; +import org.springframework.stereotype.Component; @Component public class CleansCommand implements CommandMarker { - @CliAvailabilityIndicator({"cleans show"}) - public boolean isShowAvailable() { - return HoodieCLI.tableMetadata != null; - } - @CliAvailabilityIndicator({"cleans refresh"}) - public boolean isRefreshAvailable() { - return HoodieCLI.tableMetadata != null; - } + @CliAvailabilityIndicator({"cleans show"}) + public boolean isShowAvailable() { + return HoodieCLI.tableMetadata != null; + } - @CliAvailabilityIndicator({"clean showpartitions"}) - public boolean isCommitShowAvailable() { - return HoodieCLI.tableMetadata != null; - } + @CliAvailabilityIndicator({"cleans refresh"}) + public boolean isRefreshAvailable() { + return HoodieCLI.tableMetadata != null; + } - @CliCommand(value = "cleans show", help = "Show the cleans") - public String showCleans() throws IOException { - HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants(); - List cleans = timeline.getInstants().collect(Collectors.toList()); - String[][] rows = new String[cleans.size()][]; - Collections.reverse(cleans); - for (int i = 0; i < cleans.size(); i++) { - HoodieInstant clean = cleans.get(i); - HoodieCleanMetadata cleanMetadata = - AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get()); - rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(), - String.valueOf(cleanMetadata.getTotalFilesDeleted()), - String.valueOf(cleanMetadata.getTimeTakenInMillis())}; - } - return HoodiePrintHelper.print( - new String[] {"CleanTime", "EarliestCommandRetained", "Total Files Deleted", - "Total Time Taken"}, rows); - } + @CliAvailabilityIndicator({"clean showpartitions"}) + public boolean isCommitShowAvailable() { + return HoodieCLI.tableMetadata != null; + } - @CliCommand(value = "cleans refresh", help = "Refresh the commits") - public String refreshCleans() throws IOException { - HoodieTableMetaClient metadata = - new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); - HoodieCLI.setTableMetadata(metadata); - return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; + @CliCommand(value = "cleans show", help = "Show the cleans") + public String showCleans() throws IOException { + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants(); + List cleans = timeline.getInstants().collect(Collectors.toList()); + String[][] rows = new String[cleans.size()][]; + Collections.reverse(cleans); + for (int i = 0; i < cleans.size(); i++) { + HoodieInstant clean = cleans.get(i); + HoodieCleanMetadata cleanMetadata = + AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get()); + rows[i] = new String[]{clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(), + String.valueOf(cleanMetadata.getTotalFilesDeleted()), + String.valueOf(cleanMetadata.getTimeTakenInMillis())}; } + return HoodiePrintHelper.print( + new String[]{"CleanTime", "EarliestCommandRetained", "Total Files Deleted", + "Total Time Taken"}, rows); + } - @CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean") - public String showCleanPartitions( - @CliOption(key = {"clean"}, help = "clean to show") - final String commitTime) throws Exception { - HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants(); - HoodieInstant cleanInstant = - new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, commitTime); + @CliCommand(value = "cleans refresh", help = "Refresh the commits") + public String refreshCleans() throws IOException { + HoodieTableMetaClient metadata = + new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); + HoodieCLI.setTableMetadata(metadata); + return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; + } - if (!timeline.containsInstant(cleanInstant)) { - return "Clean " + commitTime + " not found in metadata " + timeline; - } - HoodieCleanMetadata cleanMetadata = - AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get()); - List rows = new ArrayList<>(); - for (Map.Entry entry : cleanMetadata.getPartitionMetadata().entrySet()) { - String path = entry.getKey(); - HoodieCleanPartitionMetadata stats = entry.getValue(); - String policy = stats.getPolicy(); - String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size()); - String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size()); - rows.add(new String[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles}); - } - return HoodiePrintHelper.print( - new String[] {"Partition Path", "Cleaning policy", "Total Files Successfully Deleted", - "Total Failed Deletions"}, rows.toArray(new String[rows.size()][])); + @CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean") + public String showCleanPartitions( + @CliOption(key = {"clean"}, help = "clean to show") + final String commitTime) throws Exception { + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants(); + HoodieInstant cleanInstant = + new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, commitTime); + + if (!timeline.containsInstant(cleanInstant)) { + return "Clean " + commitTime + " not found in metadata " + timeline; } + HoodieCleanMetadata cleanMetadata = + AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get()); + List rows = new ArrayList<>(); + for (Map.Entry entry : cleanMetadata + .getPartitionMetadata().entrySet()) { + String path = entry.getKey(); + HoodieCleanPartitionMetadata stats = entry.getValue(); + String policy = stats.getPolicy(); + String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size()); + String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size()); + rows.add(new String[]{path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles}); + } + return HoodiePrintHelper.print( + new String[]{"Partition Path", "Cleaning policy", "Total Files Successfully Deleted", + "Total Failed Deletions"}, rows.toArray(new String[rows.size()][])); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java index 3caacfa81..c1a9e6dd9 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java @@ -27,7 +27,12 @@ import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.NumericUtils; - +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import org.apache.spark.launcher.SparkLauncher; import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.annotation.CliAvailabilityIndicator; @@ -35,228 +40,236 @@ import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; import org.springframework.stereotype.Component; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - @Component public class CommitsCommand implements CommandMarker { - @CliAvailabilityIndicator({"commits show"}) - public boolean isShowAvailable() { - return HoodieCLI.tableMetadata != null; + + @CliAvailabilityIndicator({"commits show"}) + public boolean isShowAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliAvailabilityIndicator({"commits refresh"}) + public boolean isRefreshAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliAvailabilityIndicator({"commit rollback"}) + public boolean isRollbackAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliAvailabilityIndicator({"commit show"}) + public boolean isCommitShowAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliCommand(value = "commits show", help = "Show the commits") + public String showCommits( + @CliOption(key = { + "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") + final Integer limit) throws IOException { + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline() + .filterCompletedInstants(); + List commits = timeline.getInstants().collect(Collectors.toList()); + String[][] rows = new String[commits.size()][]; + Collections.reverse(commits); + for (int i = 0; i < commits.size(); i++) { + HoodieInstant commit = commits.get(i); + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get()); + rows[i] = new String[]{commit.getTimestamp(), + NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()), + String.valueOf(commitMetadata.fetchTotalFilesInsert()), + String.valueOf(commitMetadata.fetchTotalFilesUpdated()), + String.valueOf(commitMetadata.fetchTotalPartitionsWritten()), + String.valueOf(commitMetadata.fetchTotalRecordsWritten()), + String.valueOf(commitMetadata.fetchTotalUpdateRecordsWritten()), + String.valueOf(commitMetadata.fetchTotalWriteErrors())}; + } + return HoodiePrintHelper.print( + new String[]{"CommitTime", "Total Written (B)", "Total Files Added", + "Total Files Updated", "Total Partitions Written", "Total Records Written", + "Total Update Records Written", "Total Errors"}, rows); + } + + @CliCommand(value = "commits refresh", help = "Refresh the commits") + public String refreshCommits() throws IOException { + HoodieTableMetaClient metadata = + new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); + HoodieCLI.setTableMetadata(metadata); + return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; + } + + @CliCommand(value = "commit rollback", help = "Rollback a commit") + public String rollbackCommit( + @CliOption(key = {"commit"}, help = "Commit to rollback") + final String commitTime, + @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") + final String sparkPropertiesPath) throws Exception { + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline() + .filterCompletedInstants(); + HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, + commitTime); + + if (!timeline.containsInstant(commitInstant)) { + return "Commit " + commitTime + " not found in Commits " + timeline; } - @CliAvailabilityIndicator({"commits refresh"}) - public boolean isRefreshAvailable() { - return HoodieCLI.tableMetadata != null; + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), + commitTime, + HoodieCLI.tableMetadata.getBasePath()); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + // Refresh the current + refreshCommits(); + if (exitCode != 0) { + return "Commit " + commitTime + " failed to roll back"; } + return "Commit " + commitTime + " rolled back"; + } - @CliAvailabilityIndicator({"commit rollback"}) - public boolean isRollbackAvailable() { - return HoodieCLI.tableMetadata != null; + @CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit") + public String showCommitPartitions( + @CliOption(key = {"commit"}, help = "Commit to show") + final String commitTime) throws Exception { + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline() + .filterCompletedInstants(); + HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, + commitTime); + + if (!timeline.containsInstant(commitInstant)) { + return "Commit " + commitTime + " not found in Commits " + timeline; } - - @CliAvailabilityIndicator({"commit show"}) - public boolean isCommitShowAvailable() { - return HoodieCLI.tableMetadata != null; - } - - @CliCommand(value = "commits show", help = "Show the commits") - public String showCommits( - @CliOption(key = { - "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") - final Integer limit) throws IOException { - HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants(); - List commits = timeline.getInstants().collect(Collectors.toList()); - String[][] rows = new String[commits.size()][]; - Collections.reverse(commits); - for (int i = 0; i < commits.size(); i++) { - HoodieInstant commit = commits.get(i); - HoodieCommitMetadata commitMetadata = - HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get()); - rows[i] = new String[] {commit.getTimestamp(), - NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()), - String.valueOf(commitMetadata.fetchTotalFilesInsert()), - String.valueOf(commitMetadata.fetchTotalFilesUpdated()), - String.valueOf(commitMetadata.fetchTotalPartitionsWritten()), - String.valueOf(commitMetadata.fetchTotalRecordsWritten()), - String.valueOf(commitMetadata.fetchTotalUpdateRecordsWritten()), - String.valueOf(commitMetadata.fetchTotalWriteErrors())}; - } - return HoodiePrintHelper.print( - new String[] {"CommitTime", "Total Written (B)", "Total Files Added", - "Total Files Updated", "Total Partitions Written", "Total Records Written", - "Total Update Records Written", "Total Errors"}, rows); - } - - @CliCommand(value = "commits refresh", help = "Refresh the commits") - public String refreshCommits() throws IOException { - HoodieTableMetaClient metadata = - new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); - HoodieCLI.setTableMetadata(metadata); - return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; - } - - @CliCommand(value = "commit rollback", help = "Rollback a commit") - public String rollbackCommit( - @CliOption(key = {"commit"}, help = "Commit to rollback") - final String commitTime, - @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") - final String sparkPropertiesPath) throws Exception { - HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants(); - HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); - - if (!timeline.containsInstant(commitInstant)) { - return "Commit " + commitTime + " not found in Commits " + timeline; - } - - SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), - commitTime, - HoodieCLI.tableMetadata.getBasePath()); - Process process = sparkLauncher.launch(); - InputStreamConsumer.captureOutput(process); - int exitCode = process.waitFor(); - // Refresh the current - refreshCommits(); - if (exitCode != 0) { - return "Commit " + commitTime + " failed to roll back"; - } - return "Commit " + commitTime + " rolled back"; - } - - @CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit") - public String showCommitPartitions( - @CliOption(key = {"commit"}, help = "Commit to show") - final String commitTime) throws Exception { - HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants(); - HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); - - if (!timeline.containsInstant(commitInstant)) { - return "Commit " + commitTime + " not found in Commits " + timeline; - } - HoodieCommitMetadata meta = - HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get()); - List rows = new ArrayList(); - for (Map.Entry> entry : meta.getPartitionToWriteStats() - .entrySet()) { - String path = entry.getKey(); - List stats = entry.getValue(); - long totalFilesAdded = 0; - long totalFilesUpdated = 0; - long totalRecordsUpdated = 0; - long totalRecordsInserted = 0; - long totalBytesWritten = 0; - long totalWriteErrors = 0; - for (HoodieWriteStat stat : stats) { - if (stat.getPrevCommit().equals(HoodieWriteStat.NULL_COMMIT)) { - totalFilesAdded += 1; - totalRecordsInserted += stat.getNumWrites(); - } else { - totalFilesUpdated += 1; - totalRecordsUpdated += stat.getNumUpdateWrites(); - } - totalBytesWritten += stat.getTotalWriteBytes(); - totalWriteErrors += stat.getTotalWriteErrors(); - } - rows.add(new String[] {path, String.valueOf(totalFilesAdded), - String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted), - String.valueOf(totalRecordsUpdated), - NumericUtils.humanReadableByteCount(totalBytesWritten), - String.valueOf(totalWriteErrors)}); - - } - return HoodiePrintHelper.print( - new String[] {"Partition Path", "Total Files Added", "Total Files Updated", - "Total Records Inserted", "Total Records Updated", "Total Bytes Written", - "Total Errors"}, rows.toArray(new String[rows.size()][])); - } - - @CliCommand(value = "commit showfiles", help = "Show file level details of a commit") - public String showCommitFiles( - @CliOption(key = {"commit"}, help = "Commit to show") - final String commitTime) throws Exception { - HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants(); - HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); - - if (!timeline.containsInstant(commitInstant)) { - return "Commit " + commitTime + " not found in Commits " + timeline; - } - HoodieCommitMetadata meta = - HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get()); - List rows = new ArrayList(); - for (Map.Entry> entry : meta.getPartitionToWriteStats() - .entrySet()) { - String path = entry.getKey(); - List stats = entry.getValue(); - for (HoodieWriteStat stat : stats) { - rows.add(new String[] {path, stat.getFileId(), stat.getPrevCommit(), - String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()), - String.valueOf(stat.getTotalWriteBytes()), - String.valueOf(stat.getTotalWriteErrors())}); - } - } - return HoodiePrintHelper.print( - new String[] {"Partition Path", "File ID", "Previous Commit", "Total Records Updated", - "Total Records Written", "Total Bytes Written", "Total Errors"}, - rows.toArray(new String[rows.size()][])); - } - - @CliAvailabilityIndicator({"commits compare"}) - public boolean isCompareCommitsAvailable() { - return HoodieCLI.tableMetadata != null; - } - - @CliCommand(value = "commits compare", help = "Compare commits with another Hoodie dataset") - public String compareCommits( - @CliOption(key = {"path"}, help = "Path of the dataset to compare to") - final String path) throws Exception { - HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path); - HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();; - HoodieTableMetaClient source = HoodieCLI.tableMetadata; - HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();; - String targetLatestCommit = - targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp(); - String sourceLatestCommit = - sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp(); - - if (sourceLatestCommit != null && - HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { - // source is behind the target - List commitsToCatchup = - targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) - .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); - return "Source " + source.getTableConfig().getTableName() + " is behind by " - + commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup; + HoodieCommitMetadata meta = + HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get()); + List rows = new ArrayList(); + for (Map.Entry> entry : meta.getPartitionToWriteStats() + .entrySet()) { + String path = entry.getKey(); + List stats = entry.getValue(); + long totalFilesAdded = 0; + long totalFilesUpdated = 0; + long totalRecordsUpdated = 0; + long totalRecordsInserted = 0; + long totalBytesWritten = 0; + long totalWriteErrors = 0; + for (HoodieWriteStat stat : stats) { + if (stat.getPrevCommit().equals(HoodieWriteStat.NULL_COMMIT)) { + totalFilesAdded += 1; + totalRecordsInserted += stat.getNumWrites(); } else { - List commitsToCatchup = - sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE) - .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); - return "Source " + source.getTableConfig().getTableName() + " is ahead by " - + commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup; + totalFilesUpdated += 1; + totalRecordsUpdated += stat.getNumUpdateWrites(); } - } + totalBytesWritten += stat.getTotalWriteBytes(); + totalWriteErrors += stat.getTotalWriteErrors(); + } + rows.add(new String[]{path, String.valueOf(totalFilesAdded), + String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted), + String.valueOf(totalRecordsUpdated), + NumericUtils.humanReadableByteCount(totalBytesWritten), + String.valueOf(totalWriteErrors)}); - @CliAvailabilityIndicator({"commits sync"}) - public boolean isSyncCommitsAvailable() { - return HoodieCLI.tableMetadata != null; } + return HoodiePrintHelper.print( + new String[]{"Partition Path", "Total Files Added", "Total Files Updated", + "Total Records Inserted", "Total Records Updated", "Total Bytes Written", + "Total Errors"}, rows.toArray(new String[rows.size()][])); + } - @CliCommand(value = "commits sync", help = "Compare commits with another Hoodie dataset") - public String syncCommits( - @CliOption(key = {"path"}, help = "Path of the dataset to compare to") - final String path) throws Exception { - HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.fs, path); - HoodieCLI.state = HoodieCLI.CLIState.SYNC; - return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName() - + " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName(); + @CliCommand(value = "commit showfiles", help = "Show file level details of a commit") + public String showCommitFiles( + @CliOption(key = {"commit"}, help = "Commit to show") + final String commitTime) throws Exception { + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline() + .filterCompletedInstants(); + HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, + commitTime); + + if (!timeline.containsInstant(commitInstant)) { + return "Commit " + commitTime + " not found in Commits " + timeline; } + HoodieCommitMetadata meta = + HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get()); + List rows = new ArrayList(); + for (Map.Entry> entry : meta.getPartitionToWriteStats() + .entrySet()) { + String path = entry.getKey(); + List stats = entry.getValue(); + for (HoodieWriteStat stat : stats) { + rows.add(new String[]{path, stat.getFileId(), stat.getPrevCommit(), + String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()), + String.valueOf(stat.getTotalWriteBytes()), + String.valueOf(stat.getTotalWriteErrors())}); + } + } + return HoodiePrintHelper.print( + new String[]{"Partition Path", "File ID", "Previous Commit", "Total Records Updated", + "Total Records Written", "Total Bytes Written", "Total Errors"}, + rows.toArray(new String[rows.size()][])); + } + + @CliAvailabilityIndicator({"commits compare"}) + public boolean isCompareCommitsAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliCommand(value = "commits compare", help = "Compare commits with another Hoodie dataset") + public String compareCommits( + @CliOption(key = {"path"}, help = "Path of the dataset to compare to") + final String path) throws Exception { + HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path); + HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline() + .filterCompletedInstants(); + ; + HoodieTableMetaClient source = HoodieCLI.tableMetadata; + HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline() + .filterCompletedInstants(); + ; + String targetLatestCommit = + targetTimeline.getInstants().iterator().hasNext() ? "0" + : targetTimeline.lastInstant().get().getTimestamp(); + String sourceLatestCommit = + sourceTimeline.getInstants().iterator().hasNext() ? "0" + : sourceTimeline.lastInstant().get().getTimestamp(); + + if (sourceLatestCommit != null && + HoodieTimeline + .compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { + // source is behind the target + List commitsToCatchup = + targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) + .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + return "Source " + source.getTableConfig().getTableName() + " is behind by " + + commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup; + } else { + List commitsToCatchup = + sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE) + .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + return "Source " + source.getTableConfig().getTableName() + " is ahead by " + + commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup; + } + } + + @CliAvailabilityIndicator({"commits sync"}) + public boolean isSyncCommitsAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliCommand(value = "commits sync", help = "Compare commits with another Hoodie dataset") + public String syncCommits( + @CliOption(key = {"path"}, help = "Path of the dataset to compare to") + final String path) throws Exception { + HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.fs, path); + HoodieCLI.state = HoodieCLI.CLIState.SYNC; + return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName() + + " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName(); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java index 9e17da1a4..fc1f22a3a 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java @@ -18,24 +18,24 @@ package com.uber.hoodie.cli.commands; import com.uber.hoodie.cli.HoodieCLI; import com.uber.hoodie.common.table.HoodieTableMetaClient; +import java.io.IOException; import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; import org.springframework.stereotype.Component; -import java.io.IOException; - @Component public class DatasetsCommand implements CommandMarker { - @CliCommand(value = "connect", help = "Connect to a hoodie dataset") - public String connect( - @CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset") - final String path) throws IOException { - boolean initialized = HoodieCLI.initConf(); - HoodieCLI.initFS(initialized); - HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.fs, path)); - HoodieCLI.state = HoodieCLI.CLIState.DATASET; - return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName() - + " loaded"; - } + + @CliCommand(value = "connect", help = "Connect to a hoodie dataset") + public String connect( + @CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset") + final String path) throws IOException { + boolean initialized = HoodieCLI.initConf(); + HoodieCLI.initFS(initialized); + HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.fs, path)); + HoodieCLI.state = HoodieCLI.CLIState.DATASET; + return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName() + + " loaded"; + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HDFSParquetImportCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HDFSParquetImportCommand.java index ca6e1ab05..c9d2f98b5 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HDFSParquetImportCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HDFSParquetImportCommand.java @@ -33,58 +33,59 @@ import org.springframework.stereotype.Component; @Component public class HDFSParquetImportCommand implements CommandMarker { - private static Logger log = LogManager.getLogger(HDFSParquetImportCommand.class); + private static Logger log = LogManager.getLogger(HDFSParquetImportCommand.class); - @CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset") - public String convert( - @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") - final String srcPath, - @CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset") - final String srcType, - @CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset") - final String targetPath, - @CliOption(key = "tableName", mandatory = true, help = "Table name") - final String tableName, - @CliOption(key = "tableType", mandatory = true, help = "Table type") - final String tableType, - @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") - final String rowKeyField, - @CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") - final String partitionPathField, - @CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert") - final String parallelism, - @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") - final String schemaFilePath, - @CliOption(key = "format", mandatory = true, help = "Format for the input data") - final String format, - @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") - final String sparkMemory, - @CliOption(key = "retry", mandatory = true, help = "Number of retries") - final String retry) - throws Exception { + @CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset") + public String convert( + @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") + final String srcPath, + @CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset") + final String srcType, + @CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset") + final String targetPath, + @CliOption(key = "tableName", mandatory = true, help = "Table name") + final String tableName, + @CliOption(key = "tableType", mandatory = true, help = "Table type") + final String tableType, + @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") + final String rowKeyField, + @CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") + final String partitionPathField, + @CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert") + final String parallelism, + @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") + final String schemaFilePath, + @CliOption(key = "format", mandatory = true, help = "Format for the input data") + final String format, + @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") + final String sparkMemory, + @CliOption(key = "retry", mandatory = true, help = "Number of retries") + final String retry) + throws Exception { - validate(format, srcType); + validate(format, srcType); - boolean initialized = HoodieCLI.initConf(); - HoodieCLI.initFS(initialized); - String sparkPropertiesPath = Utils - .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); - SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + boolean initialized = HoodieCLI.initConf(); + HoodieCLI.initFS(initialized); + String sparkPropertiesPath = Utils + .getDefaultPropertiesFile( + scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName, - tableType, rowKeyField, partitionPathField, parallelism, schemaFilePath, sparkMemory, - retry); - Process process = sparkLauncher.launch(); - InputStreamConsumer.captureOutput(process); - int exitCode = process.waitFor(); - if (exitCode != 0) { - return "Failed to import dataset to hoodie format"; - } - return "Dataset imported to hoodie format"; + sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName, + tableType, rowKeyField, partitionPathField, parallelism, schemaFilePath, sparkMemory, + retry); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + if (exitCode != 0) { + return "Failed to import dataset to hoodie format"; } + return "Dataset imported to hoodie format"; + } - private void validate(String format, String srcType) { - (new HDFSParquetImporter.FormatValidator()).validate("format", format); - (new HDFSParquetImporter.SourceTypeValidator()).validate("srcType", srcType); - } + private void validate(String format, String srcType) { + (new HDFSParquetImporter.FormatValidator()).validate("format", format); + (new HDFSParquetImporter.SourceTypeValidator()).validate("srcType", srcType); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java index 15fc04a43..b19608bed 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/HoodieSyncCommand.java @@ -16,105 +16,109 @@ package com.uber.hoodie.cli.commands; +import com.uber.hoodie.cli.HoodieCLI; import com.uber.hoodie.cli.utils.CommitUtil; import com.uber.hoodie.cli.utils.HiveUtil; -import com.uber.hoodie.cli.HoodieCLI; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; +import java.util.List; +import java.util.stream.Collectors; import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.annotation.CliAvailabilityIndicator; import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; import org.springframework.stereotype.Component; -import java.util.List; -import java.util.stream.Collectors; - @Component public class HoodieSyncCommand implements CommandMarker { - @CliAvailabilityIndicator({"sync validate"}) - public boolean isSyncVerificationAvailable() { - return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null; + + @CliAvailabilityIndicator({"sync validate"}) + public boolean isSyncVerificationAvailable() { + return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null; + } + + @CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records") + public String validateSync( + @CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") + final String mode, + @CliOption(key = { + "sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") + final String srcDb, + @CliOption(key = { + "targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database") + final String tgtDb, + @CliOption(key = { + "partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate") + final int partitionCount, + @CliOption(key = { + "hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to") + final String hiveServerUrl, + @CliOption(key = { + "hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to") + final String hiveUser, + @CliOption(key = { + "hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") + final String hivePass) throws Exception { + HoodieTableMetaClient target = HoodieCLI.syncTableMetadata; + HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline(); + HoodieTableMetaClient source = HoodieCLI.tableMetadata; + HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline(); + long sourceCount = 0; + long targetCount = 0; + if ("complete".equals(mode)) { + sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass); + targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass); + } else if ("latestPartitions".equals(mode)) { + sourceCount = HiveUtil + .countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass); + targetCount = HiveUtil + .countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass); } - @CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records") - public String validateSync( - @CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") - final String mode, - @CliOption(key = { - "sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") - final String srcDb, - @CliOption(key = { - "targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database") - final String tgtDb, - @CliOption(key = { - "partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate") - final int partitionCount, - @CliOption(key = { - "hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to") - final String hiveServerUrl, - @CliOption(key = { - "hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to") - final String hiveUser, - @CliOption(key = { - "hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") - final String hivePass) throws Exception { - HoodieTableMetaClient target = HoodieCLI.syncTableMetadata; - HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline(); - HoodieTableMetaClient source = HoodieCLI.tableMetadata; - HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline(); - long sourceCount = 0; - long targetCount = 0; - if ("complete".equals(mode)) { - sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass); - targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass); - } else if ("latestPartitions".equals(mode)) { - sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass); - targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass); - } + String targetLatestCommit = + targetTimeline.getInstants().iterator().hasNext() ? "0" + : targetTimeline.lastInstant().get().getTimestamp(); + String sourceLatestCommit = + sourceTimeline.getInstants().iterator().hasNext() ? "0" + : sourceTimeline.lastInstant().get().getTimestamp(); - String targetLatestCommit = - targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp(); - String sourceLatestCommit = - sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp(); + if (sourceLatestCommit != null && HoodieTimeline + .compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { + // source is behind the target + List commitsToCatchup = + targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants() + .collect(Collectors.toList()); + if (commitsToCatchup.isEmpty()) { + return "Count difference now is (count(" + target.getTableConfig().getTableName() + + ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount + - sourceCount); + } else { + long newInserts = CommitUtil.countNewRecords(target, + commitsToCatchup.stream().map(HoodieInstant::getTimestamp) + .collect(Collectors.toList())); + return "Count difference now is (count(" + target.getTableConfig().getTableName() + + ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount + - sourceCount) + ". Catch up count is " + newInserts; + } + } else { + List commitsToCatchup = + sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE).getInstants() + .collect(Collectors.toList()); + if (commitsToCatchup.isEmpty()) { + return "Count difference now is (count(" + source.getTableConfig().getTableName() + + ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount + - targetCount); + } else { + long newInserts = CommitUtil.countNewRecords(source, + commitsToCatchup.stream().map(HoodieInstant::getTimestamp) + .collect(Collectors.toList())); + return "Count difference now is (count(" + source.getTableConfig().getTableName() + + ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount + - targetCount) + ". Catch up count is " + newInserts; + } - if (sourceLatestCommit != null && HoodieTimeline - .compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { - // source is behind the target - List commitsToCatchup = - targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants() - .collect(Collectors.toList()); - if (commitsToCatchup.isEmpty()) { - return "Count difference now is (count(" + target.getTableConfig().getTableName() - + ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount - - sourceCount); - } else { - long newInserts = CommitUtil.countNewRecords(target, - commitsToCatchup.stream().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList())); - return "Count difference now is (count(" + target.getTableConfig().getTableName() - + ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount - - sourceCount) + ". Catch up count is " + newInserts; - } - } else { - List commitsToCatchup = - sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE).getInstants() - .collect(Collectors.toList()); - if (commitsToCatchup.isEmpty()) { - return "Count difference now is (count(" + source.getTableConfig().getTableName() - + ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount - - targetCount); - } else { - long newInserts = CommitUtil.countNewRecords(source, - commitsToCatchup.stream().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList())); - return "Count difference now is (count(" + source.getTableConfig().getTableName() - + ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount - - targetCount) + ". Catch up count is " + newInserts; - } - - } } + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/RepairsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/RepairsCommand.java index e7998d9d8..1db6075e1 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/RepairsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/RepairsCommand.java @@ -22,7 +22,8 @@ import com.uber.hoodie.cli.utils.InputStreamConsumer; import com.uber.hoodie.cli.utils.SparkUtil; import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.util.FSUtils; - +import java.io.IOException; +import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.spark.launcher.SparkLauncher; import org.springframework.shell.core.CommandMarker; @@ -31,80 +32,80 @@ import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; import org.springframework.stereotype.Component; -import java.io.IOException; -import java.util.List; - @Component public class RepairsCommand implements CommandMarker { - @CliAvailabilityIndicator({"repair deduplicate"}) - public boolean isRepairDeduplicateAvailable() { - return HoodieCLI.tableMetadata != null; + @CliAvailabilityIndicator({"repair deduplicate"}) + public boolean isRepairDeduplicateAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliAvailabilityIndicator({"repair addpartitionmeta"}) + public boolean isRepairAddPartitionMetaAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with") + public String deduplicate( + @CliOption(key = { + "duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) + final String duplicatedPartitionPath, + @CliOption(key = { + "repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) + final String repairedOutputPath, + @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true) + final String sparkPropertiesPath) throws Exception { + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher + .addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, + repairedOutputPath, HoodieCLI.tableMetadata.getBasePath()); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + + if (exitCode != 0) { + return "Deduplicated files placed in: " + repairedOutputPath; } + return "Deduplication failed "; + } - @CliAvailabilityIndicator({"repair addpartitionmeta"}) - public boolean isRepairAddPartitionMetaAvailable() { - return HoodieCLI.tableMetadata != null; - } - @CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with") - public String deduplicate( - @CliOption(key = { - "duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) - final String duplicatedPartitionPath, - @CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) - final String repairedOutputPath, - @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true) - final String sparkPropertiesPath) throws Exception { - SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher - .addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, - repairedOutputPath, HoodieCLI.tableMetadata.getBasePath()); - Process process = sparkLauncher.launch(); - InputStreamConsumer.captureOutput(process); - int exitCode = process.waitFor(); + @CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present") + public String addPartitionMeta( + @CliOption(key = {"dryrun"}, + help = "Should we actually add or just print what would be done", + unspecifiedDefaultValue = "true") + final boolean dryRun) throws IOException { - if (exitCode != 0) { - return "Deduplicated files placed in: " + repairedOutputPath; + String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline() + .lastInstant().get().getTimestamp(); + List partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs, + HoodieCLI.tableMetadata.getBasePath()); + Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath()); + String[][] rows = new String[partitionPaths.size() + 1][]; + + int ind = 0; + for (String partition : partitionPaths) { + Path partitionPath = new Path(basePath, partition); + String[] row = new String[3]; + row[0] = partition; + row[1] = "Yes"; + row[2] = "None"; + if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) { + row[1] = "No"; + if (!dryRun) { + HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata( + HoodieCLI.fs, + latestCommit, + basePath, + partitionPath); + partitionMetadata.trySave(0); } - return "Deduplication failed "; + } + rows[ind++] = row; } - - - @CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present") - public String addPartitionMeta( - @CliOption(key = {"dryrun"}, - help = "Should we actually add or just print what would be done", - unspecifiedDefaultValue = "true") - final boolean dryRun) throws IOException { - - String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp(); - List partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs, - HoodieCLI.tableMetadata.getBasePath()); - Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath()); - String[][] rows = new String[partitionPaths.size() + 1][]; - - int ind = 0; - for (String partition: partitionPaths) { - Path partitionPath = new Path(basePath, partition); - String[] row = new String[3]; - row[0] = partition; row[1] = "Yes"; row[2] = "None"; - if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) { - row[1] = "No"; - if (!dryRun) { - HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata( - HoodieCLI.fs, - latestCommit, - basePath, - partitionPath); - partitionMetadata.trySave(0); - } - } - rows[ind++] = row; - } - - return HoodiePrintHelper.print( - new String[] {"Partition Path", "Metadata Present?", "Action"}, rows); - } + return HoodiePrintHelper.print( + new String[]{"Partition Path", "Metadata Present?", "Action"}, rows); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SavepointsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SavepointsCommand.java index 350c9d81e..4f5b2c9a3 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SavepointsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SavepointsCommand.java @@ -27,6 +27,10 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.index.HoodieIndex; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.launcher.SparkLauncher; import org.springframework.shell.core.CommandMarker; @@ -35,122 +39,118 @@ import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; import org.springframework.stereotype.Component; -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - @Component public class SavepointsCommand implements CommandMarker { - @CliAvailabilityIndicator({"savepoints show"}) - public boolean isShowAvailable() { - return HoodieCLI.tableMetadata != null; + + @CliAvailabilityIndicator({"savepoints show"}) + public boolean isShowAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliAvailabilityIndicator({"savepoints refresh"}) + public boolean isRefreshAvailable() { + return HoodieCLI.tableMetadata != null; + } + + + @CliAvailabilityIndicator({"savepoint create"}) + public boolean isCreateSavepointAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliAvailabilityIndicator({"savepoint rollback"}) + public boolean isRollbackToSavepointAvailable() { + return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline() + .getSavePointTimeline().filterCompletedInstants().empty(); + } + + @CliCommand(value = "savepoints show", help = "Show the savepoints") + public String showSavepoints() throws IOException { + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getSavePointTimeline().filterCompletedInstants(); + List commits = timeline.getInstants().collect(Collectors.toList()); + String[][] rows = new String[commits.size()][]; + Collections.reverse(commits); + for (int i = 0; i < commits.size(); i++) { + HoodieInstant commit = commits.get(i); + rows[i] = new String[]{commit.getTimestamp()}; + } + return HoodiePrintHelper.print(new String[]{"SavepointTime"}, rows); + } + + @CliCommand(value = "savepoint create", help = "Savepoint a commit") + public String savepoint( + @CliOption(key = {"commit"}, help = "Commit to savepoint") + final String commitTime, + @CliOption(key = {"user"}, help = "User who is creating the savepoint") + final String user, + @CliOption(key = {"comments"}, help = "Comments for creating the savepoint") + final String comments) throws Exception { + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); + HoodieInstant + commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); + + if (!timeline.containsInstant(commitInstant)) { + return "Commit " + commitTime + " not found in Commits " + timeline; } - @CliAvailabilityIndicator({"savepoints refresh"}) - public boolean isRefreshAvailable() { - return HoodieCLI.tableMetadata != null; + HoodieWriteClient client = createHoodieClient(null, HoodieCLI.tableMetadata.getBasePath()); + if (client.savepoint(commitTime, user, comments)) { + // Refresh the current + refreshMetaClient(); + return String.format("The commit \"%s\" has been savepointed.", commitTime); + } + return String.format("Failed: Could not savepoint commit \"%s\".", commitTime); + } + + @CliCommand(value = "savepoint rollback", help = "Savepoint a commit") + public String rollbackToSavepoint( + @CliOption(key = {"savepoint"}, help = "Savepoint to rollback") + final String commitTime, + @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") + final String sparkPropertiesPath) throws Exception { + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); + HoodieInstant + commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); + + if (!timeline.containsInstant(commitInstant)) { + return "Commit " + commitTime + " not found in Commits " + timeline; } - - @CliAvailabilityIndicator({"savepoint create"}) - public boolean isCreateSavepointAvailable() { - return HoodieCLI.tableMetadata != null; - } - - @CliAvailabilityIndicator({"savepoint rollback"}) - public boolean isRollbackToSavepointAvailable() { - return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty(); - } - - @CliCommand(value = "savepoints show", help = "Show the savepoints") - public String showSavepoints() throws IOException { - HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getSavePointTimeline().filterCompletedInstants(); - List commits = timeline.getInstants().collect(Collectors.toList()); - String[][] rows = new String[commits.size()][]; - Collections.reverse(commits); - for (int i = 0; i < commits.size(); i++) { - HoodieInstant commit = commits.get(i); - rows[i] = new String[] {commit.getTimestamp()}; - } - return HoodiePrintHelper.print(new String[] {"SavepointTime"}, rows); - } - - @CliCommand(value = "savepoint create", help = "Savepoint a commit") - public String savepoint( - @CliOption(key = {"commit"}, help = "Commit to savepoint") - final String commitTime, - @CliOption(key = {"user"}, help = "User who is creating the savepoint") - final String user, - @CliOption(key = {"comments"}, help = "Comments for creating the savepoint") - final String comments) throws Exception { - HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); - HoodieInstant - commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); - - if (!timeline.containsInstant(commitInstant)) { - return "Commit " + commitTime + " not found in Commits " + timeline; - } - - HoodieWriteClient client = createHoodieClient(null, HoodieCLI.tableMetadata.getBasePath()); - if (client.savepoint(commitTime, user, comments)) { - // Refresh the current - refreshMetaClient(); - return String.format("The commit \"%s\" has been savepointed.", commitTime); - } - return String.format("Failed: Could not savepoint commit \"%s\".", commitTime); - } - - @CliCommand(value = "savepoint rollback", help = "Savepoint a commit") - public String rollbackToSavepoint( - @CliOption(key = {"savepoint"}, help = "Savepoint to rollback") - final String commitTime, - @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") - final String sparkPropertiesPath) throws Exception { - HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); - HoodieInstant - commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); - - if (!timeline.containsInstant(commitInstant)) { - return "Commit " + commitTime + " not found in Commits " + timeline; - } - - SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(), - commitTime, - HoodieCLI.tableMetadata.getBasePath()); - Process process = sparkLauncher.launch(); - InputStreamConsumer.captureOutput(process); - int exitCode = process.waitFor(); - // Refresh the current - refreshMetaClient(); - if (exitCode != 0) { - return "Savepoint " + commitTime + " failed to roll back"; - } - return "Savepoint " + commitTime + " rolled back"; + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(), + commitTime, + HoodieCLI.tableMetadata.getBasePath()); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + // Refresh the current + refreshMetaClient(); + if (exitCode != 0) { + return "Savepoint " + commitTime + " failed to roll back"; } + return "Savepoint " + commitTime + " rolled back"; + } - @CliCommand(value = "savepoints refresh", help = "Refresh the savepoints") - public String refreshMetaClient() throws IOException { - HoodieTableMetaClient metadata = - new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); - HoodieCLI.setTableMetadata(metadata); - return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; - } - - private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) - throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .build(); - return new HoodieWriteClient(jsc, config, false); - } + @CliCommand(value = "savepoints refresh", help = "Refresh the savepoints") + public String refreshMetaClient() throws IOException { + HoodieTableMetaClient metadata = + new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); + HoodieCLI.setTableMetadata(metadata); + return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; + } + private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) + throws Exception { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .build(); + return new HoodieWriteClient(jsc, config, false); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SparkMain.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SparkMain.java index 1564b87b2..aba2d9da8 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SparkMain.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SparkMain.java @@ -30,109 +30,110 @@ import org.apache.spark.sql.SQLContext; public class SparkMain { - protected final static Logger LOG = Logger.getLogger(SparkMain.class); + protected final static Logger LOG = Logger.getLogger(SparkMain.class); - /** - * Commands - */ - enum SparkCommand { - ROLLBACK, - DEDUPLICATE, - ROLLBACK_TO_SAVEPOINT, - SAVEPOINT, - IMPORT + /** + * Commands + */ + enum SparkCommand { + ROLLBACK, + DEDUPLICATE, + ROLLBACK_TO_SAVEPOINT, + SAVEPOINT, + IMPORT + } + + public static void main(String[] args) throws Exception { + String command = args[0]; + LOG.info("Invoking SparkMain:" + command); + + SparkCommand cmd = SparkCommand.valueOf(command); + + JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command); + int returnCode = 0; + switch (cmd) { + case ROLLBACK: + assert (args.length == 3); + returnCode = rollback(jsc, args[1], args[2]); + break; + case DEDUPLICATE: + assert (args.length == 4); + returnCode = deduplicatePartitionPath(jsc, args[1], args[2], args[3]); + break; + case ROLLBACK_TO_SAVEPOINT: + assert (args.length == 3); + returnCode = rollbackToSavepoint(jsc, args[1], args[2]); + break; + case IMPORT: + assert (args.length == 11); + returnCode = dataImport(jsc, args[1], args[2], args[3], args[4], args[5], args[6], + Integer.parseInt(args[7]), args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9], + Integer.parseInt(args[10])); + break; } - public static void main(String[] args) throws Exception { - String command = args[0]; - LOG.info("Invoking SparkMain:" + command); + System.exit(returnCode); + } - SparkCommand cmd = SparkCommand.valueOf(command); + private static int dataImport(JavaSparkContext jsc, String srcPath, String targetPath, + String tableName, String tableType, String rowKey, String partitionKey, int parallelism, + String schemaFile, String sparkMaster, String sparkMemory, int retry) throws Exception { + HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config(); + cfg.srcPath = srcPath; + cfg.targetPath = targetPath; + cfg.tableName = tableName; + cfg.tableType = tableType; + cfg.rowKey = rowKey; + cfg.partitionKey = partitionKey; + cfg.parallelism = parallelism; + cfg.schemaFile = schemaFile; + jsc.getConf().set("spark.executor.memory", sparkMemory); + return new HDFSParquetImporter(cfg).dataImport(jsc, retry); + } - JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command); - int returnCode = 0; - switch(cmd) { - case ROLLBACK: - assert (args.length == 3); - returnCode = rollback(jsc, args[1], args[2]); - break; - case DEDUPLICATE: - assert (args.length == 4); - returnCode = deduplicatePartitionPath(jsc, args[1], args[2], args[3]); - break; - case ROLLBACK_TO_SAVEPOINT: - assert (args.length == 3); - returnCode = rollbackToSavepoint(jsc, args[1], args[2]); - break; - case IMPORT: - assert (args.length == 11); - returnCode = dataImport(jsc, args[1], args[2], args[3], args[4], args[5], args[6], - Integer.parseInt(args[7]), args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9], - Integer.parseInt(args[10])); - break; - } + private static int deduplicatePartitionPath(JavaSparkContext jsc, + String duplicatedPartitionPath, + String repairedOutputPath, + String basePath) + throws Exception { + DedupeSparkJob job = new DedupeSparkJob(basePath, + duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc), FSUtils.getFs()); + job.fixDuplicates(true); + return 0; + } - System.exit(returnCode); + private static int rollback(JavaSparkContext jsc, String commitTime, String basePath) + throws Exception { + HoodieWriteClient client = createHoodieClient(jsc, basePath); + if (client.rollback(commitTime)) { + LOG.info(String.format("The commit \"%s\" rolled back.", commitTime)); + return 0; + } else { + LOG.info(String.format("The commit \"%s\" failed to roll back.", commitTime)); + return -1; } + } - private static int dataImport(JavaSparkContext jsc, String srcPath, String targetPath, - String tableName, String tableType, String rowKey, String partitionKey, int parallelism, - String schemaFile, String sparkMaster, String sparkMemory, int retry) throws Exception { - HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config(); - cfg.srcPath = srcPath; - cfg.targetPath = targetPath; - cfg.tableName = tableName; - cfg.tableType = tableType; - cfg.rowKey = rowKey; - cfg.partitionKey = partitionKey; - cfg.parallelism = parallelism; - cfg.schemaFile = schemaFile; - jsc.getConf().set("spark.executor.memory", sparkMemory); - return new HDFSParquetImporter(cfg).dataImport(jsc, retry); + private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, + String basePath) + throws Exception { + HoodieWriteClient client = createHoodieClient(jsc, basePath); + if (client.rollbackToSavepoint(savepointTime)) { + LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime)); + return 0; + } else { + LOG.info(String.format("The commit \"%s\" failed to roll back.", savepointTime)); + return -1; } + } - private static int deduplicatePartitionPath(JavaSparkContext jsc, - String duplicatedPartitionPath, - String repairedOutputPath, - String basePath) - throws Exception { - DedupeSparkJob job = new DedupeSparkJob(basePath, - duplicatedPartitionPath,repairedOutputPath,new SQLContext(jsc), FSUtils.getFs()); - job.fixDuplicates(true); - return 0; - } - - private static int rollback(JavaSparkContext jsc, String commitTime, String basePath) - throws Exception { - HoodieWriteClient client = createHoodieClient(jsc, basePath); - if (client.rollback(commitTime)) { - LOG.info(String.format("The commit \"%s\" rolled back.", commitTime)); - return 0; - } else { - LOG.info(String.format("The commit \"%s\" failed to roll back.", commitTime)); - return -1; - } - } - - private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath) - throws Exception { - HoodieWriteClient client = createHoodieClient(jsc, basePath); - if (client.rollbackToSavepoint(savepointTime)) { - LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime)); - return 0; - } else { - LOG.info(String.format("The commit \"%s\" failed to roll back.", savepointTime)); - return -1; - } - } - - private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) - throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .build(); - return new HoodieWriteClient(jsc, config); - } + private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) + throws Exception { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .build(); + return new HoodieWriteClient(jsc, config); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java index cef92ae67..cb61eef0b 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/StatsCommand.java @@ -28,7 +28,10 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.NumericUtils; - +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.HashMap; +import java.util.stream.Collectors; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -38,106 +41,105 @@ import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; import org.springframework.stereotype.Component; -import java.io.IOException; -import java.text.DecimalFormat; -import java.util.HashMap; -import java.util.stream.Collectors; - @Component public class StatsCommand implements CommandMarker { - @CliAvailabilityIndicator({"stats wa"}) - public boolean isWriteAmpAvailable() { - return HoodieCLI.tableMetadata != null; + + @CliAvailabilityIndicator({"stats wa"}) + public boolean isWriteAmpAvailable() { + return HoodieCLI.tableMetadata != null; + } + + @CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many records were actually written") + public String writeAmplificationStats() throws IOException { + long totalRecordsUpserted = 0; + long totalRecordsWritten = 0; + + HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); + + String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][]; + int i = 0; + DecimalFormat df = new DecimalFormat("#.00"); + for (HoodieInstant commitTime : timeline.getInstants().collect( + Collectors.toList())) { + String waf = "0"; + HoodieCommitMetadata commit = HoodieCommitMetadata + .fromBytes(activeTimeline.getInstantDetails(commitTime).get()); + if (commit.fetchTotalUpdateRecordsWritten() > 0) { + waf = df.format( + (float) commit.fetchTotalRecordsWritten() / commit + .fetchTotalUpdateRecordsWritten()); + } + rows[i++] = new String[]{commitTime.getTimestamp(), + String.valueOf(commit.fetchTotalUpdateRecordsWritten()), + String.valueOf(commit.fetchTotalRecordsWritten()), waf}; + totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten(); + totalRecordsWritten += commit.fetchTotalRecordsWritten(); + } + String waf = "0"; + if (totalRecordsUpserted > 0) { + waf = df.format((float) totalRecordsWritten / totalRecordsUpserted); + } + rows[i] = new String[]{"Total", String.valueOf(totalRecordsUpserted), + String.valueOf(totalRecordsWritten), waf}; + return HoodiePrintHelper.print( + new String[]{"CommitTime", "Total Upserted", "Total Written", + "Write Amplifiation Factor"}, rows); + + } + + + private String[] printFileSizeHistogram(String commitTime, Snapshot s) { + return new String[]{ + commitTime, + NumericUtils.humanReadableByteCount(s.getMin()), + NumericUtils.humanReadableByteCount(s.getValue(0.1)), + NumericUtils.humanReadableByteCount(s.getMedian()), + NumericUtils.humanReadableByteCount(s.getMean()), + NumericUtils.humanReadableByteCount(s.get95thPercentile()), + NumericUtils.humanReadableByteCount(s.getMax()), + String.valueOf(s.size()), + NumericUtils.humanReadableByteCount(s.getStdDev()) + }; + } + + @CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files") + public String fileSizeStats( + @CliOption(key = { + "partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") + final String globRegex) throws IOException { + + FileSystem fs = HoodieCLI.fs; + String globPath = String.format("%s/%s/*", + HoodieCLI.tableMetadata.getBasePath(), + globRegex); + FileStatus[] statuses = fs.globStatus(new Path(globPath)); + + // max, min, #small files < 10MB, 50th, avg, 95th + final int MAX_FILES = 1000000; + Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES)); + HashMap commitHistoMap = new HashMap(); + for (FileStatus fileStatus : statuses) { + String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName()); + long sz = fileStatus.getLen(); + if (!commitHistoMap.containsKey(commitTime)) { + commitHistoMap.put(commitTime, new Histogram(new UniformReservoir(MAX_FILES))); + } + commitHistoMap.get(commitTime).update(sz); + globalHistogram.update(sz); } - @CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many records were actually written") - public String writeAmplificationStats() throws IOException { - long totalRecordsUpserted = 0; - long totalRecordsWritten = 0; - - HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); - - String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][]; - int i = 0; - DecimalFormat df = new DecimalFormat("#.00"); - for (HoodieInstant commitTime : timeline.getInstants().collect( - Collectors.toList())) { - String waf = "0"; - HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitTime).get()); - if (commit.fetchTotalUpdateRecordsWritten() > 0) { - waf = df.format( - (float) commit.fetchTotalRecordsWritten() / commit - .fetchTotalUpdateRecordsWritten()); - } - rows[i++] = new String[] {commitTime.getTimestamp(), - String.valueOf(commit.fetchTotalUpdateRecordsWritten()), - String.valueOf(commit.fetchTotalRecordsWritten()), waf}; - totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten(); - totalRecordsWritten += commit.fetchTotalRecordsWritten(); - } - String waf = "0"; - if (totalRecordsUpserted > 0) { - waf = df.format((float) totalRecordsWritten / totalRecordsUpserted); - } - rows[i] = new String[] {"Total", String.valueOf(totalRecordsUpserted), - String.valueOf(totalRecordsWritten), waf}; - return HoodiePrintHelper.print( - new String[] {"CommitTime", "Total Upserted", "Total Written", - "Write Amplifiation Factor"}, rows); - + String[][] rows = new String[commitHistoMap.size() + 1][]; + int ind = 0; + for (String commitTime : commitHistoMap.keySet()) { + Snapshot s = commitHistoMap.get(commitTime).getSnapshot(); + rows[ind++] = printFileSizeHistogram(commitTime, s); } + Snapshot s = globalHistogram.getSnapshot(); + rows[ind++] = printFileSizeHistogram("ALL", s); - - private String[] printFileSizeHistogram(String commitTime, Snapshot s) { - return new String[]{ - commitTime, - NumericUtils.humanReadableByteCount(s.getMin()), - NumericUtils.humanReadableByteCount(s.getValue(0.1)), - NumericUtils.humanReadableByteCount(s.getMedian()), - NumericUtils.humanReadableByteCount(s.getMean()), - NumericUtils.humanReadableByteCount(s.get95thPercentile()), - NumericUtils.humanReadableByteCount(s.getMax()), - String.valueOf(s.size()), - NumericUtils.humanReadableByteCount(s.getStdDev()) - }; - } - - @CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files") - public String fileSizeStats( - @CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") - final String globRegex) throws IOException { - - FileSystem fs = HoodieCLI.fs; - String globPath = String.format("%s/%s/*", - HoodieCLI.tableMetadata.getBasePath(), - globRegex); - FileStatus[] statuses = fs.globStatus(new Path(globPath)); - - // max, min, #small files < 10MB, 50th, avg, 95th - final int MAX_FILES = 1000000; - Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES)); - HashMap commitHistoMap = new HashMap(); - for (FileStatus fileStatus: statuses) { - String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName()); - long sz = fileStatus.getLen(); - if (!commitHistoMap.containsKey(commitTime)) { - commitHistoMap.put(commitTime, new Histogram(new UniformReservoir(MAX_FILES))); - } - commitHistoMap.get(commitTime).update(sz); - globalHistogram.update(sz); - } - - String[][] rows = new String[commitHistoMap.size() + 1][]; - int ind = 0; - for (String commitTime: commitHistoMap.keySet()) { - Snapshot s = commitHistoMap.get(commitTime).getSnapshot(); - rows[ind++] = printFileSizeHistogram(commitTime, s); - } - Snapshot s = globalHistogram.getSnapshot(); - rows[ind++] = printFileSizeHistogram("ALL", s); - - return HoodiePrintHelper.print( - new String[] {"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", "StdDev"}, rows); - } + return HoodiePrintHelper.print( + new String[]{"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", + "StdDev"}, rows); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/UtilsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/UtilsCommand.java index b5abb6a6e..3733a8c0c 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/UtilsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/UtilsCommand.java @@ -23,12 +23,13 @@ import org.springframework.stereotype.Component; @Component public class UtilsCommand implements CommandMarker { - @CliCommand(value = "utils loadClass", help = "Load a class" ) - public String loadClass( - @CliOption(key = {"class"}, help = "Check mode" ) final String clazz - ) throws Exception { - Class klass = Class.forName(clazz); - return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm(); - } + + @CliCommand(value = "utils loadClass", help = "Load a class") + public String loadClass( + @CliOption(key = {"class"}, help = "Check mode") final String clazz + ) throws Exception { + Class klass = Class.forName(clazz); + return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm(); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java index 8f5aabf4f..71ed5aca6 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/CommitUtil.java @@ -20,21 +20,22 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; - import java.io.IOException; import java.util.List; public class CommitUtil { - public static long countNewRecords(HoodieTableMetaClient target, List commitsToCatchup) - throws IOException { - long totalNew = 0; - HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants(); - for(String commit:commitsToCatchup) { - HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline - .getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit)) - .get()); - totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten(); - } - return totalNew; + + public static long countNewRecords(HoodieTableMetaClient target, List commitsToCatchup) + throws IOException { + long totalNew = 0; + HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline() + .filterCompletedInstants(); + for (String commit : commitsToCatchup) { + HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline + .getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit)) + .get()); + totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten(); } + return totalNew; + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/HiveUtil.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/HiveUtil.java index 1d4b00349..52ec668c2 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/HiveUtil.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/HiveUtil.java @@ -17,107 +17,112 @@ package com.uber.hoodie.cli.utils; import com.uber.hoodie.common.table.HoodieTableMetaClient; -import org.apache.commons.dbcp.BasicDataSource; -import org.joda.time.DateTime; - -import javax.sql.DataSource; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; +import javax.sql.DataSource; +import org.apache.commons.dbcp.BasicDataSource; +import org.joda.time.DateTime; public class HiveUtil { - private static String driverName = "org.apache.hive.jdbc.HiveDriver"; - static { - try { - Class.forName(driverName); - } catch (ClassNotFoundException e) { - throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e); - } + private static String driverName = "org.apache.hive.jdbc.HiveDriver"; + + static { + try { + Class.forName(driverName); + } catch (ClassNotFoundException e) { + throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e); } + } - private static Connection connection; + private static Connection connection; - private static Connection getConnection(String jdbcUrl, String user, String pass) throws SQLException { - DataSource ds = getDatasource(jdbcUrl, user, pass); - return ds.getConnection(); + private static Connection getConnection(String jdbcUrl, String user, String pass) + throws SQLException { + DataSource ds = getDatasource(jdbcUrl, user, pass); + return ds.getConnection(); + } + + private static DataSource getDatasource(String jdbcUrl, String user, String pass) { + BasicDataSource ds = new BasicDataSource(); + ds.setDriverClassName(driverName); + ds.setUrl(jdbcUrl); + ds.setUsername(user); + ds.setPassword(pass); + return ds; + } + + public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, + String user, String pass) throws SQLException { + Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass); + ResultSet rs = null; + Statement stmt = conn.createStatement(); + try { + //stmt.execute("set mapred.job.queue.name="); + stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); + stmt.execute("set hive.stats.autogather=false"); + rs = stmt.executeQuery( + "select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source + .getTableConfig() + .getTableName()); + long count = -1; + if (rs.next()) { + count = rs.getLong("cnt"); + } + System.out + .println("Total records in " + source.getTableConfig().getTableName() + " is " + count); + return count; + } finally { + if (rs != null) { + rs.close(); + } + if (stmt != null) { + stmt.close(); + } } + } - private static DataSource getDatasource(String jdbcUrl, String user, String pass) { - BasicDataSource ds = new BasicDataSource(); - ds.setDriverClassName(driverName); - ds.setUrl(jdbcUrl); - ds.setUsername(user); - ds.setPassword(pass); - return ds; - } + public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, + int partitions, String user, String pass) throws SQLException { + DateTime dateTime = DateTime.now(); + String endDateStr = + dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" + + String.format("%02d", dateTime.getDayOfMonth()); + dateTime = dateTime.minusDays(partitions); + String startDateStr = + dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" + + String.format("%02d", dateTime.getDayOfMonth()); + System.out.println("Start date " + startDateStr + " and end date " + endDateStr); + return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass); + } - public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, String user, String pass) throws SQLException { - Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass); - ResultSet rs = null; - Statement stmt = conn.createStatement(); - try { - //stmt.execute("set mapred.job.queue.name="); - stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat" ); - stmt.execute("set hive.stats.autogather=false" ); - rs = stmt.executeQuery( - "select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig() - .getTableName()); - long count = -1; - if(rs.next()) { - count = rs.getLong("cnt"); - } - System.out.println("Total records in " + source.getTableConfig().getTableName() + " is " + count); - return count; - } finally { - if (rs != null) { - rs.close(); - } - if (stmt != null) { - stmt.close(); - } - } - } - - public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, - int partitions, String user, String pass) throws SQLException { - DateTime dateTime = DateTime.now(); - String endDateStr = - dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" + - String.format("%02d", dateTime.getDayOfMonth()); - dateTime = dateTime.minusDays(partitions); - String startDateStr = - dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" + - String.format("%02d", dateTime.getDayOfMonth()); - System.out.println("Start date " + startDateStr + " and end date " + endDateStr); - return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass); - } - - private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, String startDateStr, - String endDateStr, String user, String pass) throws SQLException { - Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass); - ResultSet rs = null; - Statement stmt = conn.createStatement(); - try { - //stmt.execute("set mapred.job.queue.name="); - stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); - stmt.execute("set hive.stats.autogather=false"); - rs = stmt.executeQuery( - "select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig() - .getTableName() + " where datestr>'" + startDateStr + "' and datestr<='" - + endDateStr + "'"); - if(rs.next()) { - return rs.getLong("cnt"); - } - return -1; - } finally { - if (rs != null) { - rs.close(); - } - if (stmt != null) { - stmt.close(); - } - } + private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, + String startDateStr, + String endDateStr, String user, String pass) throws SQLException { + Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass); + ResultSet rs = null; + Statement stmt = conn.createStatement(); + try { + //stmt.execute("set mapred.job.queue.name="); + stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); + stmt.execute("set hive.stats.autogather=false"); + rs = stmt.executeQuery( + "select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig() + .getTableName() + " where datestr>'" + startDateStr + "' and datestr<='" + + endDateStr + "'"); + if (rs.next()) { + return rs.getLong("cnt"); + } + return -1; + } finally { + if (rs != null) { + rs.close(); + } + if (stmt != null) { + stmt.close(); + } } + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/InputStreamConsumer.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/InputStreamConsumer.java index 8da872ef3..4d926cea3 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/InputStreamConsumer.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/InputStreamConsumer.java @@ -23,34 +23,37 @@ import java.io.InputStreamReader; import java.util.logging.Logger; public class InputStreamConsumer extends Thread { - protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName()); - private InputStream is; - public InputStreamConsumer(InputStream is) { - this.is = is; - } - @Override - public void run() { - try { - InputStreamReader isr = new InputStreamReader(is); - BufferedReader br = new BufferedReader(isr); - String line; - while ( (line = br.readLine()) != null) - LOG.info(line); - } catch (IOException ioe) { - LOG.severe(ioe.toString()); - ioe.printStackTrace(); - } - } + protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName()); + private InputStream is; - public static void captureOutput(Process p) { - InputStreamConsumer stdout; - InputStreamConsumer errout; - errout = new InputStreamConsumer(p.getErrorStream()); - stdout = new InputStreamConsumer(p.getInputStream()); - errout.start(); - stdout.start(); + public InputStreamConsumer(InputStream is) { + this.is = is; + } + + @Override + public void run() { + try { + InputStreamReader isr = new InputStreamReader(is); + BufferedReader br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + LOG.info(line); + } + } catch (IOException ioe) { + LOG.severe(ioe.toString()); + ioe.printStackTrace(); } + } + + public static void captureOutput(Process p) { + InputStreamConsumer stdout; + InputStreamConsumer errout; + errout = new InputStreamConsumer(p.getErrorStream()); + stdout = new InputStreamConsumer(p.getInputStream()); + errout.start(); + stdout.start(); + } } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/SparkUtil.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/SparkUtil.java index 5cb5e4bd2..4b4ab2a2d 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/SparkUtil.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/SparkUtil.java @@ -18,59 +18,54 @@ package com.uber.hoodie.cli.utils; import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.cli.commands.SparkMain; - +import java.io.File; +import java.net.URISyntaxException; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.launcher.SparkLauncher; -import java.io.File; -import java.net.URISyntaxException; - public class SparkUtil { - public static Logger logger = Logger.getLogger(SparkUtil.class); - public static final String DEFUALT_SPARK_MASTER = "yarn-client"; + public static Logger logger = Logger.getLogger(SparkUtil.class); + public static final String DEFUALT_SPARK_MASTER = "yarn-client"; - /** - * - * TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro - * - * @return - * @throws URISyntaxException - */ - public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException { - String currentJar = new File( - SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()) - .getAbsolutePath(); - SparkLauncher sparkLauncher = - new SparkLauncher().setAppResource(currentJar) - .setMainClass(SparkMain.class.getName()) - .setPropertiesFile(propertiesFile); - File libDirectory = new File(new File(currentJar).getParent(), "lib"); - for (String library : libDirectory.list()) { - sparkLauncher.addJar(new File(libDirectory, library).getAbsolutePath()); - } - return sparkLauncher; + /** + * TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro + */ + public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException { + String currentJar = new File( + SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()) + .getAbsolutePath(); + SparkLauncher sparkLauncher = + new SparkLauncher().setAppResource(currentJar) + .setMainClass(SparkMain.class.getName()) + .setPropertiesFile(propertiesFile); + File libDirectory = new File(new File(currentJar).getParent(), "lib"); + for (String library : libDirectory.list()) { + sparkLauncher.addJar(new File(libDirectory, library).getAbsolutePath()); } + return sparkLauncher; + } - public static JavaSparkContext initJavaSparkConf(String name) { - SparkConf sparkConf = new SparkConf().setAppName(name); - sparkConf.setMaster(DEFUALT_SPARK_MASTER); - sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - sparkConf.set("spark.driver.maxResultSize", "2g"); - sparkConf.set("spark.eventLog.overwrite", "true"); - sparkConf.set("spark.eventLog.enabled", "true"); + public static JavaSparkContext initJavaSparkConf(String name) { + SparkConf sparkConf = new SparkConf().setAppName(name); + sparkConf.setMaster(DEFUALT_SPARK_MASTER); + sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + sparkConf.set("spark.driver.maxResultSize", "2g"); + sparkConf.set("spark.eventLog.overwrite", "true"); + sparkConf.set("spark.eventLog.enabled", "true"); - // Configure hadoop conf - sparkConf.set("spark.hadoop.mapred.output.compress", "true"); - sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); - sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); - sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); + // Configure hadoop conf + sparkConf.set("spark.hadoop.mapred.output.compress", "true"); + sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); + sparkConf.set("spark.hadoop.mapred.output.compression.codec", + "org.apache.hadoop.io.compress.GzipCodec"); + sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); - sparkConf = HoodieWriteClient.registerClasses(sparkConf); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - jsc.hadoopConfiguration().setBoolean("parquet.enable.summary-metadata", false); - return jsc; - } + sparkConf = HoodieWriteClient.registerClasses(sparkConf); + JavaSparkContext jsc = new JavaSparkContext(sparkConf); + jsc.hadoopConfiguration().setBoolean("parquet.enable.summary-metadata", false); + return jsc; + } } diff --git a/hoodie-cli/src/main/resources/META-INF/spring/spring-shell-plugin.xml b/hoodie-cli/src/main/resources/META-INF/spring/spring-shell-plugin.xml index 900c41dd7..2b4563658 100644 --- a/hoodie-cli/src/main/resources/META-INF/spring/spring-shell-plugin.xml +++ b/hoodie-cli/src/main/resources/META-INF/spring/spring-shell-plugin.xml @@ -16,11 +16,11 @@ --> - + diff --git a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala index 249a8cc62..82c97e0a4 100644 --- a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala +++ b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala @@ -34,11 +34,11 @@ import scala.collection.mutable._ /** * Spark job to de-duplicate data present in a partition path */ -class DedupeSparkJob (basePath: String, - duplicatedPartitionPath: String, - repairOutputPath: String, - sqlContext: SQLContext, - fs: FileSystem) { +class DedupeSparkJob(basePath: String, + duplicatedPartitionPath: String, + repairOutputPath: String, + sqlContext: SQLContext, + fs: FileSystem) { val sparkHelper = new SparkHelper(sqlContext, fs) @@ -50,8 +50,9 @@ class DedupeSparkJob (basePath: String, * @param tblName * @return */ - def getDupeKeyDF(tblName: String) : DataFrame = { - val dupeSql = s""" + def getDupeKeyDF(tblName: String): DataFrame = { + val dupeSql = + s""" select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key, count(*) as dupe_cnt from ${tblName} @@ -69,7 +70,7 @@ class DedupeSparkJob (basePath: String, * * @return */ - private def planDuplicateFix() : HashMap[String, HashSet[String]] = { + private def planDuplicateFix(): HashMap[String, HashSet[String]] = { val tmpTableName = s"htbl_${System.currentTimeMillis()}" val dedupeTblName = s"${tmpTableName}_dupeKeys" @@ -78,17 +79,18 @@ class DedupeSparkJob (basePath: String, val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}")) val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles) - val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]()) + val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]()) val filteredStatuses = latestFiles.map(f => f.getPath) LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}") - val df = sqlContext.parquetFile(filteredStatuses:_*) + val df = sqlContext.parquetFile(filteredStatuses: _*) df.registerTempTable(tmpTableName) val dupeKeyDF = getDupeKeyDF(tmpTableName) dupeKeyDF.registerTempTable(dedupeTblName) // Obtain necessary satellite information for duplicate rows - val dupeDataSql = s""" + val dupeDataSql = + s""" SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time` FROM ${tmpTableName} h JOIN ${dedupeTblName} d @@ -111,9 +113,9 @@ class DedupeSparkJob (basePath: String, rows.foreach(r => { val c = r(3).asInstanceOf[String].toLong - if (c != maxCommit){ + if (c != maxCommit) { val f = r(2).asInstanceOf[String].split("_")(0) - if (!fileToDeleteKeyMap.contains(f)){ + if (!fileToDeleteKeyMap.contains(f)) { fileToDeleteKeyMap(f) = HashSet[String]() } fileToDeleteKeyMap(f).add(key) @@ -130,28 +132,30 @@ class DedupeSparkJob (basePath: String, val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}")) val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles) - val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]()) + val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]()) val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap val dupeFixPlan = planDuplicateFix() // 1. Copy all latest files into the temp fix path - fileNameToPathMap.foreach{ case(fileName, filePath) => { + fileNameToPathMap.foreach { case (fileName, filePath) => { val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else "" val dstPath = new Path(s"${repairOutputPath}/${filePath.getName}${badSuffix}") LOG.info(s"Copying from ${filePath} to ${dstPath}") FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf) - }} + } + } // 2. Remove duplicates from the bad files - dupeFixPlan.foreach{case(fileName, keysToSkip) => { + dupeFixPlan.foreach { case (fileName, keysToSkip) => { val commitTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName) val badFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}.bad") val newFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}") LOG.info(" Skipping and writing new file for : " + fileName) SparkHelpers.skipKeysAndWriteNewFile(commitTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName)) fs.delete(badFilePath, false) - }} + } + } // 3. Check that there are no duplicates anymore. val df = sqlContext.read.parquet(s"${repairOutputPath}/*.parquet") @@ -186,6 +190,7 @@ class DedupeSparkJob (basePath: String, LOG.info(s"[FOR REAL!!!] Copying from ${srcPath} to ${dstPath}") FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf) } - }} + } + } } } diff --git a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala index 0323d6f87..3fc18c8e4 100644 --- a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala +++ b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala @@ -17,9 +17,9 @@ package com.uber.hoodie.cli import com.uber.hoodie.avro.HoodieAvroWriteSupport -import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload} import com.uber.hoodie.common.model.HoodieRecord import com.uber.hoodie.common.util.ParquetUtils +import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload} import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig} import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter} import org.apache.avro.Schema @@ -107,7 +107,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) { * @param file * @param sqlContext */ - def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) ={ + def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) = { println(getRowKeyDF(file).collect().size) } @@ -122,7 +122,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) { * @param file * @return */ - def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String) : Boolean = { + def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String): Boolean = { val bfStr = SparkHelpers.getBloomFilter(file, conf) val bf = new com.uber.hoodie.common.BloomFilter(bfStr) val foundCount = sqlContext.parquetFile(file) @@ -134,7 +134,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) { totalCount == foundCount } - def getDistinctKeyDF(paths: List[String]) : DataFrame = { - sqlContext.read.parquet(paths:_*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct() + def getDistinctKeyDF(paths: List[String]): DataFrame = { + sqlContext.read.parquet(paths: _*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct() } } diff --git a/hoodie-client/pom.xml b/hoodie-client/pom.xml index db6fdc3ae..a3b1ec640 100644 --- a/hoodie-client/pom.xml +++ b/hoodie-client/pom.xml @@ -15,180 +15,182 @@ ~ limitations under the License. --> - - - hoodie - com.uber.hoodie - 0.4.1-SNAPSHOT - - 4.0.0 + + + hoodie + com.uber.hoodie + 0.4.1-SNAPSHOT + + 4.0.0 - hoodie-client - jar - - - - org.jacoco - jacoco-maven-plugin - - - org.apache.maven.plugins - maven-jar-plugin - - - - test-jar - - - - - - org.apache.rat - apache-rat-plugin - - + hoodie-client + jar + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + + + + + org.apache.rat + apache-rat-plugin + + - - - src/main/resources - - - src/test/resources - - - + + + src/main/resources + + + src/test/resources + + + - - - com.uber.hoodie - hoodie-common - ${project.version} - - - org.apache.hadoop - hadoop-hdfs - tests - - - - org.mortbay.jetty - * - - - javax.servlet.jsp - * - - - javax.servlet - * - - - - - org.apache.hadoop - hadoop-common - tests - - - org.mortbay.jetty - * - - - javax.servlet.jsp - * - - - javax.servlet - * - - - - - com.uber.hoodie - hoodie-common - ${project.version} - test-jar - test - - - io.dropwizard.metrics - metrics-graphite - - - io.dropwizard.metrics - metrics-core - - - com.beust - jcommander - 1.48 - + + + com.uber.hoodie + hoodie-common + ${project.version} + + + org.apache.hadoop + hadoop-hdfs + tests + + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + org.apache.hadoop + hadoop-common + tests + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + com.uber.hoodie + hoodie-common + ${project.version} + test-jar + test + + + io.dropwizard.metrics + metrics-graphite + + + io.dropwizard.metrics + metrics-core + + + com.beust + jcommander + 1.48 + - - - log4j - log4j - + + + log4j + log4j + - - org.apache.hadoop - hadoop-client - - - javax.servlet - * - - - + + org.apache.hadoop + hadoop-client + + + javax.servlet + * + + + - - org.apache.parquet - parquet-avro - + + org.apache.parquet + parquet-avro + - - org.apache.parquet - parquet-hadoop - + + org.apache.parquet + parquet-hadoop + - - com.google.guava - guava - + + com.google.guava + guava + - - org.apache.spark - spark-core_2.11 - + + org.apache.spark + spark-core_2.11 + - - org.apache.spark - spark-sql_2.11 - + + org.apache.spark + spark-sql_2.11 + - - org.apache.hbase - hbase-client - + + org.apache.hbase + hbase-client + - - org.mockito - mockito-all - 1.10.19 - test - - - com.uber.hoodie - hoodie-hadoop-mr - ${project.version} - test - - - org.apache.hive - hive-exec - test - + + org.mockito + mockito-all + 1.10.19 + test + + + com.uber.hoodie + hoodie-hadoop-mr + ${project.version} + test + + + org.apache.hive + hive-exec + test + - + diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java index 0417aeaff..851947286 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java @@ -17,25 +17,19 @@ package com.uber.hoodie; import com.google.common.base.Optional; - -import com.uber.hoodie.common.model.HoodieCommitMetadata; -import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; -import com.uber.hoodie.common.table.TableFileSystemView; -import com.uber.hoodie.common.table.timeline.HoodieInstant; -import com.uber.hoodie.common.table.view.HoodieTableFileSystemView; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieWriteConfig; -import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.index.bloom.HoodieBloomIndex; - import com.uber.hoodie.table.HoodieTable; - +import java.io.Serializable; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; @@ -46,136 +40,126 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.StructType; - -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - import scala.Tuple2; /** * Provides an RDD based API for accessing/filtering Hoodie tables, based on keys. - * */ public class HoodieReadClient implements Serializable { - private static Logger logger = LogManager.getLogger(HoodieReadClient.class); + private static Logger logger = LogManager.getLogger(HoodieReadClient.class); - private transient final JavaSparkContext jsc; + private transient final JavaSparkContext jsc; - private transient final FileSystem fs; - /** - * TODO: We need to persist the index type into hoodie.properties and be able to access the - * index just with a simple basepath pointing to the dataset. Until, then just always assume a - * BloomIndex - */ - private transient final HoodieBloomIndex index; - private final HoodieTimeline commitTimeline; - private HoodieTable hoodieTable; - private transient Optional sqlContextOpt; + private transient final FileSystem fs; + /** + * TODO: We need to persist the index type into hoodie.properties and be able to access the index + * just with a simple basepath pointing to the dataset. Until, then just always assume a + * BloomIndex + */ + private transient final HoodieBloomIndex index; + private final HoodieTimeline commitTimeline; + private HoodieTable hoodieTable; + private transient Optional sqlContextOpt; - /** - * @param basePath path to Hoodie dataset - */ - public HoodieReadClient(JavaSparkContext jsc, String basePath) { - this.jsc = jsc; - this.fs = FSUtils.getFs(); - // Create a Hoodie table which encapsulated the commits and files visible - this.hoodieTable = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null); - this.commitTimeline = hoodieTable.getCompletedCompactionCommitTimeline(); - this.index = - new HoodieBloomIndex(HoodieWriteConfig.newBuilder().withPath(basePath).build(), jsc); - this.sqlContextOpt = Optional.absent(); + /** + * @param basePath path to Hoodie dataset + */ + public HoodieReadClient(JavaSparkContext jsc, String basePath) { + this.jsc = jsc; + this.fs = FSUtils.getFs(); + // Create a Hoodie table which encapsulated the commits and files visible + this.hoodieTable = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null); + this.commitTimeline = hoodieTable.getCompletedCompactionCommitTimeline(); + this.index = + new HoodieBloomIndex(HoodieWriteConfig.newBuilder().withPath(basePath).build(), jsc); + this.sqlContextOpt = Optional.absent(); + } + + /** + * + * @param jsc + * @param basePath + * @param sqlContext + */ + public HoodieReadClient(JavaSparkContext jsc, String basePath, SQLContext sqlContext) { + this(jsc, basePath); + this.sqlContextOpt = Optional.of(sqlContext); + } + + /** + * Adds support for accessing Hoodie built tables from SparkSQL, as you normally would. + * + * @return SparkConf object to be used to construct the SparkContext by caller + */ + public static SparkConf addHoodieSupport(SparkConf conf) { + conf.set("spark.sql.hive.convertMetastoreParquet", "false"); + return conf; + } + + private void assertSqlContext() { + if (!sqlContextOpt.isPresent()) { + throw new IllegalStateException( + "SQLContext must be set, when performing dataframe operations"); } + } - /** - * - * @param jsc - * @param basePath - * @param sqlContext - */ - public HoodieReadClient(JavaSparkContext jsc, String basePath, SQLContext sqlContext) { - this(jsc, basePath); - this.sqlContextOpt = Optional.of(sqlContext); - } + /** + * Given a bunch of hoodie keys, fetches all the individual records out as a data frame + * + * @return a dataframe + */ + public Dataset read(JavaRDD hoodieKeys, int parallelism) + throws Exception { - /** - * Adds support for accessing Hoodie built tables from SparkSQL, as you normally would. - * - * @return SparkConf object to be used to construct the SparkContext by caller - */ - public static SparkConf addHoodieSupport(SparkConf conf) { - conf.set("spark.sql.hive.convertMetastoreParquet", "false"); - return conf; - } + assertSqlContext(); + JavaPairRDD> keyToFileRDD = + index.fetchRecordLocation(hoodieKeys, hoodieTable); + List paths = keyToFileRDD + .filter(keyFileTuple -> keyFileTuple._2().isPresent()) + .map(keyFileTuple -> keyFileTuple._2().get()) + .collect(); - private void assertSqlContext() { - if (!sqlContextOpt.isPresent()) { - throw new IllegalStateException("SQLContext must be set, when performing dataframe operations"); - } - } + // record locations might be same for multiple keys, so need a unique list + Set uniquePaths = new HashSet<>(paths); + Dataset originalDF = sqlContextOpt.get().read() + .parquet(uniquePaths.toArray(new String[uniquePaths.size()])); + StructType schema = originalDF.schema(); + JavaPairRDD keyRowRDD = originalDF.javaRDD() + .mapToPair(row -> { + HoodieKey key = new HoodieKey( + row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), + row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); + return new Tuple2<>(key, row); + }); - /** - * Given a bunch of hoodie keys, fetches all the individual records out as a data frame - * - * @return a dataframe - */ - public Dataset read(JavaRDD hoodieKeys, int parallelism) - throws Exception { + // Now, we need to further filter out, for only rows that match the supplied hoodie keys + JavaRDD rowRDD = keyRowRDD.join(keyToFileRDD, parallelism) + .map(tuple -> tuple._2()._1()); - assertSqlContext(); - JavaPairRDD> keyToFileRDD = - index.fetchRecordLocation(hoodieKeys, hoodieTable); - List paths = keyToFileRDD - .filter(keyFileTuple -> keyFileTuple._2().isPresent()) - .map(keyFileTuple -> keyFileTuple._2().get()) - .collect(); + return sqlContextOpt.get().createDataFrame(rowRDD, schema); + } - // record locations might be same for multiple keys, so need a unique list - Set uniquePaths = new HashSet<>(paths); - Dataset originalDF = sqlContextOpt.get().read() - .parquet(uniquePaths.toArray(new String[uniquePaths.size()])); - StructType schema = originalDF.schema(); - JavaPairRDD keyRowRDD = originalDF.javaRDD() - .mapToPair(row -> { - HoodieKey key = new HoodieKey( - row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), - row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); - return new Tuple2<>(key, row); - }); + /** + * Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]] + * If the optional FullFilePath value is not present, then the key is not found. If the + * FullFilePath value is present, it is the path component (without scheme) of the URI underlying + * file + */ + public JavaPairRDD> checkExists(JavaRDD hoodieKeys) { + return index.fetchRecordLocation(hoodieKeys, hoodieTable); + } - // Now, we need to further filter out, for only rows that match the supplied hoodie keys - JavaRDD rowRDD = keyRowRDD.join(keyToFileRDD, parallelism) - .map(tuple -> tuple._2()._1()); - - return sqlContextOpt.get().createDataFrame(rowRDD, schema); - } - - /** - * Checks if the given [Keys] exists in the hoodie table and returns [Key, - * Optional[FullFilePath]] If the optional FullFilePath value is not present, then the key is - * not found. If the FullFilePath value is present, it is the path component (without scheme) of - * the URI underlying file - */ - public JavaPairRDD> checkExists(JavaRDD hoodieKeys) { - return index.fetchRecordLocation(hoodieKeys, hoodieTable); - } - - /** - * Filter out HoodieRecords that already exists in the output folder. This is useful in - * deduplication. - * - * @param hoodieRecords Input RDD of Hoodie records. - * @return A subset of hoodieRecords RDD, with existing records filtered out. - */ - public JavaRDD filterExists(JavaRDD hoodieRecords) { - JavaRDD recordsWithLocation = index.tagLocation(hoodieRecords, hoodieTable); - return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); - } + /** + * Filter out HoodieRecords that already exists in the output folder. This is useful in + * deduplication. + * + * @param hoodieRecords Input RDD of Hoodie records. + * @return A subset of hoodieRecords RDD, with existing records filtered out. + */ + public JavaRDD filterExists(JavaRDD hoodieRecords) { + JavaRDD recordsWithLocation = index.tagLocation(hoodieRecords, hoodieTable); + return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java index f402825f7..b55fc5ae6 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java @@ -50,10 +50,21 @@ import com.uber.hoodie.func.BulkInsertMapFunction; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.io.HoodieCommitArchiveLog; import com.uber.hoodie.metrics.HoodieMetrics; -import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner; import com.uber.hoodie.table.HoodieTable; +import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner; import com.uber.hoodie.table.WorkloadProfile; import com.uber.hoodie.table.WorkloadStat; +import java.io.IOException; +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.text.ParseException; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; import org.apache.hadoop.fs.FileSystem; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -66,25 +77,12 @@ import org.apache.spark.storage.StorageLevel; import scala.Option; import scala.Tuple2; -import java.io.IOException; -import java.io.Serializable; -import java.nio.charset.StandardCharsets; -import java.text.ParseException; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; - /** - * Hoodie Write Client helps you build datasets on HDFS [insert()] and then - * perform efficient mutations on a HDFS dataset [upsert()] - * - * Note that, at any given time, there can only be one Spark job performing - * these operatons on a Hoodie dataset. + * Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient + * mutations on a HDFS dataset [upsert()] * + * Note that, at any given time, there can only be one Spark job performing these operatons on a + * Hoodie dataset. */ public class HoodieWriteClient implements Serializable { @@ -102,7 +100,8 @@ public class HoodieWriteClient implements Seriali * @param clientConfig * @throws Exception */ - public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) throws Exception { + public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) + throws Exception { this(jsc, clientConfig, false); } @@ -111,7 +110,8 @@ public class HoodieWriteClient implements Seriali * @param clientConfig * @param rollbackInFlight */ - public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight) { + public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, + boolean rollbackInFlight) { this.fs = FSUtils.getFs(); this.jsc = jsc; this.config = clientConfig; @@ -121,7 +121,7 @@ public class HoodieWriteClient implements Seriali if (rollbackInFlight) { rollbackInflightCommits(); - } + } } /** @@ -163,17 +163,17 @@ public class HoodieWriteClient implements Seriali throw (HoodieUpsertException) e; } throw new HoodieUpsertException("Failed to upsert for commit time " + commitTime, e); - } + } } /** * Inserts the given HoodieRecords, into the table. This API is intended to be used for normal * writes. * - * This implementation skips the index check and is able to leverage benefits such as - * small file handling/blocking alignment, as with upsert(), by profiling the workload + * This implementation skips the index check and is able to leverage benefits such as small file + * handling/blocking alignment, as with upsert(), by profiling the workload * - * @param records HoodieRecords to insert + * @param records HoodieRecords to insert * @param commitTime Commit Time handle * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ @@ -194,7 +194,7 @@ public class HoodieWriteClient implements Seriali throw e; } throw new HoodieInsertException("Failed to insert for commit time " + commitTime, e); - } + } } /** @@ -206,11 +206,12 @@ public class HoodieWriteClient implements Seriali * attempts to control the numbers of files with less memory compared to the {@link * HoodieWriteClient#insert(JavaRDD, String)} * - * @param records HoodieRecords to insert + * @param records HoodieRecords to insert * @param commitTime Commit Time handle * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ - public JavaRDD bulkInsert(JavaRDD> records, final String commitTime) { + public JavaRDD bulkInsert(JavaRDD> records, + final String commitTime) { return bulkInsert(records, commitTime, Option.empty()); } @@ -221,16 +222,18 @@ public class HoodieWriteClient implements Seriali * * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and * attempts to control the numbers of files with less memory compared to the {@link - * HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own partitioner. If - * specified then it will be used for repartitioning records. See {@link UserDefinedBulkInsertPartitioner}. + * HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own + * partitioner. If specified then it will be used for repartitioning records. See {@link + * UserDefinedBulkInsertPartitioner}. * - * @param records HoodieRecords to insert + * @param records HoodieRecords to insert * @param commitTime Commit Time handle - * @param bulkInsertPartitioner If specified then it will be used to partition input records before they are - * inserted into hoodie. + * @param bulkInsertPartitioner If specified then it will be used to partition input records + * before they are inserted into hoodie. * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ - public JavaRDD bulkInsert(JavaRDD> records, final String commitTime, + public JavaRDD bulkInsert(JavaRDD> records, + final String commitTime, Option bulkInsertPartitioner) { writeContext = metrics.getCommitCtx(); // Create a Hoodie table which encapsulated the commits and files visible @@ -240,7 +243,8 @@ public class HoodieWriteClient implements Seriali try { // De-dupe/merge if needed JavaRDD> dedupedRecords = - combineOnCondition(config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism()); + combineOnCondition(config.shouldCombineBeforeInsert(), records, + config.getInsertShuffleParallelism()); final JavaRDD> repartitionedRecords; if (bulkInsertPartitioner.isDefined()) { @@ -259,20 +263,22 @@ public class HoodieWriteClient implements Seriali }, true, config.getBulkInsertShuffleParallelism()); } JavaRDD writeStatusRDD = repartitionedRecords - .mapPartitionsWithIndex(new BulkInsertMapFunction(commitTime, config, table), true) - .flatMap(writeStatuses -> writeStatuses.iterator()); + .mapPartitionsWithIndex(new BulkInsertMapFunction(commitTime, config, table), + true) + .flatMap(writeStatuses -> writeStatuses.iterator()); return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime); } catch (Throwable e) { if (e instanceof HoodieInsertException) { throw e; } - throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, e); - } + throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, + e); + } } private void commitOnAutoCommit(String commitTime, JavaRDD resultRDD) { - if(config.shouldAutoCommit()) { + if (config.shouldAutoCommit()) { logger.info("Auto commit enabled: Committing " + commitTime); boolean commitResult = commit(commitTime, resultRDD); if (!commitResult) { @@ -280,30 +286,28 @@ public class HoodieWriteClient implements Seriali } } else { logger.info("Auto commit disabled for " + commitTime); - } + } } private JavaRDD> combineOnCondition(boolean condition, - JavaRDD> records, - int parallelism) { - if(condition) { + JavaRDD> records, + int parallelism) { + if (condition) { return deduplicateRecords(records, parallelism); } return records; } /** - * - * Save the workload profile in an intermediate file (here re-using commit files) - * This is useful when performing rollback for MOR datasets. Only updates are recorded - * in the workload profile metadata since updates to log blocks are unknown across batches - * Inserts (which are new parquet files) are rolled back based on commit time. - * // TODO : Create a new WorkloadProfile metadata file instead of using HoodieCommitMetadata - * @param profile - * @param commitTime - * @throws HoodieCommitException + * Save the workload profile in an intermediate file (here re-using commit files) This is useful + * when performing rollback for MOR datasets. Only updates are recorded in the workload profile + * metadata since updates to log blocks are unknown across batches Inserts (which are new parquet + * files) are rolled back based on commit time. // TODO : Create a new WorkloadProfile metadata + * file instead of using HoodieCommitMetadata */ - private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, HoodieTable table, String commitTime) throws HoodieCommitException { + private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, + HoodieTable table, + String commitTime) throws HoodieCommitException { try { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); profile.getPartitionPaths().stream().forEach(path -> { @@ -319,16 +323,17 @@ public class HoodieWriteClient implements Seriali HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); Optional instant = activeTimeline.filterInflights().lastInstant(); activeTimeline.saveToInflight(instant.get(), - Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); - } catch(IOException io) { - throw new HoodieCommitException("Failed to commit " + commitTime + " unable to save inflight metadata ", io); - } + Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + } catch (IOException io) { + throw new HoodieCommitException( + "Failed to commit " + commitTime + " unable to save inflight metadata ", io); + } } private JavaRDD upsertRecordsInternal(JavaRDD> preppedRecords, - String commitTime, - HoodieTable hoodieTable, - final boolean isUpsert) { + String commitTime, + HoodieTable hoodieTable, + final boolean isUpsert) { // Cache the tagged records, so we don't end up computing both preppedRecords.persist(StorageLevel.MEMORY_AND_DISK_SER()); @@ -344,29 +349,31 @@ public class HoodieWriteClient implements Seriali final Partitioner partitioner = getPartitioner(hoodieTable, isUpsert, profile); JavaRDD> partitionedRecords = partition(preppedRecords, partitioner); JavaRDD writeStatusRDD = partitionedRecords - .mapPartitionsWithIndex((partition, recordItr) -> { - if (isUpsert) { - return hoodieTable - .handleUpsertPartition(commitTime, partition, recordItr, partitioner); - } else { - return hoodieTable - .handleInsertPartition(commitTime, partition, recordItr, partitioner); - } - }, true) - .flatMap(writeStatuses -> writeStatuses.iterator()); + .mapPartitionsWithIndex((partition, recordItr) -> { + if (isUpsert) { + return hoodieTable + .handleUpsertPartition(commitTime, partition, recordItr, partitioner); + } else { + return hoodieTable + .handleInsertPartition(commitTime, partition, recordItr, partitioner); + } + }, true) + .flatMap(writeStatuses -> writeStatuses.iterator()); return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime); } - private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) { + private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, + WorkloadProfile profile) { if (isUpsert) { return table.getUpsertPartitioner(profile); } else { return table.getInsertPartitioner(profile); - } + } } - private JavaRDD updateIndexAndCommitIfNeeded(JavaRDD writeStatusRDD, HoodieTable table, String commitTime) { + private JavaRDD updateIndexAndCommitIfNeeded(JavaRDD writeStatusRDD, + HoodieTable table, String commitTime) { // Update the index back JavaRDD statuses = index.updateLocation(writeStatusRDD, table); // Trigger the insert and collect statuses @@ -375,12 +382,15 @@ public class HoodieWriteClient implements Seriali return statuses; } - private JavaRDD> partition(JavaRDD> dedupedRecords, Partitioner partitioner) { + private JavaRDD> partition(JavaRDD> dedupedRecords, + Partitioner partitioner) { return dedupedRecords - .mapToPair(record -> - new Tuple2<>(new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record)) - .partitionBy(partitioner) - .map(tuple -> tuple._2()); + .mapToPair(record -> + new Tuple2<>( + new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), + record)) + .partitionBy(partitioner) + .map(tuple -> tuple._2()); } /** @@ -394,8 +404,8 @@ public class HoodieWriteClient implements Seriali * Commit changes performed at the given commitTime marker */ public boolean commit(String commitTime, - JavaRDD writeStatuses, - Optional> extraMetadata) { + JavaRDD writeStatuses, + Optional> extraMetadata) { logger.info("Commiting " + commitTime); // Create a Hoodie table which encapsulated the commits and files visible @@ -405,9 +415,9 @@ public class HoodieWriteClient implements Seriali HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); List> stats = writeStatuses - .mapToPair((PairFunction) writeStatus -> - new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat())) - .collect(); + .mapToPair((PairFunction) writeStatus -> + new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat())) + .collect(); HoodieCommitMetadata metadata = new HoodieCommitMetadata(); for (Tuple2 stat : stats) { @@ -438,7 +448,7 @@ public class HoodieWriteClient implements Seriali // We cannot have unbounded commit files. Archive commits if we have to archive archiveLog.archiveIfRequired(); - if(config.isAutoClean()) { + if (config.isAutoClean()) { // Call clean to cleanup if there is anything to cleanup after the commit, logger.info("Auto cleaning is enabled. Running cleaner now"); clean(commitTime); @@ -465,12 +475,12 @@ public class HoodieWriteClient implements Seriali } /** - * Savepoint a specific commit. Latest version of data files as of the passed in commitTime - * will be referenced in the savepoint and will never be cleaned. The savepointed commit - * will never be rolledback or archived. + * Savepoint a specific commit. Latest version of data files as of the passed in commitTime will + * be referenced in the savepoint and will never be cleaned. The savepointed commit will never be + * rolledback or archived. * - * This gives an option to rollback the state to the savepoint anytime. - * Savepoint needs to be manually created and deleted. + * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be + * manually created and deleted. * * Savepoint should be on a commit that could not have been cleaned. * @@ -491,12 +501,12 @@ public class HoodieWriteClient implements Seriali } /** - * Savepoint a specific commit. Latest version of data files as of the passed in commitTime - * will be referenced in the savepoint and will never be cleaned. The savepointed commit - * will never be rolledback or archived. + * Savepoint a specific commit. Latest version of data files as of the passed in commitTime will + * be referenced in the savepoint and will never be cleaned. The savepointed commit will never be + * rolledback or archived. * - * This gives an option to rollback the state to the savepoint anytime. - * Savepoint needs to be manually created and deleted. + * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be + * manually created and deleted. * * Savepoint should be on a commit that could not have been cleaned. * @@ -510,9 +520,11 @@ public class HoodieWriteClient implements Seriali .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); Optional cleanInstant = table.getCompletedCleanTimeline().lastInstant(); - HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); - if(!table.getCompletedCommitTimeline().containsInstant(commitInstant)) { - throw new HoodieSavepointException("Could not savepoint non-existing commit " + commitInstant); + HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, + commitTime); + if (!table.getCompletedCommitTimeline().containsInstant(commitInstant)) { + throw new HoodieSavepointException( + "Could not savepoint non-existing commit " + commitInstant); } try { @@ -534,7 +546,8 @@ public class HoodieWriteClient implements Seriali + lastCommitRetained); Map> latestFilesMap = jsc.parallelize( - FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning())) + FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(), + config.shouldAssumeDatePartitioning())) .mapToPair((PairFunction>) partitionPath -> { // Scan all partitions files with this commit time logger.info("Collecting latest files in partition path " + partitionPath); @@ -555,12 +568,12 @@ public class HoodieWriteClient implements Seriali return true; } catch (IOException e) { throw new HoodieSavepointException("Failed to savepoint " + commitTime, e); - } + } } /** - * Delete a savepoint that was created. Once the savepoint is deleted, the commit can be rolledback - * and cleaner may clean up data files. + * Delete a savepoint that was created. Once the savepoint is deleted, the commit can be + * rolledback and cleaner may clean up data files. * * @param savepointTime - delete the savepoint * @return true if the savepoint was deleted successfully @@ -586,9 +599,8 @@ public class HoodieWriteClient implements Seriali } /** - * Rollback the state to the savepoint. - * WARNING: This rollsback recent commits and deleted data files. Queries accessing the files - * will mostly fail. This should be done during a downtime. + * Rollback the state to the savepoint. WARNING: This rollsback recent commits and deleted data + * files. Queries accessing the files will mostly fail. This should be done during a downtime. * * @param savepointTime - savepoint time to rollback to * @return true if the savepoint was rollecback to successfully @@ -616,7 +628,8 @@ public class HoodieWriteClient implements Seriali // Make sure the rollback was successful Optional lastInstant = - activeTimeline.reload().getCommitsAndCompactionsTimeline().filterCompletedInstants().lastInstant(); + activeTimeline.reload().getCommitsAndCompactionsTimeline().filterCompletedInstants() + .lastInstant(); Preconditions.checkArgument(lastInstant.isPresent()); Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime), savepointTime + "is not the last commit after rolling back " + commitsToRollback @@ -625,12 +638,9 @@ public class HoodieWriteClient implements Seriali } /** - * Rollback the (inflight/committed) record changes with the given commit time. - * Three steps: - * (1) Atomically unpublish this commit - * (2) clean indexing data, - * (3) clean new generated parquet files. - * (4) Finally delete .commit or .inflight file, + * Rollback the (inflight/committed) record changes with the given commit time. Three steps: (1) + * Atomically unpublish this commit (2) clean indexing data, (3) clean new generated parquet + * files. (4) Finally delete .commit or .inflight file, */ public boolean rollback(final String commitTime) throws HoodieRollbackException { rollback(Lists.newArrayList(commitTime)); @@ -638,7 +648,7 @@ public class HoodieWriteClient implements Seriali } private void rollback(List commits) { - if(commits.isEmpty()) { + if (commits.isEmpty()) { logger.info("List of commits to rollback is empty"); return; } @@ -702,7 +712,9 @@ public class HoodieWriteClient implements Seriali Optional durationInMs = Optional.empty(); if (context != null) { durationInMs = Optional.of(metrics.getDurationInMs(context.stop())); - Long numFilesDeleted = stats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size()).sum(); + Long numFilesDeleted = stats.stream() + .mapToLong(stat -> stat.getSuccessDeleteFiles().size()) + .sum(); metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted); } HoodieRollbackMetadata rollbackMetadata = @@ -722,7 +734,7 @@ public class HoodieWriteClient implements Seriali } catch (IOException e) { throw new HoodieRollbackException("Failed to rollback " + config.getBasePath() + " commits " + commits, e); - } + } } /** @@ -733,9 +745,9 @@ public class HoodieWriteClient implements Seriali } /** - * Clean up any stale/old files/data lying around (either on file storage or index storage) - * based on the configurations and CleaningPolicy used. (typically files that no longer can be used - * by a running query can be cleaned) + * Clean up any stale/old files/data lying around (either on file storage or index storage) based + * on the configurations and CleaningPolicy used. (typically files that no longer can be used by a + * running query can be cleaned) */ public void clean() throws HoodieIOException { String startCleanTime = HoodieActiveTimeline.createNewCommitTime(); @@ -743,11 +755,11 @@ public class HoodieWriteClient implements Seriali } /** - * Clean up any stale/old files/data lying around (either on file storage or index storage) - * based on the configurations and CleaningPolicy used. (typically files that no longer can be used - * by a running query can be cleaned) + * Clean up any stale/old files/data lying around (either on file storage or index storage) based + * on the configurations and CleaningPolicy used. (typically files that no longer can be used by a + * running query can be cleaned) */ - private void clean(String startCleanTime) throws HoodieIOException { + private void clean(String startCleanTime) throws HoodieIOException { try { logger.info("Cleaner started"); final Timer.Context context = metrics.getCleanCtx(); @@ -788,7 +800,7 @@ public class HoodieWriteClient implements Seriali } } catch (IOException e) { throw new HoodieIOException("Failed to clean up after commit", e); - } + } } /** @@ -811,30 +823,30 @@ public class HoodieWriteClient implements Seriali } public static SparkConf registerClasses(SparkConf conf) { - conf.registerKryoClasses(new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class}); + conf.registerKryoClasses( + new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class}); return conf; } /** * Deduplicate Hoodie records, using the given deduplication funciton. */ - private JavaRDD> deduplicateRecords(JavaRDD> records, int parallelism) { + private JavaRDD> deduplicateRecords(JavaRDD> records, + int parallelism) { return records - .mapToPair(record -> new Tuple2<>(record.getKey(), record)) - .reduceByKey((rec1, rec2) -> { - @SuppressWarnings("unchecked") - T reducedData = (T) rec1.getData().preCombine(rec2.getData()); - // we cannot allow the user to change the key or partitionPath, since that will affect everything - // so pick it from one of the records. - return new HoodieRecord(rec1.getKey(), reducedData); - }, parallelism) - .map(recordTuple -> recordTuple._2()); + .mapToPair(record -> new Tuple2<>(record.getKey(), record)) + .reduceByKey((rec1, rec2) -> { + @SuppressWarnings("unchecked") + T reducedData = (T) rec1.getData().preCombine(rec2.getData()); + // we cannot allow the user to change the key or partitionPath, since that will affect everything + // so pick it from one of the records. + return new HoodieRecord(rec1.getKey(), reducedData); + }, parallelism) + .map(recordTuple -> recordTuple._2()); } /** * Cleanup all inflight commits - * - * @throws IOException */ private void rollbackInflightCommits() { HoodieTable table = HoodieTable diff --git a/hoodie-client/src/main/java/com/uber/hoodie/WriteStatus.java b/hoodie-client/src/main/java/com/uber/hoodie/WriteStatus.java index 302a0fcd2..7854e128e 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/WriteStatus.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/WriteStatus.java @@ -19,7 +19,6 @@ package com.uber.hoodie; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieWriteStat; - import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; @@ -32,124 +31,130 @@ import java.util.Optional; */ public class WriteStatus implements Serializable { - private final HashMap errors = new HashMap<>(); + private final HashMap errors = new HashMap<>(); - private final List writtenRecords = new ArrayList<>(); + private final List writtenRecords = new ArrayList<>(); - private final List failedRecords = new ArrayList<>(); + private final List failedRecords = new ArrayList<>(); - private Throwable globalError = null; + private Throwable globalError = null; - private String fileId = null; + private String fileId = null; - private String partitionPath = null; + private String partitionPath = null; - private HoodieWriteStat stat = null; + private HoodieWriteStat stat = null; - private long totalRecords = 0; - private long totalErrorRecords = 0; + private long totalRecords = 0; + private long totalErrorRecords = 0; - /** - * Mark write as success, optionally using given parameters for the purpose of calculating - * some aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus - * objects are collected in Spark Driver. - * - * @param record deflated {@code HoodieRecord} containing information that uniquely identifies it. - * @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation. - */ - public void markSuccess(HoodieRecord record, - Optional> optionalRecordMetadata) { - writtenRecords.add(record); - totalRecords++; - } + /** + * Mark write as success, optionally using given parameters for the purpose of calculating some + * aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus + * objects are collected in Spark Driver. + * + * @param record deflated {@code HoodieRecord} containing information that uniquely identifies + * it. + * @param optionalRecordMetadata optional metadata related to data contained in {@link + * HoodieRecord} before deflation. + */ + public void markSuccess(HoodieRecord record, + Optional> optionalRecordMetadata) { + writtenRecords.add(record); + totalRecords++; + } - /** - * Mark write as failed, optionally using given parameters for the purpose of calculating - * some aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus - * objects are collected in Spark Driver. - * - * @param record deflated {@code HoodieRecord} containing information that uniquely identifies it. - * @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation. - */ - public void markFailure(HoodieRecord record, Throwable t, - Optional> optionalRecordMetadata) { - failedRecords.add(record); - errors.put(record.getKey(), t); - totalRecords++; - totalErrorRecords++; - } + /** + * Mark write as failed, optionally using given parameters for the purpose of calculating some + * aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus + * objects are collected in Spark Driver. + * + * @param record deflated {@code HoodieRecord} containing information that uniquely identifies + * it. + * @param optionalRecordMetadata optional metadata related to data contained in {@link + * HoodieRecord} before deflation. + */ + public void markFailure(HoodieRecord record, Throwable t, + Optional> optionalRecordMetadata) { + failedRecords.add(record); + errors.put(record.getKey(), t); + totalRecords++; + totalErrorRecords++; + } - public String getFileId() { - return fileId; - } + public String getFileId() { + return fileId; + } - public void setFileId(String fileId) { - this.fileId = fileId; - } + public void setFileId(String fileId) { + this.fileId = fileId; + } - public boolean hasErrors() { - return totalErrorRecords > 0; - } + public boolean hasErrors() { + return totalErrorRecords > 0; + } - public boolean isErrored(HoodieKey key) { - return errors.containsKey(key); - } + public boolean isErrored(HoodieKey key) { + return errors.containsKey(key); + } - public HashMap getErrors() { - return errors; - } + public HashMap getErrors() { + return errors; + } - public boolean hasGlobalError() { - return globalError != null; - } + public boolean hasGlobalError() { + return globalError != null; + } - public void setGlobalError(Throwable t) { - this.globalError = t; - } + public void setGlobalError(Throwable t) { + this.globalError = t; + } - public Throwable getGlobalError() { - return this.globalError; - } + public Throwable getGlobalError() { + return this.globalError; + } - public List getWrittenRecords() { - return writtenRecords; - } + public List getWrittenRecords() { + return writtenRecords; + } - public List getFailedRecords() { - return failedRecords; - } + public List getFailedRecords() { + return failedRecords; + } - public HoodieWriteStat getStat() { - return stat; - } + public HoodieWriteStat getStat() { + return stat; + } - public void setStat(HoodieWriteStat stat) { - this.stat = stat; - } + public void setStat(HoodieWriteStat stat) { + this.stat = stat; + } - public String getPartitionPath() { - return partitionPath; - } + public String getPartitionPath() { + return partitionPath; + } - public void setPartitionPath(String partitionPath) { - this.partitionPath = partitionPath; - } + public void setPartitionPath(String partitionPath) { + this.partitionPath = partitionPath; + } - public long getTotalRecords() { - return totalRecords; - } + public long getTotalRecords() { + return totalRecords; + } - public long getTotalErrorRecords() { return totalErrorRecords; } + public long getTotalErrorRecords() { + return totalErrorRecords; + } - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("WriteStatus {"); - sb.append("fileId=").append(fileId); - sb.append(", globalError='").append(globalError).append('\''); - sb.append(", hasErrors='").append(hasErrors()).append('\''); - sb.append(", errorCount='").append(totalErrorRecords).append('\''); - sb.append(", errorPct='").append((100.0 * totalErrorRecords) / totalRecords).append('\''); - sb.append('}'); - return sb.toString(); - } + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("WriteStatus {"); + sb.append("fileId=").append(fileId); + sb.append(", globalError='").append(globalError).append('\''); + sb.append(", hasErrors='").append(hasErrors()).append('\''); + sb.append(", errorCount='").append(totalErrorRecords).append('\''); + sb.append(", errorPct='").append((100.0 * totalErrorRecords) / totalRecords).append('\''); + sb.append('}'); + return sb.toString(); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/DefaultHoodieConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/DefaultHoodieConfig.java index bf363a38e..948a1e00e 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/config/DefaultHoodieConfig.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/DefaultHoodieConfig.java @@ -17,33 +17,35 @@ package com.uber.hoodie.config; import java.io.Serializable; -import java.util.Map; import java.util.Properties; /** * Default Way to load Hoodie config through a java.util.Properties */ public class DefaultHoodieConfig implements Serializable { - protected final Properties props; - public DefaultHoodieConfig(Properties props) { - this.props = props; - } - public Properties getProps() { - return props; - } + protected final Properties props; - public static void setDefaultOnCondition(Properties props, boolean condition, String propName, - String defaultValue) { - if (condition) { - props.setProperty(propName, defaultValue); - } - } + public DefaultHoodieConfig(Properties props) { + this.props = props; + } - public static void setDefaultOnCondition(Properties props, boolean condition, DefaultHoodieConfig config) { - if (condition) { - props.putAll(config.getProps()); - } + public Properties getProps() { + return props; + } + + public static void setDefaultOnCondition(Properties props, boolean condition, String propName, + String defaultValue) { + if (condition) { + props.setProperty(propName, defaultValue); } + } + + public static void setDefaultOnCondition(Properties props, boolean condition, + DefaultHoodieConfig config) { + if (condition) { + props.putAll(config.getProps()); + } + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieCompactionConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieCompactionConfig.java index d47dd1d52..39f076e5a 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieCompactionConfig.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieCompactionConfig.java @@ -19,231 +19,239 @@ package com.uber.hoodie.config; import com.google.common.base.Preconditions; import com.uber.hoodie.common.model.HoodieAvroPayload; import com.uber.hoodie.common.model.HoodieCleaningPolicy; -import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.io.compact.strategy.CompactionStrategy; import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy; - -import javax.annotation.concurrent.Immutable; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.Properties; +import javax.annotation.concurrent.Immutable; /** * Compaction related config */ @Immutable public class HoodieCompactionConfig extends DefaultHoodieConfig { - public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy"; - private static final String DEFAULT_CLEANER_POLICY = - HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name(); - public static final String AUTO_CLEAN_PROP = "hoodie.clean.automatic"; - private static final String DEFAULT_AUTO_CLEAN = "true"; + public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy"; + private static final String DEFAULT_CLEANER_POLICY = + HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name(); - // Turn on inline compaction - after fw delta commits a inline compaction will be run - public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline"; - private static final String DEFAULT_INLINE_COMPACT = "true"; + public static final String AUTO_CLEAN_PROP = "hoodie.clean.automatic"; + private static final String DEFAULT_AUTO_CLEAN = "true"; - // Run a compaction every N delta commits - public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = "hoodie.compact.inline.max.delta.commits"; - private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "10"; + // Turn on inline compaction - after fw delta commits a inline compaction will be run + public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline"; + private static final String DEFAULT_INLINE_COMPACT = "true"; - public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = - "hoodie.cleaner.fileversions.retained"; - private static final String DEFAULT_CLEANER_FILE_VERSIONS_RETAINED = "3"; + // Run a compaction every N delta commits + public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = "hoodie.compact.inline.max.delta.commits"; + private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "10"; - public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained"; - private static final String DEFAULT_CLEANER_COMMITS_RETAINED = "24"; + public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = + "hoodie.cleaner.fileversions.retained"; + private static final String DEFAULT_CLEANER_FILE_VERSIONS_RETAINED = "3"; - public static final String MAX_COMMITS_TO_KEEP = "hoodie.keep.max.commits"; - private static final String DEFAULT_MAX_COMMITS_TO_KEEP = String.valueOf(128); - public static final String MIN_COMMITS_TO_KEEP = "hoodie.keep.min.commits"; - private static final String DEFAULT_MIN_COMMITS_TO_KEEP = String.valueOf(96); - // Upsert uses this file size to compact new data onto existing files.. - public static final String PARQUET_SMALL_FILE_LIMIT_BYTES = "hoodie.parquet.small.file.limit"; - // Turned off by default - public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0); + public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained"; + private static final String DEFAULT_CLEANER_COMMITS_RETAINED = "24"; + + public static final String MAX_COMMITS_TO_KEEP = "hoodie.keep.max.commits"; + private static final String DEFAULT_MAX_COMMITS_TO_KEEP = String.valueOf(128); + public static final String MIN_COMMITS_TO_KEEP = "hoodie.keep.min.commits"; + private static final String DEFAULT_MIN_COMMITS_TO_KEEP = String.valueOf(96); + // Upsert uses this file size to compact new data onto existing files.. + public static final String PARQUET_SMALL_FILE_LIMIT_BYTES = "hoodie.parquet.small.file.limit"; + // Turned off by default + public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0); - /** Configs related to specific table types **/ - // Number of inserts, that will be put each partition/bucket for writing - public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size"; - // The rationale to pick the insert parallelism is the following. Writing out 100MB files, - // with atleast 1kb records, means 100K records per file. we just overprovision to 500K - public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000); + /** + * Configs related to specific table types + **/ + // Number of inserts, that will be put each partition/bucket for writing + public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size"; + // The rationale to pick the insert parallelism is the following. Writing out 100MB files, + // with atleast 1kb records, means 100K records per file. we just overprovision to 500K + public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000); - // Config to control whether we control insert split sizes automatically based on average record sizes - public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = "hoodie.copyonwrite.insert.auto.split"; - // its off by default - public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(false); + // Config to control whether we control insert split sizes automatically based on average record sizes + public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = "hoodie.copyonwrite.insert.auto.split"; + // its off by default + public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(false); - // This value is used as a guessimate for the record size, if we can't determine this from previous commits - public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate"; - // Used to determine how much more can be packed into a small file, before it exceeds the size limit. - public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024); + // This value is used as a guessimate for the record size, if we can't determine this from previous commits + public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate"; + // Used to determine how much more can be packed into a small file, before it exceeds the size limit. + public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String + .valueOf(1024); - public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism"; - public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200); + public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism"; + public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200); - public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io"; - // 500GB of target IO per compaction (both read and write) - public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024); + public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io"; + // 500GB of target IO per compaction (both read and write) + public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024); - public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy"; - // 200GB of target IO per compaction - public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName(); + public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy"; + // 200GB of target IO per compaction + public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class + .getName(); - // used to merge records written to log file - public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName(); - public static final String PAYLOAD_CLASS = "hoodie.compaction.payload.class"; + // used to merge records written to log file + public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName(); + public static final String PAYLOAD_CLASS = "hoodie.compaction.payload.class"; - private HoodieCompactionConfig(Properties props) { - super(props); + private HoodieCompactionConfig(Properties props) { + super(props); + } + + public static HoodieCompactionConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + + private final Properties props = new Properties(); + + public Builder fromFile(File propertiesFile) throws IOException { + FileReader reader = new FileReader(propertiesFile); + try { + this.props.load(reader); + return this; + } finally { + reader.close(); + } } - public static HoodieCompactionConfig.Builder newBuilder() { - return new Builder(); + public Builder fromProperties(Properties props) { + this.props.putAll(props); + return this; } - public static class Builder { - private final Properties props = new Properties(); - - public Builder fromFile(File propertiesFile) throws IOException { - FileReader reader = new FileReader(propertiesFile); - try { - this.props.load(reader); - return this; - } finally { - reader.close(); - } - } - - public Builder fromProperties(Properties props) { - this.props.putAll(props); - return this; - } - - - public Builder withAutoClean(Boolean autoClean) { - props.setProperty(AUTO_CLEAN_PROP, String.valueOf(autoClean)); - return this; - } - - public Builder withInlineCompaction(Boolean inlineCompaction) { - props.setProperty(INLINE_COMPACT_PROP, String.valueOf(inlineCompaction)); - return this; - } - - public Builder inlineCompactionEvery(int deltaCommits) { - props.setProperty(INLINE_COMPACT_PROP, String.valueOf(deltaCommits)); - return this; - } - - public Builder withCleanerPolicy(HoodieCleaningPolicy policy) { - props.setProperty(CLEANER_POLICY_PROP, policy.name()); - return this; - } - - public Builder retainFileVersions(int fileVersionsRetained) { - props.setProperty(CLEANER_FILE_VERSIONS_RETAINED_PROP, - String.valueOf(fileVersionsRetained)); - return this; - } - - public Builder retainCommits(int commitsRetained) { - props.setProperty(CLEANER_COMMITS_RETAINED_PROP, String.valueOf(commitsRetained)); - return this; - } - - public Builder archiveCommitsWith(int minToKeep, int maxToKeep) { - props.setProperty(MIN_COMMITS_TO_KEEP, String.valueOf(minToKeep)); - props.setProperty(MAX_COMMITS_TO_KEEP, String.valueOf(maxToKeep)); - return this; - } - - public Builder compactionSmallFileSize(long smallFileLimitBytes) { - props.setProperty(PARQUET_SMALL_FILE_LIMIT_BYTES, String.valueOf(smallFileLimitBytes)); - return this; - } - - public Builder insertSplitSize(int insertSplitSize) { - props.setProperty(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, String.valueOf(insertSplitSize)); - return this; - } - - public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) { - props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits)); - return this; - } - - public Builder approxRecordSize(int recordSizeEstimate) { - props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate)); - return this; - } - - public Builder withCleanerParallelism(int cleanerParallelism) { - props.setProperty(CLEANER_PARALLELISM, String.valueOf(cleanerParallelism)); - return this; - } - - public Builder withCompactionStrategy(CompactionStrategy compactionStrategy) { - props.setProperty(COMPACTION_STRATEGY_PROP, compactionStrategy.getClass().getName()); - return this; - } - - public Builder withPayloadClass(String payloadClassName) { - props.setProperty(PAYLOAD_CLASS, payloadClassName); - return this; - } - - public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) { - props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, String.valueOf(targetIOPerCompactionInMB)); - return this; - } - - public HoodieCompactionConfig build() { - HoodieCompactionConfig config = new HoodieCompactionConfig(props); - setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), - AUTO_CLEAN_PROP, DEFAULT_AUTO_CLEAN); - setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), - INLINE_COMPACT_PROP, DEFAULT_INLINE_COMPACT); - setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP), - INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS); - setDefaultOnCondition(props, !props.containsKey(CLEANER_POLICY_PROP), - CLEANER_POLICY_PROP, DEFAULT_CLEANER_POLICY); - setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP), - CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED); - setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), - CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED); - setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP), - MAX_COMMITS_TO_KEEP, DEFAULT_MAX_COMMITS_TO_KEEP); - setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP), - MIN_COMMITS_TO_KEEP, DEFAULT_MIN_COMMITS_TO_KEEP); - setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), - PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES); - setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE), - COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE); - setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS), - COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS); - setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE), - COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE); - setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), - CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), - COMPACTION_STRATEGY_PROP, DEFAULT_COMPACTION_STRATEGY); - setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS), - PAYLOAD_CLASS, DEFAULT_PAYLOAD_CLASS); - setDefaultOnCondition(props, !props.containsKey(TARGET_IO_PER_COMPACTION_IN_MB_PROP), - TARGET_IO_PER_COMPACTION_IN_MB_PROP, DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB); - - HoodieCleaningPolicy.valueOf(props.getProperty(CLEANER_POLICY_PROP)); - Preconditions.checkArgument( - Integer.parseInt(props.getProperty(MAX_COMMITS_TO_KEEP)) > Integer - .parseInt(props.getProperty(MIN_COMMITS_TO_KEEP))); - return config; - } + public Builder withAutoClean(Boolean autoClean) { + props.setProperty(AUTO_CLEAN_PROP, String.valueOf(autoClean)); + return this; } + + public Builder withInlineCompaction(Boolean inlineCompaction) { + props.setProperty(INLINE_COMPACT_PROP, String.valueOf(inlineCompaction)); + return this; + } + + public Builder inlineCompactionEvery(int deltaCommits) { + props.setProperty(INLINE_COMPACT_PROP, String.valueOf(deltaCommits)); + return this; + } + + public Builder withCleanerPolicy(HoodieCleaningPolicy policy) { + props.setProperty(CLEANER_POLICY_PROP, policy.name()); + return this; + } + + public Builder retainFileVersions(int fileVersionsRetained) { + props.setProperty(CLEANER_FILE_VERSIONS_RETAINED_PROP, + String.valueOf(fileVersionsRetained)); + return this; + } + + public Builder retainCommits(int commitsRetained) { + props.setProperty(CLEANER_COMMITS_RETAINED_PROP, String.valueOf(commitsRetained)); + return this; + } + + public Builder archiveCommitsWith(int minToKeep, int maxToKeep) { + props.setProperty(MIN_COMMITS_TO_KEEP, String.valueOf(minToKeep)); + props.setProperty(MAX_COMMITS_TO_KEEP, String.valueOf(maxToKeep)); + return this; + } + + public Builder compactionSmallFileSize(long smallFileLimitBytes) { + props.setProperty(PARQUET_SMALL_FILE_LIMIT_BYTES, String.valueOf(smallFileLimitBytes)); + return this; + } + + public Builder insertSplitSize(int insertSplitSize) { + props.setProperty(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, String.valueOf(insertSplitSize)); + return this; + } + + public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) { + props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, + String.valueOf(autoTuneInsertSplits)); + return this; + } + + public Builder approxRecordSize(int recordSizeEstimate) { + props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, + String.valueOf(recordSizeEstimate)); + return this; + } + + public Builder withCleanerParallelism(int cleanerParallelism) { + props.setProperty(CLEANER_PARALLELISM, String.valueOf(cleanerParallelism)); + return this; + } + + public Builder withCompactionStrategy(CompactionStrategy compactionStrategy) { + props.setProperty(COMPACTION_STRATEGY_PROP, compactionStrategy.getClass().getName()); + return this; + } + + public Builder withPayloadClass(String payloadClassName) { + props.setProperty(PAYLOAD_CLASS, payloadClassName); + return this; + } + + public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) { + props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, + String.valueOf(targetIOPerCompactionInMB)); + return this; + } + + public HoodieCompactionConfig build() { + HoodieCompactionConfig config = new HoodieCompactionConfig(props); + setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), + AUTO_CLEAN_PROP, DEFAULT_AUTO_CLEAN); + setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), + INLINE_COMPACT_PROP, DEFAULT_INLINE_COMPACT); + setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP), + INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS); + setDefaultOnCondition(props, !props.containsKey(CLEANER_POLICY_PROP), + CLEANER_POLICY_PROP, DEFAULT_CLEANER_POLICY); + setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP), + CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED); + setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), + CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED); + setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP), + MAX_COMMITS_TO_KEEP, DEFAULT_MAX_COMMITS_TO_KEEP); + setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP), + MIN_COMMITS_TO_KEEP, DEFAULT_MIN_COMMITS_TO_KEEP); + setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), + PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES); + setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE), + COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE); + setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS), + COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS); + setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE), + COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, + DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE); + setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), + CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), + COMPACTION_STRATEGY_PROP, DEFAULT_COMPACTION_STRATEGY); + setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS), + PAYLOAD_CLASS, DEFAULT_PAYLOAD_CLASS); + setDefaultOnCondition(props, !props.containsKey(TARGET_IO_PER_COMPACTION_IN_MB_PROP), + TARGET_IO_PER_COMPACTION_IN_MB_PROP, DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB); + + HoodieCleaningPolicy.valueOf(props.getProperty(CLEANER_POLICY_PROP)); + Preconditions.checkArgument( + Integer.parseInt(props.getProperty(MAX_COMMITS_TO_KEEP)) > Integer + .parseInt(props.getProperty(MIN_COMMITS_TO_KEEP))); + return config; + } + + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieIndexConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieIndexConfig.java index 9a5fefbbe..a7a722de1 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieIndexConfig.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieIndexConfig.java @@ -16,14 +16,12 @@ package com.uber.hoodie.config; -import com.google.common.base.Preconditions; import com.uber.hoodie.index.HoodieIndex; - -import javax.annotation.concurrent.Immutable; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.Properties; +import javax.annotation.concurrent.Immutable; /** * Indexing related config @@ -31,123 +29,124 @@ import java.util.Properties; @Immutable public class HoodieIndexConfig extends DefaultHoodieConfig { - public static final String INDEX_TYPE_PROP = "hoodie.index.type"; - public static final String DEFAULT_INDEX_TYPE = HoodieIndex.IndexType.BLOOM.name(); + public static final String INDEX_TYPE_PROP = "hoodie.index.type"; + public static final String DEFAULT_INDEX_TYPE = HoodieIndex.IndexType.BLOOM.name(); - // ***** Bloom Index configs ***** - public static final String BLOOM_FILTER_NUM_ENTRIES = "hoodie.index.bloom.num_entries"; - public static final String DEFAULT_BLOOM_FILTER_NUM_ENTRIES = "60000"; - public static final String BLOOM_FILTER_FPP = "hoodie.index.bloom.fpp"; - public static final String DEFAULT_BLOOM_FILTER_FPP = "0.000000001"; - public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism"; - // Disable explicit bloom index parallelism setting by default - hoodie auto computes - public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0"; - public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by.ranges"; - public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true"; - public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching"; - public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true"; + // ***** Bloom Index configs ***** + public static final String BLOOM_FILTER_NUM_ENTRIES = "hoodie.index.bloom.num_entries"; + public static final String DEFAULT_BLOOM_FILTER_NUM_ENTRIES = "60000"; + public static final String BLOOM_FILTER_FPP = "hoodie.index.bloom.fpp"; + public static final String DEFAULT_BLOOM_FILTER_FPP = "0.000000001"; + public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism"; + // Disable explicit bloom index parallelism setting by default - hoodie auto computes + public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0"; + public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by.ranges"; + public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true"; + public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching"; + public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true"; - // ***** HBase Index Configs ***** - public final static String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum"; - public final static String HBASE_ZKPORT_PROP = "hoodie.index.hbase.zkport"; - public final static String HBASE_TABLENAME_PROP = "hoodie.index.hbase.table"; + // ***** HBase Index Configs ***** + public final static String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum"; + public final static String HBASE_ZKPORT_PROP = "hoodie.index.hbase.zkport"; + public final static String HBASE_TABLENAME_PROP = "hoodie.index.hbase.table"; - // ***** Bucketed Index Configs ***** - public final static String BUCKETED_INDEX_NUM_BUCKETS_PROP = "hoodie.index.bucketed.numbuckets"; + // ***** Bucketed Index Configs ***** + public final static String BUCKETED_INDEX_NUM_BUCKETS_PROP = "hoodie.index.bucketed.numbuckets"; - private HoodieIndexConfig(Properties props) { - super(props); + private HoodieIndexConfig(Properties props) { + super(props); + } + + public static HoodieIndexConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + + private final Properties props = new Properties(); + + public Builder fromFile(File propertiesFile) throws IOException { + FileReader reader = new FileReader(propertiesFile); + try { + this.props.load(reader); + return this; + } finally { + reader.close(); + } } - public static HoodieIndexConfig.Builder newBuilder() { - return new Builder(); + public Builder fromProperties(Properties props) { + this.props.putAll(props); + return this; } - public static class Builder { - private final Properties props = new Properties(); - - public Builder fromFile(File propertiesFile) throws IOException { - FileReader reader = new FileReader(propertiesFile); - try { - this.props.load(reader); - return this; - } finally { - reader.close(); - } - } - - public Builder fromProperties(Properties props) { - this.props.putAll(props); - return this; - } - - public Builder withIndexType(HoodieIndex.IndexType indexType) { - props.setProperty(INDEX_TYPE_PROP, indexType.name()); - return this; - } - - public Builder bloomFilterNumEntries(int numEntries) { - props.setProperty(BLOOM_FILTER_NUM_ENTRIES, String.valueOf(numEntries)); - return this; - } - - public Builder bloomFilterFPP(double fpp) { - props.setProperty(BLOOM_FILTER_FPP, String.valueOf(fpp)); - return this; - } - - public Builder hbaseZkQuorum(String zkString) { - props.setProperty(HBASE_ZKQUORUM_PROP, zkString); - return this; - } - - public Builder hbaseZkPort(int port) { - props.setProperty(HBASE_ZKPORT_PROP, String.valueOf(port)); - return this; - } - - public Builder hbaseTableName(String tableName) { - props.setProperty(HBASE_TABLENAME_PROP, tableName); - return this; - } - - public Builder bloomIndexParallelism(int parallelism) { - props.setProperty(BLOOM_INDEX_PARALLELISM_PROP, String.valueOf(parallelism)); - return this; - } - - public Builder bloomIndexPruneByRanges(boolean pruneRanges) { - props.setProperty(BLOOM_INDEX_PRUNE_BY_RANGES_PROP, String.valueOf(pruneRanges)); - return this; - } - - public Builder bloomIndexUseCaching(boolean useCaching) { - props.setProperty(BLOOM_INDEX_USE_CACHING_PROP, String.valueOf(useCaching)); - return this; - } - - public Builder numBucketsPerPartition(int numBuckets) { - props.setProperty(BUCKETED_INDEX_NUM_BUCKETS_PROP, String.valueOf(numBuckets)); - return this; - } - - public HoodieIndexConfig build() { - HoodieIndexConfig config = new HoodieIndexConfig(props); - setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), - INDEX_TYPE_PROP, DEFAULT_INDEX_TYPE); - setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), - BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES); - setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), - BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP), - BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP), - BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP), - BLOOM_INDEX_USE_CACHING_PROP, DEFAULT_BLOOM_INDEX_USE_CACHING); - // Throws IllegalArgumentException if the value set is not a known Hoodie Index Type - HoodieIndex.IndexType.valueOf(props.getProperty(INDEX_TYPE_PROP)); - return config; - } + public Builder withIndexType(HoodieIndex.IndexType indexType) { + props.setProperty(INDEX_TYPE_PROP, indexType.name()); + return this; } + + public Builder bloomFilterNumEntries(int numEntries) { + props.setProperty(BLOOM_FILTER_NUM_ENTRIES, String.valueOf(numEntries)); + return this; + } + + public Builder bloomFilterFPP(double fpp) { + props.setProperty(BLOOM_FILTER_FPP, String.valueOf(fpp)); + return this; + } + + public Builder hbaseZkQuorum(String zkString) { + props.setProperty(HBASE_ZKQUORUM_PROP, zkString); + return this; + } + + public Builder hbaseZkPort(int port) { + props.setProperty(HBASE_ZKPORT_PROP, String.valueOf(port)); + return this; + } + + public Builder hbaseTableName(String tableName) { + props.setProperty(HBASE_TABLENAME_PROP, tableName); + return this; + } + + public Builder bloomIndexParallelism(int parallelism) { + props.setProperty(BLOOM_INDEX_PARALLELISM_PROP, String.valueOf(parallelism)); + return this; + } + + public Builder bloomIndexPruneByRanges(boolean pruneRanges) { + props.setProperty(BLOOM_INDEX_PRUNE_BY_RANGES_PROP, String.valueOf(pruneRanges)); + return this; + } + + public Builder bloomIndexUseCaching(boolean useCaching) { + props.setProperty(BLOOM_INDEX_USE_CACHING_PROP, String.valueOf(useCaching)); + return this; + } + + public Builder numBucketsPerPartition(int numBuckets) { + props.setProperty(BUCKETED_INDEX_NUM_BUCKETS_PROP, String.valueOf(numBuckets)); + return this; + } + + public HoodieIndexConfig build() { + HoodieIndexConfig config = new HoodieIndexConfig(props); + setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), + INDEX_TYPE_PROP, DEFAULT_INDEX_TYPE); + setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), + BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES); + setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), + BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP); + setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP), + BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP), + BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES); + setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP), + BLOOM_INDEX_USE_CACHING_PROP, DEFAULT_BLOOM_INDEX_USE_CACHING); + // Throws IllegalArgumentException if the value set is not a known Hoodie Index Type + HoodieIndex.IndexType.valueOf(props.getProperty(INDEX_TYPE_PROP)); + return config; + } + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieMetricsConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieMetricsConfig.java index 0ef107de5..f3ef71077 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieMetricsConfig.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieMetricsConfig.java @@ -17,12 +17,11 @@ package com.uber.hoodie.config; import com.uber.hoodie.metrics.MetricsReporterType; - -import javax.annotation.concurrent.Immutable; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.Properties; +import javax.annotation.concurrent.Immutable; /** * Fetch the configurations used by the Metrics system. @@ -30,89 +29,90 @@ import java.util.Properties; @Immutable public class HoodieMetricsConfig extends DefaultHoodieConfig { - public final static String METRIC_PREFIX = "hoodie.metrics"; - public final static String METRICS_ON = METRIC_PREFIX + ".on"; - public final static boolean DEFAULT_METRICS_ON = false; - public final static String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type"; - public final static MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = - MetricsReporterType.GRAPHITE; + public final static String METRIC_PREFIX = "hoodie.metrics"; + public final static String METRICS_ON = METRIC_PREFIX + ".on"; + public final static boolean DEFAULT_METRICS_ON = false; + public final static String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type"; + public final static MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = + MetricsReporterType.GRAPHITE; - // Graphite - public final static String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite"; - public final static String GRAPHITE_SERVER_HOST = GRAPHITE_PREFIX + ".host"; - public final static String DEFAULT_GRAPHITE_SERVER_HOST = "localhost"; + // Graphite + public final static String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite"; + public final static String GRAPHITE_SERVER_HOST = GRAPHITE_PREFIX + ".host"; + public final static String DEFAULT_GRAPHITE_SERVER_HOST = "localhost"; - public final static String GRAPHITE_SERVER_PORT = GRAPHITE_PREFIX + ".port"; - public final static int DEFAULT_GRAPHITE_SERVER_PORT = 4756; + public final static String GRAPHITE_SERVER_PORT = GRAPHITE_PREFIX + ".port"; + public final static int DEFAULT_GRAPHITE_SERVER_PORT = 4756; - public final static String GRAPHITE_METRIC_PREFIX = GRAPHITE_PREFIX + ".metric.prefix"; + public final static String GRAPHITE_METRIC_PREFIX = GRAPHITE_PREFIX + ".metric.prefix"; - private HoodieMetricsConfig(Properties props) { - super(props); + private HoodieMetricsConfig(Properties props) { + super(props); + } + + public static HoodieMetricsConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + + private final Properties props = new Properties(); + + public Builder fromFile(File propertiesFile) throws IOException { + FileReader reader = new FileReader(propertiesFile); + try { + this.props.load(reader); + return this; + } finally { + reader.close(); + } } - public static HoodieMetricsConfig.Builder newBuilder() { - return new Builder(); + public Builder fromProperties(Properties props) { + this.props.putAll(props); + return this; } - public static class Builder { - private final Properties props = new Properties(); - public Builder fromFile(File propertiesFile) throws IOException { - FileReader reader = new FileReader(propertiesFile); - try { - this.props.load(reader); - return this; - } finally { - reader.close(); - } - } - - public Builder fromProperties(Properties props) { - this.props.putAll(props); - return this; - } - - - public Builder on(boolean metricsOn) { - props.setProperty(METRICS_ON, String.valueOf(metricsOn)); - return this; - } - - public Builder withReporterType(String reporterType) { - props.setProperty(METRICS_REPORTER_TYPE, reporterType); - return this; - } - - public Builder toGraphiteHost(String host) { - props.setProperty(GRAPHITE_SERVER_HOST, host); - return this; - } - - public Builder onGraphitePort(int port) { - props.setProperty(GRAPHITE_SERVER_PORT, String.valueOf(port)); - return this; - } - - public Builder usePrefix(String prefix) { - props.setProperty(GRAPHITE_METRIC_PREFIX, prefix); - return this; - } - - public HoodieMetricsConfig build() { - HoodieMetricsConfig config = new HoodieMetricsConfig(props); - setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, - String.valueOf(DEFAULT_METRICS_ON)); - setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), - METRICS_REPORTER_TYPE, DEFAULT_METRICS_REPORTER_TYPE.name()); - setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), - GRAPHITE_SERVER_HOST, DEFAULT_GRAPHITE_SERVER_HOST); - setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), - GRAPHITE_SERVER_PORT, String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT)); - setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), - GRAPHITE_SERVER_PORT, String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT)); - return config; - } + public Builder on(boolean metricsOn) { + props.setProperty(METRICS_ON, String.valueOf(metricsOn)); + return this; } + public Builder withReporterType(String reporterType) { + props.setProperty(METRICS_REPORTER_TYPE, reporterType); + return this; + } + + public Builder toGraphiteHost(String host) { + props.setProperty(GRAPHITE_SERVER_HOST, host); + return this; + } + + public Builder onGraphitePort(int port) { + props.setProperty(GRAPHITE_SERVER_PORT, String.valueOf(port)); + return this; + } + + public Builder usePrefix(String prefix) { + props.setProperty(GRAPHITE_METRIC_PREFIX, prefix); + return this; + } + + public HoodieMetricsConfig build() { + HoodieMetricsConfig config = new HoodieMetricsConfig(props); + setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, + String.valueOf(DEFAULT_METRICS_ON)); + setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), + METRICS_REPORTER_TYPE, DEFAULT_METRICS_REPORTER_TYPE.name()); + setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), + GRAPHITE_SERVER_HOST, DEFAULT_GRAPHITE_SERVER_HOST); + setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), + GRAPHITE_SERVER_PORT, String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT)); + setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), + GRAPHITE_SERVER_PORT, String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT)); + return config; + } + } + } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieStorageConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieStorageConfig.java index b9ce48aca..cc4e6d465 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieStorageConfig.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieStorageConfig.java @@ -16,75 +16,77 @@ package com.uber.hoodie.config; -import javax.annotation.concurrent.Immutable; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.Properties; +import javax.annotation.concurrent.Immutable; /** * Storage related config */ @Immutable public class HoodieStorageConfig extends DefaultHoodieConfig { - public static final String PARQUET_FILE_MAX_BYTES = "hoodie.parquet.max.file.size"; - public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024); - public static final String PARQUET_BLOCK_SIZE_BYTES = "hoodie.parquet.block.size"; - public static final String DEFAULT_PARQUET_BLOCK_SIZE_BYTES = DEFAULT_PARQUET_FILE_MAX_BYTES; - public static final String PARQUET_PAGE_SIZE_BYTES = "hoodie.parquet.page.size"; - public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024); - private HoodieStorageConfig(Properties props) { - super(props); + public static final String PARQUET_FILE_MAX_BYTES = "hoodie.parquet.max.file.size"; + public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024); + public static final String PARQUET_BLOCK_SIZE_BYTES = "hoodie.parquet.block.size"; + public static final String DEFAULT_PARQUET_BLOCK_SIZE_BYTES = DEFAULT_PARQUET_FILE_MAX_BYTES; + public static final String PARQUET_PAGE_SIZE_BYTES = "hoodie.parquet.page.size"; + public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024); + + private HoodieStorageConfig(Properties props) { + super(props); + } + + public static HoodieStorageConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + + private final Properties props = new Properties(); + + public Builder fromFile(File propertiesFile) throws IOException { + FileReader reader = new FileReader(propertiesFile); + try { + this.props.load(reader); + return this; + } finally { + reader.close(); + } } - public static HoodieStorageConfig.Builder newBuilder() { - return new Builder(); + public Builder fromProperties(Properties props) { + this.props.putAll(props); + return this; } - public static class Builder { - private final Properties props = new Properties(); - - public Builder fromFile(File propertiesFile) throws IOException { - FileReader reader = new FileReader(propertiesFile); - try { - this.props.load(reader); - return this; - } finally { - reader.close(); - } - } - - public Builder fromProperties(Properties props) { - this.props.putAll(props); - return this; - } - - public Builder limitFileSize(int maxFileSize) { - props.setProperty(PARQUET_FILE_MAX_BYTES, String.valueOf(maxFileSize)); - return this; - } - - public Builder parquetBlockSize(int blockSize) { - props.setProperty(PARQUET_BLOCK_SIZE_BYTES, String.valueOf(blockSize)); - return this; - } - - public Builder parquetPageSize(int pageSize) { - props.setProperty(PARQUET_PAGE_SIZE_BYTES, String.valueOf(pageSize)); - return this; - } - - public HoodieStorageConfig build() { - HoodieStorageConfig config = new HoodieStorageConfig(props); - setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES), - PARQUET_FILE_MAX_BYTES, DEFAULT_PARQUET_FILE_MAX_BYTES); - setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES), - PARQUET_BLOCK_SIZE_BYTES, DEFAULT_PARQUET_BLOCK_SIZE_BYTES); - setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES), - PARQUET_PAGE_SIZE_BYTES, DEFAULT_PARQUET_PAGE_SIZE_BYTES); - return config; - } + public Builder limitFileSize(int maxFileSize) { + props.setProperty(PARQUET_FILE_MAX_BYTES, String.valueOf(maxFileSize)); + return this; } + public Builder parquetBlockSize(int blockSize) { + props.setProperty(PARQUET_BLOCK_SIZE_BYTES, String.valueOf(blockSize)); + return this; + } + + public Builder parquetPageSize(int pageSize) { + props.setProperty(PARQUET_PAGE_SIZE_BYTES, String.valueOf(pageSize)); + return this; + } + + public HoodieStorageConfig build() { + HoodieStorageConfig config = new HoodieStorageConfig(props); + setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES), + PARQUET_FILE_MAX_BYTES, DEFAULT_PARQUET_FILE_MAX_BYTES); + setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES), + PARQUET_BLOCK_SIZE_BYTES, DEFAULT_PARQUET_BLOCK_SIZE_BYTES); + setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES), + PARQUET_PAGE_SIZE_BYTES, DEFAULT_PARQUET_PAGE_SIZE_BYTES); + return config; + } + } + } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieWriteConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieWriteConfig.java index b954a7bbf..b2efc8254 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieWriteConfig.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieWriteConfig.java @@ -24,395 +24,401 @@ import com.uber.hoodie.common.util.ReflectionUtils; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.io.compact.strategy.CompactionStrategy; import com.uber.hoodie.metrics.MetricsReporterType; -import org.apache.spark.storage.StorageLevel; - -import javax.annotation.concurrent.Immutable; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.util.Map; import java.util.Properties; +import javax.annotation.concurrent.Immutable; +import org.apache.spark.storage.StorageLevel; /** * Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient} */ @Immutable public class HoodieWriteConfig extends DefaultHoodieConfig { - private static final String BASE_PATH_PROP = "hoodie.base.path"; - private static final String AVRO_SCHEMA = "hoodie.avro.schema"; - public static final String TABLE_NAME = "hoodie.table.name"; - private static final String DEFAULT_PARALLELISM = "200"; - private static final String INSERT_PARALLELISM = "hoodie.insert.shuffle.parallelism"; - private static final String BULKINSERT_PARALLELISM = "hoodie.bulkinsert.shuffle.parallelism"; - private static final String UPSERT_PARALLELISM = "hoodie.upsert.shuffle.parallelism"; - private static final String COMBINE_BEFORE_INSERT_PROP = "hoodie.combine.before.insert"; - private static final String DEFAULT_COMBINE_BEFORE_INSERT = "false"; - private static final String COMBINE_BEFORE_UPSERT_PROP = "hoodie.combine.before.upsert"; - private static final String DEFAULT_COMBINE_BEFORE_UPSERT = "true"; - private static final String WRITE_STATUS_STORAGE_LEVEL = "hoodie.write.status.storage.level"; - private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; - private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit"; - private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true"; - private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = "hoodie.assume.date.partitioning"; - private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false"; - private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class"; - private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName(); - private HoodieWriteConfig(Properties props) { - super(props); - } + private static final String BASE_PATH_PROP = "hoodie.base.path"; + private static final String AVRO_SCHEMA = "hoodie.avro.schema"; + public static final String TABLE_NAME = "hoodie.table.name"; + private static final String DEFAULT_PARALLELISM = "200"; + private static final String INSERT_PARALLELISM = "hoodie.insert.shuffle.parallelism"; + private static final String BULKINSERT_PARALLELISM = "hoodie.bulkinsert.shuffle.parallelism"; + private static final String UPSERT_PARALLELISM = "hoodie.upsert.shuffle.parallelism"; + private static final String COMBINE_BEFORE_INSERT_PROP = "hoodie.combine.before.insert"; + private static final String DEFAULT_COMBINE_BEFORE_INSERT = "false"; + private static final String COMBINE_BEFORE_UPSERT_PROP = "hoodie.combine.before.upsert"; + private static final String DEFAULT_COMBINE_BEFORE_UPSERT = "true"; + private static final String WRITE_STATUS_STORAGE_LEVEL = "hoodie.write.status.storage.level"; + private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; + private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit"; + private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true"; + private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = "hoodie.assume.date.partitioning"; + private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false"; + private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class"; + private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName(); - /** - * base properties - **/ - public String getBasePath() { - return props.getProperty(BASE_PATH_PROP); - } + private HoodieWriteConfig(Properties props) { + super(props); + } - public String getSchema() { - return props.getProperty(AVRO_SCHEMA); - } + /** + * base properties + **/ + public String getBasePath() { + return props.getProperty(BASE_PATH_PROP); + } - public String getTableName() { - return props.getProperty(TABLE_NAME); - } + public String getSchema() { + return props.getProperty(AVRO_SCHEMA); + } - public Boolean shouldAutoCommit() { - return Boolean.parseBoolean(props.getProperty(HOODIE_AUTO_COMMIT_PROP)); - } + public String getTableName() { + return props.getProperty(TABLE_NAME); + } - public Boolean shouldAssumeDatePartitioning() { - return Boolean.parseBoolean(props.getProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP)); - } + public Boolean shouldAutoCommit() { + return Boolean.parseBoolean(props.getProperty(HOODIE_AUTO_COMMIT_PROP)); + } - public int getBulkInsertShuffleParallelism() { - return Integer.parseInt(props.getProperty(BULKINSERT_PARALLELISM)); - } + public Boolean shouldAssumeDatePartitioning() { + return Boolean.parseBoolean(props.getProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP)); + } - public int getInsertShuffleParallelism() { - return Integer.parseInt(props.getProperty(INSERT_PARALLELISM)); - } + public int getBulkInsertShuffleParallelism() { + return Integer.parseInt(props.getProperty(BULKINSERT_PARALLELISM)); + } - public int getUpsertShuffleParallelism() { - return Integer.parseInt(props.getProperty(UPSERT_PARALLELISM)); - } + public int getInsertShuffleParallelism() { + return Integer.parseInt(props.getProperty(INSERT_PARALLELISM)); + } - public boolean shouldCombineBeforeInsert() { - return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_INSERT_PROP)); - } + public int getUpsertShuffleParallelism() { + return Integer.parseInt(props.getProperty(UPSERT_PARALLELISM)); + } - public boolean shouldCombineBeforeUpsert() { - return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_UPSERT_PROP)); - } + public boolean shouldCombineBeforeInsert() { + return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_INSERT_PROP)); + } - public StorageLevel getWriteStatusStorageLevel() { - return StorageLevel.fromString(props.getProperty(WRITE_STATUS_STORAGE_LEVEL)); - } + public boolean shouldCombineBeforeUpsert() { + return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_UPSERT_PROP)); + } - public String getWriteStatusClassName() { - return props.getProperty(HOODIE_WRITE_STATUS_CLASS_PROP); - } + public StorageLevel getWriteStatusStorageLevel() { + return StorageLevel.fromString(props.getProperty(WRITE_STATUS_STORAGE_LEVEL)); + } - /** - * compaction properties - **/ - public HoodieCleaningPolicy getCleanerPolicy() { - return HoodieCleaningPolicy - .valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP)); - } + public String getWriteStatusClassName() { + return props.getProperty(HOODIE_WRITE_STATUS_CLASS_PROP); + } - public int getCleanerFileVersionsRetained() { - return Integer.parseInt( - props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP)); - } + /** + * compaction properties + **/ + public HoodieCleaningPolicy getCleanerPolicy() { + return HoodieCleaningPolicy + .valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP)); + } - public int getCleanerCommitsRetained() { - return Integer - .parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP)); - } + public int getCleanerFileVersionsRetained() { + return Integer.parseInt( + props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP)); + } - public int getMaxCommitsToKeep() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP)); - } + public int getCleanerCommitsRetained() { + return Integer + .parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP)); + } - public int getMinCommitsToKeep() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP)); - } + public int getMaxCommitsToKeep() { + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP)); + } - public int getParquetSmallFileLimit() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES)); - } + public int getMinCommitsToKeep() { + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP)); + } - public int getCopyOnWriteInsertSplitSize() { - return Integer.parseInt( - props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE)); - } + public int getParquetSmallFileLimit() { + return Integer + .parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES)); + } - public int getCopyOnWriteRecordSizeEstimate() { - return Integer.parseInt( - props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE)); - } + public int getCopyOnWriteInsertSplitSize() { + return Integer.parseInt( + props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE)); + } - public boolean shouldAutoTuneInsertSplits() { - return Boolean.parseBoolean( - props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS)); - } + public int getCopyOnWriteRecordSizeEstimate() { + return Integer.parseInt( + props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE)); + } - public int getCleanerParallelism() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_PARALLELISM)); - } + public boolean shouldAutoTuneInsertSplits() { + return Boolean.parseBoolean( + props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS)); + } - public boolean isAutoClean() { - return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.AUTO_CLEAN_PROP)); - } + public int getCleanerParallelism() { + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_PARALLELISM)); + } - public boolean isInlineCompaction() { - return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_PROP)); - } + public boolean isAutoClean() { + return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.AUTO_CLEAN_PROP)); + } - public int getInlineCompactDeltaCommitMax() { - return Integer.parseInt( - props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP)); - } + public boolean isInlineCompaction() { + return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_PROP)); + } - public CompactionStrategy getCompactionStrategy() { - return ReflectionUtils.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP)); - } + public int getInlineCompactDeltaCommitMax() { + return Integer.parseInt( + props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP)); + } - public Long getTargetIOPerCompactionInMB() { - return Long.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP)); - } + public CompactionStrategy getCompactionStrategy() { + return ReflectionUtils + .loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP)); + } - /** - * index properties - **/ - public HoodieIndex.IndexType getIndexType() { - return HoodieIndex.IndexType.valueOf(props.getProperty(HoodieIndexConfig.INDEX_TYPE_PROP)); - } + public Long getTargetIOPerCompactionInMB() { + return Long + .parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP)); + } - public int getBloomFilterNumEntries() { - return Integer.parseInt(props.getProperty(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES)); - } + /** + * index properties + **/ + public HoodieIndex.IndexType getIndexType() { + return HoodieIndex.IndexType.valueOf(props.getProperty(HoodieIndexConfig.INDEX_TYPE_PROP)); + } - public double getBloomFilterFPP() { - return Double.parseDouble(props.getProperty(HoodieIndexConfig.BLOOM_FILTER_FPP)); - } + public int getBloomFilterNumEntries() { + return Integer.parseInt(props.getProperty(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES)); + } - public String getHbaseZkQuorum() { - return props.getProperty(HoodieIndexConfig.HBASE_ZKQUORUM_PROP); - } + public double getBloomFilterFPP() { + return Double.parseDouble(props.getProperty(HoodieIndexConfig.BLOOM_FILTER_FPP)); + } - public int getHbaseZkPort() { - return Integer.parseInt(props.getProperty(HoodieIndexConfig.HBASE_ZKPORT_PROP)); - } + public String getHbaseZkQuorum() { + return props.getProperty(HoodieIndexConfig.HBASE_ZKQUORUM_PROP); + } - public String getHbaseTableName() { - return props.getProperty(HoodieIndexConfig.HBASE_TABLENAME_PROP); - } + public int getHbaseZkPort() { + return Integer.parseInt(props.getProperty(HoodieIndexConfig.HBASE_ZKPORT_PROP)); + } - public int getBloomIndexParallelism() { - return Integer.parseInt(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PARALLELISM_PROP)); - } + public String getHbaseTableName() { + return props.getProperty(HoodieIndexConfig.HBASE_TABLENAME_PROP); + } - public boolean getBloomIndexPruneByRanges() { - return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP)); - } + public int getBloomIndexParallelism() { + return Integer.parseInt(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PARALLELISM_PROP)); + } - public boolean getBloomIndexUseCaching() { - return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_USE_CACHING_PROP)); - } + public boolean getBloomIndexPruneByRanges() { + return Boolean + .parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP)); + } - public int getNumBucketsPerPartition() { - return Integer.parseInt(props.getProperty(HoodieIndexConfig.BUCKETED_INDEX_NUM_BUCKETS_PROP)); - } + public boolean getBloomIndexUseCaching() { + return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_USE_CACHING_PROP)); + } - /** - * storage properties - **/ - public int getParquetMaxFileSize() { - return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_FILE_MAX_BYTES)); - } + public int getNumBucketsPerPartition() { + return Integer.parseInt(props.getProperty(HoodieIndexConfig.BUCKETED_INDEX_NUM_BUCKETS_PROP)); + } - public int getParquetBlockSize() { - return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_BLOCK_SIZE_BYTES)); - } + /** + * storage properties + **/ + public int getParquetMaxFileSize() { + return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_FILE_MAX_BYTES)); + } - public int getParquetPageSize() { - return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_PAGE_SIZE_BYTES)); - } + public int getParquetBlockSize() { + return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_BLOCK_SIZE_BYTES)); + } - /** - * metrics properties - **/ - public boolean isMetricsOn() { - return Boolean.parseBoolean(props.getProperty(HoodieMetricsConfig.METRICS_ON)); - } + public int getParquetPageSize() { + return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_PAGE_SIZE_BYTES)); + } - public MetricsReporterType getMetricsReporterType() { - return MetricsReporterType - .valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE)); - } + /** + * metrics properties + **/ + public boolean isMetricsOn() { + return Boolean.parseBoolean(props.getProperty(HoodieMetricsConfig.METRICS_ON)); + } - public String getGraphiteServerHost() { - return props.getProperty(HoodieMetricsConfig.GRAPHITE_SERVER_HOST); - } + public MetricsReporterType getMetricsReporterType() { + return MetricsReporterType + .valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE)); + } - public int getGraphiteServerPort() { - return Integer.parseInt(props.getProperty(HoodieMetricsConfig.GRAPHITE_SERVER_PORT)); - } + public String getGraphiteServerHost() { + return props.getProperty(HoodieMetricsConfig.GRAPHITE_SERVER_HOST); + } - public String getGraphiteMetricPrefix() { - return props.getProperty(HoodieMetricsConfig.GRAPHITE_METRIC_PREFIX); - } + public int getGraphiteServerPort() { + return Integer.parseInt(props.getProperty(HoodieMetricsConfig.GRAPHITE_SERVER_PORT)); + } - public static HoodieWriteConfig.Builder newBuilder() { - return new Builder(); - } + public String getGraphiteMetricPrefix() { + return props.getProperty(HoodieMetricsConfig.GRAPHITE_METRIC_PREFIX); + } + public static HoodieWriteConfig.Builder newBuilder() { + return new Builder(); + } public static class Builder { - private final Properties props = new Properties(); - private boolean isIndexConfigSet = false; - private boolean isStorageConfigSet = false; - private boolean isCompactionConfigSet = false; - private boolean isMetricsConfigSet = false; - private boolean isAutoCommit = true; - public Builder fromFile(File propertiesFile) throws IOException { - FileReader reader = new FileReader(propertiesFile); - try { - this.props.load(reader); - return this; - } finally { - reader.close(); - } - } + private final Properties props = new Properties(); + private boolean isIndexConfigSet = false; + private boolean isStorageConfigSet = false; + private boolean isCompactionConfigSet = false; + private boolean isMetricsConfigSet = false; + private boolean isAutoCommit = true; - public Builder fromInputStream(InputStream inputStream) throws IOException { - try { - this.props.load(inputStream); - return this; - } finally { - inputStream.close(); - } - } - - public Builder withProps(Map kvprops) { - props.putAll(kvprops); - return this; - } - - public Builder withPath(String basePath) { - props.setProperty(BASE_PATH_PROP, basePath); - return this; - } - - public Builder withSchema(String schemaStr) { - props.setProperty(AVRO_SCHEMA, schemaStr); - return this; - } - - public Builder forTable(String tableName) { - props.setProperty(TABLE_NAME, tableName); - return this; - } - - public Builder withBulkInsertParallelism(int bulkInsertParallelism) { - props.setProperty(BULKINSERT_PARALLELISM, String.valueOf(bulkInsertParallelism)); - return this; - } - - public Builder withParallelism(int insertShuffleParallelism, int upsertShuffleParallelism) { - props.setProperty(INSERT_PARALLELISM, String.valueOf(insertShuffleParallelism)); - props.setProperty(UPSERT_PARALLELISM, String.valueOf(upsertShuffleParallelism)); - return this; - } - - public Builder combineInput(boolean onInsert, boolean onUpsert) { - props.setProperty(COMBINE_BEFORE_INSERT_PROP, String.valueOf(onInsert)); - props.setProperty(COMBINE_BEFORE_UPSERT_PROP, String.valueOf(onUpsert)); - return this; - } - - public Builder withWriteStatusStorageLevel(String level) { - props.setProperty(WRITE_STATUS_STORAGE_LEVEL, level); - return this; - } - - public Builder withIndexConfig(HoodieIndexConfig indexConfig) { - props.putAll(indexConfig.getProps()); - isIndexConfigSet = true; - return this; - } - - public Builder withStorageConfig(HoodieStorageConfig storageConfig) { - props.putAll(storageConfig.getProps()); - isStorageConfigSet = true; - return this; - } - - public Builder withCompactionConfig(HoodieCompactionConfig compactionConfig) { - props.putAll(compactionConfig.getProps()); - isCompactionConfigSet = true; - return this; - } - - public Builder withMetricsConfig(HoodieMetricsConfig metricsConfig) { - props.putAll(metricsConfig.getProps()); - isMetricsConfigSet = true; - return this; - } - - public Builder withAutoCommit(boolean autoCommit) { - props.setProperty(HOODIE_AUTO_COMMIT_PROP, String.valueOf(autoCommit)); - return this; - } - - public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) { - props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, String.valueOf(assumeDatePartitioning)); - return this; - } - - public Builder withWriteStatusClass(Class writeStatusClass) { - props.setProperty(HOODIE_WRITE_STATUS_CLASS_PROP, writeStatusClass.getName()); - return this; - } - - public HoodieWriteConfig build() { - HoodieWriteConfig config = new HoodieWriteConfig(props); - // Check for mandatory properties - Preconditions.checkArgument(config.getBasePath() != null); - setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, - DEFAULT_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM, - DEFAULT_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, - DEFAULT_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), - COMBINE_BEFORE_INSERT_PROP, DEFAULT_COMBINE_BEFORE_INSERT); - setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), - COMBINE_BEFORE_UPSERT_PROP, DEFAULT_COMBINE_BEFORE_UPSERT); - setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), - WRITE_STATUS_STORAGE_LEVEL, DEFAULT_WRITE_STATUS_STORAGE_LEVEL); - setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP), - HOODIE_AUTO_COMMIT_PROP, DEFAULT_HOODIE_AUTO_COMMIT); - setDefaultOnCondition(props, !props.containsKey(HOODIE_ASSUME_DATE_PARTITIONING_PROP), - HOODIE_ASSUME_DATE_PARTITIONING_PROP, DEFAULT_ASSUME_DATE_PARTITIONING); - setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP), - HOODIE_WRITE_STATUS_CLASS_PROP, DEFAULT_HOODIE_WRITE_STATUS_CLASS); - - // Make sure the props is propagated - setDefaultOnCondition(props, !isIndexConfigSet, - HoodieIndexConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isStorageConfigSet, - HoodieStorageConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isCompactionConfigSet, - HoodieCompactionConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isMetricsConfigSet, - HoodieMetricsConfig.newBuilder().fromProperties(props).build()); - return config; - } + public Builder fromFile(File propertiesFile) throws IOException { + FileReader reader = new FileReader(propertiesFile); + try { + this.props.load(reader); + return this; + } finally { + reader.close(); + } } + + public Builder fromInputStream(InputStream inputStream) throws IOException { + try { + this.props.load(inputStream); + return this; + } finally { + inputStream.close(); + } + } + + public Builder withProps(Map kvprops) { + props.putAll(kvprops); + return this; + } + + public Builder withPath(String basePath) { + props.setProperty(BASE_PATH_PROP, basePath); + return this; + } + + public Builder withSchema(String schemaStr) { + props.setProperty(AVRO_SCHEMA, schemaStr); + return this; + } + + public Builder forTable(String tableName) { + props.setProperty(TABLE_NAME, tableName); + return this; + } + + public Builder withBulkInsertParallelism(int bulkInsertParallelism) { + props.setProperty(BULKINSERT_PARALLELISM, String.valueOf(bulkInsertParallelism)); + return this; + } + + public Builder withParallelism(int insertShuffleParallelism, int upsertShuffleParallelism) { + props.setProperty(INSERT_PARALLELISM, String.valueOf(insertShuffleParallelism)); + props.setProperty(UPSERT_PARALLELISM, String.valueOf(upsertShuffleParallelism)); + return this; + } + + public Builder combineInput(boolean onInsert, boolean onUpsert) { + props.setProperty(COMBINE_BEFORE_INSERT_PROP, String.valueOf(onInsert)); + props.setProperty(COMBINE_BEFORE_UPSERT_PROP, String.valueOf(onUpsert)); + return this; + } + + public Builder withWriteStatusStorageLevel(String level) { + props.setProperty(WRITE_STATUS_STORAGE_LEVEL, level); + return this; + } + + public Builder withIndexConfig(HoodieIndexConfig indexConfig) { + props.putAll(indexConfig.getProps()); + isIndexConfigSet = true; + return this; + } + + public Builder withStorageConfig(HoodieStorageConfig storageConfig) { + props.putAll(storageConfig.getProps()); + isStorageConfigSet = true; + return this; + } + + public Builder withCompactionConfig(HoodieCompactionConfig compactionConfig) { + props.putAll(compactionConfig.getProps()); + isCompactionConfigSet = true; + return this; + } + + public Builder withMetricsConfig(HoodieMetricsConfig metricsConfig) { + props.putAll(metricsConfig.getProps()); + isMetricsConfigSet = true; + return this; + } + + public Builder withAutoCommit(boolean autoCommit) { + props.setProperty(HOODIE_AUTO_COMMIT_PROP, String.valueOf(autoCommit)); + return this; + } + + public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) { + props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, + String.valueOf(assumeDatePartitioning)); + return this; + } + + public Builder withWriteStatusClass(Class writeStatusClass) { + props.setProperty(HOODIE_WRITE_STATUS_CLASS_PROP, writeStatusClass.getName()); + return this; + } + + public HoodieWriteConfig build() { + HoodieWriteConfig config = new HoodieWriteConfig(props); + // Check for mandatory properties + Preconditions.checkArgument(config.getBasePath() != null); + setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, + DEFAULT_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), + BULKINSERT_PARALLELISM, + DEFAULT_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, + DEFAULT_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), + COMBINE_BEFORE_INSERT_PROP, DEFAULT_COMBINE_BEFORE_INSERT); + setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), + COMBINE_BEFORE_UPSERT_PROP, DEFAULT_COMBINE_BEFORE_UPSERT); + setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), + WRITE_STATUS_STORAGE_LEVEL, DEFAULT_WRITE_STATUS_STORAGE_LEVEL); + setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP), + HOODIE_AUTO_COMMIT_PROP, DEFAULT_HOODIE_AUTO_COMMIT); + setDefaultOnCondition(props, !props.containsKey(HOODIE_ASSUME_DATE_PARTITIONING_PROP), + HOODIE_ASSUME_DATE_PARTITIONING_PROP, DEFAULT_ASSUME_DATE_PARTITIONING); + setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP), + HOODIE_WRITE_STATUS_CLASS_PROP, DEFAULT_HOODIE_WRITE_STATUS_CLASS); + + // Make sure the props is propagated + setDefaultOnCondition(props, !isIndexConfigSet, + HoodieIndexConfig.newBuilder().fromProperties(props).build()); + setDefaultOnCondition(props, !isStorageConfigSet, + HoodieStorageConfig.newBuilder().fromProperties(props).build()); + setDefaultOnCondition(props, !isCompactionConfigSet, + HoodieCompactionConfig.newBuilder().fromProperties(props).build()); + setDefaultOnCondition(props, !isMetricsConfigSet, + HoodieMetricsConfig.newBuilder().fromProperties(props).build()); + return config; + } + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieAppendException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieAppendException.java index 0ba0eb50c..2f896c93a 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieAppendException.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieAppendException.java @@ -17,16 +17,16 @@ package com.uber.hoodie.exception; /** - *

- * Exception thrown for any higher level errors when HoodieClient is doing a delta commit - *

+ *

Exception thrown for any higher level errors when HoodieClient is doing a delta + * commit

*/ public class HoodieAppendException extends HoodieException { - public HoodieAppendException(String msg, Throwable e) { - super(msg, e); - } - public HoodieAppendException(String msg) { - super(msg); - } + public HoodieAppendException(String msg, Throwable e) { + super(msg, e); + } + + public HoodieAppendException(String msg) { + super(msg); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCommitException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCommitException.java index bc4c139f5..a75b8153f 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCommitException.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCommitException.java @@ -17,16 +17,16 @@ package com.uber.hoodie.exception; /** - *

- * Exception thrown for any higher level errors when HoodieClient is doing a Commit + *

Exception thrown for any higher level errors when HoodieClient is doing a Commit *

*/ public class HoodieCommitException extends HoodieException { - public HoodieCommitException(String msg) { - super(msg); - } - public HoodieCommitException(String msg, Throwable e) { - super(msg, e); - } + public HoodieCommitException(String msg) { + super(msg); + } + + public HoodieCommitException(String msg, Throwable e) { + super(msg, e); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCompactionException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCompactionException.java index 9d016ec6d..66104a657 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCompactionException.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCompactionException.java @@ -17,6 +17,7 @@ package com.uber.hoodie.exception; public class HoodieCompactionException extends HoodieException { + public HoodieCompactionException(String msg) { super(msg); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieDependentSystemUnavailableException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieDependentSystemUnavailableException.java index 4f64d76ca..59ea7271e 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieDependentSystemUnavailableException.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieDependentSystemUnavailableException.java @@ -18,18 +18,17 @@ package com.uber.hoodie.exception; /** - *

- * Exception thrown when dependent system is not available - *

+ *

Exception thrown when dependent system is not available

*/ public class HoodieDependentSystemUnavailableException extends HoodieException { - public static final String HBASE = "HBASE"; - public HoodieDependentSystemUnavailableException(String system, String connectURL) { - super(getLogMessage(system, connectURL)); - } + public static final String HBASE = "HBASE"; - private static String getLogMessage(String system, String connectURL) { - return "System " + system + " unavailable. Tried to connect to " + connectURL; - } + public HoodieDependentSystemUnavailableException(String system, String connectURL) { + super(getLogMessage(system, connectURL)); + } + + private static String getLogMessage(String system, String connectURL) { + return "System " + system + " unavailable. Tried to connect to " + connectURL; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieInsertException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieInsertException.java index a228541d3..3bcfa5434 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieInsertException.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieInsertException.java @@ -16,15 +16,13 @@ package com.uber.hoodie.exception; -import java.io.IOException; - /** - *

- * Exception thrown for any higher level errors when HoodieClient is doing a bulk insert - *

+ *

Exception thrown for any higher level errors when HoodieClient is doing a bulk + * insert

*/ public class HoodieInsertException extends HoodieException { - public HoodieInsertException(String msg, Throwable e) { - super(msg, e); - } + + public HoodieInsertException(String msg, Throwable e) { + super(msg, e); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieRollbackException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieRollbackException.java index 67e4835a6..477364d99 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieRollbackException.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieRollbackException.java @@ -18,11 +18,11 @@ package com.uber.hoodie.exception; public class HoodieRollbackException extends HoodieException { - public HoodieRollbackException(String msg, Throwable e) { - super(msg, e); - } + public HoodieRollbackException(String msg, Throwable e) { + super(msg, e); + } - public HoodieRollbackException(String msg) { - super(msg); - } + public HoodieRollbackException(String msg) { + super(msg); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieSavepointException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieSavepointException.java index 83e1bd134..0b3e221b1 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieSavepointException.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieSavepointException.java @@ -18,11 +18,11 @@ package com.uber.hoodie.exception; public class HoodieSavepointException extends HoodieException { - public HoodieSavepointException(String msg, Throwable e) { - super(msg, e); - } + public HoodieSavepointException(String msg, Throwable e) { + super(msg, e); + } - public HoodieSavepointException(String msg) { - super(msg); - } + public HoodieSavepointException(String msg) { + super(msg); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieUpsertException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieUpsertException.java index 16779a92b..e4b0f4c5c 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieUpsertException.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieUpsertException.java @@ -17,16 +17,16 @@ package com.uber.hoodie.exception; /** - *

- * Exception thrown for any higher level errors when HoodieClient is doing a incremental upsert - *

+ *

Exception thrown for any higher level errors when HoodieClient is doing a + * incremental upsert

*/ -public class HoodieUpsertException extends HoodieException { - public HoodieUpsertException(String msg, Throwable e) { - super(msg, e); - } +public class HoodieUpsertException extends HoodieException { - public HoodieUpsertException(String msg) { - super(msg); - } + public HoodieUpsertException(String msg, Throwable e) { + super(msg, e); + } + + public HoodieUpsertException(String msg) { + super(msg); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/func/BulkInsertMapFunction.java b/hoodie-client/src/main/java/com/uber/hoodie/func/BulkInsertMapFunction.java index ae130a62d..8d305d214 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/func/BulkInsertMapFunction.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/func/BulkInsertMapFunction.java @@ -16,16 +16,14 @@ package com.uber.hoodie.func; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.table.HoodieTable; -import org.apache.spark.api.java.function.Function2; - import java.util.Iterator; import java.util.List; +import org.apache.spark.api.java.function.Function2; /** @@ -34,20 +32,21 @@ import java.util.List; public class BulkInsertMapFunction implements Function2>, Iterator>> { - private String commitTime; - private HoodieWriteConfig config; - private HoodieTable hoodieTable; + private String commitTime; + private HoodieWriteConfig config; + private HoodieTable hoodieTable; - public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config, - HoodieTable hoodieTable) { - this.commitTime = commitTime; - this.config = config; - this.hoodieTable = hoodieTable; - } + public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config, + HoodieTable hoodieTable) { + this.commitTime = commitTime; + this.config = config; + this.hoodieTable = hoodieTable; + } - @Override - public Iterator> call(Integer partition, Iterator> sortedRecordItr) - throws Exception { - return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable); - } + @Override + public Iterator> call(Integer partition, + Iterator> sortedRecordItr) + throws Exception { + return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java index aa11e7efe..cc038f21a 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java @@ -16,99 +16,101 @@ package com.uber.hoodie.func; -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordPayload; - -import com.uber.hoodie.io.HoodieIOHandle; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.io.HoodieCreateHandle; +import com.uber.hoodie.io.HoodieIOHandle; import com.uber.hoodie.table.HoodieTable; -import org.apache.spark.TaskContext; - import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; +import org.apache.spark.TaskContext; /** - * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, - * into new files. + * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new + * files. */ -public class LazyInsertIterable extends LazyIterableIterator, List> { +public class LazyInsertIterable extends + LazyIterableIterator, List> { - private final HoodieWriteConfig hoodieConfig; - private final String commitTime; - private final HoodieTable hoodieTable; - private Set partitionsCleaned; - private HoodieCreateHandle handle; + private final HoodieWriteConfig hoodieConfig; + private final String commitTime; + private final HoodieTable hoodieTable; + private Set partitionsCleaned; + private HoodieCreateHandle handle; - public LazyInsertIterable(Iterator> sortedRecordItr, HoodieWriteConfig config, - String commitTime, HoodieTable hoodieTable) { - super(sortedRecordItr); - this.partitionsCleaned = new HashSet<>(); - this.hoodieConfig = config; - this.commitTime = commitTime; - this.hoodieTable = hoodieTable; + public LazyInsertIterable(Iterator> sortedRecordItr, HoodieWriteConfig config, + String commitTime, HoodieTable hoodieTable) { + super(sortedRecordItr); + this.partitionsCleaned = new HashSet<>(); + this.hoodieConfig = config; + this.commitTime = commitTime; + this.hoodieTable = hoodieTable; + } + + @Override + protected void start() { + } + + + @Override + protected List computeNext() { + List statuses = new ArrayList<>(); + + while (inputItr.hasNext()) { + HoodieRecord record = inputItr.next(); + + // clean up any partial failures + if (!partitionsCleaned.contains(record.getPartitionPath())) { + // This insert task could fail multiple times, but Spark will faithfully retry with + // the same data again. Thus, before we open any files under a given partition, we + // first delete any files in the same partitionPath written by same Spark partition + HoodieIOHandle.cleanupTmpFilesFromCurrentCommit(hoodieConfig, + commitTime, + record.getPartitionPath(), + TaskContext.getPartitionId()); + partitionsCleaned.add(record.getPartitionPath()); + } + + // lazily initialize the handle, for the first time + if (handle == null) { + handle = + new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, + record.getPartitionPath()); + } + + if (handle.canWrite(record)) { + // write the record, if the handle has capacity + handle.write(record); + } else { + // handle is full. + statuses.add(handle.close()); + // Need to handle the rejected record & open new handle + handle = + new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, + record.getPartitionPath()); + handle.write(record); // we should be able to write 1 record. + break; + } } - @Override protected void start() { + // If we exited out, because we ran out of records, just close the pending handle. + if (!inputItr.hasNext()) { + if (handle != null) { + statuses.add(handle.close()); + } } + assert statuses.size() > 0; // should never return empty statuses + return statuses; + } - @Override protected List computeNext() { - List statuses = new ArrayList<>(); + @Override + protected void end() { - while (inputItr.hasNext()) { - HoodieRecord record = inputItr.next(); - - // clean up any partial failures - if (!partitionsCleaned.contains(record.getPartitionPath())) { - // This insert task could fail multiple times, but Spark will faithfully retry with - // the same data again. Thus, before we open any files under a given partition, we - // first delete any files in the same partitionPath written by same Spark partition - HoodieIOHandle.cleanupTmpFilesFromCurrentCommit(hoodieConfig, - commitTime, - record.getPartitionPath(), - TaskContext.getPartitionId()); - partitionsCleaned.add(record.getPartitionPath()); - } - - // lazily initialize the handle, for the first time - if (handle == null) { - handle = - new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, - record.getPartitionPath()); - } - - if (handle.canWrite(record)) { - // write the record, if the handle has capacity - handle.write(record); - } else { - // handle is full. - statuses.add(handle.close()); - // Need to handle the rejected record & open new handle - handle = - new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, - record.getPartitionPath()); - handle.write(record); // we should be able to write 1 record. - break; - } - } - - // If we exited out, because we ran out of records, just close the pending handle. - if (!inputItr.hasNext()) { - if (handle != null) { - statuses.add(handle.close()); - } - } - - assert statuses.size() > 0; // should never return empty statuses - return statuses; - } - - @Override protected void end() { - - } + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/func/LazyIterableIterator.java b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyIterableIterator.java index 195342f82..2720e001e 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/func/LazyIterableIterator.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyIterableIterator.java @@ -31,98 +31,99 @@ import java.util.Iterator; * responsible for calling inputIterator.next() and doing the processing in computeNext() */ public abstract class LazyIterableIterator implements Iterable, Iterator { - protected Iterator inputItr = null; - private boolean consumed = false; - private boolean startCalled = false; - private boolean endCalled = false; - public LazyIterableIterator(Iterator in) { - inputItr = in; + protected Iterator inputItr = null; + private boolean consumed = false; + private boolean startCalled = false; + private boolean endCalled = false; + + public LazyIterableIterator(Iterator in) { + inputItr = in; + } + + /** + * Called once, before any elements are processed + */ + protected abstract void start(); + + /** + * Block computation to be overwritten by sub classes. + */ + protected abstract O computeNext(); + + + /** + * Called once, after all elements are processed. + */ + protected abstract void end(); + + ////////////////// + // iterable implementation + + private void invokeStartIfNeeded() { + if (!startCalled) { + startCalled = true; + try { + start(); + } catch (Exception e) { + throw new RuntimeException("Error in start()"); + } + } + } + + private void invokeEndIfNeeded() { + // make the calls out to begin() & end() + if (!endCalled) { + endCalled = true; + // if we are out of elements, and end has not been called yet + try { + end(); + } catch (Exception e) { + throw new RuntimeException("Error in end()"); + } + } + } + + @Override + public Iterator iterator() { + //check for consumed inputItr + if (consumed) { + throw new RuntimeException("Invalid repeated inputItr consumption."); } - /** - * Called once, before any elements are processed - */ - protected abstract void start(); + //hand out self as inputItr exactly once (note: do not hand out the input + //inputItr since it is consumed by the self inputItr implementation) + consumed = true; + return this; + } - /** - * Block computation to be overwritten by sub classes. - */ - protected abstract O computeNext(); + ////////////////// + // inputItr implementation - - /** - * Called once, after all elements are processed. - */ - protected abstract void end(); - - - ////////////////// - // iterable implementation - - private void invokeStartIfNeeded() { - if (!startCalled) { - startCalled = true; - try { - start(); - } catch (Exception e) { - throw new RuntimeException("Error in start()"); - } - } + @Override + public boolean hasNext() { + boolean ret = inputItr.hasNext(); + // make sure, there is exactly one call to start() + invokeStartIfNeeded(); + if (!ret) { + // if we are out of elements, and end has not been called yet + invokeEndIfNeeded(); } - private void invokeEndIfNeeded() { - // make the calls out to begin() & end() - if (!endCalled) { - endCalled = true; - // if we are out of elements, and end has not been called yet - try { - end(); - } catch (Exception e) { - throw new RuntimeException("Error in end()"); - } - } + return ret; + } + + @Override + public O next() { + try { + return computeNext(); + } catch (Exception ex) { + throw new RuntimeException(ex); } + } - @Override - public Iterator iterator() { - //check for consumed inputItr - if (consumed) - throw new RuntimeException("Invalid repeated inputItr consumption."); - - //hand out self as inputItr exactly once (note: do not hand out the input - //inputItr since it is consumed by the self inputItr implementation) - consumed = true; - return this; - } - - ////////////////// - // inputItr implementation - - @Override - public boolean hasNext() { - boolean ret = inputItr.hasNext(); - // make sure, there is exactly one call to start() - invokeStartIfNeeded(); - if (!ret) { - // if we are out of elements, and end has not been called yet - invokeEndIfNeeded(); - } - - return ret; - } - - @Override - public O next() { - try { - return computeNext(); - } catch (Exception ex) { - throw new RuntimeException(ex); - } - } - - @Override - public void remove() { - throw new RuntimeException("Unsupported remove operation."); - } + @Override + public void remove() { + throw new RuntimeException("Unsupported remove operation."); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java index 91b23bc0c..642cb7d9b 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java @@ -17,118 +17,108 @@ package com.uber.hoodie.index; import com.google.common.base.Optional; - -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieKey; -import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecord; - +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieIndexException; import com.uber.hoodie.index.bloom.HoodieBloomIndex; import com.uber.hoodie.index.bucketed.BucketedIndex; import com.uber.hoodie.index.hbase.HBaseIndex; import com.uber.hoodie.table.HoodieTable; +import java.io.Serializable; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import java.io.Serializable; - /** * Base class for different types of indexes to determine the mapping from uuid - * */ public abstract class HoodieIndex implements Serializable { - protected transient JavaSparkContext jsc = null; - public enum IndexType { - HBASE, - INMEMORY, - BLOOM, - BUCKETED - } - - protected final HoodieWriteConfig config; - - protected HoodieIndex(HoodieWriteConfig config, JavaSparkContext jsc) { - this.config = config; - this.jsc = jsc; - } - - /** - * Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]] - * If the optional FullFilePath value is not present, then the key is not found. If the FullFilePath - * value is present, it is the path component (without scheme) of the URI underlying file - * - * @param hoodieKeys - * @param table - * @return - */ - public abstract JavaPairRDD> fetchRecordLocation( - JavaRDD hoodieKeys, final HoodieTable table); - - /** - * Looks up the index and tags each incoming record with a location of a file that contains the - * row (if it is actually present) - */ - public abstract JavaRDD> tagLocation(JavaRDD> recordRDD, - HoodieTable hoodieTable) throws HoodieIndexException; - - /** - * Extracts the location of written records, and updates the index. - * - * TODO(vc): We may need to propagate the record as well in a WriteStatus class - */ - public abstract JavaRDD updateLocation(JavaRDD writeStatusRDD, - HoodieTable hoodieTable) throws HoodieIndexException; - - /** - * Rollback the efffects of the commit made at commitTime. - */ - public abstract boolean rollbackCommit(String commitTime); - - /** - * An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the `partitionPath`. - * Such an implementation is able to obtain the same mapping, for two hoodie keys with same `recordKey` - * but different `partitionPath` - * - * @return whether or not, the index implementation is global in nature - */ - public abstract boolean isGlobal(); - - /** - * This is used by storage to determine, if its safe to send inserts, straight to the log, - * i.e having a {@link com.uber.hoodie.common.model.FileSlice}, with no data file. - * - * @return Returns true/false depending on whether the impl has this capability - */ - public abstract boolean canIndexLogFiles(); - - - /** - * - * An index is "implicit" with respect to storage, if just writing new data to a file slice, - * updates the index as well. This is used by storage, to save memory footprint in - * certain cases. - * - * @return - */ - public abstract boolean isImplicitWithStorage(); - - - public static HoodieIndex createIndex( - HoodieWriteConfig config, JavaSparkContext jsc) throws HoodieIndexException { - switch (config.getIndexType()) { - case HBASE: - return new HBaseIndex<>(config, jsc); - case INMEMORY: - return new InMemoryHashIndex<>(config, jsc); - case BLOOM: - return new HoodieBloomIndex<>(config, jsc); - case BUCKETED: - return new BucketedIndex<>(config, jsc); - } - throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType()); + protected transient JavaSparkContext jsc = null; + + public enum IndexType { + HBASE, + INMEMORY, + BLOOM, + BUCKETED + } + + protected final HoodieWriteConfig config; + + protected HoodieIndex(HoodieWriteConfig config, JavaSparkContext jsc) { + this.config = config; + this.jsc = jsc; + } + + /** + * Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]] + * If the optional FullFilePath value is not present, then the key is not found. If the + * FullFilePath value is present, it is the path component (without scheme) of the URI underlying + * file + */ + public abstract JavaPairRDD> fetchRecordLocation( + JavaRDD hoodieKeys, final HoodieTable table); + + /** + * Looks up the index and tags each incoming record with a location of a file that contains the + * row (if it is actually present) + */ + public abstract JavaRDD> tagLocation(JavaRDD> recordRDD, + HoodieTable hoodieTable) throws HoodieIndexException; + + /** + * Extracts the location of written records, and updates the index. + * + * TODO(vc): We may need to propagate the record as well in a WriteStatus class + */ + public abstract JavaRDD updateLocation(JavaRDD writeStatusRDD, + HoodieTable hoodieTable) throws HoodieIndexException; + + /** + * Rollback the efffects of the commit made at commitTime. + */ + public abstract boolean rollbackCommit(String commitTime); + + /** + * An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the + * `partitionPath`. Such an implementation is able to obtain the same mapping, for two hoodie keys + * with same `recordKey` but different `partitionPath` + * + * @return whether or not, the index implementation is global in nature + */ + public abstract boolean isGlobal(); + + /** + * This is used by storage to determine, if its safe to send inserts, straight to the log, i.e + * having a {@link com.uber.hoodie.common.model.FileSlice}, with no data file. + * + * @return Returns true/false depending on whether the impl has this capability + */ + public abstract boolean canIndexLogFiles(); + + + /** + * An index is "implicit" with respect to storage, if just writing new data to a file slice, + * updates the index as well. This is used by storage, to save memory footprint in certain cases. + */ + public abstract boolean isImplicitWithStorage(); + + + public static HoodieIndex createIndex( + HoodieWriteConfig config, JavaSparkContext jsc) throws HoodieIndexException { + switch (config.getIndexType()) { + case HBASE: + return new HBaseIndex<>(config, jsc); + case INMEMORY: + return new InMemoryHashIndex<>(config, jsc); + case BLOOM: + return new HoodieBloomIndex<>(config, jsc); + case BUCKETED: + return new BucketedIndex<>(config, jsc); } + throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType()); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java index 7f202f662..422d31983 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java @@ -17,129 +17,119 @@ package com.uber.hoodie.index; import com.google.common.base.Optional; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; - +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.table.HoodieTable; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentMap; - /** - * Hoodie Index implementation backed by an in-memory Hash map. - *

- * ONLY USE FOR LOCAL TESTING + * Hoodie Index implementation backed by an in-memory Hash map.

ONLY USE FOR LOCAL TESTING */ public class InMemoryHashIndex extends HoodieIndex { - private static ConcurrentMap recordLocationMap; + private static ConcurrentMap recordLocationMap; - public InMemoryHashIndex(HoodieWriteConfig config, JavaSparkContext jsc) { - super(config, jsc); - recordLocationMap = new ConcurrentHashMap<>(); - } + public InMemoryHashIndex(HoodieWriteConfig config, JavaSparkContext jsc) { + super(config, jsc); + recordLocationMap = new ConcurrentHashMap<>(); + } + + @Override + public JavaPairRDD> fetchRecordLocation( + JavaRDD hoodieKeys, final HoodieTable table) { + throw new UnsupportedOperationException("InMemory index does not implement check exist yet"); + } + + /** + * Function that tags each HoodieRecord with an existing location, if known. + */ + class LocationTagFunction + implements Function2>, Iterator>> { @Override - public JavaPairRDD> fetchRecordLocation( - JavaRDD hoodieKeys, final HoodieTable table) { - throw new UnsupportedOperationException("InMemory index does not implement check exist yet"); - } - - /** - * Function that tags each HoodieRecord with an existing location, if known. - */ - class LocationTagFunction - implements Function2>, Iterator>> { - @Override - public Iterator> call(Integer partitionNum, - Iterator> hoodieRecordIterator) { - List> taggedRecords = new ArrayList<>(); - while (hoodieRecordIterator.hasNext()) { - HoodieRecord rec = hoodieRecordIterator.next(); - if (recordLocationMap.containsKey(rec.getKey())) { - rec.setCurrentLocation(recordLocationMap.get(rec.getKey())); - } - taggedRecords.add(rec); - } - return taggedRecords.iterator(); + public Iterator> call(Integer partitionNum, + Iterator> hoodieRecordIterator) { + List> taggedRecords = new ArrayList<>(); + while (hoodieRecordIterator.hasNext()) { + HoodieRecord rec = hoodieRecordIterator.next(); + if (recordLocationMap.containsKey(rec.getKey())) { + rec.setCurrentLocation(recordLocationMap.get(rec.getKey())); } + taggedRecords.add(rec); + } + return taggedRecords.iterator(); } + } - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, - HoodieTable hoodieTable) { - return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(), true); - } + @Override + public JavaRDD> tagLocation(JavaRDD> recordRDD, + HoodieTable hoodieTable) { + return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(), true); + } - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, - HoodieTable hoodieTable) { - return writeStatusRDD.map(new Function() { - @Override - public WriteStatus call(WriteStatus writeStatus) { - for (HoodieRecord record : writeStatus.getWrittenRecords()) { - if (!writeStatus.isErrored(record.getKey())) { - HoodieKey key = record.getKey(); - java.util.Optional newLocation = record.getNewLocation(); - if (newLocation.isPresent()) { - recordLocationMap.put(key, newLocation.get()); - } else { - //Delete existing index for a deleted record - recordLocationMap.remove(key); - } - } - } - return writeStatus; + @Override + public JavaRDD updateLocation(JavaRDD writeStatusRDD, + HoodieTable hoodieTable) { + return writeStatusRDD.map(new Function() { + @Override + public WriteStatus call(WriteStatus writeStatus) { + for (HoodieRecord record : writeStatus.getWrittenRecords()) { + if (!writeStatus.isErrored(record.getKey())) { + HoodieKey key = record.getKey(); + java.util.Optional newLocation = record.getNewLocation(); + if (newLocation.isPresent()) { + recordLocationMap.put(key, newLocation.get()); + } else { + //Delete existing index for a deleted record + recordLocationMap.remove(key); } - }); - } + } + } + return writeStatus; + } + }); + } - @Override - public boolean rollbackCommit(String commitTime) { - return true; - } + @Override + public boolean rollbackCommit(String commitTime) { + return true; + } - /** - * Only looks up by recordKey - * - * @return - */ - @Override - public boolean isGlobal() { - return true; - } + /** + * Only looks up by recordKey + */ + @Override + public boolean isGlobal() { + return true; + } - /** - * Mapping is available in HBase already. - * - * @return - */ - @Override - public boolean canIndexLogFiles() { - return true; - } + /** + * Mapping is available in HBase already. + */ + @Override + public boolean canIndexLogFiles() { + return true; + } - /** - * Index needs to be explicitly updated after storage write. - * - * @return - */ - @Override - public boolean isImplicitWithStorage() { - return false; - } + /** + * Index needs to be explicitly updated after storage write. + */ + @Override + public boolean isImplicitWithStorage() { + return false; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/BloomIndexFileInfo.java b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/BloomIndexFileInfo.java index abe482094..0f0fb9908 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/BloomIndexFileInfo.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/BloomIndexFileInfo.java @@ -19,7 +19,6 @@ package com.uber.hoodie.index.bloom; import com.google.common.base.Objects; - import java.io.Serializable; /** @@ -27,73 +26,75 @@ import java.io.Serializable; */ public class BloomIndexFileInfo implements Serializable { - private final String fileName; + private final String fileName; - private final String minRecordKey; + private final String minRecordKey; - private final String maxRecordKey; + private final String maxRecordKey; - public BloomIndexFileInfo(String fileName, String minRecordKey, String maxRecordKey) { - this.fileName = fileName; - this.minRecordKey = minRecordKey; - this.maxRecordKey = maxRecordKey; + public BloomIndexFileInfo(String fileName, String minRecordKey, String maxRecordKey) { + this.fileName = fileName; + this.minRecordKey = minRecordKey; + this.maxRecordKey = maxRecordKey; + } + + public BloomIndexFileInfo(String fileName) { + this.fileName = fileName; + this.minRecordKey = null; + this.maxRecordKey = null; + } + + public String getFileName() { + return fileName; + } + + public String getMinRecordKey() { + return minRecordKey; + } + + public String getMaxRecordKey() { + return maxRecordKey; + } + + public boolean hasKeyRanges() { + return minRecordKey != null && maxRecordKey != null; + } + + /** + * Does the given key fall within the range (inclusive) + */ + public boolean isKeyInRange(String recordKey) { + return minRecordKey.compareTo(recordKey) <= 0 && + maxRecordKey.compareTo(recordKey) >= 0; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; } - public BloomIndexFileInfo(String fileName) { - this.fileName = fileName; - this.minRecordKey = null; - this.maxRecordKey = null; - } + BloomIndexFileInfo that = (BloomIndexFileInfo) o; + return Objects.equal(that.fileName, fileName) && + Objects.equal(that.minRecordKey, minRecordKey) && + Objects.equal(that.maxRecordKey, maxRecordKey); - public String getFileName() { - return fileName; - } + } - public String getMinRecordKey() { - return minRecordKey; - } + @Override + public int hashCode() { + return Objects.hashCode(fileName, minRecordKey, maxRecordKey); + } - public String getMaxRecordKey() { - return maxRecordKey; - } - - public boolean hasKeyRanges() { - return minRecordKey != null && maxRecordKey != null; - } - - /** - * Does the given key fall within the range (inclusive) - * @param recordKey - * @return - */ - public boolean isKeyInRange(String recordKey) { - return minRecordKey.compareTo(recordKey) <= 0 && - maxRecordKey.compareTo(recordKey) >= 0; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - BloomIndexFileInfo that = (BloomIndexFileInfo) o; - return Objects.equal(that.fileName, fileName) && - Objects.equal(that.minRecordKey, minRecordKey) && - Objects.equal(that.maxRecordKey, maxRecordKey); - - } - - @Override - public int hashCode() { - return Objects.hashCode(fileName, minRecordKey, maxRecordKey); - } - - public String toString() { - final StringBuilder sb = new StringBuilder("BloomIndexFileInfo {"); - sb.append(" fileName=").append(fileName); - sb.append(" minRecordKey=").append(minRecordKey); - sb.append(" maxRecordKey=").append(maxRecordKey); - sb.append('}'); - return sb.toString(); - } + public String toString() { + final StringBuilder sb = new StringBuilder("BloomIndexFileInfo {"); + sb.append(" fileName=").append(fileName); + sb.append(" minRecordKey=").append(minRecordKey); + sb.append(" maxRecordKey=").append(maxRecordKey); + sb.append('}'); + return sb.toString(); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndex.java index 37e0bc719..44dc910c1 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndex.java @@ -18,9 +18,12 @@ package com.uber.hoodie.index.bloom; +import static java.util.stream.Collectors.groupingBy; +import static java.util.stream.Collectors.mapping; +import static java.util.stream.Collectors.toList; + import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; - import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieKey; @@ -34,7 +37,10 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.MetadataNotFoundException; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.table.HoodieTable; - +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -42,369 +48,370 @@ import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.storage.StorageLevel; - import scala.Tuple2; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import static java.util.stream.Collectors.*; - /** * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in * its metadata. */ public class HoodieBloomIndex extends HoodieIndex { - private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class); + private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class); - // we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476) - private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024; - // this is how much a triplet of (partitionPath, fileId, recordKey) costs. - private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300; - private static int MAX_ITEMS_PER_SHUFFLE_PARTITION = SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET; + // we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476) + private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024; + // this is how much a triplet of (partitionPath, fileId, recordKey) costs. + private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300; + private static int MAX_ITEMS_PER_SHUFFLE_PARTITION = + SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET; - public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) { - super(config, jsc); + public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) { + super(config, jsc); + } + + @Override + public JavaRDD> tagLocation(JavaRDD> recordRDD, + final HoodieTable hoodieTable) { + + // Step 0: cache the input record RDD + if (config.getBloomIndexUseCaching()) { + recordRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); } - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, final HoodieTable hoodieTable) { + // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) + JavaPairRDD partitionRecordKeyPairRDD = recordRDD + .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); - // Step 0: cache the input record RDD - if (config.getBloomIndexUseCaching()) { - recordRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); - } + // Lookup indexes for all the partition/recordkey pair + JavaPairRDD rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, + hoodieTable); - // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) - JavaPairRDD partitionRecordKeyPairRDD = recordRDD - .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); - - // Lookup indexes for all the partition/recordkey pair - JavaPairRDD rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, hoodieTable); - - // Cache the result, for subsequent stages. - if (config.getBloomIndexUseCaching()) { - rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); - } - if (logger.isDebugEnabled()) { - long totalTaggedRecords = rowKeyFilenamePairRDD.count(); - logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); - } - - // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys - // Cost: 4 sec. - JavaRDD> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD); - - if (config.getBloomIndexUseCaching()) { - recordRDD.unpersist(); // unpersist the input Record RDD - rowKeyFilenamePairRDD.unpersist(); - } - - return taggedRecordRDD; + // Cache the result, for subsequent stages. + if (config.getBloomIndexUseCaching()) { + rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); + } + if (logger.isDebugEnabled()) { + long totalTaggedRecords = rowKeyFilenamePairRDD.count(); + logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); } - public JavaPairRDD> fetchRecordLocation( - JavaRDD hoodieKeys, final HoodieTable table) { - JavaPairRDD partitionRecordKeyPairRDD = - hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey())); + // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys + // Cost: 4 sec. + JavaRDD> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, + recordRDD); - // Lookup indexes for all the partition/recordkey pair - JavaPairRDD rowKeyFilenamePairRDD = - lookupIndex(partitionRecordKeyPairRDD, table); - - JavaPairRDD rowKeyHoodieKeyPairRDD = - hoodieKeys.mapToPair(key -> new Tuple2<>(key.getRecordKey(), key)); - - return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD) - .mapToPair(keyPathTuple -> { - Optional recordLocationPath; - if (keyPathTuple._2._2.isPresent()) { - String fileName = keyPathTuple._2._2.get(); - String partitionPath = keyPathTuple._2._1.getPartitionPath(); - recordLocationPath = Optional.of(new Path( - new Path(table.getMetaClient().getBasePath(), partitionPath), - fileName).toUri().getPath()); - } else { - recordLocationPath = Optional.absent(); - } - return new Tuple2<>(keyPathTuple._2._1, recordLocationPath); - }); + if (config.getBloomIndexUseCaching()) { + recordRDD.unpersist(); // unpersist the input Record RDD + rowKeyFilenamePairRDD.unpersist(); } - /** - * Lookup the location for each record key and return the pair for all - * record keys already present and drop the record keys if not present - */ - private JavaPairRDD lookupIndex( - JavaPairRDD partitionRecordKeyPairRDD, final HoodieTable hoodieTable) { - // Obtain records per partition, in the incoming records - Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); - List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); + return taggedRecordRDD; + } - // Step 2: Load all involved files as pairs - List> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, hoodieTable); - final Map> partitionToFileInfo = fileInfoList.stream() - .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); + public JavaPairRDD> fetchRecordLocation( + JavaRDD hoodieKeys, final HoodieTable table) { + JavaPairRDD partitionRecordKeyPairRDD = + hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey())); - // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it. - int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD); - return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, parallelism); + // Lookup indexes for all the partition/recordkey pair + JavaPairRDD rowKeyFilenamePairRDD = + lookupIndex(partitionRecordKeyPairRDD, table); + + JavaPairRDD rowKeyHoodieKeyPairRDD = + hoodieKeys.mapToPair(key -> new Tuple2<>(key.getRecordKey(), key)); + + return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD) + .mapToPair(keyPathTuple -> { + Optional recordLocationPath; + if (keyPathTuple._2._2.isPresent()) { + String fileName = keyPathTuple._2._2.get(); + String partitionPath = keyPathTuple._2._1.getPartitionPath(); + recordLocationPath = Optional.of(new Path( + new Path(table.getMetaClient().getBasePath(), partitionPath), + fileName).toUri().getPath()); + } else { + recordLocationPath = Optional.absent(); + } + return new Tuple2<>(keyPathTuple._2._1, recordLocationPath); + }); + } + + /** + * Lookup the location for each record key and return the pair for all record + * keys already present and drop the record keys if not present + */ + private JavaPairRDD lookupIndex( + JavaPairRDD partitionRecordKeyPairRDD, final HoodieTable hoodieTable) { + // Obtain records per partition, in the incoming records + Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); + List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); + + // Step 2: Load all involved files as pairs + List> fileInfoList = loadInvolvedFiles( + affectedPartitionPathList, hoodieTable); + final Map> partitionToFileInfo = fileInfoList.stream() + .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); + + // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it. + int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, + partitionRecordKeyPairRDD); + return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, + parallelism); + } + + /** + * The index lookup can be skewed in three dimensions : #files, #partitions, #records + * + * To be able to smoothly handle skews, we need to compute how to split each partitions into + * subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to < + * 2GB. + * + * If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified + * as a NON-zero number, then that is used explicitly. + */ + private int autoComputeParallelism(final Map recordsPerPartition, + final Map> partitionToFileInfo, + JavaPairRDD partitionRecordKeyPairRDD) { + + long totalComparisons = 0; + if (config.getBloomIndexPruneByRanges()) { + // we will just try exploding the input and then count to determine comparisons + totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, + partitionRecordKeyPairRDD).count(); + } else { + // if not pruning by ranges, then each file in a partition needs to compared against all + // records for a partition. + Map filesPerPartition = partitionToFileInfo.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> Long.valueOf(e.getValue().size()))); + long totalFiles = 0, totalRecords = 0; + for (String partitionPath : recordsPerPartition.keySet()) { + long numRecords = recordsPerPartition.get(partitionPath); + long numFiles = + filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) + : 1L; + + totalComparisons += numFiles * numRecords; + totalFiles += + filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) + : 0L; + totalRecords += numRecords; + } + logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles + + ", TotalAffectedPartitions:" + recordsPerPartition.size()); } - /** - * The index lookup can be skewed in three dimensions : #files, #partitions, #records - * - * To be able to smoothly handle skews, we need to compute how to split each partitions into - * subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to - * < 2GB. - * - * If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number, - * then that is used explicitly. - * - */ - private int autoComputeParallelism(final Map recordsPerPartition, - final Map> partitionToFileInfo, - JavaPairRDD partitionRecordKeyPairRDD) { + // each partition will have an item per comparison. + int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1); + logger.info( + "Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons); + return parallelism; + } - long totalComparisons = 0; - if (config.getBloomIndexPruneByRanges()) { - // we will just try exploding the input and then count to determine comparisons - totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD).count(); - } else { - // if not pruning by ranges, then each file in a partition needs to compared against all - // records for a partition. - Map filesPerPartition = partitionToFileInfo.entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getKey, e -> Long.valueOf(e.getValue().size()))); - long totalFiles = 0, totalRecords = 0; - for (String partitionPath : recordsPerPartition.keySet()) { - long numRecords = recordsPerPartition.get(partitionPath); - long numFiles = filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 1L; + /** + * Its crucial to pick the right parallelism. + * + * totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism : + * typically number of input file splits + * + * We pick the max such that, we are always safe, but go higher if say a there are a lot of input + * files. (otherwise, we will fallback to number of partitions in input and end up with slow + * performance) + */ + private int determineParallelism(int inputParallelism, int totalSubPartitions) { + // If bloom index parallelism is set, use it to to check against the input parallelism and take the max + int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); + int joinParallelism = Math.max(totalSubPartitions, indexParallelism); + logger.info("InputParallelism: ${" + inputParallelism + "}, " + + "IndexParallelism: ${" + config.getBloomIndexParallelism() + "}, " + + "TotalSubParts: ${" + totalSubPartitions + "}, " + + "Join Parallelism set to : " + joinParallelism); + return joinParallelism; + } - totalComparisons += numFiles * numRecords; - totalFiles += filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 0L; - totalRecords += numRecords; - } - logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles + ", TotalAffectedPartitions:" + recordsPerPartition.size()); - } - - // each partition will have an item per comparison. - int parallelism = (int) (totalComparisons/ MAX_ITEMS_PER_SHUFFLE_PARTITION + 1); - logger.info("Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons); - return parallelism; - } - - /** - * Its crucial to pick the right parallelism. - * - * totalSubPartitions : this is deemed safe limit, to be nice with Spark. - * inputParallelism : typically number of input file splits - * - * We pick the max such that, we are always safe, but go higher if say a there are a lot of - * input files. (otherwise, we will fallback to number of partitions in input and end up with - * slow performance) - */ - private int determineParallelism(int inputParallelism, int totalSubPartitions) { - // If bloom index parallelism is set, use it to to check against the input parallelism and take the max - int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); - int joinParallelism = Math.max(totalSubPartitions, indexParallelism); - logger.info("InputParallelism: ${" + inputParallelism + "}, " + - "IndexParallelism: ${" + config.getBloomIndexParallelism() + "}, " + - "TotalSubParts: ${" + totalSubPartitions + "}, " + - "Join Parallelism set to : " + joinParallelism); - return joinParallelism; - } - - /** - * Load all involved files as pair RDD. - */ - @VisibleForTesting - List> loadInvolvedFiles(List partitions, final HoodieTable hoodieTable) { - // Obtain the latest data files from all the partitions. - List> dataFilesList = jsc.parallelize(partitions, Math.max(partitions.size(), 1)) - .flatMapToPair(partitionPath -> { - java.util.Optional latestCommitTime = - hoodieTable.getCommitTimeline().filterCompletedInstants().lastInstant(); - List> filteredFiles = new ArrayList<>(); - if (latestCommitTime.isPresent()) { - filteredFiles = - hoodieTable.getROFileSystemView().getLatestDataFilesBeforeOrOn(partitionPath, - latestCommitTime.get().getTimestamp()) - .map(f -> new Tuple2<>(partitionPath, f)) - .collect(toList()); - } - return filteredFiles.iterator(); - }).collect(); - - if (config.getBloomIndexPruneByRanges()) { - // also obtain file ranges, if range pruning is enabled - return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1)) - .mapToPair(ft -> { - try { - String[] minMaxKeys = ParquetUtils.readMinMaxRecordKeys(ft._2().getFileStatus().getPath()); - return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1])); - } catch (MetadataNotFoundException me) { - logger.warn("Unable to find range metadata in file :" + ft._2()); - return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName())); - } - }).collect(); - } else { - return dataFilesList.stream() - .map(ft -> new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()))) + /** + * Load all involved files as pair RDD. + */ + @VisibleForTesting + List> loadInvolvedFiles(List partitions, + final HoodieTable hoodieTable) { + // Obtain the latest data files from all the partitions. + List> dataFilesList = jsc + .parallelize(partitions, Math.max(partitions.size(), 1)) + .flatMapToPair(partitionPath -> { + java.util.Optional latestCommitTime = + hoodieTable.getCommitTimeline().filterCompletedInstants().lastInstant(); + List> filteredFiles = new ArrayList<>(); + if (latestCommitTime.isPresent()) { + filteredFiles = + hoodieTable.getROFileSystemView().getLatestDataFilesBeforeOrOn(partitionPath, + latestCommitTime.get().getTimestamp()) + .map(f -> new Tuple2<>(partitionPath, f)) .collect(toList()); + } + return filteredFiles.iterator(); + }).collect(); + + if (config.getBloomIndexPruneByRanges()) { + // also obtain file ranges, if range pruning is enabled + return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1)) + .mapToPair(ft -> { + try { + String[] minMaxKeys = ParquetUtils + .readMinMaxRecordKeys(ft._2().getFileStatus().getPath()); + return new Tuple2<>(ft._1(), + new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1])); + } catch (MetadataNotFoundException me) { + logger.warn("Unable to find range metadata in file :" + ft._2()); + return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName())); + } + }).collect(); + } else { + return dataFilesList.stream() + .map(ft -> new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()))) + .collect(toList()); + } + } + + + @Override + public boolean rollbackCommit(String commitTime) { + // Nope, don't need to do anything. + return true; + } + + /** + * This is not global, since we depend on the partitionPath to do the lookup + */ + @Override + public boolean isGlobal() { + return false; + } + + /** + * No indexes into log files yet. + */ + @Override + public boolean canIndexLogFiles() { + return false; + } + + /** + * Bloom filters are stored, into the same data files. + */ + @Override + public boolean isImplicitWithStorage() { + return true; + } + + /** + * if we dont have key ranges, then also we need to compare against the file. no other choice if + * we do, then only compare the file if the record key falls in range. + */ + private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) { + return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey); + } + + + /** + * For each incoming record, produce N output records, 1 each for each file against which the + * record's key needs to be checked. For datasets, where the keys have a definite insert order + * (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range + * pruning. + */ + // sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey + // ranges in the index info. + @VisibleForTesting + JavaPairRDD> explodeRecordRDDWithFileComparisons( + final Map> partitionToFileIndexInfo, + JavaPairRDD partitionRecordKeyPairRDD) { + return partitionRecordKeyPairRDD + .map(partitionRecordKeyPair -> { + String recordKey = partitionRecordKeyPair._2(); + String partitionPath = partitionRecordKeyPair._1(); + + List indexInfos = partitionToFileIndexInfo.get(partitionPath); + List>> recordComparisons = new ArrayList<>(); + if (indexInfos + != null) { // could be null, if there are no files in a given partition yet. + // for each candidate file in partition, that needs to be compared. + for (BloomIndexFileInfo indexInfo : indexInfos) { + if (shouldCompareWithFile(indexInfo, recordKey)) { + recordComparisons.add( + new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey), + new Tuple2<>(indexInfo.getFileName(), + new HoodieKey(recordKey, partitionPath)))); + } + } + } + return recordComparisons; + }) + .flatMapToPair(t -> t.iterator()); + } + + /** + * Find out pair. All workload grouped by file-level. + * + * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such + * that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load + * rowKeys, (3) Tag rowKey + * + * Make sure the parallelism is atleast the groupby parallelism for tagging location + */ + @VisibleForTesting + JavaPairRDD findMatchingFilesForRecordKeys( + final Map> partitionToFileIndexInfo, + JavaPairRDD partitionRecordKeyPairRDD, + int totalSubpartitions) { + + int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), + totalSubpartitions); + + JavaPairRDD> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons( + partitionToFileIndexInfo, partitionRecordKeyPairRDD) + // sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly + .sortByKey(true, joinParallelism); + + return fileSortedTripletRDD + .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(config.getBasePath()), true) + .flatMap(indexLookupResults -> indexLookupResults.iterator()) + .filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0) + .flatMapToPair(lookupResult -> { + List> vals = new ArrayList<>(); + for (String recordKey : lookupResult.getMatchingRecordKeys()) { + vals.add(new Tuple2<>(recordKey, lookupResult.getFileName())); + } + return vals.iterator(); + }); + } + + /** + * Tag the back to the original HoodieRecord RDD. + */ + private JavaRDD> tagLocationBacktoRecords( + JavaPairRDD rowKeyFilenamePairRDD, + JavaRDD> recordRDD) { + JavaPairRDD> rowKeyRecordPairRDD = recordRDD + .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); + + // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join. + return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map( + v1 -> { + HoodieRecord record = v1._1(); + if (v1._2().isPresent()) { + String filename = v1._2().get(); + if (filename != null && !filename.isEmpty()) { + record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), + FSUtils.getFileId(filename))); + } + } + return record; } - } + ); + } - - @Override - public boolean rollbackCommit(String commitTime) { - // Nope, don't need to do anything. - return true; - } - - /** - * This is not global, since we depend on the partitionPath to do the lookup - * - * @return - */ - @Override - public boolean isGlobal() { - return false; - } - - /** - * No indexes into log files yet. - * - * @return - */ - @Override - public boolean canIndexLogFiles() { - return false; - } - - /** - * Bloom filters are stored, into the same data files. - * - * @return - */ - @Override - public boolean isImplicitWithStorage() { - return true; - } - - /** - * if we dont have key ranges, then also we need to compare against the file. no other choice - * if we do, then only compare the file if the record key falls in range. - - * @param indexInfo - * @param recordKey - * @return - */ - private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) { - return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey); - } - - - /** - * For each incoming record, produce N output records, 1 each for each file against which the record's key - * needs to be checked. For datasets, where the keys have a definite insert order (e.g: timestamp as prefix), - * the number of files to be compared gets cut down a lot from range pruning. - * - * - * @param partitionToFileIndexInfo - * @param partitionRecordKeyPairRDD - * @return - */ - // sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey - // ranges in the index info. - @VisibleForTesting - JavaPairRDD> explodeRecordRDDWithFileComparisons(final Map> partitionToFileIndexInfo, - JavaPairRDD partitionRecordKeyPairRDD) { - return partitionRecordKeyPairRDD - .map(partitionRecordKeyPair -> { - String recordKey = partitionRecordKeyPair._2(); - String partitionPath = partitionRecordKeyPair._1(); - - List indexInfos = partitionToFileIndexInfo.get(partitionPath); - List>> recordComparisons = new ArrayList<>(); - if (indexInfos != null) { // could be null, if there are no files in a given partition yet. - // for each candidate file in partition, that needs to be compared. - for (BloomIndexFileInfo indexInfo : indexInfos) { - if (shouldCompareWithFile(indexInfo, recordKey)) { - recordComparisons.add( - new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey), - new Tuple2<>(indexInfo.getFileName(), new HoodieKey(recordKey, partitionPath)))); - } - } - } - return recordComparisons; - }) - .flatMapToPair(t -> t.iterator()); - } - - /** - * Find out pair. All workload grouped by file-level. - * - * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition - * such that each RDD partition is a file, then for each file, we do - * (1) load bloom filter, - * (2) load rowKeys, - * (3) Tag rowKey - * - * Make sure the parallelism is atleast the groupby parallelism for tagging location - */ - @VisibleForTesting - JavaPairRDD findMatchingFilesForRecordKeys(final Map> partitionToFileIndexInfo, - JavaPairRDD partitionRecordKeyPairRDD, - int totalSubpartitions) { - - int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), totalSubpartitions); - - JavaPairRDD> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD) - // sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly - .sortByKey(true, joinParallelism); - - return fileSortedTripletRDD - .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(config.getBasePath()), true) - .flatMap(indexLookupResults -> indexLookupResults.iterator()) - .filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0) - .flatMapToPair(lookupResult -> { - List> vals = new ArrayList<>(); - for (String recordKey : lookupResult.getMatchingRecordKeys()) { - vals.add(new Tuple2<>(recordKey, lookupResult.getFileName())); - } - return vals.iterator(); - }); - } - - /** - * Tag the back to the original HoodieRecord RDD. - */ - private JavaRDD> tagLocationBacktoRecords(JavaPairRDD rowKeyFilenamePairRDD, - JavaRDD> recordRDD) { - JavaPairRDD> rowKeyRecordPairRDD = recordRDD - .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); - - // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join. - return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map( - v1 -> { - HoodieRecord record = v1._1(); - if (v1._2().isPresent()) { - String filename = v1._2().get(); - if (filename != null && !filename.isEmpty()) { - record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), - FSUtils.getFileId(filename))); - } - } - return record; - } - ); - } - - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieTable hoodieTable) { - return writeStatusRDD; - } + @Override + public JavaRDD updateLocation(JavaRDD writeStatusRDD, + HoodieTable hoodieTable) { + return writeStatusRDD; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndexCheckFunction.java b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndexCheckFunction.java index 9eb3c8996..0d562ae86 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndexCheckFunction.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndexCheckFunction.java @@ -24,172 +24,182 @@ import com.uber.hoodie.common.util.ParquetUtils; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieIndexException; import com.uber.hoodie.func.LazyIterableIterator; - -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.function.Function2; - import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Set; - +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.function.Function2; import scala.Tuple2; /** * Function performing actual checking of RDD parition containing (fileId, hoodieKeys) against the * actual files */ -public class HoodieBloomIndexCheckFunction implements Function2>>, Iterator>> { +public class HoodieBloomIndexCheckFunction implements + Function2>>, Iterator>> { - private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class); + private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class); - private final String basePath; + private final String basePath; - public HoodieBloomIndexCheckFunction(String basePath) { - this.basePath = basePath; + public HoodieBloomIndexCheckFunction(String basePath) { + this.basePath = basePath; + } + + /** + * Given a list of row keys and one file, return only row keys existing in that file. + */ + public static List checkCandidatesAgainstFile(List candidateRecordKeys, + Path filePath) throws HoodieIndexException { + List foundRecordKeys = new ArrayList<>(); + try { + // Load all rowKeys from the file, to double-confirm + if (!candidateRecordKeys.isEmpty()) { + Set fileRowKeys = ParquetUtils.readRowKeysFromParquet(filePath); + logger.info("Loading " + fileRowKeys.size() + " row keys from " + filePath); + if (logger.isDebugEnabled()) { + logger.debug("Keys from " + filePath + " => " + fileRowKeys); + } + for (String rowKey : candidateRecordKeys) { + if (fileRowKeys.contains(rowKey)) { + foundRecordKeys.add(rowKey); + } + } + logger.info("After checking with row keys, we have " + foundRecordKeys.size() + + " results, for file " + filePath + " => " + foundRecordKeys); + if (logger.isDebugEnabled()) { + logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); + } + } + } catch (Exception e) { + throw new HoodieIndexException("Error checking candidate keys against file.", e); } + return foundRecordKeys; + } - /** - * Given a list of row keys and one file, return only row keys existing in that file. - */ - public static List checkCandidatesAgainstFile(List candidateRecordKeys, Path filePath) throws HoodieIndexException { - List foundRecordKeys = new ArrayList<>(); - try { - // Load all rowKeys from the file, to double-confirm - if (!candidateRecordKeys.isEmpty()) { - Set fileRowKeys = ParquetUtils.readRowKeysFromParquet(filePath); - logger.info("Loading " + fileRowKeys.size() + " row keys from " + filePath); - if (logger.isDebugEnabled()) { - logger.debug("Keys from " + filePath + " => " + fileRowKeys); - } - for (String rowKey : candidateRecordKeys) { - if (fileRowKeys.contains(rowKey)) { - foundRecordKeys.add(rowKey); - } - } - logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys); - if (logger.isDebugEnabled()) { - logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); - } - } - } catch (Exception e){ - throw new HoodieIndexException("Error checking candidate keys against file.", e); - } - return foundRecordKeys; + class LazyKeyCheckIterator extends + LazyIterableIterator>, List> { + + private List candidateRecordKeys; + + private BloomFilter bloomFilter; + + private String currentFile; + + private String currentParitionPath; + + LazyKeyCheckIterator( + Iterator>> fileParitionRecordKeyTripletItr) { + super(fileParitionRecordKeyTripletItr); + currentFile = null; + candidateRecordKeys = new ArrayList<>(); + bloomFilter = null; + currentParitionPath = null; } - class LazyKeyCheckIterator extends LazyIterableIterator>, List> { - - private List candidateRecordKeys; - - private BloomFilter bloomFilter; - - private String currentFile; - - private String currentParitionPath; - - LazyKeyCheckIterator(Iterator>> fileParitionRecordKeyTripletItr) { - super(fileParitionRecordKeyTripletItr); - currentFile = null; - candidateRecordKeys = new ArrayList<>(); - bloomFilter = null; - currentParitionPath = null; - } - - @Override - protected void start() { - } - - private void initState(String fileName, String partitionPath) throws HoodieIndexException { - try { - Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName); - bloomFilter = ParquetUtils.readBloomFilterFromParquetMetadata(filePath); - candidateRecordKeys = new ArrayList<>(); - currentFile = fileName; - currentParitionPath = partitionPath; - } catch (Exception e) { - throw new HoodieIndexException("Error checking candidate keys against file.", e); - } - } - - @Override - protected List computeNext() { - - List ret = new ArrayList<>(); - try { - // process one file in each go. - while (inputItr.hasNext()) { - - Tuple2> currentTuple = inputItr.next(); - String fileName = currentTuple._2._1; - String partitionPath = currentTuple._2._2.getPartitionPath(); - String recordKey = currentTuple._2._2.getRecordKey(); - - // lazily init state - if (currentFile == null) { - initState(fileName, partitionPath); - } - - // if continue on current file) - if (fileName.equals(currentFile)) { - // check record key against bloom filter of current file & add to possible keys if needed - if (bloomFilter.mightContain(recordKey)) { - if (logger.isDebugEnabled()) { - logger.debug("#1 Adding " + recordKey + " as candidate for file " + fileName); - } - candidateRecordKeys.add(recordKey); - } - } else { - // do the actual checking of file & break out - Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile); - logger.info("#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath); - if (logger.isDebugEnabled()) { - logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys); - } - ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath))); - - initState(fileName, partitionPath); - if (bloomFilter.mightContain(recordKey)) { - if (logger.isDebugEnabled()) { - logger.debug("#2 Adding " + recordKey + " as candidate for file " + fileName); - } - candidateRecordKeys.add(recordKey); - } - break; - } - } - - // handle case, where we ran out of input, finish pending work, update return val - if (!inputItr.hasNext()) { - Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile); - logger.info("#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath); - if (logger.isDebugEnabled()) { - logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys); - } - ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath))); - } - - } catch (Throwable e) { - if (e instanceof HoodieException) { - throw e; - } - throw new HoodieIndexException("Error checking bloom filter index. ", e); - } - - return ret; - } - - @Override - protected void end() { - } - } - - @Override - public Iterator> call(Integer partition, - Iterator>> fileParitionRecordKeyTripletItr) throws Exception { - return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr); + protected void start() { } + + private void initState(String fileName, String partitionPath) throws HoodieIndexException { + try { + Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName); + bloomFilter = ParquetUtils.readBloomFilterFromParquetMetadata(filePath); + candidateRecordKeys = new ArrayList<>(); + currentFile = fileName; + currentParitionPath = partitionPath; + } catch (Exception e) { + throw new HoodieIndexException("Error checking candidate keys against file.", e); + } + } + + @Override + protected List computeNext() { + + List ret = new ArrayList<>(); + try { + // process one file in each go. + while (inputItr.hasNext()) { + + Tuple2> currentTuple = inputItr.next(); + String fileName = currentTuple._2._1; + String partitionPath = currentTuple._2._2.getPartitionPath(); + String recordKey = currentTuple._2._2.getRecordKey(); + + // lazily init state + if (currentFile == null) { + initState(fileName, partitionPath); + } + + // if continue on current file) + if (fileName.equals(currentFile)) { + // check record key against bloom filter of current file & add to possible keys if needed + if (bloomFilter.mightContain(recordKey)) { + if (logger.isDebugEnabled()) { + logger.debug("#1 Adding " + recordKey + " as candidate for file " + fileName); + } + candidateRecordKeys.add(recordKey); + } + } else { + // do the actual checking of file & break out + Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile); + logger.info( + "#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys + .size() + " for " + filePath); + if (logger.isDebugEnabled()) { + logger + .debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys); + } + ret.add(new IndexLookupResult(currentFile, + checkCandidatesAgainstFile(candidateRecordKeys, filePath))); + + initState(fileName, partitionPath); + if (bloomFilter.mightContain(recordKey)) { + if (logger.isDebugEnabled()) { + logger.debug("#2 Adding " + recordKey + " as candidate for file " + fileName); + } + candidateRecordKeys.add(recordKey); + } + break; + } + } + + // handle case, where we ran out of input, finish pending work, update return val + if (!inputItr.hasNext()) { + Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile); + logger.info( + "#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys + .size() + " for " + filePath); + if (logger.isDebugEnabled()) { + logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys); + } + ret.add(new IndexLookupResult(currentFile, + checkCandidatesAgainstFile(candidateRecordKeys, filePath))); + } + + } catch (Throwable e) { + if (e instanceof HoodieException) { + throw e; + } + throw new HoodieIndexException("Error checking bloom filter index. ", e); + } + + return ret; + } + + @Override + protected void end() { + } + } + + + @Override + public Iterator> call(Integer partition, + Iterator>> fileParitionRecordKeyTripletItr) + throws Exception { + return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/IndexLookupResult.java b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/IndexLookupResult.java index 23a89b945..37760646e 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/IndexLookupResult.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/IndexLookupResult.java @@ -25,21 +25,21 @@ import java.util.List; */ public class IndexLookupResult { - private String fileName; + private String fileName; - private List matchingRecordKeys; + private List matchingRecordKeys; - public IndexLookupResult(String fileName, List matchingRecordKeys) { - this.fileName = fileName; - this.matchingRecordKeys = matchingRecordKeys; - } + public IndexLookupResult(String fileName, List matchingRecordKeys) { + this.fileName = fileName; + this.matchingRecordKeys = matchingRecordKeys; + } - public String getFileName() { - return fileName; - } + public String getFileName() { + return fileName; + } - public List getMatchingRecordKeys() { - return matchingRecordKeys; - } + public List getMatchingRecordKeys() { + return matchingRecordKeys; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/bucketed/BucketedIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/bucketed/BucketedIndex.java index ba45bc666..a361a7b06 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/bucketed/BucketedIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/bucketed/BucketedIndex.java @@ -19,7 +19,6 @@ package com.uber.hoodie.index.bucketed; import com.google.common.base.Optional; - import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; @@ -29,96 +28,86 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieIndexException; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.table.HoodieTable; - import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; - import scala.Tuple2; /** - * An `stateless` index implementation that will using a deterministic mapping function to - * determine the fileID for a given record. - * - * Pros: - * - Fast - * - * Cons : - * - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune this) - * - Could increase write amplification on copy-on-write storage since inserts always rewrite files - * - Not global. - * + * An `stateless` index implementation that will using a deterministic mapping function to determine + * the fileID for a given record. * + * Pros: - Fast * + * Cons : - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune + * this) - Could increase write amplification on copy-on-write storage since inserts always rewrite + * files - Not global. */ public class BucketedIndex extends HoodieIndex { - private static Logger logger = LogManager.getLogger(BucketedIndex.class); + private static Logger logger = LogManager.getLogger(BucketedIndex.class); - public BucketedIndex(HoodieWriteConfig config, JavaSparkContext jsc) { - super(config, jsc); - } + public BucketedIndex(HoodieWriteConfig config, JavaSparkContext jsc) { + super(config, jsc); + } - private String getBucket(String recordKey) { - return String.valueOf(recordKey.hashCode() % config.getNumBucketsPerPartition()); - } + private String getBucket(String recordKey) { + return String.valueOf(recordKey.hashCode() % config.getNumBucketsPerPartition()); + } - @Override - public JavaPairRDD> fetchRecordLocation(JavaRDD hoodieKeys, HoodieTable table) { - return hoodieKeys.mapToPair(hk -> new Tuple2<>(hk, Optional.of(getBucket(hk.getRecordKey())))); - } + @Override + public JavaPairRDD> fetchRecordLocation(JavaRDD hoodieKeys, + HoodieTable table) { + return hoodieKeys.mapToPair(hk -> new Tuple2<>(hk, Optional.of(getBucket(hk.getRecordKey())))); + } - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, HoodieTable hoodieTable) throws HoodieIndexException { - return recordRDD.map(record -> { - String bucket = getBucket(record.getRecordKey()); - //HACK(vc) a non-existent commit is provided here. - record.setCurrentLocation(new HoodieRecordLocation("000", bucket)); - return record; - }); - } + @Override + public JavaRDD> tagLocation(JavaRDD> recordRDD, + HoodieTable hoodieTable) throws HoodieIndexException { + return recordRDD.map(record -> { + String bucket = getBucket(record.getRecordKey()); + //HACK(vc) a non-existent commit is provided here. + record.setCurrentLocation(new HoodieRecordLocation("000", bucket)); + return record; + }); + } - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieTable hoodieTable) throws HoodieIndexException { - return writeStatusRDD; - } + @Override + public JavaRDD updateLocation(JavaRDD writeStatusRDD, + HoodieTable hoodieTable) throws HoodieIndexException { + return writeStatusRDD; + } - @Override - public boolean rollbackCommit(String commitTime) { - // nothing to rollback in the index. - return true; - } + @Override + public boolean rollbackCommit(String commitTime) { + // nothing to rollback in the index. + return true; + } - /** - * Bucketing is still done within each partition. - * - * @return - */ - @Override - public boolean isGlobal() { - return false; - } + /** + * Bucketing is still done within each partition. + */ + @Override + public boolean isGlobal() { + return false; + } - /** - * Since indexing is just a deterministic hash, we can identify file group correctly even without an index - * on the actual log file. - * - * @return - */ - @Override - public boolean canIndexLogFiles() { - return true; - } + /** + * Since indexing is just a deterministic hash, we can identify file group correctly even without + * an index on the actual log file. + */ + @Override + public boolean canIndexLogFiles() { + return true; + } - /** - * Indexing is just a hash function. - * - * @return - */ - @Override - public boolean isImplicitWithStorage() { - return true; - } + /** + * Indexing is just a hash function. + */ + @Override + public boolean isImplicitWithStorage() { + return true; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/hbase/HBaseIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/hbase/HBaseIndex.java index 39929876f..5d50ff646 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/hbase/HBaseIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/hbase/HBaseIndex.java @@ -19,24 +19,33 @@ package com.uber.hoodie.index.hbase; import com.google.common.base.Optional; -import com.uber.hoodie.common.table.HoodieTimeline; -import com.uber.hoodie.common.table.timeline.HoodieInstant; -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; -import com.uber.hoodie.common.model.HoodieRecord; - +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.config.HoodieIndexConfig; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException; import com.uber.hoodie.exception.HoodieIndexException; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.table.HoodieTable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.*; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.ConnectionFactory; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.util.Bytes; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -45,230 +54,221 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - /** * Hoodie Index implementation backed by HBase */ public class HBaseIndex extends HoodieIndex { - private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s"); - private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts"); - private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name"); - private final static byte[] PARTITION_PATH_COLUMN = Bytes.toBytes("partition_path"); - private static Logger logger = LogManager.getLogger(HBaseIndex.class); + private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s"); + private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts"); + private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name"); + private final static byte[] PARTITION_PATH_COLUMN = Bytes.toBytes("partition_path"); - private final String tableName; + private static Logger logger = LogManager.getLogger(HBaseIndex.class); - public HBaseIndex(HoodieWriteConfig config, JavaSparkContext jsc) { - super(config, jsc); - this.tableName = config.getProps().getProperty(HoodieIndexConfig.HBASE_TABLENAME_PROP); + private final String tableName; + + public HBaseIndex(HoodieWriteConfig config, JavaSparkContext jsc) { + super(config, jsc); + this.tableName = config.getProps().getProperty(HoodieIndexConfig.HBASE_TABLENAME_PROP); + } + + @Override + public JavaPairRDD> fetchRecordLocation( + JavaRDD hoodieKeys, HoodieTable table) { + throw new UnsupportedOperationException("HBase index does not implement check exist yet"); + } + + private static Connection hbaseConnection = null; + + private Connection getHBaseConnection() { + Configuration hbaseConfig = HBaseConfiguration.create(); + String quorum = config.getProps().getProperty(HoodieIndexConfig.HBASE_ZKQUORUM_PROP); + hbaseConfig.set("hbase.zookeeper.quorum", quorum); + String port = config.getProps().getProperty(HoodieIndexConfig.HBASE_ZKPORT_PROP); + hbaseConfig.set("hbase.zookeeper.property.clientPort", port); + try { + return ConnectionFactory.createConnection(hbaseConfig); + } catch (IOException e) { + throw new HoodieDependentSystemUnavailableException( + HoodieDependentSystemUnavailableException.HBASE, quorum + ":" + port); + } + } + + /** + * Function that tags each HoodieRecord with an existing location, if known. + */ + class LocationTagFunction + implements Function2>, Iterator>> { + + private final HoodieTable hoodieTable; + + LocationTagFunction(HoodieTable hoodieTable) { + this.hoodieTable = hoodieTable; } @Override - public JavaPairRDD> fetchRecordLocation( - JavaRDD hoodieKeys, HoodieTable table) { - throw new UnsupportedOperationException("HBase index does not implement check exist yet"); - } - - private static Connection hbaseConnection = null; - - private Connection getHBaseConnection() { - Configuration hbaseConfig = HBaseConfiguration.create(); - String quorum = config.getProps().getProperty(HoodieIndexConfig.HBASE_ZKQUORUM_PROP); - hbaseConfig.set("hbase.zookeeper.quorum", quorum); - String port = config.getProps().getProperty(HoodieIndexConfig.HBASE_ZKPORT_PROP); - hbaseConfig.set("hbase.zookeeper.property.clientPort", port); - try { - return ConnectionFactory.createConnection(hbaseConfig); - } catch (IOException e) { - throw new HoodieDependentSystemUnavailableException( - HoodieDependentSystemUnavailableException.HBASE, quorum + ":" + port); + public Iterator> call(Integer partitionNum, + Iterator> hoodieRecordIterator) { + // Grab the global HBase connection + synchronized (HBaseIndex.class) { + if (hbaseConnection == null) { + hbaseConnection = getHBaseConnection(); } - } + } + List> taggedRecords = new ArrayList<>(); + HTable hTable = null; + try { + hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName)); + // Do the tagging. + while (hoodieRecordIterator.hasNext()) { + HoodieRecord rec = hoodieRecordIterator.next(); + // TODO(vc): This may need to be a multi get. + Result result = hTable.get( + new Get(Bytes.toBytes(rec.getRecordKey())).setMaxVersions(1) + .addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN) + .addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN) + .addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); - /** - * Function that tags each HoodieRecord with an existing location, if known. - */ - class LocationTagFunction - implements Function2>, Iterator>> { + // first, attempt to grab location from HBase + if (result.getRow() != null) { + String commitTs = + Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)); + String fileId = + Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)); - private final HoodieTable hoodieTable; - - LocationTagFunction(HoodieTable hoodieTable) { - this.hoodieTable = hoodieTable; + HoodieTimeline commitTimeline = hoodieTable.getCompletedCommitTimeline(); + // if the last commit ts for this row is less than the system commit ts + if (!commitTimeline.empty() && commitTimeline.containsInstant( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs))) { + rec.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); + } + } + taggedRecords.add(rec); + } + } catch (IOException e) { + throw new HoodieIndexException( + "Failed to Tag indexed locations because of exception with HBase Client", e); + } finally { + if (hTable != null) { + try { + hTable.close(); + } catch (IOException e) { + // Ignore + } } - @Override - public Iterator> call(Integer partitionNum, - Iterator> hoodieRecordIterator) { - // Grab the global HBase connection - synchronized (HBaseIndex.class) { - if (hbaseConnection == null) { - hbaseConnection = getHBaseConnection(); - } - } - List> taggedRecords = new ArrayList<>(); - HTable hTable = null; - try { - hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName)); - // Do the tagging. - while (hoodieRecordIterator.hasNext()) { - HoodieRecord rec = hoodieRecordIterator.next(); - // TODO(vc): This may need to be a multi get. - Result result = hTable.get( - new Get(Bytes.toBytes(rec.getRecordKey())).setMaxVersions(1) - .addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN) - .addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN) - .addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); + } + return taggedRecords.iterator(); + } + } - // first, attempt to grab location from HBase - if (result.getRow() != null) { - String commitTs = - Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)); - String fileId = - Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)); + @Override + public JavaRDD> tagLocation(JavaRDD> recordRDD, + HoodieTable hoodieTable) { + return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(hoodieTable), true); + } - HoodieTimeline commitTimeline = hoodieTable.getCompletedCommitTimeline(); - // if the last commit ts for this row is less than the system commit ts - if (!commitTimeline.empty() && commitTimeline.containsInstant( - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs))) { - rec.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); - } - } - taggedRecords.add(rec); - } - } catch (IOException e) { - throw new HoodieIndexException( - "Failed to Tag indexed locations because of exception with HBase Client", e); - } + class UpdateLocationTask implements + Function2, Iterator> { - finally { - if (hTable != null) { - try { - hTable.close(); - } catch (IOException e) { - // Ignore - } - } + @Override + public Iterator call(Integer partition, Iterator statusIterator) { - } - return taggedRecords.iterator(); + List writeStatusList = new ArrayList<>(); + // Grab the global HBase connection + synchronized (HBaseIndex.class) { + if (hbaseConnection == null) { + hbaseConnection = getHBaseConnection(); } - } - - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, HoodieTable hoodieTable) { - return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(hoodieTable), true); - } - - class UpdateLocationTask implements Function2, Iterator> { - @Override - public Iterator call(Integer partition, Iterator statusIterator) { - - List writeStatusList = new ArrayList<>(); - // Grab the global HBase connection - synchronized (HBaseIndex.class) { - if (hbaseConnection == null) { - hbaseConnection = getHBaseConnection(); + } + HTable hTable = null; + try { + hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName)); + while (statusIterator.hasNext()) { + WriteStatus writeStatus = statusIterator.next(); + List puts = new ArrayList<>(); + List deletes = new ArrayList<>(); + try { + for (HoodieRecord rec : writeStatus.getWrittenRecords()) { + if (!writeStatus.isErrored(rec.getKey())) { + java.util.Optional loc = rec.getNewLocation(); + if (loc.isPresent()) { + Put put = new Put(Bytes.toBytes(rec.getRecordKey())); + put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, + Bytes.toBytes(loc.get().getCommitTime())); + put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, + Bytes.toBytes(loc.get().getFileId())); + put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, + Bytes.toBytes(rec.getPartitionPath())); + puts.add(put); + } else { + //Delete existing index for a deleted record + Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey())); + deletes.add(delete); } + } } - HTable hTable = null; - try { - hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName)); - while (statusIterator.hasNext()) { - WriteStatus writeStatus = statusIterator.next(); - List puts = new ArrayList<>(); - List deletes = new ArrayList<>(); - try { - for (HoodieRecord rec : writeStatus.getWrittenRecords()) { - if (!writeStatus.isErrored(rec.getKey())) { - java.util.Optional loc = rec.getNewLocation(); - if(loc.isPresent()) { - Put put = new Put(Bytes.toBytes(rec.getRecordKey())); - put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, - Bytes.toBytes(loc.get().getCommitTime())); - put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, - Bytes.toBytes(loc.get().getFileId())); - put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, - Bytes.toBytes(rec.getPartitionPath())); - puts.add(put); - } else { - //Delete existing index for a deleted record - Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey())); - deletes.add(delete); - } - } - } - hTable.put(puts); - hTable.delete(deletes); - hTable.flushCommits(); - } catch (Exception e) { - Exception we = new Exception("Error updating index for " + writeStatus, e); - logger.error(we); - writeStatus.setGlobalError(we); - } - writeStatusList.add(writeStatus); - } - } catch (IOException e) { - throw new HoodieIndexException( - "Failed to Update Index locations because of exception with HBase Client", e); - } finally { - if (hTable != null) { - try { - hTable.close(); - } catch (IOException e) { - // Ignore - } - } - } - return writeStatusList.iterator(); + hTable.put(puts); + hTable.delete(deletes); + hTable.flushCommits(); + } catch (Exception e) { + Exception we = new Exception("Error updating index for " + writeStatus, e); + logger.error(we); + writeStatus.setGlobalError(we); + } + writeStatusList.add(writeStatus); } + } catch (IOException e) { + throw new HoodieIndexException( + "Failed to Update Index locations because of exception with HBase Client", e); + } finally { + if (hTable != null) { + try { + hTable.close(); + } catch (IOException e) { + // Ignore + } + } + } + return writeStatusList.iterator(); } + } - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, - HoodieTable hoodieTable) { - return writeStatusRDD.mapPartitionsWithIndex(new UpdateLocationTask(), true); - } + @Override + public JavaRDD updateLocation(JavaRDD writeStatusRDD, + HoodieTable hoodieTable) { + return writeStatusRDD.mapPartitionsWithIndex(new UpdateLocationTask(), true); + } - @Override - public boolean rollbackCommit(String commitTime) { - // Can't really rollback here. HBase only can let you go from recordKey to fileID, - // not the other way around - return true; - } + @Override + public boolean rollbackCommit(String commitTime) { + // Can't really rollback here. HBase only can let you go from recordKey to fileID, + // not the other way around + return true; + } - /** - * Only looks up by recordKey - * - * @return - */ - @Override - public boolean isGlobal() { - return true; - } + /** + * Only looks up by recordKey + */ + @Override + public boolean isGlobal() { + return true; + } - /** - * Mapping is available in HBase already. - * - * @return - */ - @Override - public boolean canIndexLogFiles() { - return true; - } + /** + * Mapping is available in HBase already. + */ + @Override + public boolean canIndexLogFiles() { + return true; + } - /** - * Index needs to be explicitly updated after storage write. - * - * @return - */ - @Override - public boolean isImplicitWithStorage() { - return false; - } + /** + * Index needs to be explicitly updated after storage write. + */ + @Override + public boolean isImplicitWithStorage() { + return false; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieAppendHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieAppendHandle.java index 7e4d106ec..683c6a75e 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieAppendHandle.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieAppendHandle.java @@ -36,13 +36,6 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieAppendException; import com.uber.hoodie.exception.HoodieUpsertException; import com.uber.hoodie.table.HoodieTable; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.TaskContext; - import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; @@ -50,155 +43,161 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.atomic.AtomicLong; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.TaskContext; /** * IO Operation to append data onto an existing file. - * - * @param */ public class HoodieAppendHandle extends HoodieIOHandle { - private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class); - private static AtomicLong recordIndex = new AtomicLong(1); - private final WriteStatus writeStatus; - private final String fileId; - private String partitionPath; - private List> records; - private long recordsWritten = 0; - private long recordsDeleted = 0; - private HoodieLogFile currentLogFile; - private Writer writer; + private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class); + private static AtomicLong recordIndex = new AtomicLong(1); - public HoodieAppendHandle(HoodieWriteConfig config, - String commitTime, - HoodieTable hoodieTable, - String fileId, - Iterator> recordItr) { - super(config, commitTime, hoodieTable); - WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName()); - writeStatus.setStat(new HoodieDeltaWriteStat()); - this.writeStatus = writeStatus; - this.fileId = fileId; - init(recordItr); - } + private final WriteStatus writeStatus; + private final String fileId; + private String partitionPath; + private List> records; + private long recordsWritten = 0; + private long recordsDeleted = 0; + private HoodieLogFile currentLogFile; + private Writer writer; - private void init(Iterator> recordItr) { - List> records = Lists.newArrayList(); - recordItr.forEachRemaining(record -> { - records.add(record); - // extract some information from the first record - if (partitionPath == null) { - partitionPath = record.getPartitionPath(); - // HACK(vc) This also assumes a base file. It will break, if appending without one. - String latestValidFilePath = - fileSystemView.getLatestDataFiles(record.getPartitionPath()) - .filter(dataFile -> dataFile.getFileId().equals(fileId)) - .findFirst().get().getFileName(); - String baseCommitTime = FSUtils.getCommitTime(latestValidFilePath); - writeStatus.getStat().setPrevCommit(baseCommitTime); - writeStatus.setFileId(fileId); - writeStatus.setPartitionPath(record.getPartitionPath()); - writeStatus.getStat().setFileId(fileId); + public HoodieAppendHandle(HoodieWriteConfig config, + String commitTime, + HoodieTable hoodieTable, + String fileId, + Iterator> recordItr) { + super(config, commitTime, hoodieTable); + WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName()); + writeStatus.setStat(new HoodieDeltaWriteStat()); + this.writeStatus = writeStatus; + this.fileId = fileId; + init(recordItr); + } - try { - this.writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath)) - .withFileId(fileId).overBaseCommit(baseCommitTime) - .withFs(fs).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); - this.currentLogFile = writer.getLogFile(); - ((HoodieDeltaWriteStat) writeStatus.getStat()) - .setLogVersion(currentLogFile.getLogVersion()); - ((HoodieDeltaWriteStat) writeStatus.getStat()) - .setLogOffset(writer.getCurrentSize()); - } catch (Exception e) { - logger.error("Error in update task at commit " + commitTime, e); - writeStatus.setGlobalError(e); - throw new HoodieUpsertException( - "Failed to initialize HoodieUpdateHandle for FileId: " + fileId - + " on commit " + commitTime + " on HDFS path " + hoodieTable - .getMetaClient().getBasePath() + partitionPath, e); - } - Path path = new Path(record.getPartitionPath(), - FSUtils.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)); - writeStatus.getStat().setPath(path.toString()); - } - // update the new location of the record, so we know where to find it next - record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); - }); - this.records = records; - } + private void init(Iterator> recordItr) { + List> records = Lists.newArrayList(); + recordItr.forEachRemaining(record -> { + records.add(record); + // extract some information from the first record + if (partitionPath == null) { + partitionPath = record.getPartitionPath(); + // HACK(vc) This also assumes a base file. It will break, if appending without one. + String latestValidFilePath = + fileSystemView.getLatestDataFiles(record.getPartitionPath()) + .filter(dataFile -> dataFile.getFileId().equals(fileId)) + .findFirst().get().getFileName(); + String baseCommitTime = FSUtils.getCommitTime(latestValidFilePath); + writeStatus.getStat().setPrevCommit(baseCommitTime); + writeStatus.setFileId(fileId); + writeStatus.setPartitionPath(record.getPartitionPath()); + writeStatus.getStat().setFileId(fileId); - private Optional getIndexedRecord(HoodieRecord hoodieRecord) { - Optional recordMetadata = hoodieRecord.getData().getMetadata(); try { - Optional avroRecord = hoodieRecord.getData().getInsertValue(schema); - - if(avroRecord.isPresent()) { - String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), - recordIndex.getAndIncrement()); - HoodieAvroUtils - .addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(), - hoodieRecord.getPartitionPath(), fileId); - HoodieAvroUtils - .addCommitMetadataToRecord((GenericRecord) avroRecord.get(), commitTime, seqId); - recordsWritten++; - } else { - recordsDeleted++; - } - - hoodieRecord.deflate(); - writeStatus.markSuccess(hoodieRecord, recordMetadata); - return avroRecord; + this.writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath)) + .withFileId(fileId).overBaseCommit(baseCommitTime) + .withFs(fs).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); + this.currentLogFile = writer.getLogFile(); + ((HoodieDeltaWriteStat) writeStatus.getStat()) + .setLogVersion(currentLogFile.getLogVersion()); + ((HoodieDeltaWriteStat) writeStatus.getStat()) + .setLogOffset(writer.getCurrentSize()); } catch (Exception e) { - logger.error("Error writing record " + hoodieRecord, e); - writeStatus.markFailure(hoodieRecord, e, recordMetadata); + logger.error("Error in update task at commit " + commitTime, e); + writeStatus.setGlobalError(e); + throw new HoodieUpsertException( + "Failed to initialize HoodieUpdateHandle for FileId: " + fileId + + " on commit " + commitTime + " on HDFS path " + hoodieTable + .getMetaClient().getBasePath() + partitionPath, e); } - return Optional.empty(); - } + Path path = new Path(record.getPartitionPath(), + FSUtils.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)); + writeStatus.getStat().setPath(path.toString()); + } + // update the new location of the record, so we know where to find it next + record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); + }); + this.records = records; + } - public void doAppend() { + private Optional getIndexedRecord(HoodieRecord hoodieRecord) { + Optional recordMetadata = hoodieRecord.getData().getMetadata(); + try { + Optional avroRecord = hoodieRecord.getData().getInsertValue(schema); - List recordList = new ArrayList<>(); - List keysToDelete = new ArrayList<>(); - Map metadata = Maps.newHashMap(); - metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, commitTime); - records.stream().forEach(record -> { - Optional indexedRecord = getIndexedRecord(record); - if(indexedRecord.isPresent()) { - recordList.add(indexedRecord.get()); - } else { - keysToDelete.add(record.getRecordKey()); - } - }); - try { - if(recordList.size() > 0) { - writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, schema, metadata)); - } - if(keysToDelete.size() > 0) { - writer = writer.appendBlock(new HoodieDeleteBlock(keysToDelete.stream().toArray(String[]::new), metadata)); - } - } catch (Exception e) { - throw new HoodieAppendException( - "Failed while appeding records to " + currentLogFile.getPath(), e); - } - } + if (avroRecord.isPresent()) { + String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), + recordIndex.getAndIncrement()); + HoodieAvroUtils + .addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(), + hoodieRecord.getPartitionPath(), fileId); + HoodieAvroUtils + .addCommitMetadataToRecord((GenericRecord) avroRecord.get(), commitTime, seqId); + recordsWritten++; + } else { + recordsDeleted++; + } - public void close() { - try { - if (writer != null) { - writer.close(); - } - writeStatus.getStat().setNumWrites(recordsWritten); - writeStatus.getStat().setNumDeletes(recordsDeleted); - writeStatus.getStat().setTotalWriteErrors(writeStatus.getFailedRecords().size()); - } catch (IOException e) { - throw new HoodieUpsertException("Failed to close UpdateHandle", e); - } + hoodieRecord.deflate(); + writeStatus.markSuccess(hoodieRecord, recordMetadata); + return avroRecord; + } catch (Exception e) { + logger.error("Error writing record " + hoodieRecord, e); + writeStatus.markFailure(hoodieRecord, e, recordMetadata); } + return Optional.empty(); + } - public WriteStatus getWriteStatus() { - return writeStatus; + public void doAppend() { + + List recordList = new ArrayList<>(); + List keysToDelete = new ArrayList<>(); + Map metadata = Maps.newHashMap(); + metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, commitTime); + records.stream().forEach(record -> { + Optional indexedRecord = getIndexedRecord(record); + if (indexedRecord.isPresent()) { + recordList.add(indexedRecord.get()); + } else { + keysToDelete.add(record.getRecordKey()); + } + }); + try { + if (recordList.size() > 0) { + writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, schema, metadata)); + } + if (keysToDelete.size() > 0) { + writer = writer.appendBlock( + new HoodieDeleteBlock(keysToDelete.stream().toArray(String[]::new), metadata)); + } + } catch (Exception e) { + throw new HoodieAppendException( + "Failed while appeding records to " + currentLogFile.getPath(), e); } + } + + public void close() { + try { + if (writer != null) { + writer.close(); + } + writeStatus.getStat().setNumWrites(recordsWritten); + writeStatus.getStat().setNumDeletes(recordsDeleted); + writeStatus.getStat().setTotalWriteErrors(writeStatus.getFailedRecords().size()); + } catch (IOException e) { + throw new HoodieUpsertException("Failed to close UpdateHandle", e); + } + } + + public WriteStatus getWriteStatus() { + return writeStatus; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleanHelper.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleanHelper.java index 410dea8e3..086b87f89 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleanHelper.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleanHelper.java @@ -27,226 +27,212 @@ import com.uber.hoodie.common.table.TableFileSystemView; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.table.HoodieTable; -import org.apache.hadoop.fs.FileSystem; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; /** * Cleaner is responsible for garbage collecting older files in a given partition path, such that - *

- * 1) It provides sufficient time for existing queries running on older versions, to finish - *

- * 2) It bounds the growth of the files in the file system - *

- * TODO: Should all cleaning be done based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata} + *

1) It provides sufficient time for existing queries running on older versions, to finish

+ * 2) It bounds the growth of the files in the file system

TODO: Should all cleaning be done + * based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata} */ public class HoodieCleanHelper> { - private static Logger logger = LogManager.getLogger(HoodieCleanHelper.class); - private final TableFileSystemView fileSystemView; - private final HoodieTimeline commitTimeline; - private HoodieTable hoodieTable; - private HoodieWriteConfig config; - private FileSystem fs; + private static Logger logger = LogManager.getLogger(HoodieCleanHelper.class); - public HoodieCleanHelper(HoodieTable hoodieTable, HoodieWriteConfig config) { - this.hoodieTable = hoodieTable; - this.fileSystemView = hoodieTable.getCompletedFileSystemView(); - this.commitTimeline = hoodieTable.getCompletedCommitTimeline(); - this.config = config; - this.fs = hoodieTable.getFs(); + private final TableFileSystemView fileSystemView; + private final HoodieTimeline commitTimeline; + private HoodieTable hoodieTable; + private HoodieWriteConfig config; + private FileSystem fs; + + public HoodieCleanHelper(HoodieTable hoodieTable, HoodieWriteConfig config) { + this.hoodieTable = hoodieTable; + this.fileSystemView = hoodieTable.getCompletedFileSystemView(); + this.commitTimeline = hoodieTable.getCompletedCommitTimeline(); + this.config = config; + this.fs = hoodieTable.getFs(); + } + + + /** + * Selects the older versions of files for cleaning, such that it bounds the number of versions of + * each file. This policy is useful, if you are simply interested in querying the table, and you + * don't want too many versions for a single file (i.e run it with versionsRetained = 1) + */ + private List getFilesToCleanKeepingLatestVersions(String partitionPath) + throws IOException { + logger.info("Cleaning " + partitionPath + ", retaining latest " + config + .getCleanerFileVersionsRetained() + " file versions. "); + List fileGroups = + fileSystemView.getAllFileGroups(partitionPath) + .collect(Collectors.toList()); + List deletePaths = new ArrayList<>(); + // Collect all the datafiles savepointed by all the savepoints + List savepointedFiles = hoodieTable.getSavepoints().stream() + .flatMap(s -> hoodieTable.getSavepointedDataFiles(s)).collect(Collectors.toList()); + + for (HoodieFileGroup fileGroup : fileGroups) { + int keepVersions = config.getCleanerFileVersionsRetained(); + Iterator fileSliceIterator = fileGroup.getAllFileSlices().iterator(); + while (fileSliceIterator.hasNext() && keepVersions > 0) { + // Skip this most recent version + FileSlice nextSlice = fileSliceIterator.next(); + HoodieDataFile dataFile = nextSlice.getDataFile().get(); + if (savepointedFiles.contains(dataFile.getFileName())) { + // do not clean up a savepoint data file + continue; + } + keepVersions--; + } + // Delete the remaining files + while (fileSliceIterator.hasNext()) { + FileSlice nextSlice = fileSliceIterator.next(); + HoodieDataFile dataFile = nextSlice.getDataFile().get(); + deletePaths.add(dataFile.getFileStatus().getPath().toString()); + if (hoodieTable.getMetaClient().getTableType() + == HoodieTableType.MERGE_ON_READ) { + // If merge on read, then clean the log files for the commits as well + deletePaths.addAll(nextSlice.getLogFiles() + .map(file -> file.getPath().toString()) + .collect(Collectors.toList())); + } + } } + return deletePaths; + } - /** - * Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. - * This policy is useful, if you are simply interested in querying the table, and you don't want too many - * versions for a single file (i.e run it with versionsRetained = 1) - * - * @param partitionPath - * @return - * @throws IOException - */ - private List getFilesToCleanKeepingLatestVersions(String partitionPath) - throws IOException { - logger.info("Cleaning " + partitionPath + ", retaining latest " + config - .getCleanerFileVersionsRetained() + " file versions. "); - List fileGroups = - fileSystemView.getAllFileGroups(partitionPath) - .collect(Collectors.toList()); - List deletePaths = new ArrayList<>(); - // Collect all the datafiles savepointed by all the savepoints - List savepointedFiles = hoodieTable.getSavepoints().stream() - .flatMap(s -> hoodieTable.getSavepointedDataFiles(s)).collect(Collectors.toList()); + /** + * Selects the versions for file for cleaning, such that it

- Leaves the latest version of the + * file untouched - For older versions, - It leaves all the commits untouched which has occured in + * last config.getCleanerCommitsRetained() commits - It leaves ONE commit before this + * window. We assume that the max(query execution time) == commit_batch_time * + * config.getCleanerCommitsRetained(). This is 12 hours by default. This is essential to leave the + * file used by the query thats running for the max time.

This provides the effect of having + * lookback into all changes that happened in the last X commits. (eg: if you retain 24 commits, + * and commit batch time is 30 mins, then you have 12 hrs of lookback)

This policy is the + * default. + */ + private List getFilesToCleanKeepingLatestCommits(String partitionPath) + throws IOException { + int commitsRetained = config.getCleanerCommitsRetained(); + logger.info( + "Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. "); + List deletePaths = new ArrayList<>(); - for (HoodieFileGroup fileGroup : fileGroups) { - int keepVersions = config.getCleanerFileVersionsRetained(); - Iterator fileSliceIterator = fileGroup.getAllFileSlices().iterator(); - while (fileSliceIterator.hasNext() && keepVersions > 0) { - // Skip this most recent version - FileSlice nextSlice = fileSliceIterator.next(); - HoodieDataFile dataFile = nextSlice.getDataFile().get(); - if(savepointedFiles.contains(dataFile.getFileName())) { - // do not clean up a savepoint data file - continue; - } - keepVersions--; - } - // Delete the remaining files - while (fileSliceIterator.hasNext()) { - FileSlice nextSlice = fileSliceIterator.next(); - HoodieDataFile dataFile = nextSlice.getDataFile().get(); - deletePaths.add(dataFile.getFileStatus().getPath().toString()); - if (hoodieTable.getMetaClient().getTableType() - == HoodieTableType.MERGE_ON_READ) { - // If merge on read, then clean the log files for the commits as well - deletePaths.addAll(nextSlice.getLogFiles() - .map(file -> file.getPath().toString()) - .collect(Collectors.toList())); - } + // Collect all the datafiles savepointed by all the savepoints + List savepointedFiles = hoodieTable.getSavepoints().stream() + .flatMap(s -> hoodieTable.getSavepointedDataFiles(s)).collect(Collectors.toList()); + + // determine if we have enough commits, to start cleaning. + if (commitTimeline.countInstants() > commitsRetained) { + HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get(); + List fileGroups = + fileSystemView.getAllFileGroups(partitionPath) + .collect(Collectors.toList()); + for (HoodieFileGroup fileGroup : fileGroups) { + List fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList()); + HoodieDataFile dataFile = fileSliceList.get(0).getDataFile().get(); + String lastVersion = dataFile.getCommitTime(); + String lastVersionBeforeEarliestCommitToRetain = + getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain); + + // Ensure there are more than 1 version of the file (we only clean old files from updates) + // i.e always spare the last commit. + for (FileSlice aSlice : fileSliceList) { + HoodieDataFile aFile = aSlice.getDataFile().get(); + String fileCommitTime = aFile.getCommitTime(); + if (savepointedFiles.contains(aFile.getFileName())) { + // do not clean up a savepoint data file + continue; + } + // Dont delete the latest commit and also the last commit before the earliest commit we are retaining + // The window of commit retain == max query run time. So a query could be running which still + // uses this file. + if (fileCommitTime.equals(lastVersion) || ( + lastVersionBeforeEarliestCommitToRetain != null && fileCommitTime + .equals(lastVersionBeforeEarliestCommitToRetain))) { + // move on to the next file + continue; + } + + // Always keep the last commit + if (HoodieTimeline.compareTimestamps( + earliestCommitToRetain.getTimestamp(), + fileCommitTime, + HoodieTimeline.GREATER)) { + // this is a commit, that should be cleaned. + deletePaths.add(aFile.getFileStatus().getPath().toString()); + if (hoodieTable.getMetaClient().getTableType() + == HoodieTableType.MERGE_ON_READ) { + // If merge on read, then clean the log files for the commits as well + deletePaths.addAll(aSlice.getLogFiles() + .map(file -> file.getPath().toString()) + .collect(Collectors.toList())); } + } } - return deletePaths; + } } + return deletePaths; + } - /** - * Selects the versions for file for cleaning, such that it - *

- * - Leaves the latest version of the file untouched - * - For older versions, - * - It leaves all the commits untouched which has occured in last config.getCleanerCommitsRetained() commits - * - It leaves ONE commit before this window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained(). This is 12 hours by default. - * This is essential to leave the file used by the query thats running for the max time. - *

- * This provides the effect of having lookback into all changes that happened in the last X - * commits. (eg: if you retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback) - *

- * This policy is the default. - * - * @param partitionPath - * @return - * @throws IOException - */ - private List getFilesToCleanKeepingLatestCommits(String partitionPath) - throws IOException { - int commitsRetained = config.getCleanerCommitsRetained(); - logger.info( - "Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. "); - List deletePaths = new ArrayList<>(); - - // Collect all the datafiles savepointed by all the savepoints - List savepointedFiles = hoodieTable.getSavepoints().stream() - .flatMap(s -> hoodieTable.getSavepointedDataFiles(s)).collect(Collectors.toList()); - - // determine if we have enough commits, to start cleaning. - if (commitTimeline.countInstants() > commitsRetained) { - HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get(); - List fileGroups = - fileSystemView.getAllFileGroups(partitionPath) - .collect(Collectors.toList()); - for (HoodieFileGroup fileGroup : fileGroups) { - List fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList()); - HoodieDataFile dataFile = fileSliceList.get(0).getDataFile().get(); - String lastVersion = dataFile.getCommitTime(); - String lastVersionBeforeEarliestCommitToRetain = - getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain); - - // Ensure there are more than 1 version of the file (we only clean old files from updates) - // i.e always spare the last commit. - for (FileSlice aSlice : fileSliceList) { - HoodieDataFile aFile = aSlice.getDataFile().get(); - String fileCommitTime = aFile.getCommitTime(); - if(savepointedFiles.contains(aFile.getFileName())) { - // do not clean up a savepoint data file - continue; - } - // Dont delete the latest commit and also the last commit before the earliest commit we are retaining - // The window of commit retain == max query run time. So a query could be running which still - // uses this file. - if (fileCommitTime.equals(lastVersion) || ( - lastVersionBeforeEarliestCommitToRetain != null && fileCommitTime - .equals(lastVersionBeforeEarliestCommitToRetain))) { - // move on to the next file - continue; - } - - // Always keep the last commit - if (HoodieTimeline.compareTimestamps( - earliestCommitToRetain.getTimestamp(), - fileCommitTime, - HoodieTimeline.GREATER)) { - // this is a commit, that should be cleaned. - deletePaths.add(aFile.getFileStatus().getPath().toString()); - if (hoodieTable.getMetaClient().getTableType() - == HoodieTableType.MERGE_ON_READ) { - // If merge on read, then clean the log files for the commits as well - deletePaths.addAll(aSlice.getLogFiles() - .map(file -> file.getPath().toString()) - .collect(Collectors.toList())); - } - } - } - } - } - - return deletePaths; + /** + * Gets the latest version < commitTime. This version file could still be used by queries. + */ + private String getLatestVersionBeforeCommit(List fileSliceList, + HoodieInstant commitTime) { + for (FileSlice file : fileSliceList) { + String fileCommitTime = file.getDataFile().get().getCommitTime(); + if (HoodieTimeline.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, + HoodieTimeline.GREATER)) { + // fileList is sorted on the reverse, so the first commit we find <= commitTime is the one we want + return fileCommitTime; + } } + // There is no version of this file which is <= commitTime + return null; + } - /** - * Gets the latest version < commitTime. This version file could still be used by queries. - */ - private String getLatestVersionBeforeCommit(List fileSliceList, - HoodieInstant commitTime) { - for (FileSlice file : fileSliceList) { - String fileCommitTime = file.getDataFile().get().getCommitTime(); - if (HoodieTimeline.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, - HoodieTimeline.GREATER)) { - // fileList is sorted on the reverse, so the first commit we find <= commitTime is the one we want - return fileCommitTime; - } - } - // There is no version of this file which is <= commitTime - return null; + /** + * Returns files to be cleaned for the given partitionPath based on cleaning policy. + */ + public List getDeletePaths(String partitionPath) throws IOException { + HoodieCleaningPolicy policy = config.getCleanerPolicy(); + List deletePaths; + if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) { + deletePaths = getFilesToCleanKeepingLatestCommits(partitionPath); + } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) { + deletePaths = getFilesToCleanKeepingLatestVersions(partitionPath); + } else { + throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name()); } + logger.info( + deletePaths.size() + " patterns used to delete in partition path:" + partitionPath); - /** - * Returns files to be cleaned for the given partitionPath based on cleaning policy. - */ - public List getDeletePaths(String partitionPath) throws IOException { - HoodieCleaningPolicy policy = config.getCleanerPolicy(); - List deletePaths; - if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) { - deletePaths = getFilesToCleanKeepingLatestCommits(partitionPath); - } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) { - deletePaths = getFilesToCleanKeepingLatestVersions(partitionPath); - } else { - throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name()); - } - logger.info( - deletePaths.size() + " patterns used to delete in partition path:" + partitionPath); + return deletePaths; + } - return deletePaths; - } - - /** - * Returns earliest commit to retain based on cleaning policy. - */ - public Optional getEarliestCommitToRetain() { - Optional earliestCommitToRetain = Optional.empty(); - int commitsRetained = config.getCleanerCommitsRetained(); - if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS - && commitTimeline.countInstants() > commitsRetained) { - earliestCommitToRetain = - commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained); - } - return earliestCommitToRetain; + /** + * Returns earliest commit to retain based on cleaning policy. + */ + public Optional getEarliestCommitToRetain() { + Optional earliestCommitToRetain = Optional.empty(); + int commitsRetained = config.getCleanerCommitsRetained(); + if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS + && commitTimeline.countInstants() > commitsRetained) { + earliestCommitToRetain = + commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained); } + return earliestCommitToRetain; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java index 18279a498..a707590c4 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java @@ -39,6 +39,12 @@ import com.uber.hoodie.exception.HoodieCommitException; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.table.HoodieTable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FileSystem; @@ -46,225 +52,233 @@ import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; -import java.util.stream.Stream; - /** * Archiver to bound the growth of .commit files */ public class HoodieCommitArchiveLog { - private static Logger log = LogManager.getLogger(HoodieCommitArchiveLog.class); - private final Path archiveFilePath; - private final FileSystem fs; - private final HoodieWriteConfig config; - private HoodieLogFormat.Writer writer; + private static Logger log = LogManager.getLogger(HoodieCommitArchiveLog.class); - public HoodieCommitArchiveLog(HoodieWriteConfig config, FileSystem fs) { - this.fs = fs; - this.config = config; - this.archiveFilePath = HoodieArchivedTimeline - .getArchiveLogPath(config.getBasePath() + "/" + HoodieTableMetaClient.METAFOLDER_NAME); + private final Path archiveFilePath; + private final FileSystem fs; + private final HoodieWriteConfig config; + private HoodieLogFormat.Writer writer; + + public HoodieCommitArchiveLog(HoodieWriteConfig config, FileSystem fs) { + this.fs = fs; + this.config = config; + this.archiveFilePath = HoodieArchivedTimeline + .getArchiveLogPath(config.getBasePath() + "/" + HoodieTableMetaClient.METAFOLDER_NAME); + } + + private HoodieLogFormat.Writer openWriter() { + try { + if (this.writer == null) { + return HoodieLogFormat.newWriterBuilder() + .onParentPath(archiveFilePath.getParent()) + .withFileId(archiveFilePath.getName()) + .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION) + .withFs(fs) + .overBaseCommit("").build(); + } else { + return this.writer; + } + } catch (InterruptedException | IOException e) { + throw new HoodieException("Unable to initialize HoodieLogFormat writer", e); + } + } + + private void close() { + try { + if (this.writer != null) { + this.writer.close(); + } + } catch (IOException e) { + throw new HoodieException("Unable to close HoodieLogFormat writer", e); + } + } + + /** + * Check if commits need to be archived. If yes, archive commits. + */ + public boolean archiveIfRequired() { + try { + List instantsToArchive = getInstantsToArchive().collect(Collectors.toList()); + boolean success = true; + if (instantsToArchive.iterator().hasNext()) { + this.writer = openWriter(); + log.info("Archiving instants " + instantsToArchive); + archive(instantsToArchive); + success = deleteArchivedInstants(instantsToArchive); + } else { + log.info("No Instants to archive"); + } + return success; + } finally { + close(); + } + } + + private Stream getInstantsToArchive() { + + // TODO : rename to max/minInstantsToKeep + int maxCommitsToKeep = config.getMaxCommitsToKeep(); + int minCommitsToKeep = config.getMinCommitsToKeep(); + + HoodieTable table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + + // GroupBy each action and limit each action timeline to maxCommitsToKeep + HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline() + .getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION, + HoodieTimeline.ROLLBACK_ACTION)); + Stream instants = cleanAndRollbackTimeline.getInstants() + .collect(Collectors.groupingBy(s -> s.getAction())) + .entrySet() + .stream() + .map(i -> { + if (i.getValue().size() > maxCommitsToKeep) { + return i.getValue().subList(0, i.getValue().size() - minCommitsToKeep); + } else { + return new ArrayList(); + } + }) + .flatMap(i -> i.stream()); + + //TODO (na) : Add a way to return actions associated with a timeline and then merge/unify with logic above to avoid Stream.concats + HoodieTimeline commitTimeline = table.getCompletedCommitTimeline(); + // We cannot have any holes in the commit timeline. We cannot archive any commits which are made after the first savepoint present. + Optional firstSavepoint = table.getCompletedSavepointTimeline().firstInstant(); + if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) { + // Actually do the commits + instants = Stream.concat(instants, commitTimeline.getInstants().filter(s -> { + // if no savepoint present, then dont filter + return !(firstSavepoint.isPresent() && HoodieTimeline + .compareTimestamps(firstSavepoint.get().getTimestamp(), s.getTimestamp(), + HoodieTimeline.LESSER_OR_EQUAL)); + }).limit(commitTimeline.countInstants() - minCommitsToKeep)); } - private HoodieLogFormat.Writer openWriter() { - try { - if(this.writer == null) { - return HoodieLogFormat.newWriterBuilder() - .onParentPath(archiveFilePath.getParent()) - .withFileId(archiveFilePath.getName()) - .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION) - .withFs(fs) - .overBaseCommit("").build(); - } else { - return this.writer; - } - } catch(InterruptedException | IOException e) { - throw new HoodieException("Unable to initialize HoodieLogFormat writer", e); + return instants; + } + + private boolean deleteArchivedInstants(List archivedInstants) { + log.info("Deleting instants " + archivedInstants); + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + + boolean success = true; + for (HoodieInstant archivedInstant : archivedInstants) { + Path commitFile = + new Path(metaClient.getMetaPath(), archivedInstant.getFileName()); + try { + if (fs.exists(commitFile)) { + success &= fs.delete(commitFile, false); + log.info("Archived and deleted instant file " + commitFile); } + } catch (IOException e) { + throw new HoodieIOException("Failed to delete archived instant " + archivedInstant, + e); + } } + return success; + } - private void close() { - try { - if(this.writer != null) { - this.writer.close(); - } - } catch(IOException e) { - throw new HoodieException("Unable to close HoodieLogFormat writer", e); - } + public void archive(List instants) throws HoodieCommitException { + + try { + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, config.getBasePath(), true); + HoodieTimeline commitTimeline = + metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants(); + + Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema(); + log.info("Wrapper schema " + wrapperSchema.toString()); + List records = new ArrayList<>(); + for (HoodieInstant hoodieInstant : instants) { + records.add(convertToAvroRecord(commitTimeline, hoodieInstant)); + } + HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, wrapperSchema); + this.writer = writer.appendBlock(block); + } catch (Exception e) { + throw new HoodieCommitException("Failed to archive commits", e); } + } - /** - * Check if commits need to be archived. If yes, archive commits. - */ - public boolean archiveIfRequired() { - try { - List instantsToArchive = getInstantsToArchive().collect(Collectors.toList()); - boolean success = true; - if (instantsToArchive.iterator().hasNext()) { - this.writer = openWriter(); - log.info("Archiving instants " + instantsToArchive); - archive(instantsToArchive); - success = deleteArchivedInstants(instantsToArchive); - } else { - log.info("No Instants to archive"); - } - return success; - } finally { - close(); - } + public Path getArchiveFilePath() { + return archiveFilePath; + } + + private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, + HoodieInstant hoodieInstant) throws IOException { + HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry(); + archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp()); + switch (hoodieInstant.getAction()) { + case HoodieTimeline.CLEAN_ACTION: { + archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils + .deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), + HoodieCleanMetadata.class)); + archivedMetaWrapper.setActionType(ActionType.clean.name()); + break; + } + case HoodieTimeline.COMMIT_ACTION: { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get()); + archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata)); + archivedMetaWrapper.setActionType(ActionType.commit.name()); + break; + } + case HoodieTimeline.COMPACTION_ACTION: { + com.uber.hoodie.common.model.HoodieCompactionMetadata compactionMetadata = com.uber.hoodie.common.model.HoodieCompactionMetadata + .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get()); + archivedMetaWrapper + .setHoodieCompactionMetadata(compactionMetadataConverter(compactionMetadata)); + archivedMetaWrapper.setActionType(ActionType.compaction.name()); + break; + } + case HoodieTimeline.ROLLBACK_ACTION: { + archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils + .deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), + HoodieRollbackMetadata.class)); + archivedMetaWrapper.setActionType(ActionType.rollback.name()); + break; + } + case HoodieTimeline.SAVEPOINT_ACTION: { + archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils + .deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), + HoodieSavepointMetadata.class)); + archivedMetaWrapper.setActionType(ActionType.savepoint.name()); + break; + } + case HoodieTimeline.DELTA_COMMIT_ACTION: { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get()); + archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata)); + archivedMetaWrapper.setActionType(ActionType.commit.name()); + break; + } } + return archivedMetaWrapper; + } - private Stream getInstantsToArchive() { + private com.uber.hoodie.avro.model.HoodieCommitMetadata commitMetadataConverter( + HoodieCommitMetadata hoodieCommitMetadata) { + ObjectMapper mapper = new ObjectMapper(); + //Need this to ignore other public get() methods + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData = + mapper.convertValue(hoodieCommitMetadata, + com.uber.hoodie.avro.model.HoodieCommitMetadata.class); + return avroMetaData; + } - // TODO : rename to max/minInstantsToKeep - int maxCommitsToKeep = config.getMaxCommitsToKeep(); - int minCommitsToKeep = config.getMinCommitsToKeep(); - - HoodieTable table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); - - // GroupBy each action and limit each action timeline to maxCommitsToKeep - HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline().getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION, - HoodieTimeline.ROLLBACK_ACTION)); - Stream instants = cleanAndRollbackTimeline.getInstants() - .collect(Collectors.groupingBy(s -> s.getAction())) - .entrySet() - .stream() - .map(i -> { - if (i.getValue().size() > maxCommitsToKeep) { - return i.getValue().subList(0, i.getValue().size() - minCommitsToKeep); - } else { - return new ArrayList(); - } - }) - .flatMap(i -> i.stream()); - - //TODO (na) : Add a way to return actions associated with a timeline and then merge/unify with logic above to avoid Stream.concats - HoodieTimeline commitTimeline = table.getCompletedCommitTimeline(); - // We cannot have any holes in the commit timeline. We cannot archive any commits which are made after the first savepoint present. - Optional firstSavepoint = table.getCompletedSavepointTimeline().firstInstant(); - if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) { - // Actually do the commits - instants = Stream.concat(instants, commitTimeline.getInstants().filter(s -> { - // if no savepoint present, then dont filter - return !(firstSavepoint.isPresent() && HoodieTimeline - .compareTimestamps(firstSavepoint.get().getTimestamp(), s.getTimestamp(), - HoodieTimeline.LESSER_OR_EQUAL)); - }).limit(commitTimeline.countInstants() - minCommitsToKeep)); - } - - return instants; - } - - private boolean deleteArchivedInstants(List archivedInstants) { - log.info("Deleting instants " + archivedInstants); - HoodieTableMetaClient metaClient = - new HoodieTableMetaClient(fs, config.getBasePath(), true); - - boolean success = true; - for (HoodieInstant archivedInstant : archivedInstants) { - Path commitFile = - new Path(metaClient.getMetaPath(), archivedInstant.getFileName()); - try { - if (fs.exists(commitFile)) { - success &= fs.delete(commitFile, false); - log.info("Archived and deleted instant file " + commitFile); - } - } catch (IOException e) { - throw new HoodieIOException("Failed to delete archived instant " + archivedInstant, - e); - } - } - return success; - } - - public void archive(List instants) throws HoodieCommitException { - - try { - HoodieTableMetaClient metaClient = - new HoodieTableMetaClient(fs, config.getBasePath(), true); - HoodieTimeline commitTimeline = - metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants(); - - Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema(); - log.info("Wrapper schema " + wrapperSchema.toString()); - List records = new ArrayList<>(); - for (HoodieInstant hoodieInstant : instants) { - records.add(convertToAvroRecord(commitTimeline, hoodieInstant)); - } - HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, wrapperSchema); - this.writer = writer.appendBlock(block); - } catch(Exception e) { - throw new HoodieCommitException("Failed to archive commits", e); - } - } - - public Path getArchiveFilePath() { - return archiveFilePath; - } - - private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant) throws IOException { - HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry(); - archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp()); - switch(hoodieInstant.getAction()) { - case HoodieTimeline.CLEAN_ACTION:{ - archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieCleanMetadata.class)); - archivedMetaWrapper.setActionType(ActionType.clean.name()); - break; - } - case HoodieTimeline.COMMIT_ACTION:{ - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get()); - archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata)); - archivedMetaWrapper.setActionType(ActionType.commit.name()); - break; - } - case HoodieTimeline.COMPACTION_ACTION:{ - com.uber.hoodie.common.model.HoodieCompactionMetadata compactionMetadata = com.uber.hoodie.common.model.HoodieCompactionMetadata - .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get()); - archivedMetaWrapper.setHoodieCompactionMetadata(compactionMetadataConverter(compactionMetadata)); - archivedMetaWrapper.setActionType(ActionType.compaction.name()); - break; - } - case HoodieTimeline.ROLLBACK_ACTION:{ - archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class)); - archivedMetaWrapper.setActionType(ActionType.rollback.name()); - break; - } - case HoodieTimeline.SAVEPOINT_ACTION:{ - archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class)); - archivedMetaWrapper.setActionType(ActionType.savepoint.name()); - break; - } - case HoodieTimeline.DELTA_COMMIT_ACTION:{ - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get()); - archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata)); - archivedMetaWrapper.setActionType(ActionType.commit.name()); - break; - } - } - return archivedMetaWrapper; - } - - private com.uber.hoodie.avro.model.HoodieCommitMetadata commitMetadataConverter(HoodieCommitMetadata hoodieCommitMetadata) { - ObjectMapper mapper = new ObjectMapper(); - //Need this to ignore other public get() methods - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData = - mapper.convertValue(hoodieCommitMetadata, com.uber.hoodie.avro.model.HoodieCommitMetadata.class); - return avroMetaData; - } - - private com.uber.hoodie.avro.model.HoodieCompactionMetadata compactionMetadataConverter(HoodieCompactionMetadata hoodieCompactionMetadata) { - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - com.uber.hoodie.avro.model.HoodieCompactionMetadata avroMetaData = mapper.convertValue(hoodieCompactionMetadata, - com.uber.hoodie.avro.model.HoodieCompactionMetadata.class); - return avroMetaData; - } + private com.uber.hoodie.avro.model.HoodieCompactionMetadata compactionMetadataConverter( + HoodieCompactionMetadata hoodieCompactionMetadata) { + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + com.uber.hoodie.avro.model.HoodieCompactionMetadata avroMetaData = mapper + .convertValue(hoodieCompactionMetadata, + com.uber.hoodie.avro.model.HoodieCompactionMetadata.class); + return avroMetaData; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCreateHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCreateHandle.java index c9680c8f2..629869e18 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCreateHandle.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCreateHandle.java @@ -29,116 +29,111 @@ import com.uber.hoodie.exception.HoodieInsertException; import com.uber.hoodie.io.storage.HoodieStorageWriter; import com.uber.hoodie.io.storage.HoodieStorageWriterFactory; import com.uber.hoodie.table.HoodieTable; +import java.io.IOException; +import java.util.Optional; +import java.util.UUID; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.TaskContext; -import java.io.IOException; -import java.util.Optional; -import java.util.UUID; - public class HoodieCreateHandle extends HoodieIOHandle { - private static Logger logger = LogManager.getLogger(HoodieCreateHandle.class); - private final WriteStatus status; - private final HoodieStorageWriter storageWriter; - private final Path path; - private long recordsWritten = 0; - private long recordsDeleted = 0; + private static Logger logger = LogManager.getLogger(HoodieCreateHandle.class); - public HoodieCreateHandle(HoodieWriteConfig config, String commitTime, - HoodieTable hoodieTable, String partitionPath) { - super(config, commitTime, hoodieTable); - this.status = ReflectionUtils.loadClass(config.getWriteStatusClassName()); - status.setFileId(UUID.randomUUID().toString()); - status.setPartitionPath(partitionPath); + private final WriteStatus status; + private final HoodieStorageWriter storageWriter; + private final Path path; + private long recordsWritten = 0; + private long recordsDeleted = 0; - this.path = makeNewPath(partitionPath, TaskContext.getPartitionId(), status.getFileId()); - try { - HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, - commitTime, - new Path(config.getBasePath()), - new Path(config.getBasePath(), partitionPath)); - partitionMetadata.trySave(TaskContext.getPartitionId()); - this.storageWriter = - HoodieStorageWriterFactory.getStorageWriter(commitTime, path, hoodieTable, config, schema); - } catch (IOException e) { - throw new HoodieInsertException( - "Failed to initialize HoodieStorageWriter for path " + path, e); - } - logger.info("New InsertHandle for partition :" + partitionPath); + public HoodieCreateHandle(HoodieWriteConfig config, String commitTime, + HoodieTable hoodieTable, String partitionPath) { + super(config, commitTime, hoodieTable); + this.status = ReflectionUtils.loadClass(config.getWriteStatusClassName()); + status.setFileId(UUID.randomUUID().toString()); + status.setPartitionPath(partitionPath); + + this.path = makeNewPath(partitionPath, TaskContext.getPartitionId(), status.getFileId()); + try { + HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, + commitTime, + new Path(config.getBasePath()), + new Path(config.getBasePath(), partitionPath)); + partitionMetadata.trySave(TaskContext.getPartitionId()); + this.storageWriter = + HoodieStorageWriterFactory + .getStorageWriter(commitTime, path, hoodieTable, config, schema); + } catch (IOException e) { + throw new HoodieInsertException( + "Failed to initialize HoodieStorageWriter for path " + path, e); } + logger.info("New InsertHandle for partition :" + partitionPath); + } - /** - * Determines whether we can accept the incoming records, into the current file, depending on - * - * - Whether it belongs to the same partitionPath as existing records - * - Whether the current file written bytes lt max file size - * - * @return - */ - public boolean canWrite(HoodieRecord record) { - return storageWriter.canWrite() && record.getPartitionPath() - .equals(status.getPartitionPath()); + /** + * Determines whether we can accept the incoming records, into the current file, depending on + * + * - Whether it belongs to the same partitionPath as existing records - Whether the current file + * written bytes lt max file size + */ + public boolean canWrite(HoodieRecord record) { + return storageWriter.canWrite() && record.getPartitionPath() + .equals(status.getPartitionPath()); + } + + /** + * Perform the actual writing of the given record into the backing file. + */ + public void write(HoodieRecord record) { + Optional recordMetadata = record.getData().getMetadata(); + try { + Optional avroRecord = record.getData().getInsertValue(schema); + + if (avroRecord.isPresent()) { + storageWriter.writeAvroWithMetadata(avroRecord.get(), record); + // update the new location of record, so we know where to find it next + record.setNewLocation(new HoodieRecordLocation(commitTime, status.getFileId())); + recordsWritten++; + } else { + recordsDeleted++; + } + record.deflate(); + status.markSuccess(record, recordMetadata); + } catch (Throwable t) { + // Not throwing exception from here, since we don't want to fail the entire job + // for a single record + status.markFailure(record, t, recordMetadata); + logger.error("Error writing record " + record, t); } + } - /** - * Perform the actual writing of the given record into the backing file. - * - * @param record - */ - public void write(HoodieRecord record) { - Optional recordMetadata = record.getData().getMetadata(); - try { - Optional avroRecord = record.getData().getInsertValue(schema); + /** + * Performs actions to durably, persist the current changes and returns a WriteStatus object + */ + public WriteStatus close() { + logger.info( + "Closing the file " + status.getFileId() + " as we are done with all the records " + + recordsWritten); + try { + storageWriter.close(); - if(avroRecord.isPresent()) { - storageWriter.writeAvroWithMetadata(avroRecord.get(), record); - // update the new location of record, so we know where to find it next - record.setNewLocation(new HoodieRecordLocation(commitTime, status.getFileId())); - recordsWritten++; - } else { - recordsDeleted++; - } - record.deflate(); - status.markSuccess(record, recordMetadata); - } catch (Throwable t) { - // Not throwing exception from here, since we don't want to fail the entire job - // for a single record - status.markFailure(record, t, recordMetadata); - logger.error("Error writing record " + record, t); - } - } - - /** - * Performs actions to durably, persist the current changes and returns a WriteStatus object - * - * @return - */ - public WriteStatus close() { - logger.info( - "Closing the file " + status.getFileId() + " as we are done with all the records " - + recordsWritten); - try { - storageWriter.close(); - - HoodieWriteStat stat = new HoodieWriteStat(); - stat.setNumWrites(recordsWritten); - stat.setNumDeletes(recordsDeleted); - stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); - stat.setFileId(status.getFileId()); - String relativePath = path.toString().replace(new Path(config.getBasePath()) + "/", ""); - stat.setPath(relativePath); - stat.setTotalWriteBytes(FSUtils.getFileSize(fs, path)); - stat.setTotalWriteErrors(status.getFailedRecords().size()); - status.setStat(stat); - - return status; - } catch (IOException e) { - throw new HoodieInsertException("Failed to close the Insert Handle for path " + path, - e); - } + HoodieWriteStat stat = new HoodieWriteStat(); + stat.setNumWrites(recordsWritten); + stat.setNumDeletes(recordsDeleted); + stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); + stat.setFileId(status.getFileId()); + String relativePath = path.toString().replace(new Path(config.getBasePath()) + "/", ""); + stat.setPath(relativePath); + stat.setTotalWriteBytes(FSUtils.getFileSize(fs, path)); + stat.setTotalWriteErrors(status.getFailedRecords().size()); + status.setStat(stat); + + return status; + } catch (IOException e) { + throw new HoodieInsertException("Failed to close the Insert Handle for path " + path, + e); } + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java index 8ba4068ad..53ec545fc 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java @@ -24,6 +24,7 @@ import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.table.HoodieTable; +import java.io.IOException; import org.apache.avro.Schema; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -31,68 +32,67 @@ import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; - public abstract class HoodieIOHandle { - private static Logger logger = LogManager.getLogger(HoodieIOHandle.class); - protected final String commitTime; - protected final HoodieWriteConfig config; - protected final FileSystem fs; - protected final HoodieTable hoodieTable; - protected HoodieTimeline hoodieTimeline; - protected TableFileSystemView.ReadOptimizedView fileSystemView; - protected final Schema schema; - public HoodieIOHandle(HoodieWriteConfig config, String commitTime, - HoodieTable hoodieTable) { - this.commitTime = commitTime; - this.config = config; - this.fs = FSUtils.getFs(); - this.hoodieTable = hoodieTable; - this.hoodieTimeline = hoodieTable.getCompletedCommitTimeline(); - this.fileSystemView = hoodieTable.getROFileSystemView(); - this.schema = - HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); + private static Logger logger = LogManager.getLogger(HoodieIOHandle.class); + protected final String commitTime; + protected final HoodieWriteConfig config; + protected final FileSystem fs; + protected final HoodieTable hoodieTable; + protected HoodieTimeline hoodieTimeline; + protected TableFileSystemView.ReadOptimizedView fileSystemView; + protected final Schema schema; + + public HoodieIOHandle(HoodieWriteConfig config, String commitTime, + HoodieTable hoodieTable) { + this.commitTime = commitTime; + this.config = config; + this.fs = FSUtils.getFs(); + this.hoodieTable = hoodieTable; + this.hoodieTimeline = hoodieTable.getCompletedCommitTimeline(); + this.fileSystemView = hoodieTable.getROFileSystemView(); + this.schema = + HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); + } + + public Path makeNewPath(String partitionPath, int taskPartitionId, String fileName) { + Path path = new Path(config.getBasePath(), partitionPath); + try { + fs.mkdirs(path); // create a new partition as needed. + } catch (IOException e) { + throw new HoodieIOException("Failed to make dir " + path, e); } - public Path makeNewPath(String partitionPath, int taskPartitionId, String fileName) { - Path path = new Path(config.getBasePath(), partitionPath); - try { - fs.mkdirs(path); // create a new partition as needed. - } catch (IOException e) { - throw new HoodieIOException("Failed to make dir " + path, e); + return new Path(path.toString(), + FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName)); + } + + /** + * Deletes any new tmp files written during the current commit, into the partition + */ + public static void cleanupTmpFilesFromCurrentCommit(HoodieWriteConfig config, + String commitTime, + String partitionPath, + int taskPartitionId) { + FileSystem fs = FSUtils.getFs(); + try { + FileStatus[] prevFailedFiles = fs.globStatus(new Path(String + .format("%s/%s/%s", config.getBasePath(), partitionPath, + FSUtils.maskWithoutFileId(commitTime, taskPartitionId)))); + if (prevFailedFiles != null) { + logger.info("Deleting " + prevFailedFiles.length + + " files generated by previous failed attempts."); + for (FileStatus status : prevFailedFiles) { + fs.delete(status.getPath(), false); } - - return new Path(path.toString(), - FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName)); + } + } catch (IOException e) { + throw new HoodieIOException("Failed to cleanup Temp files from commit " + commitTime, + e); } + } - /** - * Deletes any new tmp files written during the current commit, into the partition - */ - public static void cleanupTmpFilesFromCurrentCommit(HoodieWriteConfig config, - String commitTime, - String partitionPath, - int taskPartitionId) { - FileSystem fs = FSUtils.getFs(); - try { - FileStatus[] prevFailedFiles = fs.globStatus(new Path(String - .format("%s/%s/%s", config.getBasePath(), partitionPath, - FSUtils.maskWithoutFileId(commitTime, taskPartitionId)))); - if (prevFailedFiles != null) { - logger.info("Deleting " + prevFailedFiles.length - + " files generated by previous failed attempts."); - for (FileStatus status : prevFailedFiles) { - fs.delete(status.getPath(), false); - } - } - } catch (IOException e) { - throw new HoodieIOException("Failed to cleanup Temp files from commit " + commitTime, - e); - } - } - - public Schema getSchema() { - return schema; - } + public Schema getSchema() { + return schema; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieMergeHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieMergeHandle.java index 262da60ca..b61b9d9e8 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieMergeHandle.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieMergeHandle.java @@ -16,19 +16,23 @@ package com.uber.hoodie.io; -import com.uber.hoodie.common.model.HoodiePartitionMetadata; -import com.uber.hoodie.common.util.ReflectionUtils; -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieWriteStat; import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.common.util.ReflectionUtils; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieUpsertException; import com.uber.hoodie.io.storage.HoodieStorageWriter; import com.uber.hoodie.io.storage.HoodieStorageWriterFactory; import com.uber.hoodie.table.HoodieTable; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Optional; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; @@ -36,197 +40,197 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.TaskContext; -import java.io.IOException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Optional; - @SuppressWarnings("Duplicates") public class HoodieMergeHandle extends HoodieIOHandle { - private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class); - private WriteStatus writeStatus; - private HashMap> keyToNewRecords; - private HoodieStorageWriter storageWriter; - private Path newFilePath; - private Path oldFilePath; - private long recordsWritten = 0; - private long recordsDeleted = 0; - private long updatedRecordsWritten = 0; + private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class); - public HoodieMergeHandle(HoodieWriteConfig config, - String commitTime, - HoodieTable hoodieTable, - Iterator> recordItr, - String fileId) { - super(config, commitTime, hoodieTable); - init(fileId, recordItr); - } + private WriteStatus writeStatus; + private HashMap> keyToNewRecords; + private HoodieStorageWriter storageWriter; + private Path newFilePath; + private Path oldFilePath; + private long recordsWritten = 0; + private long recordsDeleted = 0; + private long updatedRecordsWritten = 0; - /** - * Load the new incoming records in a map, and extract the old file path. - */ - private void init(String fileId, Iterator> newRecordsItr) { - WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName()); - writeStatus.setStat(new HoodieWriteStat()); - this.writeStatus = writeStatus; - this.keyToNewRecords = new HashMap<>(); + public HoodieMergeHandle(HoodieWriteConfig config, + String commitTime, + HoodieTable hoodieTable, + Iterator> recordItr, + String fileId) { + super(config, commitTime, hoodieTable); + init(fileId, recordItr); + } - try { - // Load the new records in a map - while (newRecordsItr.hasNext()) { - HoodieRecord record = newRecordsItr.next(); - // If the first record, we need to extract some info out - if (oldFilePath == null) { - String latestValidFilePath = fileSystemView - .getLatestDataFiles(record.getPartitionPath()) - .filter(dataFile -> dataFile.getFileId().equals(fileId)) - .findFirst() - .get().getFileName(); - writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath)); + /** + * Load the new incoming records in a map, and extract the old file path. + */ + private void init(String fileId, Iterator> newRecordsItr) { + WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName()); + writeStatus.setStat(new HoodieWriteStat()); + this.writeStatus = writeStatus; + this.keyToNewRecords = new HashMap<>(); - HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, - commitTime, - new Path(config.getBasePath()), - new Path(config.getBasePath(), record.getPartitionPath())); - partitionMetadata.trySave(TaskContext.getPartitionId()); + try { + // Load the new records in a map + while (newRecordsItr.hasNext()) { + HoodieRecord record = newRecordsItr.next(); + // If the first record, we need to extract some info out + if (oldFilePath == null) { + String latestValidFilePath = fileSystemView + .getLatestDataFiles(record.getPartitionPath()) + .filter(dataFile -> dataFile.getFileId().equals(fileId)) + .findFirst() + .get().getFileName(); + writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath)); - oldFilePath = new Path( - config.getBasePath() + "/" + record.getPartitionPath() + "/" - + latestValidFilePath); - String relativePath = new Path( record.getPartitionPath() + "/" + FSUtils - .makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString(); - newFilePath = new Path(config.getBasePath(), relativePath); + HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, + commitTime, + new Path(config.getBasePath()), + new Path(config.getBasePath(), record.getPartitionPath())); + partitionMetadata.trySave(TaskContext.getPartitionId()); - // handle cases of partial failures, for update task - if (fs.exists(newFilePath)) { - fs.delete(newFilePath, false); - } + oldFilePath = new Path( + config.getBasePath() + "/" + record.getPartitionPath() + "/" + + latestValidFilePath); + String relativePath = new Path(record.getPartitionPath() + "/" + FSUtils + .makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString(); + newFilePath = new Path(config.getBasePath(), relativePath); - logger.info(String.format("Merging new data into oldPath %s, as newPath %s", - oldFilePath.toString(), newFilePath.toString())); - // file name is same for all records, in this bunch - writeStatus.setFileId(fileId); - writeStatus.setPartitionPath(record.getPartitionPath()); - writeStatus.getStat().setFileId(fileId); - writeStatus.getStat().setPath(relativePath); - } - keyToNewRecords.put(record.getRecordKey(), record); - // update the new location of the record, so we know where to find it next - record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); - } - // Create the writer for writing the new version file - storageWriter = HoodieStorageWriterFactory - .getStorageWriter(commitTime, newFilePath, hoodieTable, config, schema); + // handle cases of partial failures, for update task + if (fs.exists(newFilePath)) { + fs.delete(newFilePath, false); + } - } catch (Exception e) { - logger.error("Error in update task at commit " + commitTime, e); - writeStatus.setGlobalError(e); - throw new HoodieUpsertException( - "Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit " - + commitTime + " on path " + hoodieTable.getMetaClient().getBasePath(), e); + logger.info(String.format("Merging new data into oldPath %s, as newPath %s", + oldFilePath.toString(), newFilePath.toString())); + // file name is same for all records, in this bunch + writeStatus.setFileId(fileId); + writeStatus.setPartitionPath(record.getPartitionPath()); + writeStatus.getStat().setFileId(fileId); + writeStatus.getStat().setPath(relativePath); } + keyToNewRecords.put(record.getRecordKey(), record); + // update the new location of the record, so we know where to find it next + record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); + } + // Create the writer for writing the new version file + storageWriter = HoodieStorageWriterFactory + .getStorageWriter(commitTime, newFilePath, hoodieTable, config, schema); + + } catch (Exception e) { + logger.error("Error in update task at commit " + commitTime, e); + writeStatus.setGlobalError(e); + throw new HoodieUpsertException( + "Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit " + + commitTime + " on path " + hoodieTable.getMetaClient().getBasePath(), e); } + } - private boolean writeUpdateRecord(HoodieRecord hoodieRecord, Optional indexedRecord) { - Optional recordMetadata = hoodieRecord.getData().getMetadata(); - try { - if(indexedRecord.isPresent()) { - storageWriter.writeAvroWithMetadata(indexedRecord.get(), hoodieRecord); - recordsWritten++; - updatedRecordsWritten++; - } else { - recordsDeleted++; - } + private boolean writeUpdateRecord(HoodieRecord hoodieRecord, + Optional indexedRecord) { + Optional recordMetadata = hoodieRecord.getData().getMetadata(); + try { + if (indexedRecord.isPresent()) { + storageWriter.writeAvroWithMetadata(indexedRecord.get(), hoodieRecord); + recordsWritten++; + updatedRecordsWritten++; + } else { + recordsDeleted++; + } - hoodieRecord.deflate(); - writeStatus.markSuccess(hoodieRecord, recordMetadata); - return true; - } catch (Exception e) { - logger.error("Error writing record "+ hoodieRecord, e); - writeStatus.markFailure(hoodieRecord, e, recordMetadata); - } - return false; + hoodieRecord.deflate(); + writeStatus.markSuccess(hoodieRecord, recordMetadata); + return true; + } catch (Exception e) { + logger.error("Error writing record " + hoodieRecord, e); + writeStatus.markFailure(hoodieRecord, e, recordMetadata); } + return false; + } - /** - * Go through an old record. Here if we detect a newer version shows up, we write the new one to the file. - */ - public void write(GenericRecord oldRecord) { - String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - HoodieRecord hoodieRecord = keyToNewRecords.get(key); - boolean copyOldRecord = true; - if (keyToNewRecords.containsKey(key)) { - try { - Optional combinedAvroRecord = hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, schema); - if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) { + /** + * Go through an old record. Here if we detect a newer version shows up, we write the new one to + * the file. + */ + public void write(GenericRecord oldRecord) { + String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + HoodieRecord hoodieRecord = keyToNewRecords.get(key); + boolean copyOldRecord = true; + if (keyToNewRecords.containsKey(key)) { + try { + Optional combinedAvroRecord = hoodieRecord.getData() + .combineAndGetUpdateValue(oldRecord, schema); + if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) { /* ONLY WHEN * 1) we have an update for this key AND * 2) We are able to successfully write the the combined new value * * We no longer need to copy the old record over. */ - copyOldRecord = false; - } - keyToNewRecords.remove(key); - } catch (Exception e) { - throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {" - + keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e); - } - } - - if (copyOldRecord) { - // this should work as it is, since this is an existing record - String errMsg = "Failed to merge old record into new file for key " + key + " from old file " - + getOldFilePath() + " to new file " + newFilePath; - try { - storageWriter.writeAvro(key, oldRecord); - } catch (ClassCastException e) { - logger.error( - "Schema mismatch when rewriting old record " + oldRecord + " from file " - + getOldFilePath() + " to file " + newFilePath + " with schema " + schema - .toString(true)); - throw new HoodieUpsertException(errMsg, e); - } catch (IOException e) { - logger.error("Failed to merge old record into new file for key " + key + " from old file " - + getOldFilePath() + " to new file " + newFilePath, e); - throw new HoodieUpsertException(errMsg, e); - } - recordsWritten ++; + copyOldRecord = false; } + keyToNewRecords.remove(key); + } catch (Exception e) { + throw new HoodieUpsertException( + "Failed to combine/merge new record with old value in storage, for new record {" + + keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e); + } } - public void close() { - try { - // write out any pending records (this can happen when inserts are turned into updates) - Iterator pendingRecordsItr = keyToNewRecords.keySet().iterator(); - while (pendingRecordsItr.hasNext()) { - String key = pendingRecordsItr.next(); - HoodieRecord hoodieRecord = keyToNewRecords.get(key); - writeUpdateRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(schema)); - } - keyToNewRecords.clear(); - - if (storageWriter != null) { - storageWriter.close(); - } - - writeStatus.getStat().setTotalWriteBytes(FSUtils.getFileSize(fs, newFilePath)); - writeStatus.getStat().setNumWrites(recordsWritten); - writeStatus.getStat().setNumDeletes(recordsDeleted); - writeStatus.getStat().setNumUpdateWrites(updatedRecordsWritten); - writeStatus.getStat().setTotalWriteErrors(writeStatus.getFailedRecords().size()); - } catch (IOException e) { - throw new HoodieUpsertException("Failed to close UpdateHandle", e); - } + if (copyOldRecord) { + // this should work as it is, since this is an existing record + String errMsg = "Failed to merge old record into new file for key " + key + " from old file " + + getOldFilePath() + " to new file " + newFilePath; + try { + storageWriter.writeAvro(key, oldRecord); + } catch (ClassCastException e) { + logger.error( + "Schema mismatch when rewriting old record " + oldRecord + " from file " + + getOldFilePath() + " to file " + newFilePath + " with schema " + schema + .toString(true)); + throw new HoodieUpsertException(errMsg, e); + } catch (IOException e) { + logger.error("Failed to merge old record into new file for key " + key + " from old file " + + getOldFilePath() + " to new file " + newFilePath, e); + throw new HoodieUpsertException(errMsg, e); + } + recordsWritten++; } + } - public Path getOldFilePath() { - return oldFilePath; - } + public void close() { + try { + // write out any pending records (this can happen when inserts are turned into updates) + Iterator pendingRecordsItr = keyToNewRecords.keySet().iterator(); + while (pendingRecordsItr.hasNext()) { + String key = pendingRecordsItr.next(); + HoodieRecord hoodieRecord = keyToNewRecords.get(key); + writeUpdateRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(schema)); + } + keyToNewRecords.clear(); - public WriteStatus getWriteStatus() { - return writeStatus; + if (storageWriter != null) { + storageWriter.close(); + } + + writeStatus.getStat().setTotalWriteBytes(FSUtils.getFileSize(fs, newFilePath)); + writeStatus.getStat().setNumWrites(recordsWritten); + writeStatus.getStat().setNumDeletes(recordsDeleted); + writeStatus.getStat().setNumUpdateWrites(updatedRecordsWritten); + writeStatus.getStat().setTotalWriteErrors(writeStatus.getFailedRecords().size()); + } catch (IOException e) { + throw new HoodieUpsertException("Failed to close UpdateHandle", e); } + } + + public Path getOldFilePath() { + return oldFilePath; + } + + public WriteStatus getWriteStatus() { + return writeStatus; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/CompactionOperation.java b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/CompactionOperation.java index 55eb4c364..7304c11d8 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/CompactionOperation.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/CompactionOperation.java @@ -18,7 +18,6 @@ package com.uber.hoodie.io.compact; import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieLogFile; - import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.io.compact.strategy.CompactionStrategy; import java.io.Serializable; @@ -27,8 +26,8 @@ import java.util.Map; import java.util.stream.Collectors; /** - * Encapsulates all the needed information about a compaction - * and make a decision whether this compaction is effective or not + * Encapsulates all the needed information about a compaction and make a decision whether this + * compaction is effective or not * * @see CompactionStrategy */ diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieCompactor.java b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieCompactor.java index 8032cec41..e192a2416 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieCompactor.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieCompactor.java @@ -22,29 +22,28 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.table.HoodieTable; -import org.apache.spark.api.java.JavaSparkContext; - import java.io.Serializable; import java.util.Date; +import org.apache.spark.api.java.JavaSparkContext; /** * A HoodieCompactor runs compaction on a hoodie table */ public interface HoodieCompactor extends Serializable { - /** - * Compact the delta files with the data files - * @throws Exception - */ - HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config, - HoodieTable hoodieTable) throws Exception; + + /** + * Compact the delta files with the data files + */ + HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config, + HoodieTable hoodieTable) throws Exception; - // Helper methods - default String startCompactionCommit(HoodieTable hoodieTable) { - String commitTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date()); - HoodieActiveTimeline activeTimeline = hoodieTable.getActiveTimeline(); - activeTimeline - .createInflight(new HoodieInstant(true, HoodieTimeline.COMPACTION_ACTION, commitTime)); - return commitTime; - } + // Helper methods + default String startCompactionCommit(HoodieTable hoodieTable) { + String commitTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date()); + HoodieActiveTimeline activeTimeline = hoodieTable.getActiveTimeline(); + activeTimeline + .createInflight(new HoodieInstant(true, HoodieTimeline.COMPACTION_ACTION, commitTime)); + return commitTime; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieRealtimeTableCompactor.java b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieRealtimeTableCompactor.java index be039570d..7db2da2f7 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieRealtimeTableCompactor.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieRealtimeTableCompactor.java @@ -16,14 +16,14 @@ package com.uber.hoodie.io.compact; +import static java.util.stream.Collectors.toList; + import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.model.CompactionWriteStat; -import com.uber.hoodie.common.model.HoodieAvroPayload; import com.uber.hoodie.common.model.HoodieCompactionMetadata; -import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; @@ -36,7 +36,12 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieCompactionException; import com.uber.hoodie.table.HoodieCopyOnWriteTable; import com.uber.hoodie.table.HoodieTable; +import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.StreamSupport; import org.apache.avro.Schema; @@ -46,18 +51,10 @@ import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.Iterator; -import java.util.List; -import java.util.Optional; - -import static java.util.stream.Collectors.*; - /** - * HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. - * Computes all possible compactions, passes it through a CompactionFilter and executes - * all the compactions and writes a new version of base files and make a normal commit + * HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all + * possible compactions, passes it through a CompactionFilter and executes all the compactions and + * writes a new version of base files and make a normal commit * * @see HoodieCompactor */ @@ -80,7 +77,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { String compactionCommit = startCompactionCommit(hoodieTable); log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit); List partitionPaths = - FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), config.shouldAssumeDatePartitioning()); + FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), + config.shouldAssumeDatePartitioning()); log.info("Compaction looking for files to compact in " + partitionPaths + " partitions"); List operations = @@ -89,7 +87,7 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { .getRTFileSystemView() .getLatestFileSlices(partitionPath) .map(s -> new CompactionOperation(s.getDataFile().get(), - partitionPath, s.getLogFiles().collect(Collectors.toList()), config)) + partitionPath, s.getLogFiles().collect(Collectors.toList()), config)) .collect(toList()).iterator()).collect(); log.info("Total of " + operations.size() + " compactions are retrieved"); @@ -150,14 +148,15 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon. String maxInstantTime = metaClient.getActiveTimeline() - .getTimelineOfActions( - Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, - HoodieTimeline.COMPACTION_ACTION, - HoodieTimeline.DELTA_COMMIT_ACTION)) - .filterCompletedInstants().lastInstant().get().getTimestamp(); + .getTimelineOfActions( + Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, + HoodieTimeline.COMPACTION_ACTION, + HoodieTimeline.DELTA_COMMIT_ACTION)) + .filterCompletedInstants().lastInstant().get().getTimestamp(); - HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, metaClient.getBasePath(), - operation.getDeltaFilePaths(), readerSchema, maxInstantTime); + HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, + metaClient.getBasePath(), + operation.getDeltaFilePaths(), readerSchema, maxInstantTime); if (!scanner.iterator().hasNext()) { return Lists.newArrayList(); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/BoundedIOCompactionStrategy.java b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/BoundedIOCompactionStrategy.java index 697062616..676b3e10e 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/BoundedIOCompactionStrategy.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/BoundedIOCompactionStrategy.java @@ -28,8 +28,8 @@ import java.util.Map; import java.util.Optional; /** - * CompactionStrategy which looks at total IO to be done for the compaction (read + write) - * and limits the list of compactions to be under a configured limit on the IO + * CompactionStrategy which looks at total IO to be done for the compaction (read + write) and + * limits the list of compactions to be under a configured limit on the IO * * @see CompactionStrategy */ @@ -46,7 +46,7 @@ public class BoundedIOCompactionStrategy implements CompactionStrategy { // Total size of all the log files Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter( Optional::isPresent).map(Optional::get).reduce( - (size1, size2) -> size1 + size2).orElse(0L); + (size1, size2) -> size1 + size2).orElse(0L); // Total read will be the base file + all the log files Long totalIORead = FSUtils.getSizeInMB(dataFile.getFileSize() + totalLogFileSize); // Total write will be similar to the size of the base file @@ -62,7 +62,8 @@ public class BoundedIOCompactionStrategy implements CompactionStrategy { } @Override - public List orderAndFilter(HoodieWriteConfig writeConfig, List operations) { + public List orderAndFilter(HoodieWriteConfig writeConfig, + List operations) { // Iterate through the operations in order and accept operations as long as we are within the IO limit // Preserves the original ordering of compactions List finalOperations = Lists.newArrayList(); diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/CompactionStrategy.java b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/CompactionStrategy.java index bb452d326..8486774d7 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/CompactionStrategy.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/CompactionStrategy.java @@ -25,12 +25,12 @@ import java.util.List; import java.util.Map; /** - * Strategy for compaction. Pluggable implementation of define how compaction should be done. - * The implementations of this interface can capture the relevant metrics to order and filter - * the final list of compaction operation to run in a single compaction. + * Strategy for compaction. Pluggable implementation of define how compaction should be done. The + * implementations of this interface can capture the relevant metrics to order and filter the final + * list of compaction operation to run in a single compaction. * - * Implementation of CompactionStrategy cannot hold any state. - * Difference instantiations can be passed in every time + * Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be + * passed in every time * * @see com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor * @see CompactionOperation @@ -38,8 +38,8 @@ import java.util.Map; public interface CompactionStrategy extends Serializable { /** - * Callback hook when a CompactionOperation is created. Individual strategies can - * capture the metrics they need to decide on the priority. + * Callback hook when a CompactionOperation is created. Individual strategies can capture the + * metrics they need to decide on the priority. * * @param dataFile - Base file to compact * @param partitionPath - Partition path @@ -50,8 +50,8 @@ public interface CompactionStrategy extends Serializable { List logFiles); /** - * Order and Filter the list of compactions. Use the metrics captured with the - * captureMetrics to order and filter out compactions + * Order and Filter the list of compactions. Use the metrics captured with the captureMetrics to + * order and filter out compactions * * @param writeConfig - HoodieWriteConfig - config for this compaction is passed in * @param operations - list of compactions collected diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/LogFileSizeBasedCompactionStrategy.java b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/LogFileSizeBasedCompactionStrategy.java index 60fcf2f7f..2fabc6596 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/LogFileSizeBasedCompactionStrategy.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/LogFileSizeBasedCompactionStrategy.java @@ -27,8 +27,8 @@ import java.util.Optional; import java.util.stream.Collectors; /** - * LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size - * and limits the compactions within a configured IO bound + * LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and + * limits the compactions within a configured IO bound * * @see BoundedIOCompactionStrategy * @see CompactionStrategy diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/UnBoundedCompactionStrategy.java b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/UnBoundedCompactionStrategy.java index c3b145e11..08f46019f 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/UnBoundedCompactionStrategy.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/strategy/UnBoundedCompactionStrategy.java @@ -25,9 +25,9 @@ import java.util.List; import java.util.Map; /** - * UnBoundedCompactionStrategy will not change ordering or filter any compaction. - * It is a pass-through and will compact all the base files which has a log file. - * This usually means no-intelligence on compaction. + * UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a + * pass-through and will compact all the base files which has a log file. This usually means + * no-intelligence on compaction. * * @see CompactionStrategy */ diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetConfig.java index 363bbb78f..d2af03047 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetConfig.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetConfig.java @@ -17,50 +17,50 @@ package com.uber.hoodie.io.storage; import com.uber.hoodie.avro.HoodieAvroWriteSupport; -import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.parquet.hadoop.metadata.CompressionCodecName; public class HoodieParquetConfig { - private HoodieAvroWriteSupport writeSupport; - private CompressionCodecName compressionCodecName; - private int blockSize; - private int pageSize; - private int maxFileSize; - private Configuration hadoopConf; - public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport, - CompressionCodecName compressionCodecName, int blockSize, int pageSize, int maxFileSize, - Configuration hadoopConf) { - this.writeSupport = writeSupport; - this.compressionCodecName = compressionCodecName; - this.blockSize = blockSize; - this.pageSize = pageSize; - this.maxFileSize = maxFileSize; - this.hadoopConf = hadoopConf; - } + private HoodieAvroWriteSupport writeSupport; + private CompressionCodecName compressionCodecName; + private int blockSize; + private int pageSize; + private int maxFileSize; + private Configuration hadoopConf; - public HoodieAvroWriteSupport getWriteSupport() { - return writeSupport; - } + public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport, + CompressionCodecName compressionCodecName, int blockSize, int pageSize, int maxFileSize, + Configuration hadoopConf) { + this.writeSupport = writeSupport; + this.compressionCodecName = compressionCodecName; + this.blockSize = blockSize; + this.pageSize = pageSize; + this.maxFileSize = maxFileSize; + this.hadoopConf = hadoopConf; + } - public CompressionCodecName getCompressionCodecName() { - return compressionCodecName; - } + public HoodieAvroWriteSupport getWriteSupport() { + return writeSupport; + } - public int getBlockSize() { - return blockSize; - } + public CompressionCodecName getCompressionCodecName() { + return compressionCodecName; + } - public int getPageSize() { - return pageSize; - } + public int getBlockSize() { + return blockSize; + } - public int getMaxFileSize() { - return maxFileSize; - } + public int getPageSize() { + return pageSize; + } - public Configuration getHadoopConf() { - return hadoopConf; - } + public int getMaxFileSize() { + return maxFileSize; + } + + public Configuration getHadoopConf() { + return hadoopConf; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java index ca3ed5709..2e207fd7f 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java @@ -20,6 +20,8 @@ import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.util.HoodieAvroUtils; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; @@ -30,79 +32,76 @@ import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.spark.TaskContext; -import java.io.IOException; -import java.util.concurrent.atomic.AtomicLong; - /** - * HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. - * Provides a way to check if the current file can take more records with the canWrite() - * - * @param + * HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides + * a way to check if the current file can take more records with the canWrite() */ public class HoodieParquetWriter extends ParquetWriter implements HoodieStorageWriter { - private static double STREAM_COMPRESSION_RATIO = 0.1; - private static AtomicLong recordIndex = new AtomicLong(1); + + private static double STREAM_COMPRESSION_RATIO = 0.1; + private static AtomicLong recordIndex = new AtomicLong(1); - private final Path file; - private final HoodieWrapperFileSystem fs; - private final long maxFileSize; - private final HoodieAvroWriteSupport writeSupport; - private final String commitTime; - private final Schema schema; + private final Path file; + private final HoodieWrapperFileSystem fs; + private final long maxFileSize; + private final HoodieAvroWriteSupport writeSupport; + private final String commitTime; + private final Schema schema; - private static Configuration registerFileSystem(Configuration conf) { - Configuration returnConf = new Configuration(conf); - String scheme = FileSystem.getDefaultUri(conf).getScheme(); - returnConf.set("fs." + HoodieWrapperFileSystem.getHoodieScheme(scheme) + ".impl", - HoodieWrapperFileSystem.class.getName()); - return returnConf; - } + private static Configuration registerFileSystem(Configuration conf) { + Configuration returnConf = new Configuration(conf); + String scheme = FileSystem.getDefaultUri(conf).getScheme(); + returnConf.set("fs." + HoodieWrapperFileSystem.getHoodieScheme(scheme) + ".impl", + HoodieWrapperFileSystem.class.getName()); + return returnConf; + } - public HoodieParquetWriter(String commitTime, Path file, - HoodieParquetConfig parquetConfig, Schema schema) throws IOException { - super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()), - ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), - parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(), - parquetConfig.getPageSize(), parquetConfig.getPageSize(), - ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, - ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, - registerFileSystem(parquetConfig.getHadoopConf())); - this.file = - HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()); - this.fs = (HoodieWrapperFileSystem) this.file - .getFileSystem(registerFileSystem(parquetConfig.getHadoopConf())); - // We cannot accurately measure the snappy compressed output file size. We are choosing a conservative 10% - // TODO - compute this compression ratio dynamically by looking at the bytes written to the stream and the actual file size reported by HDFS - this.maxFileSize = parquetConfig.getMaxFileSize() + Math - .round(parquetConfig.getMaxFileSize() * STREAM_COMPRESSION_RATIO); - this.writeSupport = parquetConfig.getWriteSupport(); - this.commitTime = commitTime; - this.schema = schema; - } + public HoodieParquetWriter(String commitTime, Path file, + HoodieParquetConfig parquetConfig, Schema schema) throws IOException { + super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()), + ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), + parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(), + parquetConfig.getPageSize(), parquetConfig.getPageSize(), + ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, + ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, + registerFileSystem(parquetConfig.getHadoopConf())); + this.file = + HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()); + this.fs = (HoodieWrapperFileSystem) this.file + .getFileSystem(registerFileSystem(parquetConfig.getHadoopConf())); + // We cannot accurately measure the snappy compressed output file size. We are choosing a conservative 10% + // TODO - compute this compression ratio dynamically by looking at the bytes written to the stream and the actual file size reported by HDFS + this.maxFileSize = parquetConfig.getMaxFileSize() + Math + .round(parquetConfig.getMaxFileSize() * STREAM_COMPRESSION_RATIO); + this.writeSupport = parquetConfig.getWriteSupport(); + this.commitTime = commitTime; + this.schema = schema; + } - @Override - public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { - String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), - recordIndex.getAndIncrement()); - HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, - record.getRecordKey(), - record.getPartitionPath(), - file.getName()); - HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId); - super.write(avroRecord); - writeSupport.add(record.getRecordKey()); - } + @Override + public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { + String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), + recordIndex.getAndIncrement()); + HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, + record.getRecordKey(), + record.getPartitionPath(), + file.getName()); + HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId); + super.write(avroRecord); + writeSupport.add(record.getRecordKey()); + } - public boolean canWrite() { - return fs.getBytesWritten(file) < maxFileSize; - } + public boolean canWrite() { + return fs.getBytesWritten(file) < maxFileSize; + } - @Override public void writeAvro(String key, IndexedRecord object) throws IOException { - super.write(object); - writeSupport.add(key); - } + @Override + public void writeAvro(String key, IndexedRecord object) throws IOException { + super.write(object); + writeSupport.add(key); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriter.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriter.java index e4fcdc335..f3a39ffb5 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriter.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriter.java @@ -17,13 +17,16 @@ package com.uber.hoodie.io.storage; import com.uber.hoodie.common.model.HoodieRecord; +import java.io.IOException; import org.apache.avro.generic.IndexedRecord; -import java.io.IOException; - public interface HoodieStorageWriter { - void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException; - boolean canWrite(); - void close() throws IOException; - void writeAvro(String key, R oldRecord) throws IOException; + + void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException; + + boolean canWrite(); + + void close() throws IOException; + + void writeAvro(String key, R oldRecord) throws IOException; } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java index b9084dc61..2803fc8f9 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java @@ -16,42 +16,42 @@ package com.uber.hoodie.io.storage; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.table.HoodieTable; +import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import java.io.IOException; - public class HoodieStorageWriterFactory { - public static HoodieStorageWriter getStorageWriter( - String commitTime, Path path, HoodieTable hoodieTable, HoodieWriteConfig config, Schema schema) - throws IOException { - //TODO - based on the metadata choose the implementation of HoodieStorageWriter - // Currently only parquet is supported - return newParquetStorageWriter(commitTime, path, config, schema); - } - private static HoodieStorageWriter newParquetStorageWriter( - String commitTime, Path path, HoodieWriteConfig config, Schema schema) throws IOException { - BloomFilter filter = - new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP()); - HoodieAvroWriteSupport writeSupport = - new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); + public static HoodieStorageWriter getStorageWriter( + String commitTime, Path path, HoodieTable hoodieTable, HoodieWriteConfig config, + Schema schema) + throws IOException { + //TODO - based on the metadata choose the implementation of HoodieStorageWriter + // Currently only parquet is supported + return newParquetStorageWriter(commitTime, path, config, schema); + } - HoodieParquetConfig parquetConfig = - new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, - config.getParquetBlockSize(), config.getParquetPageSize(), - config.getParquetMaxFileSize(), FSUtils.getFs().getConf()); + private static HoodieStorageWriter newParquetStorageWriter( + String commitTime, Path path, HoodieWriteConfig config, Schema schema) throws IOException { + BloomFilter filter = + new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP()); + HoodieAvroWriteSupport writeSupport = + new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); - return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema); - } + HoodieParquetConfig parquetConfig = + new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, + config.getParquetBlockSize(), config.getParquetPageSize(), + config.getParquetMaxFileSize(), FSUtils.getFs().getConf()); + + return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java index 7cd2b8377..87a628447 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java @@ -16,17 +16,6 @@ package com.uber.hoodie.io.storage; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.*; -import org.apache.hadoop.fs.permission.AclEntry; -import org.apache.hadoop.fs.permission.AclStatus; -import org.apache.hadoop.fs.permission.FsAction; -import org.apache.hadoop.fs.permission.FsPermission; -import org.apache.hadoop.security.AccessControlException; -import org.apache.hadoop.security.Credentials; -import org.apache.hadoop.security.token.Token; -import org.apache.hadoop.util.Progressable; - import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; @@ -38,646 +27,784 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.CreateFlag; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileAlreadyExistsException; +import org.apache.hadoop.fs.FileChecksum; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FsServerDefaults; +import org.apache.hadoop.fs.FsStatus; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Options; +import org.apache.hadoop.fs.ParentNotDirectoryException; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.fs.UnsupportedFileSystemException; +import org.apache.hadoop.fs.XAttrSetFlag; +import org.apache.hadoop.fs.permission.AclEntry; +import org.apache.hadoop.fs.permission.AclStatus; +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.security.AccessControlException; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.util.Progressable; /** - * HoodieWrapperFileSystem wraps the default file system. - * It holds state about the open streams in the file system to support getting the - * written size to each of the open streams. + * HoodieWrapperFileSystem wraps the default file system. It holds state about the open streams in + * the file system to support getting the written size to each of the open streams. */ public class HoodieWrapperFileSystem extends FileSystem { - private static final Set SUPPORT_SCHEMES; - public static final String HOODIE_SCHEME_PREFIX = "hoodie-"; - - static { - SUPPORT_SCHEMES = new HashSet<>(); - SUPPORT_SCHEMES.add("file"); - SUPPORT_SCHEMES.add("hdfs"); - SUPPORT_SCHEMES.add("s3"); - - // Hoodie currently relies on underlying object store being fully - // consistent so only regional buckets should be used. - SUPPORT_SCHEMES.add("gs"); - SUPPORT_SCHEMES.add("viewfs"); - } - - private ConcurrentMap openStreams = - new ConcurrentHashMap<>(); - private FileSystem fileSystem; - private URI uri; - - @Override public void initialize(URI uri, Configuration conf) throws IOException { - // Get the default filesystem to decorate - fileSystem = FileSystem.get(conf); - // Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get - // fileSystem.initialize(FileSystem.getDefaultUri(conf), conf); - // fileSystem.setConf(conf); - this.uri = uri; - } - - @Override public URI getUri() { - return uri; - } - - @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { - return fileSystem.open(convertToDefaultPath(f), bufferSize); - } - - @Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, - int bufferSize, short replication, long blockSize, Progressable progress) - throws IOException { - final Path translatedPath = convertToDefaultPath(f); - return wrapOutputStream(f, fileSystem - .create(translatedPath, permission, overwrite, bufferSize, replication, blockSize, - progress)); - } - - private FSDataOutputStream wrapOutputStream(final Path path, - FSDataOutputStream fsDataOutputStream) throws IOException { - if (fsDataOutputStream instanceof SizeAwareFSDataOutputStream) { - return fsDataOutputStream; - } - - SizeAwareFSDataOutputStream os = - new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() { - @Override public void run() { - openStreams.remove(path.getName()); - } - }); - openStreams.put(path.getName(), os); - return os; - } - - @Override public FSDataOutputStream create(Path f, boolean overwrite) throws IOException { - return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite)); - } - - @Override public FSDataOutputStream create(Path f) throws IOException { - return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f))); - } - - @Override public FSDataOutputStream create(Path f, Progressable progress) throws IOException { - return fileSystem.create(convertToDefaultPath(f), progress); - } - - @Override public FSDataOutputStream create(Path f, short replication) throws IOException { - return fileSystem.create(convertToDefaultPath(f), replication); - } - - @Override public FSDataOutputStream create(Path f, short replication, Progressable progress) - throws IOException { - return fileSystem.create(convertToDefaultPath(f), replication, progress); - } - - @Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) - throws IOException { - return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize); - } - - @Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, - Progressable progress) throws IOException { - return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress); - } - - @Override - public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, - long blockSize, Progressable progress) throws IOException { - return fileSystem - .create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize, - progress); - } - - @Override - public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, - int bufferSize, short replication, long blockSize, Progressable progress) - throws IOException { - return fileSystem - .create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, - progress); - } - - @Override - public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, - int bufferSize, short replication, long blockSize, Progressable progress, - Options.ChecksumOpt checksumOpt) throws IOException { - return fileSystem - .create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, - progress, checksumOpt); - } - - - @Override - public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, - long blockSize) throws IOException { - return fileSystem - .create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize); - } - - - @Override public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) - throws IOException { - return fileSystem.append(convertToDefaultPath(f), bufferSize, progress); - } - - @Override public boolean rename(Path src, Path dst) throws IOException { - return fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst)); - } - - @Override public boolean delete(Path f, boolean recursive) throws IOException { - return fileSystem.delete(convertToDefaultPath(f), recursive); - } - - @Override public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException { - return fileSystem.listStatus(convertToDefaultPath(f)); - } - - @Override public void setWorkingDirectory(Path new_dir) { - fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir)); - } - - @Override public Path getWorkingDirectory() { - return convertToHoodiePath(fileSystem.getWorkingDirectory()); - } - - @Override public boolean mkdirs(Path f, FsPermission permission) throws IOException { - return fileSystem.mkdirs(convertToDefaultPath(f), permission); - } - - @Override public FileStatus getFileStatus(Path f) throws IOException { - return fileSystem.getFileStatus(convertToDefaultPath(f)); - } - - @Override public String getScheme() { - return uri.getScheme(); - } - - @Override public String getCanonicalServiceName() { - return fileSystem.getCanonicalServiceName(); - } - - @Override public String getName() { - return fileSystem.getName(); - } - - @Override public Path makeQualified(Path path) { - return convertToHoodiePath(fileSystem.makeQualified(convertToDefaultPath(path))); - } - - @Override public Token getDelegationToken(String renewer) throws IOException { - return fileSystem.getDelegationToken(renewer); - } - - @Override public Token[] addDelegationTokens(String renewer, Credentials credentials) - throws IOException { - return fileSystem.addDelegationTokens(renewer, credentials); - } - - @Override public FileSystem[] getChildFileSystems() { - return fileSystem.getChildFileSystems(); - } - - @Override public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) - throws IOException { - return fileSystem.getFileBlockLocations(file, start, len); - } - - @Override public BlockLocation[] getFileBlockLocations(Path p, long start, long len) - throws IOException { - return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len); - } - - @Override public FsServerDefaults getServerDefaults() throws IOException { - return fileSystem.getServerDefaults(); - } - - @Override public FsServerDefaults getServerDefaults(Path p) throws IOException { - return fileSystem.getServerDefaults(convertToDefaultPath(p)); - } - - @Override public Path resolvePath(Path p) throws IOException { - return convertToHoodiePath(fileSystem.resolvePath(convertToDefaultPath(p))); - } - - @Override public FSDataInputStream open(Path f) throws IOException { - return fileSystem.open(convertToDefaultPath(f)); - } - - @Override - public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize, - short replication, long blockSize, Progressable progress) throws IOException { - return fileSystem - .createNonRecursive(convertToDefaultPath(f), overwrite, bufferSize, replication, - blockSize, progress); - } - - @Override - public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite, - int bufferSize, short replication, long blockSize, Progressable progress) - throws IOException { - return fileSystem - .createNonRecursive(convertToDefaultPath(f), permission, overwrite, bufferSize, - replication, blockSize, progress); - } - - @Override public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, - EnumSet flags, int bufferSize, short replication, long blockSize, - Progressable progress) throws IOException { - return fileSystem - .createNonRecursive(convertToDefaultPath(f), permission, flags, bufferSize, replication, - blockSize, progress); - } - - @Override public boolean createNewFile(Path f) throws IOException { - return fileSystem.createNewFile(convertToDefaultPath(f)); - } - - @Override public FSDataOutputStream append(Path f) throws IOException { - return fileSystem.append(convertToDefaultPath(f)); - } - - @Override public FSDataOutputStream append(Path f, int bufferSize) throws IOException { - return fileSystem.append(convertToDefaultPath(f), bufferSize); - } - - @Override public void concat(Path trg, Path[] psrcs) throws IOException { - Path[] psrcsNew = convertDefaults(psrcs); - fileSystem.concat(convertToDefaultPath(trg), psrcsNew); - } - - @Override public short getReplication(Path src) throws IOException { - return fileSystem.getReplication(convertToDefaultPath(src)); - } - - @Override public boolean setReplication(Path src, short replication) throws IOException { - return fileSystem.setReplication(convertToDefaultPath(src), replication); - } - - @Override public boolean delete(Path f) throws IOException { - return fileSystem.delete(convertToDefaultPath(f)); - } - - @Override public boolean deleteOnExit(Path f) throws IOException { - return fileSystem.deleteOnExit(convertToDefaultPath(f)); - } - - @Override public boolean cancelDeleteOnExit(Path f) { - return fileSystem.cancelDeleteOnExit(convertToDefaultPath(f)); - } - - @Override public boolean exists(Path f) throws IOException { - return fileSystem.exists(convertToDefaultPath(f)); - } - - @Override public boolean isDirectory(Path f) throws IOException { - return fileSystem.isDirectory(convertToDefaultPath(f)); - } - - @Override public boolean isFile(Path f) throws IOException { - return fileSystem.isFile(convertToDefaultPath(f)); - } - - @Override public long getLength(Path f) throws IOException { - return fileSystem.getLength(convertToDefaultPath(f)); - } - - @Override public ContentSummary getContentSummary(Path f) throws IOException { - return fileSystem.getContentSummary(convertToDefaultPath(f)); - } - - @Override public RemoteIterator listCorruptFileBlocks(Path path) throws IOException { - return fileSystem.listCorruptFileBlocks(convertToDefaultPath(path)); - } - - @Override public FileStatus[] listStatus(Path f, PathFilter filter) - throws FileNotFoundException, IOException { - return fileSystem.listStatus(convertToDefaultPath(f), filter); - } - - @Override public FileStatus[] listStatus(Path[] files) - throws FileNotFoundException, IOException { - return fileSystem.listStatus(convertDefaults(files)); - } - - @Override public FileStatus[] listStatus(Path[] files, PathFilter filter) - throws FileNotFoundException, IOException { - return fileSystem.listStatus(convertDefaults(files), filter); - } - - @Override public FileStatus[] globStatus(Path pathPattern) throws IOException { - return fileSystem.globStatus(convertToDefaultPath(pathPattern)); - } - - @Override public FileStatus[] globStatus(Path pathPattern, PathFilter filter) - throws IOException { - return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter); - } - - @Override public RemoteIterator listLocatedStatus(Path f) - throws FileNotFoundException, IOException { - return fileSystem.listLocatedStatus(convertToDefaultPath(f)); - } - - @Override public RemoteIterator listFiles(Path f, boolean recursive) - throws FileNotFoundException, IOException { - return fileSystem.listFiles(convertToDefaultPath(f), recursive); - } - - @Override public Path getHomeDirectory() { - return convertToHoodiePath(fileSystem.getHomeDirectory()); - } - - @Override public boolean mkdirs(Path f) throws IOException { - return fileSystem.mkdirs(convertToDefaultPath(f)); - } - - @Override public void copyFromLocalFile(Path src, Path dst) throws IOException { - fileSystem.copyFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); - } - - @Override public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException { - fileSystem.moveFromLocalFile(convertDefaults(srcs), convertToDefaultPath(dst)); - } - - @Override public void moveFromLocalFile(Path src, Path dst) throws IOException { - fileSystem.moveFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); - } - - @Override public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException { - fileSystem.copyFromLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst)); - } - - @Override - public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path[] srcs, Path dst) - throws IOException { - fileSystem - .copyFromLocalFile(delSrc, overwrite, convertDefaults(srcs), convertToDefaultPath(dst)); - } - - @Override public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst) - throws IOException { - fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src), - convertToDefaultPath(dst)); - } - - @Override public void copyToLocalFile(Path src, Path dst) throws IOException { - fileSystem.copyToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); - } - - @Override public void moveToLocalFile(Path src, Path dst) throws IOException { - fileSystem.moveToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); - } - - @Override public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException { - fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst)); - } - - @Override - public void copyToLocalFile(boolean delSrc, Path src, Path dst, boolean useRawLocalFileSystem) - throws IOException { - fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst), - useRawLocalFileSystem); - } - - @Override public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) - throws IOException { - return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile), - convertToDefaultPath(tmpLocalFile))); - } - - @Override public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) - throws IOException { - fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile), - convertToDefaultPath(tmpLocalFile)); - } - - @Override public void close() throws IOException { - fileSystem.close(); - } - - @Override public long getUsed() throws IOException { - return fileSystem.getUsed(); - } - - @Override public long getBlockSize(Path f) throws IOException { - return fileSystem.getBlockSize(convertToDefaultPath(f)); - } - - @Override public long getDefaultBlockSize() { - return fileSystem.getDefaultBlockSize(); - } - - @Override public long getDefaultBlockSize(Path f) { - return fileSystem.getDefaultBlockSize(convertToDefaultPath(f)); - } - - @Override public short getDefaultReplication() { - return fileSystem.getDefaultReplication(); - } - - @Override public short getDefaultReplication(Path path) { - return fileSystem.getDefaultReplication(convertToDefaultPath(path)); - } - - @Override public void access(Path path, FsAction mode) - throws AccessControlException, FileNotFoundException, IOException { - fileSystem.access(convertToDefaultPath(path), mode); - } - - @Override public void createSymlink(Path target, Path link, boolean createParent) - throws AccessControlException, FileAlreadyExistsException, FileNotFoundException, - ParentNotDirectoryException, UnsupportedFileSystemException, IOException { - fileSystem - .createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent); - } - - @Override public FileStatus getFileLinkStatus(Path f) - throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException, - IOException { - return fileSystem.getFileLinkStatus(convertToDefaultPath(f)); - } - - @Override public boolean supportsSymlinks() { - return fileSystem.supportsSymlinks(); - } - - @Override public Path getLinkTarget(Path f) throws IOException { - return convertToHoodiePath(fileSystem.getLinkTarget(convertToDefaultPath(f))); - } - - @Override public FileChecksum getFileChecksum(Path f) throws IOException { - return fileSystem.getFileChecksum(convertToDefaultPath(f)); - } - - @Override public FileChecksum getFileChecksum(Path f, long length) throws IOException { - return fileSystem.getFileChecksum(convertToDefaultPath(f), length); - } - - @Override public void setVerifyChecksum(boolean verifyChecksum) { - fileSystem.setVerifyChecksum(verifyChecksum); - } - - @Override public void setWriteChecksum(boolean writeChecksum) { - fileSystem.setWriteChecksum(writeChecksum); - } - - @Override public FsStatus getStatus() throws IOException { - return fileSystem.getStatus(); - } - @Override public FsStatus getStatus(Path p) throws IOException { - return fileSystem.getStatus(convertToDefaultPath(p)); - } - - @Override public void setPermission(Path p, FsPermission permission) throws IOException { - fileSystem.setPermission(convertToDefaultPath(p), permission); - } - - @Override public void setOwner(Path p, String username, String groupname) throws IOException { - fileSystem.setOwner(convertToDefaultPath(p), username, groupname); - } - - @Override public void setTimes(Path p, long mtime, long atime) throws IOException { - fileSystem.setTimes(convertToDefaultPath(p), mtime, atime); - } - - @Override public Path createSnapshot(Path path, String snapshotName) throws IOException { - return convertToHoodiePath( - fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName)); - } - - @Override public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName) - throws IOException { - fileSystem.renameSnapshot(convertToDefaultPath(path), snapshotOldName, snapshotNewName); - } - - @Override public void deleteSnapshot(Path path, String snapshotName) throws IOException { - fileSystem.deleteSnapshot(convertToDefaultPath(path), snapshotName); - } - - @Override public void modifyAclEntries(Path path, List aclSpec) throws IOException { - fileSystem.modifyAclEntries(convertToDefaultPath(path), aclSpec); - } - - @Override public void removeAclEntries(Path path, List aclSpec) throws IOException { - fileSystem.removeAclEntries(convertToDefaultPath(path), aclSpec); - } - - @Override public void removeDefaultAcl(Path path) throws IOException { - fileSystem.removeDefaultAcl(convertToDefaultPath(path)); - } - - @Override public void removeAcl(Path path) throws IOException { - fileSystem.removeAcl(convertToDefaultPath(path)); - } - - @Override public void setAcl(Path path, List aclSpec) throws IOException { - fileSystem.setAcl(convertToDefaultPath(path), aclSpec); - } - - @Override public AclStatus getAclStatus(Path path) throws IOException { - return fileSystem.getAclStatus(convertToDefaultPath(path)); - } - - @Override public void setXAttr(Path path, String name, byte[] value) throws IOException { - fileSystem.setXAttr(convertToDefaultPath(path), name, value); - } - - @Override public void setXAttr(Path path, String name, byte[] value, EnumSet flag) - throws IOException { - fileSystem.setXAttr(convertToDefaultPath(path), name, value, flag); - } - - @Override public byte[] getXAttr(Path path, String name) throws IOException { - return fileSystem.getXAttr(convertToDefaultPath(path), name); - } - - @Override public Map getXAttrs(Path path) throws IOException { - return fileSystem.getXAttrs(convertToDefaultPath(path)); - } - - @Override public Map getXAttrs(Path path, List names) - throws IOException { - return fileSystem.getXAttrs(convertToDefaultPath(path), names); - } - - @Override public List listXAttrs(Path path) throws IOException { - return fileSystem.listXAttrs(convertToDefaultPath(path)); - } - - @Override public void removeXAttr(Path path, String name) throws IOException { - fileSystem.removeXAttr(convertToDefaultPath(path), name); - } - - @Override public void setConf(Configuration conf) { - // ignore this. we will set conf on init - } - - @Override public Configuration getConf() { - return fileSystem.getConf(); - } - - @Override public int hashCode() { - return fileSystem.hashCode(); - } - - @Override public boolean equals(Object obj) { - return fileSystem.equals(obj); - } - - @Override public String toString() { - return fileSystem.toString(); - } - - public Path convertToHoodiePath(Path oldPath) { - return convertPathWithScheme(oldPath, getHoodieScheme(fileSystem.getScheme())); - } - - public static Path convertToHoodiePath(Path file, Configuration conf) { - String scheme = FileSystem.getDefaultUri(conf).getScheme(); - return convertPathWithScheme(file, getHoodieScheme(scheme)); - } - - private Path convertToDefaultPath(Path oldPath) { - return convertPathWithScheme(oldPath, fileSystem.getScheme()); - } - - private Path[] convertDefaults(Path[] psrcs) { - Path[] psrcsNew = new Path[psrcs.length]; - for (int i = 0; i < psrcs.length; i++) { - psrcsNew[i] = convertToDefaultPath(psrcs[i]); - } - return psrcsNew; - } - - private static Path convertPathWithScheme(Path oldPath, String newScheme) { - URI oldURI = oldPath.toUri(); - URI newURI; - try { - newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), - oldURI.getPath(), oldURI.getQuery(), oldURI.getFragment()); - return new Path(newURI); - } catch (URISyntaxException e) { - // TODO - Better Exception handling - throw new RuntimeException(e); - } - } - - public static String getHoodieScheme(String scheme) { - String newScheme; - if (SUPPORT_SCHEMES.contains(scheme)) { - newScheme = HOODIE_SCHEME_PREFIX + scheme; - } else { - throw new IllegalArgumentException( - "BlockAlignedAvroParquetWriter does not support scheme " + scheme); - } - return newScheme; - } - - public long getBytesWritten(Path file) { - if (openStreams.containsKey(file.getName())) { - return openStreams.get(file.getName()).getBytesWritten(); - } - // When the file is first written, we do not have a track of it - throw new IllegalArgumentException(file.toString() - + " does not have a open stream. Cannot get the bytes written on the stream"); - } + private static final Set SUPPORT_SCHEMES; + public static final String HOODIE_SCHEME_PREFIX = "hoodie-"; + + static { + SUPPORT_SCHEMES = new HashSet<>(); + SUPPORT_SCHEMES.add("file"); + SUPPORT_SCHEMES.add("hdfs"); + SUPPORT_SCHEMES.add("s3"); + + // Hoodie currently relies on underlying object store being fully + // consistent so only regional buckets should be used. + SUPPORT_SCHEMES.add("gs"); + SUPPORT_SCHEMES.add("viewfs"); + } + + private ConcurrentMap openStreams = + new ConcurrentHashMap<>(); + private FileSystem fileSystem; + private URI uri; + + @Override + public void initialize(URI uri, Configuration conf) throws IOException { + // Get the default filesystem to decorate + fileSystem = FileSystem.get(conf); + // Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get + // fileSystem.initialize(FileSystem.getDefaultUri(conf), conf); + // fileSystem.setConf(conf); + this.uri = uri; + } + + @Override + public URI getUri() { + return uri; + } + + @Override + public FSDataInputStream open(Path f, int bufferSize) throws IOException { + return fileSystem.open(convertToDefaultPath(f), bufferSize); + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, + int bufferSize, short replication, long blockSize, Progressable progress) + throws IOException { + final Path translatedPath = convertToDefaultPath(f); + return wrapOutputStream(f, fileSystem + .create(translatedPath, permission, overwrite, bufferSize, replication, blockSize, + progress)); + } + + private FSDataOutputStream wrapOutputStream(final Path path, + FSDataOutputStream fsDataOutputStream) throws IOException { + if (fsDataOutputStream instanceof SizeAwareFSDataOutputStream) { + return fsDataOutputStream; + } + + SizeAwareFSDataOutputStream os = + new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() { + @Override + public void run() { + openStreams.remove(path.getName()); + } + }); + openStreams.put(path.getName(), os); + return os; + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite) throws IOException { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite)); + } + + @Override + public FSDataOutputStream create(Path f) throws IOException { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f))); + } + + @Override + public FSDataOutputStream create(Path f, Progressable progress) throws IOException { + return fileSystem.create(convertToDefaultPath(f), progress); + } + + @Override + public FSDataOutputStream create(Path f, short replication) throws IOException { + return fileSystem.create(convertToDefaultPath(f), replication); + } + + @Override + public FSDataOutputStream create(Path f, short replication, Progressable progress) + throws IOException { + return fileSystem.create(convertToDefaultPath(f), replication, progress); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) + throws IOException { + return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, + Progressable progress) throws IOException { + return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, + long blockSize, Progressable progress) throws IOException { + return fileSystem + .create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize, + progress); + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, + int bufferSize, short replication, long blockSize, Progressable progress) + throws IOException { + return fileSystem + .create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, + progress); + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, + int bufferSize, short replication, long blockSize, Progressable progress, + Options.ChecksumOpt checksumOpt) throws IOException { + return fileSystem + .create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, + progress, checksumOpt); + } + + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, + long blockSize) throws IOException { + return fileSystem + .create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize); + } + + + @Override + public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) + throws IOException { + return fileSystem.append(convertToDefaultPath(f), bufferSize, progress); + } + + @Override + public boolean rename(Path src, Path dst) throws IOException { + return fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override + public boolean delete(Path f, boolean recursive) throws IOException { + return fileSystem.delete(convertToDefaultPath(f), recursive); + } + + @Override + public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException { + return fileSystem.listStatus(convertToDefaultPath(f)); + } + + @Override + public void setWorkingDirectory(Path new_dir) { + fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir)); + } + + @Override + public Path getWorkingDirectory() { + return convertToHoodiePath(fileSystem.getWorkingDirectory()); + } + + @Override + public boolean mkdirs(Path f, FsPermission permission) throws IOException { + return fileSystem.mkdirs(convertToDefaultPath(f), permission); + } + + @Override + public FileStatus getFileStatus(Path f) throws IOException { + return fileSystem.getFileStatus(convertToDefaultPath(f)); + } + + @Override + public String getScheme() { + return uri.getScheme(); + } + + @Override + public String getCanonicalServiceName() { + return fileSystem.getCanonicalServiceName(); + } + + @Override + public String getName() { + return fileSystem.getName(); + } + + @Override + public Path makeQualified(Path path) { + return convertToHoodiePath(fileSystem.makeQualified(convertToDefaultPath(path))); + } + + @Override + public Token getDelegationToken(String renewer) throws IOException { + return fileSystem.getDelegationToken(renewer); + } + + @Override + public Token[] addDelegationTokens(String renewer, Credentials credentials) + throws IOException { + return fileSystem.addDelegationTokens(renewer, credentials); + } + + @Override + public FileSystem[] getChildFileSystems() { + return fileSystem.getChildFileSystems(); + } + + @Override + public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) + throws IOException { + return fileSystem.getFileBlockLocations(file, start, len); + } + + @Override + public BlockLocation[] getFileBlockLocations(Path p, long start, long len) + throws IOException { + return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len); + } + + @Override + public FsServerDefaults getServerDefaults() throws IOException { + return fileSystem.getServerDefaults(); + } + + @Override + public FsServerDefaults getServerDefaults(Path p) throws IOException { + return fileSystem.getServerDefaults(convertToDefaultPath(p)); + } + + @Override + public Path resolvePath(Path p) throws IOException { + return convertToHoodiePath(fileSystem.resolvePath(convertToDefaultPath(p))); + } + + @Override + public FSDataInputStream open(Path f) throws IOException { + return fileSystem.open(convertToDefaultPath(f)); + } + + @Override + public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize, + short replication, long blockSize, Progressable progress) throws IOException { + return fileSystem + .createNonRecursive(convertToDefaultPath(f), overwrite, bufferSize, replication, + blockSize, progress); + } + + @Override + public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite, + int bufferSize, short replication, long blockSize, Progressable progress) + throws IOException { + return fileSystem + .createNonRecursive(convertToDefaultPath(f), permission, overwrite, bufferSize, + replication, blockSize, progress); + } + + @Override + public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, + EnumSet flags, int bufferSize, short replication, long blockSize, + Progressable progress) throws IOException { + return fileSystem + .createNonRecursive(convertToDefaultPath(f), permission, flags, bufferSize, replication, + blockSize, progress); + } + + @Override + public boolean createNewFile(Path f) throws IOException { + return fileSystem.createNewFile(convertToDefaultPath(f)); + } + + @Override + public FSDataOutputStream append(Path f) throws IOException { + return fileSystem.append(convertToDefaultPath(f)); + } + + @Override + public FSDataOutputStream append(Path f, int bufferSize) throws IOException { + return fileSystem.append(convertToDefaultPath(f), bufferSize); + } + + @Override + public void concat(Path trg, Path[] psrcs) throws IOException { + Path[] psrcsNew = convertDefaults(psrcs); + fileSystem.concat(convertToDefaultPath(trg), psrcsNew); + } + + @Override + public short getReplication(Path src) throws IOException { + return fileSystem.getReplication(convertToDefaultPath(src)); + } + + @Override + public boolean setReplication(Path src, short replication) throws IOException { + return fileSystem.setReplication(convertToDefaultPath(src), replication); + } + + @Override + public boolean delete(Path f) throws IOException { + return fileSystem.delete(convertToDefaultPath(f)); + } + + @Override + public boolean deleteOnExit(Path f) throws IOException { + return fileSystem.deleteOnExit(convertToDefaultPath(f)); + } + + @Override + public boolean cancelDeleteOnExit(Path f) { + return fileSystem.cancelDeleteOnExit(convertToDefaultPath(f)); + } + + @Override + public boolean exists(Path f) throws IOException { + return fileSystem.exists(convertToDefaultPath(f)); + } + + @Override + public boolean isDirectory(Path f) throws IOException { + return fileSystem.isDirectory(convertToDefaultPath(f)); + } + + @Override + public boolean isFile(Path f) throws IOException { + return fileSystem.isFile(convertToDefaultPath(f)); + } + + @Override + public long getLength(Path f) throws IOException { + return fileSystem.getLength(convertToDefaultPath(f)); + } + + @Override + public ContentSummary getContentSummary(Path f) throws IOException { + return fileSystem.getContentSummary(convertToDefaultPath(f)); + } + + @Override + public RemoteIterator listCorruptFileBlocks(Path path) throws IOException { + return fileSystem.listCorruptFileBlocks(convertToDefaultPath(path)); + } + + @Override + public FileStatus[] listStatus(Path f, PathFilter filter) + throws FileNotFoundException, IOException { + return fileSystem.listStatus(convertToDefaultPath(f), filter); + } + + @Override + public FileStatus[] listStatus(Path[] files) + throws FileNotFoundException, IOException { + return fileSystem.listStatus(convertDefaults(files)); + } + + @Override + public FileStatus[] listStatus(Path[] files, PathFilter filter) + throws FileNotFoundException, IOException { + return fileSystem.listStatus(convertDefaults(files), filter); + } + + @Override + public FileStatus[] globStatus(Path pathPattern) throws IOException { + return fileSystem.globStatus(convertToDefaultPath(pathPattern)); + } + + @Override + public FileStatus[] globStatus(Path pathPattern, PathFilter filter) + throws IOException { + return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter); + } + + @Override + public RemoteIterator listLocatedStatus(Path f) + throws FileNotFoundException, IOException { + return fileSystem.listLocatedStatus(convertToDefaultPath(f)); + } + + @Override + public RemoteIterator listFiles(Path f, boolean recursive) + throws FileNotFoundException, IOException { + return fileSystem.listFiles(convertToDefaultPath(f), recursive); + } + + @Override + public Path getHomeDirectory() { + return convertToHoodiePath(fileSystem.getHomeDirectory()); + } + + @Override + public boolean mkdirs(Path f) throws IOException { + return fileSystem.mkdirs(convertToDefaultPath(f)); + } + + @Override + public void copyFromLocalFile(Path src, Path dst) throws IOException { + fileSystem.copyFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override + public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException { + fileSystem.moveFromLocalFile(convertDefaults(srcs), convertToDefaultPath(dst)); + } + + @Override + public void moveFromLocalFile(Path src, Path dst) throws IOException { + fileSystem.moveFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override + public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException { + fileSystem.copyFromLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override + public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path[] srcs, Path dst) + throws IOException { + fileSystem + .copyFromLocalFile(delSrc, overwrite, convertDefaults(srcs), convertToDefaultPath(dst)); + } + + @Override + public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst) + throws IOException { + fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src), + convertToDefaultPath(dst)); + } + + @Override + public void copyToLocalFile(Path src, Path dst) throws IOException { + fileSystem.copyToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override + public void moveToLocalFile(Path src, Path dst) throws IOException { + fileSystem.moveToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override + public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException { + fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override + public void copyToLocalFile(boolean delSrc, Path src, Path dst, boolean useRawLocalFileSystem) + throws IOException { + fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst), + useRawLocalFileSystem); + } + + @Override + public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) + throws IOException { + return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile), + convertToDefaultPath(tmpLocalFile))); + } + + @Override + public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) + throws IOException { + fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile), + convertToDefaultPath(tmpLocalFile)); + } + + @Override + public void close() throws IOException { + fileSystem.close(); + } + + @Override + public long getUsed() throws IOException { + return fileSystem.getUsed(); + } + + @Override + public long getBlockSize(Path f) throws IOException { + return fileSystem.getBlockSize(convertToDefaultPath(f)); + } + + @Override + public long getDefaultBlockSize() { + return fileSystem.getDefaultBlockSize(); + } + + @Override + public long getDefaultBlockSize(Path f) { + return fileSystem.getDefaultBlockSize(convertToDefaultPath(f)); + } + + @Override + public short getDefaultReplication() { + return fileSystem.getDefaultReplication(); + } + + @Override + public short getDefaultReplication(Path path) { + return fileSystem.getDefaultReplication(convertToDefaultPath(path)); + } + + @Override + public void access(Path path, FsAction mode) + throws AccessControlException, FileNotFoundException, IOException { + fileSystem.access(convertToDefaultPath(path), mode); + } + + @Override + public void createSymlink(Path target, Path link, boolean createParent) + throws AccessControlException, FileAlreadyExistsException, FileNotFoundException, + ParentNotDirectoryException, UnsupportedFileSystemException, IOException { + fileSystem + .createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent); + } + + @Override + public FileStatus getFileLinkStatus(Path f) + throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException, + IOException { + return fileSystem.getFileLinkStatus(convertToDefaultPath(f)); + } + + @Override + public boolean supportsSymlinks() { + return fileSystem.supportsSymlinks(); + } + + @Override + public Path getLinkTarget(Path f) throws IOException { + return convertToHoodiePath(fileSystem.getLinkTarget(convertToDefaultPath(f))); + } + + @Override + public FileChecksum getFileChecksum(Path f) throws IOException { + return fileSystem.getFileChecksum(convertToDefaultPath(f)); + } + + @Override + public FileChecksum getFileChecksum(Path f, long length) throws IOException { + return fileSystem.getFileChecksum(convertToDefaultPath(f), length); + } + + @Override + public void setVerifyChecksum(boolean verifyChecksum) { + fileSystem.setVerifyChecksum(verifyChecksum); + } + + @Override + public void setWriteChecksum(boolean writeChecksum) { + fileSystem.setWriteChecksum(writeChecksum); + } + + @Override + public FsStatus getStatus() throws IOException { + return fileSystem.getStatus(); + } + + @Override + public FsStatus getStatus(Path p) throws IOException { + return fileSystem.getStatus(convertToDefaultPath(p)); + } + + @Override + public void setPermission(Path p, FsPermission permission) throws IOException { + fileSystem.setPermission(convertToDefaultPath(p), permission); + } + + @Override + public void setOwner(Path p, String username, String groupname) throws IOException { + fileSystem.setOwner(convertToDefaultPath(p), username, groupname); + } + + @Override + public void setTimes(Path p, long mtime, long atime) throws IOException { + fileSystem.setTimes(convertToDefaultPath(p), mtime, atime); + } + + @Override + public Path createSnapshot(Path path, String snapshotName) throws IOException { + return convertToHoodiePath( + fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName)); + } + + @Override + public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName) + throws IOException { + fileSystem.renameSnapshot(convertToDefaultPath(path), snapshotOldName, snapshotNewName); + } + + @Override + public void deleteSnapshot(Path path, String snapshotName) throws IOException { + fileSystem.deleteSnapshot(convertToDefaultPath(path), snapshotName); + } + + @Override + public void modifyAclEntries(Path path, List aclSpec) throws IOException { + fileSystem.modifyAclEntries(convertToDefaultPath(path), aclSpec); + } + + @Override + public void removeAclEntries(Path path, List aclSpec) throws IOException { + fileSystem.removeAclEntries(convertToDefaultPath(path), aclSpec); + } + + @Override + public void removeDefaultAcl(Path path) throws IOException { + fileSystem.removeDefaultAcl(convertToDefaultPath(path)); + } + + @Override + public void removeAcl(Path path) throws IOException { + fileSystem.removeAcl(convertToDefaultPath(path)); + } + + @Override + public void setAcl(Path path, List aclSpec) throws IOException { + fileSystem.setAcl(convertToDefaultPath(path), aclSpec); + } + + @Override + public AclStatus getAclStatus(Path path) throws IOException { + return fileSystem.getAclStatus(convertToDefaultPath(path)); + } + + @Override + public void setXAttr(Path path, String name, byte[] value) throws IOException { + fileSystem.setXAttr(convertToDefaultPath(path), name, value); + } + + @Override + public void setXAttr(Path path, String name, byte[] value, EnumSet flag) + throws IOException { + fileSystem.setXAttr(convertToDefaultPath(path), name, value, flag); + } + + @Override + public byte[] getXAttr(Path path, String name) throws IOException { + return fileSystem.getXAttr(convertToDefaultPath(path), name); + } + + @Override + public Map getXAttrs(Path path) throws IOException { + return fileSystem.getXAttrs(convertToDefaultPath(path)); + } + + @Override + public Map getXAttrs(Path path, List names) + throws IOException { + return fileSystem.getXAttrs(convertToDefaultPath(path), names); + } + + @Override + public List listXAttrs(Path path) throws IOException { + return fileSystem.listXAttrs(convertToDefaultPath(path)); + } + + @Override + public void removeXAttr(Path path, String name) throws IOException { + fileSystem.removeXAttr(convertToDefaultPath(path), name); + } + + @Override + public void setConf(Configuration conf) { + // ignore this. we will set conf on init + } + + @Override + public Configuration getConf() { + return fileSystem.getConf(); + } + + @Override + public int hashCode() { + return fileSystem.hashCode(); + } + + @Override + public boolean equals(Object obj) { + return fileSystem.equals(obj); + } + + @Override + public String toString() { + return fileSystem.toString(); + } + + public Path convertToHoodiePath(Path oldPath) { + return convertPathWithScheme(oldPath, getHoodieScheme(fileSystem.getScheme())); + } + + public static Path convertToHoodiePath(Path file, Configuration conf) { + String scheme = FileSystem.getDefaultUri(conf).getScheme(); + return convertPathWithScheme(file, getHoodieScheme(scheme)); + } + + private Path convertToDefaultPath(Path oldPath) { + return convertPathWithScheme(oldPath, fileSystem.getScheme()); + } + + private Path[] convertDefaults(Path[] psrcs) { + Path[] psrcsNew = new Path[psrcs.length]; + for (int i = 0; i < psrcs.length; i++) { + psrcsNew[i] = convertToDefaultPath(psrcs[i]); + } + return psrcsNew; + } + + private static Path convertPathWithScheme(Path oldPath, String newScheme) { + URI oldURI = oldPath.toUri(); + URI newURI; + try { + newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), + oldURI.getPath(), oldURI.getQuery(), oldURI.getFragment()); + return new Path(newURI); + } catch (URISyntaxException e) { + // TODO - Better Exception handling + throw new RuntimeException(e); + } + } + + public static String getHoodieScheme(String scheme) { + String newScheme; + if (SUPPORT_SCHEMES.contains(scheme)) { + newScheme = HOODIE_SCHEME_PREFIX + scheme; + } else { + throw new IllegalArgumentException( + "BlockAlignedAvroParquetWriter does not support scheme " + scheme); + } + return newScheme; + } + + public long getBytesWritten(Path file) { + if (openStreams.containsKey(file.getName())) { + return openStreams.get(file.getName()).getBytesWritten(); + } + // When the file is first written, we do not have a track of it + throw new IllegalArgumentException(file.toString() + + " does not have a open stream. Cannot get the bytes written on the stream"); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/SizeAwareFSDataOutputStream.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/SizeAwareFSDataOutputStream.java index 1c4dd9ae5..3f966d6ac 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/SizeAwareFSDataOutputStream.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/SizeAwareFSDataOutputStream.java @@ -16,44 +16,47 @@ package com.uber.hoodie.io.storage; -import org.apache.hadoop.fs.FSDataOutputStream; - import java.io.IOException; import java.util.concurrent.atomic.AtomicLong; +import org.apache.hadoop.fs.FSDataOutputStream; /** - * Wrapper over FSDataOutputStream to keep track of the size of the written bytes. - * This gives a cheap way to check on the underlying file size. + * Wrapper over FSDataOutputStream to keep track of the size of the written bytes. This + * gives a cheap way to check on the underlying file size. */ public class SizeAwareFSDataOutputStream extends FSDataOutputStream { - // A callback to call when the output stream is closed. - private final Runnable closeCallback; - // Keep track of the bytes written - private final AtomicLong bytesWritten = new AtomicLong(0L); - public SizeAwareFSDataOutputStream(FSDataOutputStream out, Runnable closeCallback) - throws IOException { - super(out); - this.closeCallback = closeCallback; - } + // A callback to call when the output stream is closed. + private final Runnable closeCallback; + // Keep track of the bytes written + private final AtomicLong bytesWritten = new AtomicLong(0L); - @Override public synchronized void write(byte[] b, int off, int len) throws IOException { - bytesWritten.addAndGet(len); - super.write(b, off, len); - } + public SizeAwareFSDataOutputStream(FSDataOutputStream out, Runnable closeCallback) + throws IOException { + super(out); + this.closeCallback = closeCallback; + } - @Override public void write(byte[] b) throws IOException { - bytesWritten.addAndGet(b.length); - super.write(b); - } + @Override + public synchronized void write(byte[] b, int off, int len) throws IOException { + bytesWritten.addAndGet(len); + super.write(b, off, len); + } - @Override public void close() throws IOException { - super.close(); - closeCallback.run(); - } + @Override + public void write(byte[] b) throws IOException { + bytesWritten.addAndGet(b.length); + super.write(b); + } - public long getBytesWritten() { - return bytesWritten.get(); - } + @Override + public void close() throws IOException { + super.close(); + closeCallback.run(); + } + + public long getBytesWritten() { + return bytesWritten.get(); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java index c0dee102d..f6c79bb10 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java @@ -22,7 +22,6 @@ import com.codahale.metrics.Timer; import com.google.common.annotations.VisibleForTesting; import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.config.HoodieWriteConfig; - import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -30,119 +29,122 @@ import org.apache.log4j.Logger; * Wrapper for metrics-related operations. */ public class HoodieMetrics { - private HoodieWriteConfig config = null; - private String tableName = null; - private static Logger logger = LogManager.getLogger(HoodieMetrics.class); - // Some timers - public String rollbackTimerName = null; - public String cleanTimerName = null; - public String commitTimerName = null; - private Timer rollbackTimer = null; - private Timer cleanTimer = null; - private Timer commitTimer = null; - public HoodieMetrics(HoodieWriteConfig config, String tableName) { - this.config = config; - this.tableName = tableName; - if (config.isMetricsOn()) { - Metrics.init(config); - this.rollbackTimerName = getMetricsName("timer", "rollback"); - this.cleanTimerName = getMetricsName("timer", "clean"); - this.commitTimerName = getMetricsName("timer", "commit"); + private HoodieWriteConfig config = null; + private String tableName = null; + private static Logger logger = LogManager.getLogger(HoodieMetrics.class); + // Some timers + public String rollbackTimerName = null; + public String cleanTimerName = null; + public String commitTimerName = null; + private Timer rollbackTimer = null; + private Timer cleanTimer = null; + private Timer commitTimer = null; + + public HoodieMetrics(HoodieWriteConfig config, String tableName) { + this.config = config; + this.tableName = tableName; + if (config.isMetricsOn()) { + Metrics.init(config); + this.rollbackTimerName = getMetricsName("timer", "rollback"); + this.cleanTimerName = getMetricsName("timer", "clean"); + this.commitTimerName = getMetricsName("timer", "commit"); + } + } + + private Timer createTimer(String name) { + return config.isMetricsOn() ? Metrics.getInstance().getRegistry().timer(name) : null; + } + + public Timer.Context getRollbackCtx() { + if (config.isMetricsOn() && rollbackTimer == null) { + rollbackTimer = createTimer(rollbackTimerName); + } + return rollbackTimer == null ? null : rollbackTimer.time(); + } + + public Timer.Context getCleanCtx() { + if (config.isMetricsOn() && cleanTimer == null) { + cleanTimer = createTimer(cleanTimerName); + } + return cleanTimer == null ? null : cleanTimer.time(); + } + + public Timer.Context getCommitCtx() { + if (config.isMetricsOn() && commitTimer == null) { + commitTimer = createTimer(commitTimerName); + } + return commitTimer == null ? null : commitTimer.time(); + } + + public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, + HoodieCommitMetadata metadata) { + if (config.isMetricsOn()) { + long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten(); + long totalFilesInsert = metadata.fetchTotalFilesInsert(); + long totalFilesUpdate = metadata.fetchTotalFilesUpdated(); + long totalRecordsWritten = metadata.fetchTotalRecordsWritten(); + long totalUpdateRecordsWritten = metadata.fetchTotalUpdateRecordsWritten(); + long totalInsertRecordsWritten = metadata.fetchTotalInsertRecordsWritten(); + long totalBytesWritten = metadata.fetchTotalBytesWritten(); + registerGauge(getMetricsName("commit", "duration"), durationInMs); + registerGauge(getMetricsName("commit", "totalPartitionsWritten"), totalPartitionsWritten); + registerGauge(getMetricsName("commit", "totalFilesInsert"), totalFilesInsert); + registerGauge(getMetricsName("commit", "totalFilesUpdate"), totalFilesUpdate); + registerGauge(getMetricsName("commit", "totalRecordsWritten"), totalRecordsWritten); + registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"), + totalUpdateRecordsWritten); + registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"), + totalInsertRecordsWritten); + registerGauge(getMetricsName("commit", "totalBytesWritten"), totalBytesWritten); + registerGauge(getMetricsName("commit", "commitTime"), commitEpochTimeInMs); + } + } + + public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) { + if (config.isMetricsOn()) { + logger.info(String.format("Sending rollback metrics (duration=%d, numFilesDeleted=$d)", + durationInMs, numFilesDeleted)); + registerGauge(getMetricsName("rollback", "duration"), durationInMs); + registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted); + } + } + + public void updateCleanMetrics(long durationInMs, int numFilesDeleted) { + if (config.isMetricsOn()) { + logger.info(String.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", + durationInMs, numFilesDeleted)); + registerGauge(getMetricsName("clean", "duration"), durationInMs); + registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted); + } + } + + @VisibleForTesting + String getMetricsName(String action, String metric) { + return config == null ? null : + String.format("%s.%s.%s", tableName, action, metric); + } + + void registerGauge(String metricName, final long value) { + try { + MetricRegistry registry = Metrics.getInstance().getRegistry(); + registry.register(metricName, new Gauge() { + @Override + public Long getValue() { + return value; } + }); + } catch (Exception e) { + // Here we catch all exception, so the major upsert pipeline will not be affected if the metrics system + // has some issues. + logger.error("Failed to send metrics: ", e); } + } - private Timer createTimer(String name) { - return config.isMetricsOn() ? Metrics.getInstance().getRegistry().timer(name) : null; - } - - public Timer.Context getRollbackCtx() { - if (config.isMetricsOn() && rollbackTimer == null) { - rollbackTimer = createTimer(rollbackTimerName); - } - return rollbackTimer == null ? null : rollbackTimer.time(); - } - - public Timer.Context getCleanCtx() { - if (config.isMetricsOn() && cleanTimer == null) { - cleanTimer = createTimer(cleanTimerName); - } - return cleanTimer == null ? null : cleanTimer.time(); - } - - public Timer.Context getCommitCtx() { - if (config.isMetricsOn() && commitTimer == null) { - commitTimer = createTimer(commitTimerName); - } - return commitTimer == null ? null : commitTimer.time(); - } - - public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata) { - if (config.isMetricsOn()) { - long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten(); - long totalFilesInsert = metadata.fetchTotalFilesInsert(); - long totalFilesUpdate = metadata.fetchTotalFilesUpdated(); - long totalRecordsWritten = metadata.fetchTotalRecordsWritten(); - long totalUpdateRecordsWritten = metadata.fetchTotalUpdateRecordsWritten(); - long totalInsertRecordsWritten = metadata.fetchTotalInsertRecordsWritten(); - long totalBytesWritten = metadata.fetchTotalBytesWritten(); - registerGauge(getMetricsName("commit", "duration"), durationInMs); - registerGauge(getMetricsName("commit", "totalPartitionsWritten"), totalPartitionsWritten); - registerGauge(getMetricsName("commit", "totalFilesInsert"), totalFilesInsert); - registerGauge(getMetricsName("commit", "totalFilesUpdate"), totalFilesUpdate); - registerGauge(getMetricsName("commit", "totalRecordsWritten"), totalRecordsWritten); - registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"), totalUpdateRecordsWritten); - registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"), totalInsertRecordsWritten); - registerGauge(getMetricsName("commit", "totalBytesWritten"), totalBytesWritten); - registerGauge(getMetricsName("commit", "commitTime"), commitEpochTimeInMs); - } - } - - public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) { - if (config.isMetricsOn()) { - logger.info(String.format("Sending rollback metrics (duration=%d, numFilesDeleted=$d)", - durationInMs, numFilesDeleted)); - registerGauge(getMetricsName("rollback", "duration"), durationInMs); - registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted); - } - } - - public void updateCleanMetrics(long durationInMs, int numFilesDeleted) { - if (config.isMetricsOn()) { - logger.info(String.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", - durationInMs, numFilesDeleted)); - registerGauge(getMetricsName("clean", "duration"), durationInMs); - registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted); - } - } - - @VisibleForTesting - String getMetricsName(String action, String metric) { - return config == null ? null : - String.format("%s.%s.%s", tableName, action, metric); - } - - void registerGauge(String metricName, final long value) { - try { - MetricRegistry registry = Metrics.getInstance().getRegistry(); - registry.register(metricName, new Gauge() { - @Override - public Long getValue() { - return value; - } - }); - } catch (Exception e) { - // Here we catch all exception, so the major upsert pipeline will not be affected if the metrics system - // has some issues. - logger.error("Failed to send metrics: ", e); - } - } - - /** - * By default, the timer context returns duration with nano seconds. - * Convert it to millisecond. - */ - public long getDurationInMs(long ctxDuration) { - return ctxDuration / 1000000; - } + /** + * By default, the timer context returns duration with nano seconds. Convert it to millisecond. + */ + public long getDurationInMs(long ctxDuration) { + return ctxDuration / 1000000; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/InMemoryMetricsReporter.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/InMemoryMetricsReporter.java index e3511b523..282c592cc 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/InMemoryMetricsReporter.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/InMemoryMetricsReporter.java @@ -22,16 +22,17 @@ import java.io.Closeable; * Used for testing. */ public class InMemoryMetricsReporter extends MetricsReporter { - @Override - public void start() { - } - @Override - public void report() { - } + @Override + public void start() { + } - @Override - public Closeable getReporter() { - return null; - } + @Override + public void report() { + } + + @Override + public Closeable getReporter() { + return null; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java index 337d21c16..c5a357317 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java @@ -19,65 +19,64 @@ package com.uber.hoodie.metrics; import com.codahale.metrics.MetricRegistry; import com.google.common.io.Closeables; import com.uber.hoodie.config.HoodieWriteConfig; -import com.uber.hoodie.config.HoodieMetricsConfig; import com.uber.hoodie.exception.HoodieException; -import org.apache.commons.configuration.ConfigurationException; - import java.io.Closeable; +import org.apache.commons.configuration.ConfigurationException; /** * This is the main class of the metrics system. */ public class Metrics { - private static volatile boolean initialized = false; - private static Metrics metrics = null; - private final MetricRegistry registry; - private MetricsReporter reporter = null; - private Metrics(HoodieWriteConfig metricConfig) throws ConfigurationException { - registry = new MetricRegistry(); + private static volatile boolean initialized = false; + private static Metrics metrics = null; + private final MetricRegistry registry; + private MetricsReporter reporter = null; - reporter = MetricsReporterFactory.createReporter(metricConfig, registry); - if (reporter == null) { - throw new RuntimeException("Cannot initialize Reporter."); - } + private Metrics(HoodieWriteConfig metricConfig) throws ConfigurationException { + registry = new MetricRegistry(); + + reporter = MetricsReporterFactory.createReporter(metricConfig, registry); + if (reporter == null) { + throw new RuntimeException("Cannot initialize Reporter."); + } // reporter.start(); - Runtime.getRuntime().addShutdownHook(new Thread() { - @Override - public void run() { - try { - reporter.report(); - Closeables.close(reporter.getReporter(), true); - } catch (Exception e) { - e.printStackTrace(); - } - } - }); - } - - public static Metrics getInstance() { - assert initialized; - return metrics; - } - - public static synchronized void init(HoodieWriteConfig metricConfig) { - if (initialized) { - return; - } + Runtime.getRuntime().addShutdownHook(new Thread() { + @Override + public void run() { try { - metrics = new Metrics(metricConfig); - } catch (ConfigurationException e) { - throw new HoodieException(e); + reporter.report(); + Closeables.close(reporter.getReporter(), true); + } catch (Exception e) { + e.printStackTrace(); } - initialized = true; - } + } + }); + } - public MetricRegistry getRegistry() { - return registry; - } + public static Metrics getInstance() { + assert initialized; + return metrics; + } - public Closeable getReporter() { - return reporter.getReporter(); + public static synchronized void init(HoodieWriteConfig metricConfig) { + if (initialized) { + return; } + try { + metrics = new Metrics(metricConfig); + } catch (ConfigurationException e) { + throw new HoodieException(e); + } + initialized = true; + } + + public MetricRegistry getRegistry() { + return registry; + } + + public Closeable getReporter() { + return reporter.getReporter(); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsGraphiteReporter.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsGraphiteReporter.java index aeb5464d4..bc7d024a5 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsGraphiteReporter.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsGraphiteReporter.java @@ -21,75 +21,74 @@ import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.graphite.Graphite; import com.codahale.metrics.graphite.GraphiteReporter; import com.uber.hoodie.config.HoodieWriteConfig; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - import java.io.Closeable; import java.net.InetSocketAddress; import java.util.concurrent.TimeUnit; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; /** - * Implementation of Graphite reporter, which connects to the Graphite server, - * and send metrics to that server. + * Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to + * that server. */ public class MetricsGraphiteReporter extends MetricsReporter { - private final MetricRegistry registry; - private final GraphiteReporter graphiteReporter; - private final HoodieWriteConfig config; - private String serverHost; - private int serverPort; - private static Logger logger = LogManager.getLogger(MetricsGraphiteReporter.class); + private final MetricRegistry registry; + private final GraphiteReporter graphiteReporter; + private final HoodieWriteConfig config; + private String serverHost; + private int serverPort; - public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry) { - this.registry = registry; - this.config = config; + private static Logger logger = LogManager.getLogger(MetricsGraphiteReporter.class); - // Check the serverHost and serverPort here - this.serverHost = config.getGraphiteServerHost(); - this.serverPort = config.getGraphiteServerPort(); - if (serverHost == null || serverPort == 0) { - throw new RuntimeException( - String.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].", - serverHost, serverPort)); - } + public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry) { + this.registry = registry; + this.config = config; - this.graphiteReporter = createGraphiteReport(); + // Check the serverHost and serverPort here + this.serverHost = config.getGraphiteServerHost(); + this.serverPort = config.getGraphiteServerPort(); + if (serverHost == null || serverPort == 0) { + throw new RuntimeException( + String.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].", + serverHost, serverPort)); } - @Override - public void start() { - if (graphiteReporter != null) { - graphiteReporter.start(30, TimeUnit.SECONDS); - } else { - logger.error("Cannot start as the graphiteReporter is null."); - } - } + this.graphiteReporter = createGraphiteReport(); + } - @Override - public void report() { - if (graphiteReporter != null) { - graphiteReporter.report(); - } else { - logger.error("Cannot report metrics as the graphiteReporter is null."); - } + @Override + public void start() { + if (graphiteReporter != null) { + graphiteReporter.start(30, TimeUnit.SECONDS); + } else { + logger.error("Cannot start as the graphiteReporter is null."); } + } - @Override - public Closeable getReporter() { - return graphiteReporter; + @Override + public void report() { + if (graphiteReporter != null) { + graphiteReporter.report(); + } else { + logger.error("Cannot report metrics as the graphiteReporter is null."); } + } - private GraphiteReporter createGraphiteReport() { - Graphite graphite = new Graphite( - new InetSocketAddress(serverHost, serverPort)); - String reporterPrefix = config.getGraphiteMetricPrefix(); - return GraphiteReporter.forRegistry(registry) - .prefixedWith(reporterPrefix) - .convertRatesTo(TimeUnit.SECONDS) - .convertDurationsTo(TimeUnit.MILLISECONDS) - .filter(MetricFilter.ALL) - .build(graphite); - } + @Override + public Closeable getReporter() { + return graphiteReporter; + } + + private GraphiteReporter createGraphiteReport() { + Graphite graphite = new Graphite( + new InetSocketAddress(serverHost, serverPort)); + String reporterPrefix = config.getGraphiteMetricPrefix(); + return GraphiteReporter.forRegistry(registry) + .prefixedWith(reporterPrefix) + .convertRatesTo(TimeUnit.SECONDS) + .convertDurationsTo(TimeUnit.MILLISECONDS) + .filter(MetricFilter.ALL) + .build(graphite); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporter.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporter.java index 719e7c6a3..22c0a076a 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporter.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporter.java @@ -22,15 +22,16 @@ import java.io.Closeable; * Interface for implementing a Reporter. */ public abstract class MetricsReporter { - /** - * Push out metrics at scheduled intervals - */ - public abstract void start(); - /** - * Deterministically push out metrics - */ - public abstract void report(); + /** + * Push out metrics at scheduled intervals + */ + public abstract void start(); - public abstract Closeable getReporter(); + /** + * Deterministically push out metrics + */ + public abstract void report(); + + public abstract Closeable getReporter(); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterFactory.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterFactory.java index 3c0d9e667..31bc555c7 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterFactory.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterFactory.java @@ -18,7 +18,6 @@ package com.uber.hoodie.metrics; import com.codahale.metrics.MetricRegistry; import com.uber.hoodie.config.HoodieWriteConfig; - import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -26,23 +25,24 @@ import org.apache.log4j.Logger; * Factory class for creating MetricsReporter. */ public class MetricsReporterFactory { - private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class); - public static MetricsReporter createReporter(HoodieWriteConfig config, - MetricRegistry registry) { - MetricsReporterType type = config.getMetricsReporterType(); - MetricsReporter reporter = null; - switch (type) { - case GRAPHITE: - reporter = new MetricsGraphiteReporter(config, registry); - break; - case INMEMORY: - reporter = new InMemoryMetricsReporter(); - break; - default: - logger.error("Reporter type[" + type + "] is not supported."); - break; - } - return reporter; + private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class); + + public static MetricsReporter createReporter(HoodieWriteConfig config, + MetricRegistry registry) { + MetricsReporterType type = config.getMetricsReporterType(); + MetricsReporter reporter = null; + switch (type) { + case GRAPHITE: + reporter = new MetricsGraphiteReporter(config, registry); + break; + case INMEMORY: + reporter = new InMemoryMetricsReporter(); + break; + default: + logger.error("Reporter type[" + type + "] is not supported."); + break; } + return reporter; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterType.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterType.java index cac162cec..46e128ddb 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterType.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterType.java @@ -17,10 +17,10 @@ package com.uber.hoodie.metrics; /** - * Types of the reporter. Right now we only support Graphite. - * We can include JMX and CSV in the future. + * Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the + * future. */ public enum MetricsReporterType { - GRAPHITE, - INMEMORY + GRAPHITE, + INMEMORY } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java index 9c2a80e9b..3bcce373b 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java @@ -70,626 +70,626 @@ import org.apache.spark.api.java.function.PairFlatMapFunction; import scala.Option; import scala.Tuple2; -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; - /** * Implementation of a very heavily read-optimized Hoodie Table where * - * INSERTS - Produce new files, block aligned to desired size (or) - * Merge with the smallest existing file, to expand it + * INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing + * file, to expand it * * UPDATES - Produce a new version of the file, just replacing the updated records with new values - * */ public class HoodieCopyOnWriteTable extends HoodieTable { - public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) { - super(config, metaClient); - } - private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class); + public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) { + super(config, metaClient); + } - enum BucketType { - UPDATE, - INSERT + private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class); + + enum BucketType { + UPDATE, + INSERT + } + + /** + * Helper class for a small file's location and its actual size on disk + */ + class SmallFile implements Serializable { + + HoodieRecordLocation location; + long sizeBytes; + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("SmallFile {"); + sb.append("location=").append(location).append(", "); + sb.append("sizeBytes=").append(sizeBytes); + sb.append('}'); + return sb.toString(); } + } + + /** + * Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of + * incoming inserts that should be allocated to the bucket + */ + class InsertBucket implements Serializable { + + int bucketNumber; + // fraction of total inserts, that should go into this bucket + double weight; + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("WorkloadStat {"); + sb.append("bucketNumber=").append(bucketNumber).append(", "); + sb.append("weight=").append(weight); + sb.append('}'); + return sb.toString(); + } + } + + /** + * Helper class for a bucket's type (INSERT and UPDATE) and its file location + */ + class BucketInfo implements Serializable { + + BucketType bucketType; + String fileLoc; + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("BucketInfo {"); + sb.append("bucketType=").append(bucketType).append(", "); + sb.append("fileLoc=").append(fileLoc); + sb.append('}'); + return sb.toString(); + } + } + + + /** + * Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition) + */ + class UpsertPartitioner extends Partitioner { /** - * Helper class for a small file's location and its actual size on disk + * Total number of RDD partitions, is determined by total buckets we want to pack the incoming + * workload into */ - class SmallFile implements Serializable { - HoodieRecordLocation location; - long sizeBytes; - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("SmallFile {"); - sb.append("location=").append(location).append(", "); - sb.append("sizeBytes=").append(sizeBytes); - sb.append('}'); - return sb.toString(); - } - } + private int totalBuckets = 0; /** - * Helper class for an insert bucket along with the weight [0.0, 0.1] - * that defines the amount of incoming inserts that should be allocated to - * the bucket + * Stat for the current workload. Helps in determining total inserts, upserts etc. */ - class InsertBucket implements Serializable { - int bucketNumber; - // fraction of total inserts, that should go into this bucket - double weight; - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("WorkloadStat {"); - sb.append("bucketNumber=").append(bucketNumber).append(", "); - sb.append("weight=").append(weight); - sb.append('}'); - return sb.toString(); - } - } + private WorkloadStat globalStat; /** - * Helper class for a bucket's type (INSERT and UPDATE) and its file location + * Helps decide which bucket an incoming update should go to. */ - class BucketInfo implements Serializable { - BucketType bucketType; - String fileLoc; + private HashMap updateLocationToBucket; - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("BucketInfo {"); - sb.append("bucketType=").append(bucketType).append(", "); - sb.append("fileLoc=").append(fileLoc); - sb.append('}'); - return sb.toString(); + + /** + * Helps us pack inserts into 1 or more buckets depending on number of incoming records. + */ + private HashMap> partitionPathToInsertBuckets; + + + /** + * Remembers what type each bucket is for later. + */ + private HashMap bucketInfoMap; + + UpsertPartitioner(WorkloadProfile profile) { + updateLocationToBucket = new HashMap<>(); + partitionPathToInsertBuckets = new HashMap<>(); + bucketInfoMap = new HashMap<>(); + globalStat = profile.getGlobalStat(); + + assignUpdates(profile); + assignInserts(profile); + + logger.info("Total Buckets :" + totalBuckets + ", " + + "buckets info => " + bucketInfoMap + ", \n" + + "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n" + + "UpdateLocations mapped to buckets =>" + updateLocationToBucket); + } + + private void assignUpdates(WorkloadProfile profile) { + // each update location gets a partition + WorkloadStat gStat = profile.getGlobalStat(); + for (Map.Entry updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) { + addUpdateBucket(updateLocEntry.getKey()); + } + } + + private int addUpdateBucket(String fileLoc) { + int bucket = totalBuckets; + updateLocationToBucket.put(fileLoc, bucket); + BucketInfo bucketInfo = new BucketInfo(); + bucketInfo.bucketType = BucketType.UPDATE; + bucketInfo.fileLoc = fileLoc; + bucketInfoMap.put(totalBuckets, bucketInfo); + totalBuckets++; + return bucket; + } + + private void assignInserts(WorkloadProfile profile) { + // for new inserts, compute buckets depending on how many records we have for each partition + Set partitionPaths = profile.getPartitionPaths(); + long averageRecordSize = averageBytesPerRecord(); + logger.info("AvgRecordSize => " + averageRecordSize); + for (String partitionPath : partitionPaths) { + WorkloadStat pStat = profile.getWorkloadStat(partitionPath); + if (pStat.getNumInserts() > 0) { + + List smallFiles = getSmallFiles(partitionPath); + logger.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles); + + long totalUnassignedInserts = pStat.getNumInserts(); + List bucketNumbers = new ArrayList<>(); + List recordsPerBucket = new ArrayList<>(); + + // first try packing this into one of the smallFiles + for (SmallFile smallFile : smallFiles) { + long recordsToAppend = Math + .min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize, + totalUnassignedInserts); + if (recordsToAppend > 0 && totalUnassignedInserts > 0) { + // create a new bucket or re-use an existing bucket + int bucket; + if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) { + bucket = updateLocationToBucket.get(smallFile.location.getFileId()); + logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + + bucket); + } else { + bucket = addUpdateBucket(smallFile.location.getFileId()); + logger.info( + "Assigning " + recordsToAppend + " inserts to new update bucket " + bucket); + } + bucketNumbers.add(bucket); + recordsPerBucket.add(recordsToAppend); + totalUnassignedInserts -= recordsToAppend; + } + } + + // if we have anything more, create new insert buckets, like normal + if (totalUnassignedInserts > 0) { + long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize(); + if (config.shouldAutoTuneInsertSplits()) { + insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize; + } + + int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L); + logger + .info("After small file assignment: unassignedInserts => " + totalUnassignedInserts + + ", totalInsertBuckets => " + insertBuckets + + ", recordsPerBucket => " + insertRecordsPerBucket); + for (int b = 0; b < insertBuckets; b++) { + bucketNumbers.add(totalBuckets); + recordsPerBucket.add(totalUnassignedInserts / insertBuckets); + BucketInfo bucketInfo = new BucketInfo(); + bucketInfo.bucketType = BucketType.INSERT; + bucketInfoMap.put(totalBuckets, bucketInfo); + totalBuckets++; + } + } + + // Go over all such buckets, and assign weights as per amount of incoming inserts. + List insertBuckets = new ArrayList<>(); + for (int i = 0; i < bucketNumbers.size(); i++) { + InsertBucket bkt = new InsertBucket(); + bkt.bucketNumber = bucketNumbers.get(i); + bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts(); + insertBuckets.add(bkt); + } + logger.info( + "Total insert buckets for partition path " + partitionPath + " => " + insertBuckets); + partitionPathToInsertBuckets.put(partitionPath, insertBuckets); } + } } /** - * Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition) + * Returns a list of small files in the given partition path */ - class UpsertPartitioner extends Partitioner { + private List getSmallFiles(String partitionPath) { + List smallFileLocations = new ArrayList<>(); - /** - * Total number of RDD partitions, is determined by total buckets we want to - * pack the incoming workload into - */ - private int totalBuckets = 0; + HoodieTimeline commitTimeline = getCompletedCommitTimeline(); - /** - * Stat for the current workload. Helps in determining total inserts, upserts etc. - */ - private WorkloadStat globalStat; + if (!commitTimeline.empty()) { // if we have some commits + HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); + List allFiles = getROFileSystemView() + .getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()) + .collect(Collectors.toList()); - /** - * Helps decide which bucket an incoming update should go to. - */ - private HashMap updateLocationToBucket; - - - /** - * Helps us pack inserts into 1 or more buckets depending on number of - * incoming records. - */ - private HashMap> partitionPathToInsertBuckets; - - - /** - * Remembers what type each bucket is for later. - */ - private HashMap bucketInfoMap; - - UpsertPartitioner(WorkloadProfile profile) { - updateLocationToBucket = new HashMap<>(); - partitionPathToInsertBuckets = new HashMap<>(); - bucketInfoMap = new HashMap<>(); - globalStat = profile.getGlobalStat(); - - assignUpdates(profile); - assignInserts(profile); - - logger.info("Total Buckets :" + totalBuckets + ", " + - "buckets info => " + bucketInfoMap + ", \n" + - "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n" + - "UpdateLocations mapped to buckets =>" + updateLocationToBucket); + for (HoodieDataFile file : allFiles) { + if (file.getFileSize() < config.getParquetSmallFileLimit()) { + String filename = file.getFileName(); + SmallFile sf = new SmallFile(); + sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), + FSUtils.getFileId(filename)); + sf.sizeBytes = file.getFileSize(); + smallFileLocations.add(sf); + } } + } - private void assignUpdates(WorkloadProfile profile) { - // each update location gets a partition - WorkloadStat gStat = profile.getGlobalStat(); - for (Map.Entry updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) { - addUpdateBucket(updateLocEntry.getKey()); - } - } - - private int addUpdateBucket(String fileLoc) { - int bucket = totalBuckets; - updateLocationToBucket.put(fileLoc, bucket); - BucketInfo bucketInfo = new BucketInfo(); - bucketInfo.bucketType = BucketType.UPDATE; - bucketInfo.fileLoc = fileLoc; - bucketInfoMap.put(totalBuckets, bucketInfo); - totalBuckets++; - return bucket; - } - - private void assignInserts(WorkloadProfile profile) { - // for new inserts, compute buckets depending on how many records we have for each partition - Set partitionPaths = profile.getPartitionPaths(); - long averageRecordSize = averageBytesPerRecord(); - logger.info("AvgRecordSize => " + averageRecordSize); - for (String partitionPath : partitionPaths) { - WorkloadStat pStat = profile.getWorkloadStat(partitionPath); - if (pStat.getNumInserts() > 0) { - - List smallFiles = getSmallFiles(partitionPath); - logger.info("For partitionPath : "+ partitionPath + " Small Files => " + smallFiles); - - long totalUnassignedInserts = pStat.getNumInserts(); - List bucketNumbers = new ArrayList<>(); - List recordsPerBucket = new ArrayList<>(); - - // first try packing this into one of the smallFiles - for (SmallFile smallFile: smallFiles) { - long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes)/ averageRecordSize, totalUnassignedInserts); - if (recordsToAppend > 0 && totalUnassignedInserts > 0){ - // create a new bucket or re-use an existing bucket - int bucket; - if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) { - bucket = updateLocationToBucket.get(smallFile.location.getFileId()); - logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "+ bucket); - } else { - bucket = addUpdateBucket(smallFile.location.getFileId()); - logger.info("Assigning " + recordsToAppend + " inserts to new update bucket "+ bucket); - } - bucketNumbers.add(bucket); - recordsPerBucket.add(recordsToAppend); - totalUnassignedInserts -= recordsToAppend; - } - } - - // if we have anything more, create new insert buckets, like normal - if (totalUnassignedInserts > 0) { - long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize(); - if (config.shouldAutoTuneInsertSplits()) { - insertRecordsPerBucket = config.getParquetMaxFileSize()/averageRecordSize; - } - - int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L); - logger.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts - + ", totalInsertBuckets => " + insertBuckets - + ", recordsPerBucket => " + insertRecordsPerBucket); - for (int b = 0; b < insertBuckets; b++) { - bucketNumbers.add(totalBuckets); - recordsPerBucket.add(totalUnassignedInserts/insertBuckets); - BucketInfo bucketInfo = new BucketInfo(); - bucketInfo.bucketType = BucketType.INSERT; - bucketInfoMap.put(totalBuckets, bucketInfo); - totalBuckets++; - } - } - - // Go over all such buckets, and assign weights as per amount of incoming inserts. - List insertBuckets = new ArrayList<>(); - for (int i = 0; i < bucketNumbers.size(); i++) { - InsertBucket bkt = new InsertBucket(); - bkt.bucketNumber = bucketNumbers.get(i); - bkt.weight = (1.0 * recordsPerBucket.get(i))/pStat.getNumInserts(); - insertBuckets.add(bkt); - } - logger.info("Total insert buckets for partition path "+ partitionPath + " => " + insertBuckets); - partitionPathToInsertBuckets.put(partitionPath, insertBuckets); - } - } - } - - - /** - * Returns a list of small files in the given partition path - * - * @param partitionPath - * @return - */ - private List getSmallFiles(String partitionPath) { - List smallFileLocations = new ArrayList<>(); - - HoodieTimeline commitTimeline = getCompletedCommitTimeline(); - - if (!commitTimeline.empty()) { // if we have some commits - HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); - List allFiles = getROFileSystemView() - .getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()) - .collect(Collectors.toList()); - - for (HoodieDataFile file : allFiles) { - if (file.getFileSize() < config.getParquetSmallFileLimit()) { - String filename = file.getFileName(); - SmallFile sf = new SmallFile(); - sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), - FSUtils.getFileId(filename)); - sf.sizeBytes = file.getFileSize(); - smallFileLocations.add(sf); - } - } - } - - return smallFileLocations; - } - - /** - * Obtains the average record size based on records written during last commit. - * Used for estimating how many records pack into one file. - * - * @return - */ - private long averageBytesPerRecord() { - long avgSize = 0L; - HoodieTimeline commitTimeline = - metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); - try { - if (!commitTimeline.empty()) { - HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(commitTimeline.getInstantDetails(latestCommitTime).get()); - avgSize = (long) Math.ceil( - (1.0 * commitMetadata.fetchTotalBytesWritten()) / commitMetadata - .fetchTotalRecordsWritten()); - } - } catch (Throwable t) { - // make this fail safe. - logger.error("Error trying to compute average bytes/record ", t); - } - return avgSize <= 0L ? config.getCopyOnWriteRecordSizeEstimate() : avgSize; - } - - public BucketInfo getBucketInfo(int bucketNumber) { - return bucketInfoMap.get(bucketNumber); - } - - public List getInsertBuckets(String partitionPath) { - return partitionPathToInsertBuckets.get(partitionPath); - } - - @Override - public int numPartitions() { - return totalBuckets; - } - - @Override - public int getPartition(Object key) { - Tuple2> keyLocation = (Tuple2>) key; - if (keyLocation._2().isDefined()) { - HoodieRecordLocation location = keyLocation._2().get(); - return updateLocationToBucket.get(location.getFileId()); - } else { - List targetBuckets = partitionPathToInsertBuckets.get(keyLocation._1().getPartitionPath()); - // pick the target bucket to use based on the weights. - double totalWeight = 0.0; - final long totalInserts = Math.max(1, globalStat.getNumInserts()); - final long hashOfKey = Hashing.md5().hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong(); - final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts; - for (InsertBucket insertBucket: targetBuckets) { - totalWeight += insertBucket.weight; - if (r <= totalWeight) { - return insertBucket.bucketNumber; - } - } - // return first one, by default - return targetBuckets.get(0).bucketNumber; - } - } - } - - - @Override - public Partitioner getUpsertPartitioner(WorkloadProfile profile) { - if (profile == null) { - throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner."); - } - return new UpsertPartitioner(profile); - } - - @Override - public Partitioner getInsertPartitioner(WorkloadProfile profile) { - return getUpsertPartitioner(profile); - } - - @Override - public boolean isWorkloadProfileNeeded() { - return true; - } - - - - public Iterator> handleUpdate(String commitTime, String fileLoc, Iterator> recordItr) - throws IOException { - // these are updates - HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr); - if (upsertHandle.getOldFilePath() == null) { - throw new HoodieUpsertException("Error in finding the old file path at commit " + - commitTime +" at fileLoc: " + fileLoc); - } else { - Configuration conf = FSUtils.getFs().getConf(); - AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema()); - ParquetReader reader = - AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(conf).build(); - try { - IndexedRecord record; - while ((record = reader.read()) != null) { - // Two types of writes here (new record, and old record). - // We have already catch the exception during writing new records. - // But for old records, we should fail if any exception happens. - upsertHandle.write((GenericRecord) record); - } - } catch (IOException e) { - throw new HoodieUpsertException( - "Failed to read record from " + upsertHandle.getOldFilePath() - + " with new Schema " + upsertHandle.getSchema(), e); - } finally { - reader.close(); - upsertHandle.close(); - } - } - //TODO(vc): This needs to be revisited - if (upsertHandle.getWriteStatus().getPartitionPath() == null) { - logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() - + ", " + upsertHandle.getWriteStatus()); - } - return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator(); - } - - protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc, Iterator> recordItr) { - return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc); - } - - public Iterator> handleInsert(String commitTime, Iterator> recordItr) throws Exception { - return new LazyInsertIterable<>(recordItr, config, commitTime, this); - } - - - @SuppressWarnings("unchecked") - @Override - public Iterator> handleUpsertPartition(String commitTime, Integer partition, - Iterator recordItr, Partitioner partitioner) { - UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner; - BucketInfo binfo = upsertPartitioner.getBucketInfo(partition); - BucketType btype = binfo.bucketType; - try { - if (btype.equals(BucketType.INSERT)) { - return handleInsert(commitTime, recordItr); - } else if (btype.equals(BucketType.UPDATE)) { - return handleUpdate(commitTime, binfo.fileLoc, recordItr); - } else { - throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition); - } - } catch (Throwable t) { - String msg = "Error upserting bucketType " + btype + " for partition :" + partition; - logger.error(msg, t); - throw new HoodieUpsertException(msg, t); - } - } - - @Override - public Iterator> handleInsertPartition(String commitTime, Integer partition, - Iterator recordItr, - Partitioner partitioner) { - return handleUpsertPartition(commitTime, partition, recordItr, partitioner); - } - - @Override - public Optional compact(JavaSparkContext jsc) { - logger.info("Nothing to compact in COW storage format"); - return Optional.empty(); + return smallFileLocations; } /** - * Performs cleaning of partition paths according to cleaning policy and returns the number - * of files cleaned. Handles skews in partitions to clean by making files to clean as the - * unit of task distribution. - * - * @throws IllegalArgumentException if unknown cleaning policy is provided + * Obtains the average record size based on records written during last commit. Used for + * estimating how many records pack into one file. */ - @Override - public List clean(JavaSparkContext jsc) { - try { - List partitionsToClean = - FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()); - logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config - .getCleanerPolicy()); - if (partitionsToClean.isEmpty()) { - logger.info("Nothing to clean here mom. It is already clean"); - return Collections.emptyList(); - } - return cleanPartitionPaths(partitionsToClean, jsc); - } catch (IOException e) { - throw new HoodieIOException("Failed to clean up after commit", e); + private long averageBytesPerRecord() { + long avgSize = 0L; + HoodieTimeline commitTimeline = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); + try { + if (!commitTimeline.empty()) { + HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(commitTimeline.getInstantDetails(latestCommitTime).get()); + avgSize = (long) Math.ceil( + (1.0 * commitMetadata.fetchTotalBytesWritten()) / commitMetadata + .fetchTotalRecordsWritten()); } + } catch (Throwable t) { + // make this fail safe. + logger.error("Error trying to compute average bytes/record ", t); + } + return avgSize <= 0L ? config.getCopyOnWriteRecordSizeEstimate() : avgSize; } - /** - * - * Common method used for cleaning out parquet files under a partition path during rollback of a set of commits - * @param partitionPath - * @param commits - * @return - * @throws IOException - */ - protected Map deleteCleanedFiles(String partitionPath, List commits) throws IOException { - logger.info("Cleaning path " + partitionPath); - FileSystem fs = FSUtils.getFs(); - FileStatus[] toBeDeleted = - fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> { - if(!path.toString().contains(".parquet")) { - return false; - } - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commits.contains(fileCommitTime); - }); - Map results = Maps.newHashMap(); - for (FileStatus file : toBeDeleted) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - logger.info("Delete file " + file.getPath() + "\t" + success); - } - return results; + public BucketInfo getBucketInfo(int bucketNumber) { + return bucketInfoMap.get(bucketNumber); + } + + public List getInsertBuckets(String partitionPath) { + return partitionPathToInsertBuckets.get(partitionPath); } @Override - public List rollback(JavaSparkContext jsc, List commits) throws IOException { - String actionType = this.getCompactedCommitActionType(); - HoodieActiveTimeline activeTimeline = this.getActiveTimeline(); - List inflights = this.getInflightCommitTimeline().getInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); - - // Atomically unpublish all the commits - commits.stream().filter(s -> !inflights.contains(s)) - .map(s -> new HoodieInstant(false, actionType, s)) - .forEach(activeTimeline::revertToInflight); - logger.info("Unpublished " + commits); - - // delete all the data files for all these commits - logger.info("Clean out all parquet files generated for commits: " + commits); - List stats = jsc.parallelize( - FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning())) - .map((Function) partitionPath -> { - // Scan all partitions files with this commit time - Map results = deleteCleanedFiles(partitionPath, commits); - return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) - .withDeletedFileResults(results).build(); - }).collect(); - - // Remove the rolled back inflight commits - commits.stream().map(s -> new HoodieInstant(true, actionType, s)) - .forEach(activeTimeline::deleteInflight); - logger.info("Deleted inflight commits " + commits); - return stats; + public int numPartitions() { + return totalBuckets; } - private static class PartitionCleanStat implements Serializable { - private final String partitionPath; - private final List deletePathPatterns = new ArrayList<>(); - private final List successDeleteFiles = new ArrayList<>(); - private final List failedDeleteFiles = new ArrayList<>(); - - private PartitionCleanStat(String partitionPath) { - this.partitionPath = partitionPath; + @Override + public int getPartition(Object key) { + Tuple2> keyLocation = (Tuple2>) key; + if (keyLocation._2().isDefined()) { + HoodieRecordLocation location = keyLocation._2().get(); + return updateLocationToBucket.get(location.getFileId()); + } else { + List targetBuckets = partitionPathToInsertBuckets + .get(keyLocation._1().getPartitionPath()); + // pick the target bucket to use based on the weights. + double totalWeight = 0.0; + final long totalInserts = Math.max(1, globalStat.getNumInserts()); + final long hashOfKey = Hashing.md5() + .hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong(); + final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts; + for (InsertBucket insertBucket : targetBuckets) { + totalWeight += insertBucket.weight; + if (r <= totalWeight) { + return insertBucket.bucketNumber; + } } + // return first one, by default + return targetBuckets.get(0).bucketNumber; + } + } + } - private void addDeletedFileResult(String deletePathStr, Boolean deletedFileResult) { - if (deletedFileResult) { - successDeleteFiles.add(deletePathStr); - } else { - failedDeleteFiles.add(deletePathStr); - } + + @Override + public Partitioner getUpsertPartitioner(WorkloadProfile profile) { + if (profile == null) { + throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner."); + } + return new UpsertPartitioner(profile); + } + + @Override + public Partitioner getInsertPartitioner(WorkloadProfile profile) { + return getUpsertPartitioner(profile); + } + + @Override + public boolean isWorkloadProfileNeeded() { + return true; + } + + + public Iterator> handleUpdate(String commitTime, String fileLoc, + Iterator> recordItr) + throws IOException { + // these are updates + HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr); + if (upsertHandle.getOldFilePath() == null) { + throw new HoodieUpsertException("Error in finding the old file path at commit " + + commitTime + " at fileLoc: " + fileLoc); + } else { + Configuration conf = FSUtils.getFs().getConf(); + AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema()); + ParquetReader reader = + AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(conf).build(); + try { + IndexedRecord record; + while ((record = reader.read()) != null) { + // Two types of writes here (new record, and old record). + // We have already catch the exception during writing new records. + // But for old records, we should fail if any exception happens. + upsertHandle.write((GenericRecord) record); } + } catch (IOException e) { + throw new HoodieUpsertException( + "Failed to read record from " + upsertHandle.getOldFilePath() + + " with new Schema " + upsertHandle.getSchema(), e); + } finally { + reader.close(); + upsertHandle.close(); + } + } + //TODO(vc): This needs to be revisited + if (upsertHandle.getWriteStatus().getPartitionPath() == null) { + logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + + ", " + upsertHandle.getWriteStatus()); + } + return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())) + .iterator(); + } - private void addDeleteFilePatterns(String deletePathStr) { - deletePathPatterns.add(deletePathStr); + protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc, + Iterator> recordItr) { + return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc); + } + + public Iterator> handleInsert(String commitTime, + Iterator> recordItr) throws Exception { + return new LazyInsertIterable<>(recordItr, config, commitTime, this); + } + + + @SuppressWarnings("unchecked") + @Override + public Iterator> handleUpsertPartition(String commitTime, Integer partition, + Iterator recordItr, Partitioner partitioner) { + UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner; + BucketInfo binfo = upsertPartitioner.getBucketInfo(partition); + BucketType btype = binfo.bucketType; + try { + if (btype.equals(BucketType.INSERT)) { + return handleInsert(commitTime, recordItr); + } else if (btype.equals(BucketType.UPDATE)) { + return handleUpdate(commitTime, binfo.fileLoc, recordItr); + } else { + throw new HoodieUpsertException( + "Unknown bucketType " + btype + " for partition :" + partition); + } + } catch (Throwable t) { + String msg = "Error upserting bucketType " + btype + " for partition :" + partition; + logger.error(msg, t); + throw new HoodieUpsertException(msg, t); + } + } + + @Override + public Iterator> handleInsertPartition(String commitTime, Integer partition, + Iterator recordItr, + Partitioner partitioner) { + return handleUpsertPartition(commitTime, partition, recordItr, partitioner); + } + + @Override + public Optional compact(JavaSparkContext jsc) { + logger.info("Nothing to compact in COW storage format"); + return Optional.empty(); + } + + /** + * Performs cleaning of partition paths according to cleaning policy and returns the number of + * files cleaned. Handles skews in partitions to clean by making files to clean as the unit of + * task distribution. + * + * @throws IllegalArgumentException if unknown cleaning policy is provided + */ + @Override + public List clean(JavaSparkContext jsc) { + try { + List partitionsToClean = + FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(), + config.shouldAssumeDatePartitioning()); + logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config + .getCleanerPolicy()); + if (partitionsToClean.isEmpty()) { + logger.info("Nothing to clean here mom. It is already clean"); + return Collections.emptyList(); + } + return cleanPartitionPaths(partitionsToClean, jsc); + } catch (IOException e) { + throw new HoodieIOException("Failed to clean up after commit", e); + } + } + + /** + * Common method used for cleaning out parquet files under a partition path during rollback of a + * set of commits + */ + protected Map deleteCleanedFiles(String partitionPath, List commits) + throws IOException { + logger.info("Cleaning path " + partitionPath); + FileSystem fs = FSUtils.getFs(); + FileStatus[] toBeDeleted = + fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> { + if (!path.toString().contains(".parquet")) { + return false; + } + String fileCommitTime = FSUtils.getCommitTime(path.getName()); + return commits.contains(fileCommitTime); + }); + Map results = Maps.newHashMap(); + for (FileStatus file : toBeDeleted) { + boolean success = fs.delete(file.getPath(), false); + results.put(file, success); + logger.info("Delete file " + file.getPath() + "\t" + success); + } + return results; + } + + @Override + public List rollback(JavaSparkContext jsc, List commits) + throws IOException { + String actionType = this.getCompactedCommitActionType(); + HoodieActiveTimeline activeTimeline = this.getActiveTimeline(); + List inflights = this.getInflightCommitTimeline().getInstants() + .map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); + + // Atomically unpublish all the commits + commits.stream().filter(s -> !inflights.contains(s)) + .map(s -> new HoodieInstant(false, actionType, s)) + .forEach(activeTimeline::revertToInflight); + logger.info("Unpublished " + commits); + + // delete all the data files for all these commits + logger.info("Clean out all parquet files generated for commits: " + commits); + List stats = jsc.parallelize( + FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(), + config.shouldAssumeDatePartitioning())) + .map((Function) partitionPath -> { + // Scan all partitions files with this commit time + Map results = deleteCleanedFiles(partitionPath, commits); + return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) + .withDeletedFileResults(results).build(); + }).collect(); + + // Remove the rolled back inflight commits + commits.stream().map(s -> new HoodieInstant(true, actionType, s)) + .forEach(activeTimeline::deleteInflight); + logger.info("Deleted inflight commits " + commits); + return stats; + } + + private static class PartitionCleanStat implements Serializable { + + private final String partitionPath; + private final List deletePathPatterns = new ArrayList<>(); + private final List successDeleteFiles = new ArrayList<>(); + private final List failedDeleteFiles = new ArrayList<>(); + + private PartitionCleanStat(String partitionPath) { + this.partitionPath = partitionPath; + } + + private void addDeletedFileResult(String deletePathStr, Boolean deletedFileResult) { + if (deletedFileResult) { + successDeleteFiles.add(deletePathStr); + } else { + failedDeleteFiles.add(deletePathStr); + } + } + + private void addDeleteFilePatterns(String deletePathStr) { + deletePathPatterns.add(deletePathStr); + } + + private PartitionCleanStat merge(PartitionCleanStat other) { + if (!this.partitionPath.equals(other.partitionPath)) { + throw new RuntimeException(String.format( + "partitionPath is not a match: (%s, %s)", + partitionPath, other.partitionPath)); + } + successDeleteFiles.addAll(other.successDeleteFiles); + deletePathPatterns.addAll(other.deletePathPatterns); + failedDeleteFiles.addAll(other.failedDeleteFiles); + return this; + } + } + + private List cleanPartitionPaths(List partitionsToClean, + JavaSparkContext jsc) { + int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); + logger.info("Using cleanerParallelism: " + cleanerParallelism); + List> partitionCleanStats = jsc + .parallelize(partitionsToClean, cleanerParallelism) + .flatMapToPair(getFilesToDeleteFunc(this, config)) + .repartition(cleanerParallelism) // repartition to remove skews + .mapPartitionsToPair(deleteFilesFunc(this, config)) + .reduceByKey( + // merge partition level clean stats below + (Function2) (e1, e2) -> e1 + .merge(e2)) + .collect(); + + Map partitionCleanStatsMap = partitionCleanStats + .stream().collect(Collectors.toMap(e -> e._1(), e -> e._2())); + + HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config); + // Return PartitionCleanStat for each partition passed. + return partitionsToClean.stream().map(partitionPath -> { + PartitionCleanStat partitionCleanStat = + (partitionCleanStatsMap.containsKey(partitionPath)) ? + partitionCleanStatsMap.get(partitionPath) + : new PartitionCleanStat(partitionPath); + return HoodieCleanStat.newBuilder() + .withPolicy(config.getCleanerPolicy()) + .withPartitionPath(partitionPath) + .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain()) + .withDeletePathPattern(partitionCleanStat.deletePathPatterns) + .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles) + .withFailedDeletes(partitionCleanStat.failedDeleteFiles) + .build(); + }).collect(Collectors.toList()); + } + + private PairFlatMapFunction>, String, PartitionCleanStat> deleteFilesFunc( + HoodieTable table, HoodieWriteConfig config) { + return (PairFlatMapFunction>, String, PartitionCleanStat>) iter -> { + HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config); + Map partitionCleanStatMap = new HashMap<>(); + + while (iter.hasNext()) { + Tuple2 partitionDelFileTuple = iter.next(); + String partitionPath = partitionDelFileTuple._1(); + String deletePathStr = partitionDelFileTuple._2(); + Boolean deletedFileResult = deleteFileAndGetResult(deletePathStr); + if (!partitionCleanStatMap.containsKey(partitionPath)) { + partitionCleanStatMap.put(partitionPath, + new PartitionCleanStat(partitionPath)); } + PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath); + partitionCleanStat.addDeleteFilePatterns(deletePathStr); + partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult); + } - private PartitionCleanStat merge(PartitionCleanStat other) { - if (!this.partitionPath.equals(other.partitionPath)) { - throw new RuntimeException(String.format( - "partitionPath is not a match: (%s, %s)", - partitionPath, other.partitionPath)); - } - successDeleteFiles.addAll(other.successDeleteFiles); - deletePathPatterns.addAll(other.deletePathPatterns); - failedDeleteFiles.addAll(other.failedDeleteFiles); - return this; - } - } - - private List cleanPartitionPaths(List partitionsToClean, JavaSparkContext jsc) { - int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); - logger.info("Using cleanerParallelism: " + cleanerParallelism); - List> partitionCleanStats = jsc - .parallelize(partitionsToClean, cleanerParallelism) - .flatMapToPair(getFilesToDeleteFunc(this, config)) - .repartition(cleanerParallelism) // repartition to remove skews - .mapPartitionsToPair(deleteFilesFunc(this, config)) - .reduceByKey( // merge partition level clean stats below - (Function2) (e1, e2) -> e1 - .merge(e2)) - .collect(); - - Map partitionCleanStatsMap = partitionCleanStats - .stream().collect(Collectors.toMap(e -> e._1(), e -> e._2())); - - HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config); - // Return PartitionCleanStat for each partition passed. - return partitionsToClean.stream().map(partitionPath -> { - PartitionCleanStat partitionCleanStat = - (partitionCleanStatsMap.containsKey(partitionPath)) ? - partitionCleanStatsMap.get(partitionPath) - : new PartitionCleanStat(partitionPath); - return HoodieCleanStat.newBuilder() - .withPolicy(config.getCleanerPolicy()) - .withPartitionPath(partitionPath) - .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain()) - .withDeletePathPattern(partitionCleanStat.deletePathPatterns) - .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles) - .withFailedDeletes(partitionCleanStat.failedDeleteFiles) - .build(); - }).collect(Collectors.toList()); - } - - private PairFlatMapFunction>, String, PartitionCleanStat> deleteFilesFunc( - HoodieTable table, HoodieWriteConfig config) { - return (PairFlatMapFunction>, String, PartitionCleanStat>) iter -> { - HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config); - Map partitionCleanStatMap = new HashMap<>(); - - while (iter.hasNext()) { - Tuple2 partitionDelFileTuple = iter.next(); - String partitionPath = partitionDelFileTuple._1(); - String deletePathStr = partitionDelFileTuple._2(); - Boolean deletedFileResult = deleteFileAndGetResult(deletePathStr); - if (!partitionCleanStatMap.containsKey(partitionPath)) { - partitionCleanStatMap.put(partitionPath, - new PartitionCleanStat(partitionPath)); - } - PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath); - partitionCleanStat.addDeleteFilePatterns(deletePathStr); - partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult); - } - - return partitionCleanStatMap.entrySet().stream() - .map(e -> new Tuple2<>(e.getKey(), e.getValue())) - .collect(Collectors.toList()).iterator(); - }; - } - - private static PairFlatMapFunction getFilesToDeleteFunc( - HoodieTable table, HoodieWriteConfig config) { - return (PairFlatMapFunction) partitionPathToClean -> { - HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config); - return cleaner.getDeletePaths(partitionPathToClean).stream() - .map(deleteFile -> new Tuple2<>(partitionPathToClean, deleteFile.toString())) - .iterator(); - }; - } - - private Boolean deleteFileAndGetResult(String deletePathStr) throws IOException { - Path deletePath = new Path(deletePathStr); - logger.debug("Working on delete path :" + deletePath); - boolean deleteResult = getFs().delete(deletePath, false); - if (deleteResult) { - logger.debug("Cleaned file at path :" + deletePath); - } - return deleteResult; + return partitionCleanStatMap.entrySet().stream() + .map(e -> new Tuple2<>(e.getKey(), e.getValue())) + .collect(Collectors.toList()).iterator(); + }; + } + + private static PairFlatMapFunction getFilesToDeleteFunc( + HoodieTable table, HoodieWriteConfig config) { + return (PairFlatMapFunction) partitionPathToClean -> { + HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config); + return cleaner.getDeletePaths(partitionPathToClean).stream() + .map(deleteFile -> new Tuple2<>(partitionPathToClean, deleteFile.toString())) + .iterator(); + }; + } + + private Boolean deleteFileAndGetResult(String deletePathStr) throws IOException { + Path deletePath = new Path(deletePathStr); + logger.debug("Working on delete path :" + deletePath); + boolean deleteResult = getFs().delete(deletePath, false); + if (deleteResult) { + logger.debug("Cleaned file at path :" + deletePath); } + return deleteResult; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieMergeOnReadTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieMergeOnReadTable.java index fe84238e3..1a2cfa1c0 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieMergeOnReadTable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieMergeOnReadTable.java @@ -39,13 +39,6 @@ import com.uber.hoodie.exception.HoodieCompactionException; import com.uber.hoodie.exception.HoodieRollbackException; import com.uber.hoodie.io.HoodieAppendHandle; import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; - import java.io.IOException; import java.io.UncheckedIOException; import java.util.Arrays; @@ -56,179 +49,209 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; /** * Implementation of a more real-time read-optimized Hoodie Table where * * INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) - * Merge with the smallest existing file, to expand it + * Merge with the smallest existing file, to expand it * - * UPDATES - Appends the changes to a rolling log file maintained per file Id. - * Compaction merges the log file into the base file. + * UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the + * log file into the base file. * - * WARNING - MOR table type does not support nested rollbacks, every rollback - * must be followed by an attempted commit action + * WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an + * attempted commit action */ -public class HoodieMergeOnReadTable extends HoodieCopyOnWriteTable { - private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class); +public class HoodieMergeOnReadTable extends + HoodieCopyOnWriteTable { - public HoodieMergeOnReadTable(HoodieWriteConfig config, - HoodieTableMetaClient metaClient) { - super(config, metaClient); + private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class); + + public HoodieMergeOnReadTable(HoodieWriteConfig config, + HoodieTableMetaClient metaClient) { + super(config, metaClient); + } + + @Override + public Iterator> handleUpdate(String commitTime, String fileId, + Iterator> recordItr) throws IOException { + logger.info("Merging updates for commit " + commitTime + " for file " + fileId); + HoodieAppendHandle appendHandle = + new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr); + appendHandle.doAppend(); + appendHandle.close(); + return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())) + .iterator(); + } + + @Override + public Optional compact(JavaSparkContext jsc) { + logger.info("Checking if compaction needs to be run on " + config.getBasePath()); + Optional lastCompaction = getActiveTimeline().getCompactionTimeline() + .filterCompletedInstants().lastInstant(); + String deltaCommitsSinceTs = "0"; + if (lastCompaction.isPresent()) { + deltaCommitsSinceTs = lastCompaction.get().getTimestamp(); } - @Override - public Iterator> handleUpdate(String commitTime, String fileId, - Iterator> recordItr) throws IOException { - logger.info("Merging updates for commit " + commitTime + " for file " + fileId); - HoodieAppendHandle appendHandle = - new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr); - appendHandle.doAppend(); - appendHandle.close(); - return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())) - .iterator(); + int deltaCommitsSinceLastCompaction = getActiveTimeline().getDeltaCommitTimeline() + .findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants(); + if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) { + logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction + + " delta commits was found since last compaction " + deltaCommitsSinceTs + + ". Waiting for " + config.getInlineCompactDeltaCommitMax()); + return Optional.empty(); } - @Override - public Optional compact(JavaSparkContext jsc) { - logger.info("Checking if compaction needs to be run on " + config.getBasePath()); - Optional lastCompaction = getActiveTimeline().getCompactionTimeline() - .filterCompletedInstants().lastInstant(); - String deltaCommitsSinceTs = "0"; - if (lastCompaction.isPresent()) { - deltaCommitsSinceTs = lastCompaction.get().getTimestamp(); - } - - int deltaCommitsSinceLastCompaction = getActiveTimeline().getDeltaCommitTimeline() - .findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants(); - if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) { - logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction - + " delta commits was found since last compaction " + deltaCommitsSinceTs - + ". Waiting for " + config.getInlineCompactDeltaCommitMax()); - return Optional.empty(); - } - - logger.info("Compacting merge on read table " + config.getBasePath()); - HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor(); - try { - return Optional.of(compactor.compact(jsc, config, this)); - } catch (IOException e) { - throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e); - } + logger.info("Compacting merge on read table " + config.getBasePath()); + HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor(); + try { + return Optional.of(compactor.compact(jsc, config, this)); + } catch (IOException e) { + throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e); } + } - @Override - public List rollback(JavaSparkContext jsc, List commits) throws IOException { + @Override + public List rollback(JavaSparkContext jsc, List commits) + throws IOException { - //At the moment, MOR table type does not support nested rollbacks - if(commits.size() > 1) { - throw new UnsupportedOperationException("Nested Rollbacks are not supported"); - } - Map commitsAndCompactions = - this.getActiveTimeline() - .getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION)) - .getInstants() - .filter(i -> commits.contains(i.getTimestamp())) - .collect(Collectors.toMap(i -> i.getTimestamp(), i -> i)); - - // Atomically un-publish all non-inflight commits - commitsAndCompactions.entrySet().stream().map(entry -> entry.getValue()) - .filter(i -> !i.isInflight()).forEach(this.getActiveTimeline()::revertToInflight); - - logger.info("Unpublished " + commits); - - Long startTime = System.currentTimeMillis(); - - List allRollbackStats = commits.stream().map(commit -> { - HoodieInstant instant = commitsAndCompactions.get(commit); - List stats = null; - switch (instant.getAction()) { - case HoodieTimeline.COMMIT_ACTION: - case HoodieTimeline.COMPACTION_ACTION: - try { - logger.info("Starting to rollback Commit/Compaction " + instant); - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get()); - - stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream().collect(Collectors.toList())) - .map((Function) partitionPath -> { - Map results = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit)); - return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) - .withDeletedFileResults(results).build(); - }).collect(); - logger.info("Finished rollback of Commit/Compaction " + instant); - break; - } catch (IOException io) { - throw new UncheckedIOException("Failed to rollback for commit " + commit, io); - } - case HoodieTimeline.DELTA_COMMIT_ACTION: - try { - logger.info("Starting to rollback delta commit " + instant); - - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get()); - - stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream().collect(Collectors.toList())) - .map((Function) partitionPath -> { - // read commit file and (either append delete blocks or delete file) - Map filesToDeletedStatus = new HashMap<>(); - Map filesToNumBlocksRollback = new HashMap<>(); - - // we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW) - filesToDeletedStatus = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit)); - - // append rollback blocks for updates - commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT).forEach(wStat -> { - HoodieLogFormat.Writer writer = null; - try { - writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(new Path(this.getMetaClient().getBasePath(), partitionPath)) - .withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit()) - .withFs(FSUtils.getFs()).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); - Long numRollbackBlocks = 0L; - // generate metadata - Map metadata = Maps.newHashMap(); - metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); - metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, commit); - // if update belongs to an existing log file - writer.appendBlock(new HoodieCommandBlock(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata)); - numRollbackBlocks++; - if(wStat.getNumDeletes() > 0) { - writer.appendBlock(new HoodieCommandBlock(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata)); - numRollbackBlocks++; - } - filesToNumBlocksRollback.put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()), numRollbackBlocks); - } catch (IOException | InterruptedException io) { - throw new HoodieRollbackException("Failed to rollback for commit " + commit, io); - } finally { - try { - writer.close(); - } catch (IOException io) { - throw new UncheckedIOException(io); - } - } - }); - return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) - .withDeletedFileResults(filesToDeletedStatus) - .withRollbackBlockAppendResults(filesToNumBlocksRollback).build(); - }).collect(); - logger.info("Fnished rollback of delta commit " + instant); - break; - } catch (IOException io) { - throw new UncheckedIOException("Failed to rollback for commit " + commit, io); - } - } - return stats; - }).flatMap(x -> x.stream()).collect(Collectors.toList()); - - commitsAndCompactions.entrySet().stream() - .map(entry -> new HoodieInstant(true, entry.getValue().getAction(), entry.getValue().getTimestamp())) - .forEach(this.getActiveTimeline()::deleteInflight); - - logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime)); - - return allRollbackStats; + //At the moment, MOR table type does not support nested rollbacks + if (commits.size() > 1) { + throw new UnsupportedOperationException("Nested Rollbacks are not supported"); } + Map commitsAndCompactions = + this.getActiveTimeline() + .getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, + HoodieActiveTimeline.COMPACTION_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION)) + .getInstants() + .filter(i -> commits.contains(i.getTimestamp())) + .collect(Collectors.toMap(i -> i.getTimestamp(), i -> i)); + + // Atomically un-publish all non-inflight commits + commitsAndCompactions.entrySet().stream().map(entry -> entry.getValue()) + .filter(i -> !i.isInflight()).forEach(this.getActiveTimeline()::revertToInflight); + + logger.info("Unpublished " + commits); + + Long startTime = System.currentTimeMillis(); + + List allRollbackStats = commits.stream().map(commit -> { + HoodieInstant instant = commitsAndCompactions.get(commit); + List stats = null; + switch (instant.getAction()) { + case HoodieTimeline.COMMIT_ACTION: + case HoodieTimeline.COMPACTION_ACTION: + try { + logger.info("Starting to rollback Commit/Compaction " + instant); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(this.getCommitTimeline().getInstantDetails( + new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get()); + + stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream() + .collect(Collectors.toList())) + .map((Function) partitionPath -> { + Map results = super + .deleteCleanedFiles(partitionPath, Arrays.asList(commit)); + return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) + .withDeletedFileResults(results).build(); + }).collect(); + logger.info("Finished rollback of Commit/Compaction " + instant); + break; + } catch (IOException io) { + throw new UncheckedIOException("Failed to rollback for commit " + commit, io); + } + case HoodieTimeline.DELTA_COMMIT_ACTION: + try { + logger.info("Starting to rollback delta commit " + instant); + + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(this.getCommitTimeline().getInstantDetails( + new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get()); + + stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream() + .collect(Collectors.toList())) + .map((Function) partitionPath -> { + // read commit file and (either append delete blocks or delete file) + Map filesToDeletedStatus = new HashMap<>(); + Map filesToNumBlocksRollback = new HashMap<>(); + + // we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW) + filesToDeletedStatus = super + .deleteCleanedFiles(partitionPath, Arrays.asList(commit)); + + // append rollback blocks for updates + commitMetadata.getPartitionToWriteStats().get(partitionPath).stream() + .filter(wStat -> wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT) + .forEach(wStat -> { + HoodieLogFormat.Writer writer = null; + try { + writer = HoodieLogFormat.newWriterBuilder() + .onParentPath( + new Path(this.getMetaClient().getBasePath(), partitionPath)) + .withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit()) + .withFs(FSUtils.getFs()) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); + Long numRollbackBlocks = 0L; + // generate metadata + Map metadata = Maps.newHashMap(); + metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, + metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); + metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, commit); + // if update belongs to an existing log file + writer.appendBlock(new HoodieCommandBlock( + HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, + metadata)); + numRollbackBlocks++; + if (wStat.getNumDeletes() > 0) { + writer.appendBlock(new HoodieCommandBlock( + HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, + metadata)); + numRollbackBlocks++; + } + filesToNumBlocksRollback + .put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()), + numRollbackBlocks); + } catch (IOException | InterruptedException io) { + throw new HoodieRollbackException( + "Failed to rollback for commit " + commit, io); + } finally { + try { + writer.close(); + } catch (IOException io) { + throw new UncheckedIOException(io); + } + } + }); + return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) + .withDeletedFileResults(filesToDeletedStatus) + .withRollbackBlockAppendResults(filesToNumBlocksRollback).build(); + }).collect(); + logger.info("Fnished rollback of delta commit " + instant); + break; + } catch (IOException io) { + throw new UncheckedIOException("Failed to rollback for commit " + commit, io); + } + } + return stats; + }).flatMap(x -> x.stream()).collect(Collectors.toList()); + + commitsAndCompactions.entrySet().stream() + .map(entry -> new HoodieInstant(true, entry.getValue().getAction(), + entry.getValue().getTimestamp())) + .forEach(this.getActiveTimeline()::deleteInflight); + + logger + .debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime)); + + return allRollbackStats; + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java index 8ed494f79..88f7f9b4b 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java @@ -34,7 +34,6 @@ import com.uber.hoodie.common.util.AvroUtils; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieCommitException; import com.uber.hoodie.exception.HoodieException; -import com.uber.hoodie.exception.HoodieRollbackException; import com.uber.hoodie.exception.HoodieSavepointException; import java.io.IOException; import java.io.Serializable; @@ -43,8 +42,6 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.Stream; - -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -55,291 +52,245 @@ import org.apache.spark.api.java.JavaSparkContext; * Abstract implementation of a HoodieTable */ public abstract class HoodieTable implements Serializable { - protected final HoodieWriteConfig config; - protected final HoodieTableMetaClient metaClient; - private static Logger logger = LogManager.getLogger(HoodieTable.class); - protected HoodieTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) { - this.config = config; - this.metaClient = metaClient; + protected final HoodieWriteConfig config; + protected final HoodieTableMetaClient metaClient; + private static Logger logger = LogManager.getLogger(HoodieTable.class); + + protected HoodieTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) { + this.config = config; + this.metaClient = metaClient; + } + + /** + * Provides a partitioner to perform the upsert operation, based on the workload profile + */ + public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile); + + + /** + * Provides a partitioner to perform the insert operation, based on the workload profile + */ + public abstract Partitioner getInsertPartitioner(WorkloadProfile profile); + + + /** + * Return whether this HoodieTable implementation can benefit from workload profiling + */ + public abstract boolean isWorkloadProfileNeeded(); + + public HoodieWriteConfig getConfig() { + return config; + } + + public HoodieTableMetaClient getMetaClient() { + return metaClient; + } + + public FileSystem getFs() { + return metaClient.getFs(); + } + + /** + * Get the view of the file system for this table + */ + public TableFileSystemView getFileSystemView() { + return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline()); + } + + /** + * Get the read optimized view of the file system for this table + */ + public TableFileSystemView.ReadOptimizedView getROFileSystemView() { + return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline()); + } + + /** + * Get the real time view of the file system for this table + */ + public TableFileSystemView.RealtimeView getRTFileSystemView() { + return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline()); + } + + /** + * Get the completed (commit + compaction) view of the file system for this table + */ + public TableFileSystemView getCompletedFileSystemView() { + return new HoodieTableFileSystemView(metaClient, getCommitTimeline()); + } + + /** + * Get only the completed (no-inflights) commit timeline + */ + public HoodieTimeline getCompletedCommitTimeline() { + return getCommitTimeline().filterCompletedInstants(); + } + + /** + * Get only the inflights (no-completed) commit timeline + */ + public HoodieTimeline getInflightCommitTimeline() { + return getCommitTimeline().filterInflights(); + } + + + /** + * Get only the completed (no-inflights) clean timeline + */ + public HoodieTimeline getCompletedCleanTimeline() { + return getActiveTimeline().getCleanerTimeline().filterCompletedInstants(); + } + + /** + * Get only the completed (no-inflights) savepoint timeline + */ + public HoodieTimeline getCompletedSavepointTimeline() { + return getActiveTimeline().getSavePointTimeline().filterCompletedInstants(); + } + + /** + * Get the list of savepoints in this table + */ + public List getSavepoints() { + return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); + } + + /** + * Get the list of data file names savepointed + */ + public Stream getSavepointedDataFiles(String savepointTime) { + if (!getSavepoints().contains(savepointTime)) { + throw new HoodieSavepointException( + "Could not get data files for savepoint " + savepointTime + ". No such savepoint."); } - - /** - * Provides a partitioner to perform the upsert operation, based on the - * workload profile - * - * @return - */ - public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile); - - - /** - * Provides a partitioner to perform the insert operation, based on the workload profile - * - * @return - */ - public abstract Partitioner getInsertPartitioner(WorkloadProfile profile); - - - /** - * Return whether this HoodieTable implementation can benefit from workload - * profiling - * - * @return - */ - public abstract boolean isWorkloadProfileNeeded(); - - public HoodieWriteConfig getConfig() { - return config; + HoodieInstant instant = + new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime); + HoodieSavepointMetadata metadata = null; + try { + metadata = AvroUtils.deserializeHoodieSavepointMetadata( + getActiveTimeline().getInstantDetails(instant).get()); + } catch (IOException e) { + throw new HoodieSavepointException( + "Could not get savepointed data files for savepoint " + savepointTime, e); } + return metadata.getPartitionMetadata().values().stream() + .flatMap(s -> s.getSavepointDataFile().stream()); + } - public HoodieTableMetaClient getMetaClient() { - return metaClient; + public HoodieActiveTimeline getActiveTimeline() { + return metaClient.getActiveTimeline(); + } + + /** + * Get the commit timeline visible for this table + */ + public HoodieTimeline getCommitTimeline() { + switch (metaClient.getTableType()) { + case COPY_ON_WRITE: + return getActiveTimeline().getCommitTimeline(); + case MERGE_ON_READ: + // We need to include the parquet files written out in delta commits + // Include commit action to be able to start doing a MOR over a COW dataset - no migration required + return getActiveTimeline().getCommitsAndCompactionsTimeline(); + default: + throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); } + } - public FileSystem getFs() { - return metaClient.getFs(); + /** + * Get only the completed (no-inflights) compaction commit timeline + */ + public HoodieTimeline getCompletedCompactionCommitTimeline() { + return getCompactionCommitTimeline().filterCompletedInstants(); + } + + + /** + * Get the compacted commit timeline visible for this table + */ + public HoodieTimeline getCompactionCommitTimeline() { + switch (metaClient.getTableType()) { + case COPY_ON_WRITE: + return getActiveTimeline().getCommitsAndCompactionsTimeline(); + case MERGE_ON_READ: + // We need to include the parquet files written out in delta commits in tagging + return getActiveTimeline().getTimelineOfActions( + Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION)); + default: + throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); } + } - /** - * Get the view of the file system for this table - * - * @return - */ - public TableFileSystemView getFileSystemView() { - return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline()); + /** + * Gets the commit action type + */ + public String getCommitActionType() { + switch (metaClient.getTableType()) { + case COPY_ON_WRITE: + return HoodieActiveTimeline.COMMIT_ACTION; + case MERGE_ON_READ: + return HoodieActiveTimeline.DELTA_COMMIT_ACTION; } + throw new HoodieCommitException( + "Could not commit on unknown storage type " + metaClient.getTableType()); + } - /** - * Get the read optimized view of the file system for this table - * - * @return - */ - public TableFileSystemView.ReadOptimizedView getROFileSystemView() { - return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline()); + /** + * Gets the action type for a compaction commit + */ + public String getCompactedCommitActionType() { + switch (metaClient.getTableType()) { + case COPY_ON_WRITE: + return HoodieTimeline.COMMIT_ACTION; + case MERGE_ON_READ: + return HoodieTimeline.COMPACTION_ACTION; } + throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); + } - /** - * Get the real time view of the file system for this table - * - * @return - */ - public TableFileSystemView.RealtimeView getRTFileSystemView() { - return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline()); + + /** + * Perform the ultimate IO for a given upserted (RDD) partition + */ + public abstract Iterator> handleUpsertPartition(String commitTime, + Integer partition, Iterator> recordIterator, Partitioner partitioner); + + /** + * Perform the ultimate IO for a given inserted (RDD) partition + */ + public abstract Iterator> handleInsertPartition(String commitTime, + Integer partition, Iterator> recordIterator, Partitioner partitioner); + + + public static HoodieTable getHoodieTable( + HoodieTableMetaClient metaClient, HoodieWriteConfig config) { + switch (metaClient.getTableType()) { + case COPY_ON_WRITE: + return new HoodieCopyOnWriteTable<>(config, metaClient); + case MERGE_ON_READ: + return new HoodieMergeOnReadTable<>(config, metaClient); + default: + throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); } + } - /** - * Get the completed (commit + compaction) view of the file system for this table - * - * @return - */ - public TableFileSystemView getCompletedFileSystemView() { - return new HoodieTableFileSystemView(metaClient, getCommitTimeline()); - } + /** + * Run Compaction on the table. Compaction arranges the data so that it is optimized for data + * access + */ + public abstract Optional compact(JavaSparkContext jsc); - /** - * Get only the completed (no-inflights) commit timeline - * @return - */ - public HoodieTimeline getCompletedCommitTimeline() { - return getCommitTimeline().filterCompletedInstants(); - } + /** + * Clean partition paths according to cleaning policy and returns the number of files cleaned. + */ + public abstract List clean(JavaSparkContext jsc); - /** - * Get only the inflights (no-completed) commit timeline - * @return - */ - public HoodieTimeline getInflightCommitTimeline() { - return getCommitTimeline().filterInflights(); - } - - - /** - * Get only the completed (no-inflights) clean timeline - * @return - */ - public HoodieTimeline getCompletedCleanTimeline() { - return getActiveTimeline().getCleanerTimeline().filterCompletedInstants(); - } - - /** - * Get only the completed (no-inflights) savepoint timeline - * @return - */ - public HoodieTimeline getCompletedSavepointTimeline() { - return getActiveTimeline().getSavePointTimeline().filterCompletedInstants(); - } - - /** - * Get the list of savepoints in this table - * @return - */ - public List getSavepoints() { - return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); - } - - /** - * Get the list of data file names savepointed - * - * @param savepointTime - * @return - * @throws IOException - */ - public Stream getSavepointedDataFiles(String savepointTime) { - if (!getSavepoints().contains(savepointTime)) { - throw new HoodieSavepointException( - "Could not get data files for savepoint " + savepointTime + ". No such savepoint."); - } - HoodieInstant instant = - new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime); - HoodieSavepointMetadata metadata = null; - try { - metadata = AvroUtils.deserializeHoodieSavepointMetadata( - getActiveTimeline().getInstantDetails(instant).get()); - } catch (IOException e) { - throw new HoodieSavepointException( - "Could not get savepointed data files for savepoint " + savepointTime, e); - } - return metadata.getPartitionMetadata().values().stream() - .flatMap(s -> s.getSavepointDataFile().stream()); - } - - public HoodieActiveTimeline getActiveTimeline() { - return metaClient.getActiveTimeline(); - } - - /** - * Get the commit timeline visible for this table - * - * @return - */ - public HoodieTimeline getCommitTimeline() { - switch (metaClient.getTableType()) { - case COPY_ON_WRITE: - return getActiveTimeline().getCommitTimeline(); - case MERGE_ON_READ: - // We need to include the parquet files written out in delta commits - // Include commit action to be able to start doing a MOR over a COW dataset - no migration required - return getActiveTimeline().getCommitsAndCompactionsTimeline(); - default: - throw new HoodieException("Unsupported table type :"+ metaClient.getTableType()); - } - } - - /** - * Get only the completed (no-inflights) compaction commit timeline - * @return - */ - public HoodieTimeline getCompletedCompactionCommitTimeline() { - return getCompactionCommitTimeline().filterCompletedInstants(); - } - - - /** - * Get the compacted commit timeline visible for this table - * - * @return - */ - public HoodieTimeline getCompactionCommitTimeline() { - switch (metaClient.getTableType()) { - case COPY_ON_WRITE: - return getActiveTimeline().getCommitsAndCompactionsTimeline(); - case MERGE_ON_READ: - // We need to include the parquet files written out in delta commits in tagging - return getActiveTimeline().getTimelineOfActions( - Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION)); - default: - throw new HoodieException("Unsupported table type :"+ metaClient.getTableType()); - } - } - - /** - * Gets the commit action type - * @return - */ - public String getCommitActionType() { - switch (metaClient.getTableType()) { - case COPY_ON_WRITE: - return HoodieActiveTimeline.COMMIT_ACTION; - case MERGE_ON_READ: - return HoodieActiveTimeline.DELTA_COMMIT_ACTION; - } - throw new HoodieCommitException( - "Could not commit on unknown storage type " + metaClient.getTableType()); - } - - /** - * Gets the action type for a compaction commit - * @return - */ - public String getCompactedCommitActionType() { - switch (metaClient.getTableType()) { - case COPY_ON_WRITE: - return HoodieTimeline.COMMIT_ACTION; - case MERGE_ON_READ: - return HoodieTimeline.COMPACTION_ACTION; - } - throw new HoodieException("Unsupported table type :"+ metaClient.getTableType()); - } - - - - /** - * Perform the ultimate IO for a given upserted (RDD) partition - * - * @param partition - * @param recordIterator - * @param partitioner - */ - public abstract Iterator> handleUpsertPartition(String commitTime, - Integer partition, Iterator> recordIterator, Partitioner partitioner); - - /** - * Perform the ultimate IO for a given inserted (RDD) partition - * - * @param partition - * @param recordIterator - * @param partitioner - */ - public abstract Iterator> handleInsertPartition(String commitTime, - Integer partition, Iterator> recordIterator, Partitioner partitioner); - - - public static HoodieTable getHoodieTable( - HoodieTableMetaClient metaClient, HoodieWriteConfig config) { - switch (metaClient.getTableType()) { - case COPY_ON_WRITE: - return new HoodieCopyOnWriteTable<>(config, metaClient); - case MERGE_ON_READ: - return new HoodieMergeOnReadTable<>(config, metaClient); - default: - throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); - } - } - - /** - * Run Compaction on the table. - * Compaction arranges the data so that it is optimized for data access - */ - public abstract Optional compact(JavaSparkContext jsc); - - /** - * Clean partition paths according to cleaning policy and returns the number - * of files cleaned. - */ - public abstract List clean(JavaSparkContext jsc); - - /** - * Rollback the (inflight/committed) record changes with the given commit time. - * Four steps: - * (1) Atomically unpublish this commit - * (2) clean indexing data - * (3) clean new generated parquet files / log blocks - * (4) Finally, delete ..commit or ..inflight file - * @param commits - * @return - * @throws HoodieRollbackException - */ - public abstract List rollback(JavaSparkContext jsc, List commits) throws IOException; + /** + * Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1) + * Atomically unpublish this commit (2) clean indexing data (3) clean new generated parquet files + * / log blocks (4) Finally, delete ..commit or ..inflight file + */ + public abstract List rollback(JavaSparkContext jsc, List commits) + throws IOException; } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/UserDefinedBulkInsertPartitioner.java b/hoodie-client/src/main/java/com/uber/hoodie/table/UserDefinedBulkInsertPartitioner.java index 9a676f4e7..2ca51a31f 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/UserDefinedBulkInsertPartitioner.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/UserDefinedBulkInsertPartitioner.java @@ -20,13 +20,13 @@ import com.uber.hoodie.common.model.HoodieRecordPayload; import org.apache.spark.api.java.JavaRDD; /** - * Repartition input records into at least expected number of output spark partitions. It should give - * below guarantees - * - Output spark partition will have records from only one hoodie partition. - * - Average records per output spark partitions should be almost equal to (#inputRecords / #outputSparkPartitions) - * to avoid possible skews. + * Repartition input records into at least expected number of output spark partitions. It should + * give below guarantees - Output spark partition will have records from only one hoodie partition. + * - Average records per output spark partitions should be almost equal to (#inputRecords / + * #outputSparkPartitions) to avoid possible skews. */ public interface UserDefinedBulkInsertPartitioner { - JavaRDD> repartitionRecords(JavaRDD> records, int outputSparkPartitions); + JavaRDD> repartitionRecords(JavaRDD> records, + int outputSparkPartitions); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadProfile.java b/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadProfile.java index 1d1332ae8..07e863690 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadProfile.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadProfile.java @@ -20,15 +20,11 @@ package com.uber.hoodie.table; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordPayload; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.PairFunction; - import java.io.Serializable; import java.util.HashMap; import java.util.Map; import java.util.Set; - +import org.apache.spark.api.java.JavaRDD; import scala.Option; import scala.Tuple2; @@ -40,73 +36,76 @@ import scala.Tuple2; */ public class WorkloadProfile implements Serializable { - /** - * Input workload - */ - private final JavaRDD> taggedRecords; + /** + * Input workload + */ + private final JavaRDD> taggedRecords; - /** - * Computed workload profile - */ - private final HashMap partitionPathStatMap; + /** + * Computed workload profile + */ + private final HashMap partitionPathStatMap; - private final WorkloadStat globalStat; + private final WorkloadStat globalStat; - public WorkloadProfile(JavaRDD> taggedRecords) { - this.taggedRecords = taggedRecords; - this.partitionPathStatMap = new HashMap<>(); - this.globalStat = new WorkloadStat(); - buildProfile(); + public WorkloadProfile(JavaRDD> taggedRecords) { + this.taggedRecords = taggedRecords; + this.partitionPathStatMap = new HashMap<>(); + this.globalStat = new WorkloadStat(); + buildProfile(); + } + + private void buildProfile() { + + Map>, Long> partitionLocationCounts = taggedRecords + .mapToPair(record -> + new Tuple2<>( + new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), + record)) + .countByKey(); + + for (Map.Entry>, Long> e : partitionLocationCounts + .entrySet()) { + String partitionPath = e.getKey()._1(); + Long count = e.getValue(); + Option locOption = e.getKey()._2(); + + if (!partitionPathStatMap.containsKey(partitionPath)) { + partitionPathStatMap.put(partitionPath, new WorkloadStat()); + } + + if (locOption.isDefined()) { + // update + partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count); + globalStat.addUpdates(locOption.get(), count); + } else { + // insert + partitionPathStatMap.get(partitionPath).addInserts(count); + globalStat.addInserts(count); + } } + } - private void buildProfile() { + public WorkloadStat getGlobalStat() { + return globalStat; + } - Map>, Long> partitionLocationCounts = taggedRecords - .mapToPair(record -> - new Tuple2<>(new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), record)) - .countByKey(); + public Set getPartitionPaths() { + return partitionPathStatMap.keySet(); + } - for (Map.Entry>, Long> e: partitionLocationCounts.entrySet()) { - String partitionPath = e.getKey()._1(); - Long count = e.getValue(); - Option locOption = e.getKey()._2(); + public WorkloadStat getWorkloadStat(String partitionPath) { + return partitionPathStatMap.get(partitionPath); + } - if (!partitionPathStatMap.containsKey(partitionPath)){ - partitionPathStatMap.put(partitionPath, new WorkloadStat()); - } - - if (locOption.isDefined()) { - // update - partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count); - globalStat.addUpdates(locOption.get(), count); - } else { - // insert - partitionPathStatMap.get(partitionPath).addInserts(count); - globalStat.addInserts(count); - } - } - } - - public WorkloadStat getGlobalStat() { - return globalStat; - } - - public Set getPartitionPaths() { - return partitionPathStatMap.keySet(); - } - - public WorkloadStat getWorkloadStat(String partitionPath){ - return partitionPathStatMap.get(partitionPath); - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("WorkloadProfile {"); - sb.append("globalStat=").append(globalStat).append(", "); - sb.append("partitionStat=").append(partitionPathStatMap); - sb.append('}'); - return sb.toString(); - } + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("WorkloadProfile {"); + sb.append("globalStat=").append(globalStat).append(", "); + sb.append("partitionStat=").append(partitionPathStatMap); + sb.append('}'); + return sb.toString(); + } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadStat.java b/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadStat.java index a0eea477a..10bf6735a 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadStat.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadStat.java @@ -17,7 +17,6 @@ package com.uber.hoodie.table; import com.uber.hoodie.common.model.HoodieRecordLocation; - import java.io.Serializable; import java.util.HashMap; @@ -25,43 +24,44 @@ import java.util.HashMap; * Wraps stats about a single partition path. */ public class WorkloadStat implements Serializable { - private long numInserts = 0L; - private long numUpdates = 0L; + private long numInserts = 0L; - private HashMap updateLocationToCount; + private long numUpdates = 0L; - public WorkloadStat() { - updateLocationToCount = new HashMap<>(); - } + private HashMap updateLocationToCount; - long addInserts(long numInserts) { - return this.numInserts += numInserts; - } + public WorkloadStat() { + updateLocationToCount = new HashMap<>(); + } - long addUpdates(HoodieRecordLocation location, long numUpdates) { - updateLocationToCount.put(location.getFileId(), numUpdates); - return this.numUpdates += numUpdates; - } + long addInserts(long numInserts) { + return this.numInserts += numInserts; + } - public long getNumUpdates() { - return numUpdates; - } + long addUpdates(HoodieRecordLocation location, long numUpdates) { + updateLocationToCount.put(location.getFileId(), numUpdates); + return this.numUpdates += numUpdates; + } - public long getNumInserts() { - return numInserts; - } + public long getNumUpdates() { + return numUpdates; + } - public HashMap getUpdateLocationToCount() { - return updateLocationToCount; - } + public long getNumInserts() { + return numInserts; + } - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("WorkloadStat {"); - sb.append("numInserts=").append(numInserts).append(", "); - sb.append("numUpdates=").append(numUpdates); - sb.append('}'); - return sb.toString(); - } + public HashMap getUpdateLocationToCount() { + return updateLocationToCount; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("WorkloadStat {"); + sb.append("numInserts=").append(numInserts).append(", "); + sb.append("numUpdates=").append(numUpdates); + sb.append('}'); + return sb.toString(); + } } diff --git a/hoodie-client/src/main/resources/log4j.properties b/hoodie-client/src/main/resources/log4j.properties index 5a8b643fd..ab922d18a 100644 --- a/hoodie-client/src/main/resources/log4j.properties +++ b/hoodie-client/src/main/resources/log4j.properties @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - # Set root logger level to DEBUG and its only appender to A1. log4j.rootLogger=INFO, A1 # A1 is set to be a ConsoleAppender. diff --git a/hoodie-client/src/test/java/HoodieClientExample.java b/hoodie-client/src/test/java/HoodieClientExample.java index 26f097a93..ef31fea1c 100644 --- a/hoodie-client/src/test/java/HoodieClientExample.java +++ b/hoodie-client/src/test/java/HoodieClientExample.java @@ -22,13 +22,12 @@ import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieAvroPayload; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieTableType; -import com.uber.hoodie.common.table.HoodieTableConfig; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.index.HoodieIndex; - +import java.util.List; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; @@ -36,7 +35,6 @@ import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import java.util.List; /** * Driver program that uses the Hoodie client with synthetic workload, and performs basic @@ -44,75 +42,77 @@ import java.util.List; */ public class HoodieClientExample { - @Parameter(names={"--table-path", "-p"}, description = "path for Hoodie sample table") - private String tablePath = "file:///tmp/hoodie/sample-table"; + @Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table") + private String tablePath = "file:///tmp/hoodie/sample-table"; - @Parameter(names={"--table-name", "-n"}, description = "table name for Hoodie sample table") - private String tableName = "hoodie_rt"; + @Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table") + private String tableName = "hoodie_rt"; - @Parameter(names={"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ") - private String tableType = HoodieTableType.COPY_ON_WRITE.name(); + @Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ") + private String tableType = HoodieTableType.COPY_ON_WRITE.name(); - @Parameter(names = {"--help", "-h"}, help = true) - public Boolean help = false; + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; - private static Logger logger = LogManager.getLogger(HoodieClientExample.class); + private static Logger logger = LogManager.getLogger(HoodieClientExample.class); - public static void main(String[] args) throws Exception { - HoodieClientExample cli = new HoodieClientExample(); - JCommander cmd = new JCommander(cli, args); + public static void main(String[] args) throws Exception { + HoodieClientExample cli = new HoodieClientExample(); + JCommander cmd = new JCommander(cli, args); - if (cli.help) { - cmd.usage(); - System.exit(1); - } - cli.run(); + if (cli.help) { + cmd.usage(); + System.exit(1); + } + cli.run(); + } + + + public void run() throws Exception { + + SparkConf sparkConf = new SparkConf().setAppName("hoodie-client-example"); + sparkConf.setMaster("local[1]"); + sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + sparkConf.set("spark.kryoserializer.buffer.max", "512m"); + JavaSparkContext jsc = new JavaSparkContext(sparkConf); + + // Generator of some records to be loaded in. + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + // initialize the table, if not done already + Path path = new Path(tablePath); + FileSystem fs = FSUtils.getFs(); + if (!fs.exists(path)) { + HoodieTableMetaClient + .initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName, + HoodieAvroPayload.class.getName()); } + // Create the write client to write some records in + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable(tableName).withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - public void run() throws Exception { + /** + * Write 1 (only inserts) + */ + String newCommitTime = client.startCommit(); + logger.info("Starting commit " + newCommitTime); - SparkConf sparkConf = new SparkConf().setAppName("hoodie-client-example"); - sparkConf.setMaster("local[1]"); - sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - sparkConf.set("spark.kryoserializer.buffer.max", "512m"); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD writeRecords = jsc.parallelize(records, 1); + client.upsert(writeRecords, newCommitTime); - // Generator of some records to be loaded in. - HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); - - // initialize the table, if not done already - Path path = new Path(tablePath); - FileSystem fs = FSUtils.getFs(); - if (!fs.exists(path)) { - HoodieTableMetaClient.initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName, HoodieAvroPayload.class.getName()); - } - - // Create the write client to write some records in - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable(tableName).withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .build(); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - - /** - * Write 1 (only inserts) - */ - String newCommitTime = client.startCommit(); - logger.info("Starting commit " + newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 100); - JavaRDD writeRecords = jsc.parallelize(records, 1); - client.upsert(writeRecords, newCommitTime); - - /** - * Write 2 (updates) - */ - newCommitTime = client.startCommit(); - logger.info("Starting commit " + newCommitTime); - records.addAll(dataGen.generateUpdates(newCommitTime, 100)); - writeRecords = jsc.parallelize(records, 1); - client.upsert(writeRecords, newCommitTime); - } + /** + * Write 2 (updates) + */ + newCommitTime = client.startCommit(); + logger.info("Starting commit " + newCommitTime); + records.addAll(dataGen.generateUpdates(newCommitTime, 100)); + writeRecords = jsc.parallelize(records, 1); + client.upsert(writeRecords, newCommitTime); + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClientOnCopyOnWriteStorage.java b/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClientOnCopyOnWriteStorage.java index be4022dc1..ccdd12839 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClientOnCopyOnWriteStorage.java @@ -16,8 +16,12 @@ package com.uber.hoodie; -import com.google.common.collect.Iterables; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import com.google.common.collect.Iterables; import com.uber.hoodie.common.HoodieCleanStat; import com.uber.hoodie.common.HoodieClientTestUtils; import com.uber.hoodie.common.HoodieTestDataGenerator; @@ -45,22 +49,6 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieRollbackException; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.table.HoodieTable; - -import org.apache.avro.generic.GenericRecord; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.scheduler.SparkListener; -import org.apache.spark.scheduler.SparkListenerTaskEnd; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.util.AccumulatorV2; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -76,1399 +64,1488 @@ import java.util.Optional; import java.util.Set; import java.util.TreeSet; import java.util.stream.Collectors; - +import org.apache.avro.generic.GenericRecord; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.scheduler.SparkListener; +import org.apache.spark.scheduler.SparkListenerTaskEnd; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.util.AccumulatorV2; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; import scala.collection.Iterator; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { - private transient JavaSparkContext jsc = null; - private transient SQLContext sqlContext; - private String basePath = null; - private transient HoodieTestDataGenerator dataGen = null; - private String[] partitionPaths = {"2016/01/01", "2016/02/02", "2016/06/02"}; - @Before - public void init() throws IOException { - // Initialize a local spark env - jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieClient")); + private transient JavaSparkContext jsc = null; + private transient SQLContext sqlContext; + private String basePath = null; + private transient HoodieTestDataGenerator dataGen = null; + private String[] partitionPaths = {"2016/01/01", "2016/02/02", "2016/06/02"}; - //SQLContext stuff - sqlContext = new SQLContext(jsc); + @Before + public void init() throws IOException { + // Initialize a local spark env + jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieClient")); - // Create a temp folder as the base path - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.init(basePath); - dataGen = new HoodieTestDataGenerator(); + //SQLContext stuff + sqlContext = new SQLContext(jsc); + + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.init(basePath); + dataGen = new HoodieTestDataGenerator(); + } + + + private HoodieWriteConfig getConfig() { + return getConfigBuilder().build(); + } + + private HoodieWriteConfig.Builder getConfigBuilder() { + return HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withCompactionConfig( + HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) + .forTable("test-trip-table").withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); + } + + private void assertNoWriteErrors(List statuses) { + // Verify there are no errors + for (WriteStatus status : statuses) { + assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors()); + } + } + + private void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException { + for (String partitionPath : partitionPaths) { + assertTrue( + HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath))); + HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs, + new Path(basePath, partitionPath)); + pmeta.readFromFS(); + assertEquals(3, pmeta.getPartitionDepth()); + } + } + + private void checkTaggedRecords(List taggedRecords, String commitTime) { + for (HoodieRecord rec : taggedRecords) { + assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown()); + assertEquals( + "All records should have commit time " + commitTime + ", since updates were made", + rec.getCurrentLocation().getCommitTime(), commitTime); + } + } + + + @Test + public void testFilterExist() throws Exception { + HoodieWriteConfig config = getConfig(); + HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); + String newCommitTime = writeClient.startCommit(); + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD recordsRDD = jsc.parallelize(records, 1); + + HoodieReadClient readClient = new HoodieReadClient(jsc, config.getBasePath()); + JavaRDD filteredRDD = readClient.filterExists(recordsRDD); + + // Should not find any files + assertTrue(filteredRDD.collect().size() == 100); + + JavaRDD smallRecordsRDD = jsc.parallelize(records.subList(0, 75), 1); + // We create three parquet file, each having one record. (two different partitions) + List statuses = writeClient.bulkInsert(smallRecordsRDD, newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + readClient = new HoodieReadClient(jsc, config.getBasePath()); + filteredRDD = readClient.filterExists(recordsRDD); + List result = filteredRDD.collect(); + // Check results + assertTrue(result.size() == 25); + } + + @Test + public void testAutoCommit() throws Exception { + // Set autoCommit false + HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + JavaRDD result = client.bulkInsert(writeRecords, newCommitTime); + + assertFalse("If Autocommit is false, then commit should not be made automatically", + HoodieTestUtils.doesCommitExist(basePath, newCommitTime)); + assertTrue("Commit should succeed", client.commit(newCommitTime, result)); + assertTrue("After explicit commit, commit file should be created", + HoodieTestUtils.doesCommitExist(basePath, newCommitTime)); + + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, 100); + JavaRDD updateRecords = jsc.parallelize(records, 1); + result = client.upsert(updateRecords, newCommitTime); + assertFalse("If Autocommit is false, then commit should not be made automatically", + HoodieTestUtils.doesCommitExist(basePath, newCommitTime)); + assertTrue("Commit should succeed", client.commit(newCommitTime, result)); + assertTrue("After explicit commit, commit file should be created", + HoodieTestUtils.doesCommitExist(basePath, newCommitTime)); + } + + @Test + public void testUpserts() throws Exception { + HoodieWriteConfig cfg = getConfig(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); + FileSystem fs = FSUtils.getFs(); + + /** + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + // check the partition metadata is written out + assertPartitionMetadata(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, fs); + + // verify that there is a commit + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()) + .getCommitTimeline(); + + assertEquals("Expecting a single commit.", 1, + timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); + assertEquals("Latest commit should be 001", newCommitTime, + timeline.lastInstant().get().getTimestamp()); + assertEquals("Must contain 200 records", + records.size(), + HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); + // Should have 100 records in table (check using Index), all in locations marked at commit + HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + + List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table) + .collect(); + checkTaggedRecords(taggedRecords, "001"); + + /** + * Write 2 (updates) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, 100); + LinkedHashMap recordsMap = new LinkedHashMap<>(); + for (HoodieRecord rec : records) { + if (!recordsMap.containsKey(rec.getKey())) { + recordsMap.put(rec.getKey(), rec); + } + } + List dedupedRecords = new ArrayList<>(recordsMap.values()); + + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + // verify there are now 2 commits + timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); + assertEquals("Expecting two commits.", + timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2); + assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), + newCommitTime); + + metaClient = new HoodieTableMetaClient(fs, basePath); + table = HoodieTable.getHoodieTable(metaClient, getConfig()); + + // Index should be able to locate all updates in correct locations. + taggedRecords = index.tagLocation(jsc.parallelize(dedupedRecords, 1), table).collect(); + checkTaggedRecords(taggedRecords, "004"); + + // Check the entire dataset has 100 records still + String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; + for (int i = 0; i < fullPartitionPaths.length; i++) { + fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); + } + assertEquals("Must contain 200 records", + 200, + HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count()); + + // Check that the incremental consumption from time 000 + assertEquals("Incremental consumption from time 002, should give all records in commit 004", + HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), + HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "002").count()); + assertEquals("Incremental consumption from time 001, should give all records in commit 004", + HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), + HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "001").count()); + } + + @Test + public void testDeletes() throws Exception { + + HoodieWriteConfig cfg = getConfig(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); + FileSystem fs = FSUtils.getFs(); + + /** + * Write 1 (inserts and deletes) + * Write actual 200 insert records and ignore 100 delete records + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List fewRecordsForInsert = dataGen.generateInserts(newCommitTime, 200); + List fewRecordsForDelete = dataGen.generateDeletes(newCommitTime, 100); + + List records = new ArrayList(fewRecordsForInsert); + records.addAll(fewRecordsForDelete); + + JavaRDD writeRecords = jsc.parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + // verify that there is a commit + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()) + .getCommitTimeline(); + assertEquals("Expecting a single commit.", 1, + timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); + assertEquals("Latest commit should be 001", newCommitTime, + timeline.lastInstant().get().getTimestamp()); + assertEquals("Must contain 200 records", fewRecordsForInsert.size(), + HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); + // Should have 100 records in table (check using Index), all in locations marked at commit + HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + + List taggedRecords = index + .tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect(); + checkTaggedRecords(taggedRecords, "001"); + + /** + * Write 2 (deletes+writes) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + + fewRecordsForDelete = records.subList(0, 50); + List fewRecordsForUpdate = records.subList(50, 100); + records = dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete); + + records.addAll(fewRecordsForUpdate); + + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + // verify there are now 2 commits + timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); + assertEquals("Expecting two commits.", + timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2); + assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), + newCommitTime); + + // Check the entire dataset has 150 records(200-50) still + String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; + for (int i = 0; i < fullPartitionPaths.length; i++) { + fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); + } + assertEquals("Must contain 150 records", 150, + HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count()); + + // Check that the incremental consumption from time 000 + assertEquals("Incremental consumption from latest commit, should give 50 updated records", + 50, + HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); + assertEquals("Incremental consumption from time 001, should give 50 updated records", + 50, + HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "001").count()); + assertEquals("Incremental consumption from time 000, should give 150", + 150, + HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count()); + } + + + @Test + public void testCreateSavepoint() throws Exception { + HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( + HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1) + .build()).build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + FileSystem fs = FSUtils.getFs(); + HoodieTestDataGenerator + .writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); + + /** + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + List statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime) + .collect(); + assertNoWriteErrors(statuses); + + /** + * Write 2 (updates) + */ + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + client.savepoint("hoodie-unit-test", "test"); + try { + client.rollback(newCommitTime); + fail("Rollback of a savepoint was allowed " + newCommitTime); + } catch (HoodieRollbackException e) { + // this is good } + /** + * Write 3 (updates) + */ + newCommitTime = "003"; + client.startCommitWithTime(newCommitTime); - private HoodieWriteConfig getConfig() { - return getConfigBuilder().build(); + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + /** + * Write 4 (updates) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + List partitionPaths = FSUtils + .getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning()); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView(); + List dataFiles = partitionPaths.stream().flatMap(s -> { + return view.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002")); + }).collect(Collectors.toList()); + + assertEquals("The data files for commit 002 should not be cleaned", 3, dataFiles.size()); + + // Delete savepoint + assertFalse(table.getCompletedSavepointTimeline().empty()); + client.deleteSavepoint( + table.getCompletedSavepointTimeline().getInstants().findFirst().get().getTimestamp()); + // rollback and reupsert 004 + client.rollback(newCommitTime); + + client.startCommitWithTime(newCommitTime); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + metaClient = new HoodieTableMetaClient(fs, basePath); + table = HoodieTable.getHoodieTable(metaClient, getConfig()); + final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView(); + dataFiles = partitionPaths.stream().flatMap(s -> { + return view1.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002")); + }).collect(Collectors.toList()); + + assertEquals("The data files for commit 002 should be cleaned now", 0, dataFiles.size()); + } + + + @Test + public void testRollbackToSavepoint() throws Exception { + HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( + HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1) + .build()).build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + FileSystem fs = FSUtils.getFs(); + HoodieTestDataGenerator + .writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); + + /** + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + /** + * Write 2 (updates) + */ + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + client.savepoint("hoodie-unit-test", "test"); + + /** + * Write 3 (updates) + */ + newCommitTime = "003"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + List partitionPaths = FSUtils + .getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning()); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView(); + + List dataFiles = partitionPaths.stream().flatMap(s -> { + return view1.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("003")); + }).collect(Collectors.toList()); + assertEquals("The data files for commit 003 should be present", 3, dataFiles.size()); + + /** + * Write 4 (updates) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + metaClient = new HoodieTableMetaClient(fs, basePath); + table = HoodieTable.getHoodieTable(metaClient, getConfig()); + final TableFileSystemView.ReadOptimizedView view2 = table.getROFileSystemView(); + + dataFiles = partitionPaths.stream().flatMap(s -> { + return view2.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("004")); + }).collect(Collectors.toList()); + assertEquals("The data files for commit 004 should be present", 3, dataFiles.size()); + + // rolling back to a non existent savepoint must not succeed + try { + client.rollbackToSavepoint("001"); + fail("Rolling back to non-existent savepoint should not be allowed"); + } catch (HoodieRollbackException e) { + // this is good } - private HoodieWriteConfig.Builder getConfigBuilder() { - return HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) - .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) - .forTable("test-trip-table").withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); - } + // rollback to savepoint 002 + HoodieInstant savepoint = + table.getCompletedSavepointTimeline().getInstants().findFirst().get(); + client.rollbackToSavepoint(savepoint.getTimestamp()); - private void assertNoWriteErrors(List statuses) { - // Verify there are no errors - for (WriteStatus status : statuses) { - assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors()); - } - } + metaClient = new HoodieTableMetaClient(fs, basePath); + table = HoodieTable.getHoodieTable(metaClient, getConfig()); + final TableFileSystemView.ReadOptimizedView view3 = table.getROFileSystemView(); + dataFiles = partitionPaths.stream().flatMap(s -> { + return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002")); + }).collect(Collectors.toList()); + assertEquals("The data files for commit 002 be available", 3, dataFiles.size()); - private void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException { - for (String partitionPath: partitionPaths) { - assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath))); - HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs, new Path(basePath, partitionPath)); - pmeta.readFromFS(); - assertEquals(3, pmeta.getPartitionDepth()); - } - } + dataFiles = partitionPaths.stream().flatMap(s -> { + return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("003")); + }).collect(Collectors.toList()); + assertEquals("The data files for commit 003 should be rolled back", 0, dataFiles.size()); - private void checkTaggedRecords(List taggedRecords, String commitTime) { - for (HoodieRecord rec : taggedRecords) { - assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown()); - assertEquals("All records should have commit time "+ commitTime+", since updates were made", - rec.getCurrentLocation().getCommitTime(), commitTime); - } - } + dataFiles = partitionPaths.stream().flatMap(s -> { + return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("004")); + }).collect(Collectors.toList()); + assertEquals("The data files for commit 004 should be rolled back", 0, dataFiles.size()); + } + @Test + public void testInsertAndCleanByVersions() throws Exception { + int maxVersions = 2; // keep upto 2 versions for each file + HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( + HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) + .retainFileVersions(maxVersions).build()).build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); + FileSystem fs = FSUtils.getFs(); - @Test - public void testFilterExist() throws Exception { - HoodieWriteConfig config = getConfig(); - HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); - String newCommitTime = writeClient.startCommit(); - List records = dataGen.generateInserts(newCommitTime, 100); - JavaRDD recordsRDD = jsc.parallelize(records, 1); + /** + * do a big insert + * (this is basically same as insert part of upsert, just adding it here so we can + * catch breakages in insert(), if the implementation diverges.) + */ + String newCommitTime = client.startCommit(); + List records = dataGen.generateInserts(newCommitTime, 500); + JavaRDD writeRecords = jsc.parallelize(records, 5); - HoodieReadClient readClient = new HoodieReadClient(jsc, config.getBasePath()); - JavaRDD filteredRDD = readClient.filterExists(recordsRDD); + List statuses = client.insert(writeRecords, newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); - // Should not find any files - assertTrue(filteredRDD.collect().size() == 100); + // verify that there is a commit + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()) + .getCommitTimeline(); + assertEquals("Expecting a single commit.", 1, + timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); + // Should have 100 records in table (check using Index), all in locations marked at commit + HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + assertFalse(table.getCompletedCommitTimeline().empty()); + String commitTime = + table.getCompletedCommitTimeline().getInstants().findFirst().get().getTimestamp(); + assertFalse(table.getCompletedCleanTimeline().empty()); + assertEquals("The clean instant should be the same as the commit instant", commitTime, + table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp()); - JavaRDD smallRecordsRDD = jsc.parallelize(records.subList(0, 75), 1); - // We create three parquet file, each having one record. (two different partitions) - List statuses = writeClient.bulkInsert(smallRecordsRDD, newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); + List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table) + .collect(); + checkTaggedRecords(taggedRecords, newCommitTime); - readClient = new HoodieReadClient(jsc, config.getBasePath()); - filteredRDD = readClient.filterExists(recordsRDD); - List result = filteredRDD.collect(); - // Check results - assertTrue(result.size() == 25); - } + // Keep doing some writes and clean inline. Make sure we have expected number of files remaining. + for (int writeCnt = 2; writeCnt < 10; writeCnt++) { - @Test - public void testAutoCommit() throws Exception { - // Set autoCommit false - HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + Thread.sleep(1100); // make sure commits are unique + newCommitTime = client.startCommit(); + records = dataGen.generateUpdates(newCommitTime, 100); - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); - List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + table = HoodieTable.getHoodieTable(metadata, getConfig()); + timeline = table.getCommitTimeline(); - JavaRDD result = client.bulkInsert(writeRecords, newCommitTime); + TableFileSystemView fsView = table.getFileSystemView(); + // Need to ensure the following + for (String partitionPath : dataGen.getPartitionPaths()) { + // compute all the versions of all files, from time 0 + HashMap> fileIdToVersions = new HashMap<>(); + for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(timeline.getInstantDetails(entry).get()); - assertFalse("If Autocommit is false, then commit should not be made automatically", - HoodieTestUtils.doesCommitExist(basePath, newCommitTime)); - assertTrue("Commit should succeed", client.commit(newCommitTime, result)); - assertTrue("After explicit commit, commit file should be created", - HoodieTestUtils.doesCommitExist(basePath, newCommitTime)); - - newCommitTime = "002"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, 100); - JavaRDD updateRecords = jsc.parallelize(records, 1); - result = client.upsert(updateRecords, newCommitTime); - assertFalse("If Autocommit is false, then commit should not be made automatically", - HoodieTestUtils.doesCommitExist(basePath, newCommitTime)); - assertTrue("Commit should succeed", client.commit(newCommitTime, result)); - assertTrue("After explicit commit, commit file should be created", - HoodieTestUtils.doesCommitExist(basePath, newCommitTime)); - } - - @Test - public void testUpserts() throws Exception { - HoodieWriteConfig cfg = getConfig(); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); - FileSystem fs = FSUtils.getFs(); - - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - List statuses = client.upsert(writeRecords, newCommitTime).collect(); - assertNoWriteErrors(statuses); - - // check the partition metadata is written out - assertPartitionMetadata(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, fs); - - // verify that there is a commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); - - assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); - assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp()); - assertEquals("Must contain 200 records", - records.size(), - HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); - // Should have 100 records in table (check using Index), all in locations marked at commit - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); - - List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect(); - checkTaggedRecords(taggedRecords, "001"); - - /** - * Write 2 (updates) - */ - newCommitTime = "004"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, 100); - LinkedHashMap recordsMap = new LinkedHashMap<>(); - for (HoodieRecord rec : records) { - if (!recordsMap.containsKey(rec.getKey())) { - recordsMap.put(rec.getKey(), rec); + for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) { + if (!fileIdToVersions.containsKey(wstat.getFileId())) { + fileIdToVersions.put(wstat.getFileId(), new TreeSet<>()); } + fileIdToVersions.get(wstat.getFileId()) + .add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName())); + } } - List dedupedRecords = new ArrayList<>(recordsMap.values()); - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); + List fileGroups = fsView.getAllFileGroups(partitionPath) + .collect(Collectors.toList()); - // verify there are now 2 commits - timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); - assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2); - assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime); + for (HoodieFileGroup fileGroup : fileGroups) { + // No file has no more than max versions + String fileId = fileGroup.getId(); + List dataFiles = fileGroup.getAllDataFiles().collect(Collectors.toList()); - metaClient = new HoodieTableMetaClient(fs, basePath); - table = HoodieTable.getHoodieTable(metaClient, getConfig()); + assertTrue("fileId " + fileId + " has more than " + maxVersions + " versions", + dataFiles.size() <= maxVersions); - // Index should be able to locate all updates in correct locations. - taggedRecords = index.tagLocation(jsc.parallelize(dedupedRecords, 1), table).collect(); - checkTaggedRecords(taggedRecords, "004"); - - // Check the entire dataset has 100 records still - String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; - for (int i=0; i < fullPartitionPaths.length; i++) { - fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); + // Each file, has the latest N versions (i.e cleaning gets rid of older versions) + List commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId)); + for (int i = 0; i < dataFiles.size(); i++) { + assertEquals( + "File " + fileId + " does not have latest versions on commits" + commitedVersions, + Iterables.get(dataFiles, i).getCommitTime(), + commitedVersions.get(commitedVersions.size() - 1 - i)); + } } - assertEquals("Must contain 200 records", - 200, - HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count()); - - - // Check that the incremental consumption from time 000 - assertEquals("Incremental consumption from time 002, should give all records in commit 004", - HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), - HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "002").count()); - assertEquals("Incremental consumption from time 001, should give all records in commit 004", - HoodieClientTestUtils.readCommit(basePath, sqlContext,timeline, newCommitTime).count(), - HoodieClientTestUtils.readSince(basePath, sqlContext,timeline, "001").count()); + } } + } - @Test - public void testDeletes() throws Exception { + @Test + public void testInsertAndCleanByCommits() throws Exception { + int maxCommits = 3; // keep upto 3 commits from the past + HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( + HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) + .retainCommits(maxCommits).build()).build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); + FileSystem fs = FSUtils.getFs(); - HoodieWriteConfig cfg = getConfig(); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); - FileSystem fs = FSUtils.getFs(); + /** + * do a big insert + * (this is basically same as insert part of upsert, just adding it here so we can + * catch breakages in insert(), if the implementation diverges.) + */ + String newCommitTime = client.startCommit(); + List records = dataGen.generateInserts(newCommitTime, 500); + JavaRDD writeRecords = jsc.parallelize(records, 5); - /** - * Write 1 (inserts and deletes) - * Write actual 200 insert records and ignore 100 delete records - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); + List statuses = client.insert(writeRecords, newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); - List fewRecordsForInsert = dataGen.generateInserts(newCommitTime, 200); - List fewRecordsForDelete = dataGen.generateDeletes(newCommitTime, 100); + // verify that there is a commit + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()) + .getCommitTimeline(); + assertEquals("Expecting a single commit.", 1, + timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); + // Should have 100 records in table (check using Index), all in locations marked at commit + HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); - List records = new ArrayList(fewRecordsForInsert); - records.addAll(fewRecordsForDelete); + assertFalse(table.getCompletedCommitTimeline().empty()); + String commitTime = + table.getCompletedCommitTimeline().getInstants().findFirst().get().getTimestamp(); + assertFalse(table.getCompletedCleanTimeline().empty()); + assertEquals("The clean instant should be the same as the commit instant", commitTime, + table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp()); - JavaRDD writeRecords = jsc.parallelize(records, 1); + List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table) + .collect(); + checkTaggedRecords(taggedRecords, newCommitTime); - List statuses = client.upsert(writeRecords, newCommitTime).collect(); - assertNoWriteErrors(statuses); + // Keep doing some writes and clean inline. Make sure we have expected number of files remaining. + for (int writeCnt = 2; writeCnt < 10; writeCnt++) { + Thread.sleep(1100); // make sure commits are unique + newCommitTime = client.startCommit(); + records = dataGen.generateUpdates(newCommitTime, 100); - // verify that there is a commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); - assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); - assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp()); - assertEquals("Must contain 200 records", fewRecordsForInsert.size(), - HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); - // Should have 100 records in table (check using Index), all in locations marked at commit - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); - List taggedRecords = index.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect(); - checkTaggedRecords(taggedRecords, "001"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTable table1 = HoodieTable.getHoodieTable(metadata, cfg); + HoodieTimeline activeTimeline = table1.getCompletedCommitTimeline(); + Optional + earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1); + Set acceptableCommits = + activeTimeline.getInstants().collect(Collectors.toSet()); + if (earliestRetainedCommit.isPresent()) { + acceptableCommits.removeAll( + activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()) + .getInstants() + .collect(Collectors.toSet())); + acceptableCommits.add(earliestRetainedCommit.get()); + } - /** - * Write 2 (deletes+writes) - */ - newCommitTime = "004"; - client.startCommitWithTime(newCommitTime); - - fewRecordsForDelete = records.subList(0,50); - List fewRecordsForUpdate = records.subList(50,100); - records = dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete); - - records.addAll(fewRecordsForUpdate); - - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - // verify there are now 2 commits - timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); - assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2); - assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime); - - // Check the entire dataset has 150 records(200-50) still - String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; - for (int i=0; i < fullPartitionPaths.length; i++) { - fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); + TableFileSystemView fsView = table1.getFileSystemView(); + // Need to ensure the following + for (String partitionPath : dataGen.getPartitionPaths()) { + List fileGroups = fsView.getAllFileGroups(partitionPath) + .collect(Collectors.toList()); + for (HoodieFileGroup fileGroup : fileGroups) { + Set commitTimes = new HashSet<>(); + fileGroup.getAllDataFiles().forEach(value -> { + System.out.println("Data File - " + value); + commitTimes.add(value.getCommitTime()); + }); + assertEquals("Only contain acceptable versions of file should be present", + acceptableCommits.stream().map(HoodieInstant::getTimestamp) + .collect(Collectors.toSet()), commitTimes); } - assertEquals("Must contain 150 records", 150, - HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count()); - - - // Check that the incremental consumption from time 000 - assertEquals("Incremental consumption from latest commit, should give 50 updated records", - 50, - HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); - assertEquals("Incremental consumption from time 001, should give 50 updated records", - 50, - HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "001").count()); - assertEquals("Incremental consumption from time 000, should give 150", - 150, - HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count()); + } } + } + @Test + public void testRollbackCommit() throws Exception { + // Let's create some commit files and parquet files + String commitTime1 = "20160501010101"; + String commitTime2 = "20160502020601"; + String commitTime3 = "20160506030611"; + new File(basePath + "/.hoodie").mkdirs(); + HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(), + new String[]{"2016/05/01", "2016/05/02", "2016/05/06"}, + basePath); - @Test - public void testCreateSavepoint() throws Exception { - HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( - HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1) + // Only first two have commit files + HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2); + // Third one has a .inflight intermediate commit file + HoodieTestUtils.createInflightCommitFiles(basePath, commitTime3); + + // Make commit1 + String file11 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime1, "id11"); + String file12 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime1, "id12"); + String file13 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime1, "id13"); + + // Make commit2 + String file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21"); + String file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22"); + String file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23"); + + // Make commit3 + String file31 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime3, "id31"); + String file32 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime3, "id32"); + String file33 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime3, "id33"); + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY) .build()).build(); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - FileSystem fs = FSUtils.getFs(); - HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); + HoodieWriteClient client = new HoodieWriteClient(jsc, config, false); - List records = dataGen.generateInserts(newCommitTime, 200); - List statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(statuses); - - /** - * Write 2 (updates) - */ - newCommitTime = "002"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, records); - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - client.savepoint("hoodie-unit-test", "test"); - try { - client.rollback(newCommitTime); - fail("Rollback of a savepoint was allowed " + newCommitTime); - } catch (HoodieRollbackException e) { - // this is good - } - - /** - * Write 3 (updates) - */ - newCommitTime = "003"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, records); - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - /** - * Write 4 (updates) - */ - newCommitTime = "004"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, records); - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - List partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); - final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView(); - List dataFiles = partitionPaths.stream().flatMap(s -> { - return view.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002")); - }).collect(Collectors.toList()); - - assertEquals("The data files for commit 002 should not be cleaned", 3, dataFiles.size()); - - // Delete savepoint - assertFalse(table.getCompletedSavepointTimeline().empty()); - client.deleteSavepoint( - table.getCompletedSavepointTimeline().getInstants().findFirst().get().getTimestamp()); - // rollback and reupsert 004 - client.rollback(newCommitTime); - - client.startCommitWithTime(newCommitTime); - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - metaClient = new HoodieTableMetaClient(fs, basePath); - table = HoodieTable.getHoodieTable(metaClient, getConfig()); - final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView(); - dataFiles = partitionPaths.stream().flatMap(s -> { - return view1.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002")); - }).collect(Collectors.toList()); - - assertEquals("The data files for commit 002 should be cleaned now", 0, dataFiles.size()); + // Rollback commit 1 (this should fail, since commit2 is still around) + try { + client.rollback(commitTime1); + assertTrue("Should have thrown an exception ", false); + } catch (HoodieRollbackException hrbe) { + // should get here } + // Rollback commit3 + client.rollback(commitTime3); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33)); - @Test - public void testRollbackToSavepoint() throws Exception { - HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( - HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1) + // simulate partial failure, where .inflight was not deleted, but data files were. + HoodieTestUtils.createInflightCommitFiles(basePath, commitTime3); + client.rollback(commitTime3); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); + + // Rollback commit2 + client.rollback(commitTime2); + assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2)); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); + + // simulate partial failure, where only .commit => .inflight renaming succeeded, leaving a + // .inflight commit and a bunch of data files around. + HoodieTestUtils.createInflightCommitFiles(basePath, commitTime2); + file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21"); + file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22"); + file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23"); + + client.rollback(commitTime2); + assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2)); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); + + // Let's rollback commit1, Check results + client.rollback(commitTime1); + assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime1)); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime1)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13)); + } + + + @Test + public void testAutoRollbackCommit() throws Exception { + // Let's create some commit files and parquet files + String commitTime1 = "20160501010101"; + String commitTime2 = "20160502020601"; + String commitTime3 = "20160506030611"; + new File(basePath + "/.hoodie").mkdirs(); + HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(), + new String[]{"2016/05/01", "2016/05/02", "2016/05/06"}, + basePath); + + // One good commit + HoodieTestUtils.createCommitFiles(basePath, commitTime1); + // Two inflight commits + HoodieTestUtils.createInflightCommitFiles(basePath, commitTime2, commitTime3); + + // Make commit1 + String file11 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime1, "id11"); + String file12 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime1, "id12"); + String file13 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime1, "id13"); + + // Make commit2 + String file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21"); + String file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22"); + String file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23"); + + // Make commit3 + String file31 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime3, "id31"); + String file32 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime3, "id32"); + String file33 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime3, "id33"); + + // Turn auto rollback off + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY) .build()).build(); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - FileSystem fs = FSUtils.getFs(); - HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); + new HoodieWriteClient(jsc, config, false); - List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); + // Check results, nothing changed + assertTrue(HoodieTestUtils.doesCommitExist(basePath, commitTime1)); + assertTrue(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); + assertTrue(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13)); - List statuses = client.upsert(writeRecords, newCommitTime).collect(); - assertNoWriteErrors(statuses); - - /** - * Write 2 (updates) - */ - newCommitTime = "002"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, records); - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - client.savepoint("hoodie-unit-test", "test"); - - /** - * Write 3 (updates) - */ - newCommitTime = "003"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, records); - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - List partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); - final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView(); - - List dataFiles = partitionPaths.stream().flatMap(s -> { - return view1.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("003")); - }).collect(Collectors.toList()); - assertEquals("The data files for commit 003 should be present", 3, dataFiles.size()); + // Turn auto rollback on + new HoodieWriteClient(jsc, config, true); + assertTrue(HoodieTestUtils.doesCommitExist(basePath, commitTime1)); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13)); + } - /** - * Write 4 (updates) - */ - newCommitTime = "004"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, records); - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - metaClient = new HoodieTableMetaClient(fs, basePath); - table = HoodieTable.getHoodieTable(metaClient, getConfig()); - final TableFileSystemView.ReadOptimizedView view2 = table.getROFileSystemView(); - - dataFiles = partitionPaths.stream().flatMap(s -> { - return view2.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("004")); - }).collect(Collectors.toList()); - assertEquals("The data files for commit 004 should be present", 3, dataFiles.size()); + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize) { + HoodieWriteConfig.Builder builder = getConfigBuilder(); + return builder.withCompactionConfig( + HoodieCompactionConfig.newBuilder() + .compactionSmallFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 15) + .insertSplitSize(insertSplitSize).build()) // tolerate upto 15 records + .withStorageConfig(HoodieStorageConfig.newBuilder() + .limitFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 20) + .build()) + .build(); + } - // rolling back to a non existent savepoint must not succeed - try { - client.rollbackToSavepoint("001"); - fail("Rolling back to non-existent savepoint should not be allowed"); - } catch (HoodieRollbackException e) { - // this is good - } + @Test + public void testSmallInsertHandlingForUpserts() throws Exception { - // rollback to savepoint 002 - HoodieInstant savepoint = - table.getCompletedSavepointTimeline().getInstants().findFirst().get(); - client.rollbackToSavepoint(savepoint.getTimestamp()); + FileSystem fs = FSUtils.getFs(); + final String TEST_PARTITION_PATH = "2016/09/26"; + final int INSERT_SPLIT_LIMIT = 100; + // setup the small file handling params + HoodieWriteConfig config = getSmallInsertWriteConfig( + INSERT_SPLIT_LIMIT); // hold upto 200 records max + dataGen = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH}); - metaClient = new HoodieTableMetaClient(fs, basePath); - table = HoodieTable.getHoodieTable(metaClient, getConfig()); - final TableFileSystemView.ReadOptimizedView view3 = table.getROFileSystemView(); - dataFiles = partitionPaths.stream().flatMap(s -> { - return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002")); - }).collect(Collectors.toList()); - assertEquals("The data files for commit 002 be available", 3, dataFiles.size()); + HoodieWriteClient client = new HoodieWriteClient(jsc, config); - dataFiles = partitionPaths.stream().flatMap(s -> { - return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("003")); - }).collect(Collectors.toList()); - assertEquals("The data files for commit 003 should be rolled back", 0, dataFiles.size()); + // Inserts => will write file1 + String commitTime1 = "001"; + client.startCommitWithTime(commitTime1); + List inserts1 = dataGen + .generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb + Set keys1 = HoodieClientTestUtils.getRecordKeys(inserts1); - dataFiles = partitionPaths.stream().flatMap(s -> { - return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("004")); - }).collect(Collectors.toList()); - assertEquals("The data files for commit 004 should be rolled back", 0, dataFiles.size()); + JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); + List statuses = client.upsert(insertRecordsRDD1, commitTime1).collect(); + + assertNoWriteErrors(statuses); + + assertEquals("Just 1 file needs to be added.", 1, statuses.size()); + String file1 = statuses.get(0).getFileId(); + assertEquals("file should contain 100 records", + ParquetUtils.readRowKeysFromParquet(new Path(basePath, + TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), + 100); + + // Update + Inserts such that they just expand file1 + String commitTime2 = "002"; + client.startCommitWithTime(commitTime2); + List inserts2 = dataGen.generateInserts(commitTime2, 40); + Set keys2 = HoodieClientTestUtils.getRecordKeys(inserts2); + List insertsAndUpdates2 = new ArrayList<>(); + insertsAndUpdates2.addAll(inserts2); + insertsAndUpdates2.addAll(dataGen.generateUpdates(commitTime2, inserts1)); + + JavaRDD insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 1); + statuses = client.upsert(insertAndUpdatesRDD2, commitTime2).collect(); + assertNoWriteErrors(statuses); + + assertEquals("Just 1 file needs to be updated.", 1, statuses.size()); + assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId()); + assertEquals("Existing file should be expanded", commitTime1, + statuses.get(0).getStat().getPrevCommit()); + Path newFile = new Path(basePath, + TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); + assertEquals("file should contain 140 records", + ParquetUtils.readRowKeysFromParquet(newFile).size(), 140); + + List records = ParquetUtils.readAvroRecords(newFile); + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + assertEquals("only expect commit2", commitTime2, + record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); + assertTrue("key expected to be part of commit2", + keys2.contains(recordKey) || keys1.contains(recordKey)); } - - @Test - public void testInsertAndCleanByVersions() throws Exception { - int maxVersions = 2; // keep upto 2 versions for each file - HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( - HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) - .retainFileVersions(maxVersions).build()).build(); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); - FileSystem fs = FSUtils.getFs(); - - /** - * do a big insert - * (this is basically same as insert part of upsert, just adding it here so we can - * catch breakages in insert(), if the implementation diverges.) - */ - String newCommitTime = client.startCommit(); - List records = dataGen.generateInserts(newCommitTime, 500); - JavaRDD writeRecords = jsc.parallelize(records, 5); - - List statuses = client.insert(writeRecords, newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - // verify that there is a commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); - assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); - // Should have 100 records in table (check using Index), all in locations marked at commit - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); - assertFalse(table.getCompletedCommitTimeline().empty()); - String commitTime = - table.getCompletedCommitTimeline().getInstants().findFirst().get().getTimestamp(); - assertFalse(table.getCompletedCleanTimeline().empty()); - assertEquals("The clean instant should be the same as the commit instant", commitTime, - table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp()); - - List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect(); - checkTaggedRecords(taggedRecords, newCommitTime); - - // Keep doing some writes and clean inline. Make sure we have expected number of files remaining. - for (int writeCnt = 2; writeCnt < 10; writeCnt++) { - - Thread.sleep(1100); // make sure commits are unique - newCommitTime = client.startCommit(); - records = dataGen.generateUpdates(newCommitTime, 100); - - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - table = HoodieTable.getHoodieTable(metadata, getConfig()); - timeline = table.getCommitTimeline(); - - TableFileSystemView fsView = table.getFileSystemView(); - // Need to ensure the following - for (String partitionPath : dataGen.getPartitionPaths()) { - // compute all the versions of all files, from time 0 - HashMap> fileIdToVersions = new HashMap<>(); - for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get()); - - for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) { - if (!fileIdToVersions.containsKey(wstat.getFileId())) { - fileIdToVersions.put(wstat.getFileId(), new TreeSet<>()); - } - fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName())); - } - } - - - List fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList()); - - for (HoodieFileGroup fileGroup : fileGroups) { - // No file has no more than max versions - String fileId = fileGroup.getId(); - List dataFiles = fileGroup.getAllDataFiles().collect(Collectors.toList()); - - assertTrue("fileId " + fileId + " has more than " + maxVersions + " versions", - dataFiles.size() <= maxVersions); - - // Each file, has the latest N versions (i.e cleaning gets rid of older versions) - List commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId)); - for (int i = 0; i < dataFiles.size(); i++) { - assertEquals("File " + fileId + " does not have latest versions on commits" + commitedVersions, - Iterables.get(dataFiles, i).getCommitTime(), - commitedVersions.get(commitedVersions.size() - 1 - i)); - } - } - } - } - } - - @Test - public void testInsertAndCleanByCommits() throws Exception { - int maxCommits = 3; // keep upto 3 commits from the past - HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( - HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) - .retainCommits(maxCommits).build()).build(); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); - FileSystem fs = FSUtils.getFs(); - - /** - * do a big insert - * (this is basically same as insert part of upsert, just adding it here so we can - * catch breakages in insert(), if the implementation diverges.) - */ - String newCommitTime = client.startCommit(); - List records = dataGen.generateInserts(newCommitTime, 500); - JavaRDD writeRecords = jsc.parallelize(records, 5); - - List statuses = client.insert(writeRecords, newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - // verify that there is a commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); - assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); - // Should have 100 records in table (check using Index), all in locations marked at commit - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); - - assertFalse(table.getCompletedCommitTimeline().empty()); - String commitTime = - table.getCompletedCommitTimeline().getInstants().findFirst().get().getTimestamp(); - assertFalse(table.getCompletedCleanTimeline().empty()); - assertEquals("The clean instant should be the same as the commit instant", commitTime, - table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp()); - - List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect(); - checkTaggedRecords(taggedRecords, newCommitTime); - - // Keep doing some writes and clean inline. Make sure we have expected number of files remaining. - for (int writeCnt = 2; writeCnt < 10; writeCnt++) { - Thread.sleep(1100); // make sure commits are unique - newCommitTime = client.startCommit(); - records = dataGen.generateUpdates(newCommitTime, 100); - - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieTable table1 = HoodieTable.getHoodieTable(metadata, cfg); - HoodieTimeline activeTimeline = table1.getCompletedCommitTimeline(); - Optional - earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1); - Set acceptableCommits = - activeTimeline.getInstants().collect(Collectors.toSet()); - if (earliestRetainedCommit.isPresent()) { - acceptableCommits.removeAll( - activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants() - .collect(Collectors.toSet())); - acceptableCommits.add(earliestRetainedCommit.get()); - } - - TableFileSystemView fsView = table1.getFileSystemView(); - // Need to ensure the following - for (String partitionPath : dataGen.getPartitionPaths()) { - List fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList()); - for (HoodieFileGroup fileGroup : fileGroups) { - Set commitTimes = new HashSet<>(); - fileGroup.getAllDataFiles().forEach(value -> { - System.out.println("Data File - " + value); - commitTimes.add(value.getCommitTime()); - }); - assertEquals("Only contain acceptable versions of file should be present", - acceptableCommits.stream().map(HoodieInstant::getTimestamp) - .collect(Collectors.toSet()), commitTimes); - } - } - } - } - - @Test - public void testRollbackCommit() throws Exception { - // Let's create some commit files and parquet files - String commitTime1 = "20160501010101"; - String commitTime2 = "20160502020601"; - String commitTime3 = "20160506030611"; - new File(basePath + "/.hoodie").mkdirs(); - HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(), - new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, - basePath); - - - // Only first two have commit files - HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2); - // Third one has a .inflight intermediate commit file - HoodieTestUtils.createInflightCommitFiles(basePath, commitTime3); - - // Make commit1 - String file11 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime1, "id11"); - String file12 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime1, "id12"); - String file13 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime1, "id13"); - - // Make commit2 - String file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21"); - String file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22"); - String file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23"); - - // Make commit3 - String file31 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime3, "id31"); - String file32 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime3, "id32"); - String file33 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime3, "id33"); - - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY) - .build()).build(); - - HoodieWriteClient client = new HoodieWriteClient(jsc, config, false); - - // Rollback commit 1 (this should fail, since commit2 is still around) - try { - client.rollback(commitTime1); - assertTrue("Should have thrown an exception ", false); - } catch (HoodieRollbackException hrbe) { - // should get here - } - - // Rollback commit3 - client.rollback(commitTime3); - assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33)); - - // simulate partial failure, where .inflight was not deleted, but data files were. - HoodieTestUtils.createInflightCommitFiles(basePath, commitTime3); - client.rollback(commitTime3); - assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); - - - // Rollback commit2 - client.rollback(commitTime2); - assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2)); - assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); - - // simulate partial failure, where only .commit => .inflight renaming succeeded, leaving a - // .inflight commit and a bunch of data files around. - HoodieTestUtils.createInflightCommitFiles(basePath, commitTime2); - file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21"); - file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22"); - file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23"); - - client.rollback(commitTime2); - assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2)); - assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); - - - // Let's rollback commit1, Check results - client.rollback(commitTime1); - assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime1)); - assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime1)); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13)); - } - - - @Test - public void testAutoRollbackCommit() throws Exception { - // Let's create some commit files and parquet files - String commitTime1 = "20160501010101"; - String commitTime2 = "20160502020601"; - String commitTime3 = "20160506030611"; - new File(basePath + "/.hoodie").mkdirs(); - HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(), - new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, - basePath); - - // One good commit - HoodieTestUtils.createCommitFiles(basePath, commitTime1); - // Two inflight commits - HoodieTestUtils.createInflightCommitFiles(basePath, commitTime2, commitTime3); - - // Make commit1 - String file11 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime1, "id11"); - String file12 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime1, "id12"); - String file13 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime1, "id13"); - - // Make commit2 - String file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21"); - String file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22"); - String file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23"); - - // Make commit3 - String file31 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime3, "id31"); - String file32 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime3, "id32"); - String file33 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime3, "id33"); - - // Turn auto rollback off - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY) - .build()).build(); - - new HoodieWriteClient(jsc, config, false); - - // Check results, nothing changed - assertTrue(HoodieTestUtils.doesCommitExist(basePath, commitTime1)); - assertTrue(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); - assertTrue(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31) && - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32) && - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) && - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) && - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11) && - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12) && - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13)); - - // Turn auto rollback on - new HoodieWriteClient(jsc, config, true); - assertTrue(HoodieTestUtils.doesCommitExist(basePath, commitTime1)); - assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); - assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33)); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) || - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11) && - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12) && - HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13)); - } - - - private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize) { - HoodieWriteConfig.Builder builder = getConfigBuilder(); - return builder.withCompactionConfig( - HoodieCompactionConfig.newBuilder() - .compactionSmallFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 15) - .insertSplitSize(insertSplitSize).build()) // tolerate upto 15 records - .withStorageConfig(HoodieStorageConfig.newBuilder() - .limitFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 20) - .build()) - .build(); - } - - - @Test - public void testSmallInsertHandlingForUpserts() throws Exception { - - FileSystem fs = FSUtils.getFs(); - final String TEST_PARTITION_PATH = "2016/09/26"; - final int INSERT_SPLIT_LIMIT = 100; - // setup the small file handling params - HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 200 records max - dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH}); - - HoodieWriteClient client = new HoodieWriteClient(jsc, config); - - // Inserts => will write file1 - String commitTime1 = "001"; - client.startCommitWithTime(commitTime1); - List inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb - Set keys1 = HoodieClientTestUtils.getRecordKeys(inserts1); - - JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); - List statuses= client.upsert(insertRecordsRDD1, commitTime1).collect(); - - assertNoWriteErrors(statuses); - - assertEquals("Just 1 file needs to be added.", 1, statuses.size()); - String file1 = statuses.get(0).getFileId(); - assertEquals("file should contain 100 records", - ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), - 100); - - // Update + Inserts such that they just expand file1 - String commitTime2 = "002"; - client.startCommitWithTime(commitTime2); - List inserts2 = dataGen.generateInserts(commitTime2, 40); - Set keys2 = HoodieClientTestUtils.getRecordKeys(inserts2); - List insertsAndUpdates2 = new ArrayList<>(); - insertsAndUpdates2.addAll(inserts2); - insertsAndUpdates2.addAll(dataGen.generateUpdates(commitTime2, inserts1)); - - JavaRDD insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 1); - statuses = client.upsert(insertAndUpdatesRDD2, commitTime2).collect(); - assertNoWriteErrors(statuses); - - assertEquals("Just 1 file needs to be updated.", 1, statuses.size()); - assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId()); - assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit()); - Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); - assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 140); - - List records = ParquetUtils.readAvroRecords(newFile); - for (GenericRecord record: records) { - String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - assertEquals("only expect commit2", commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); - assertTrue("key expected to be part of commit2", keys2.contains(recordKey) || keys1.contains(recordKey)); - } - - // update + inserts such that file1 is updated and expanded, a new file2 is created. - String commitTime3 = "003"; - client.startCommitWithTime(commitTime3); - List insertsAndUpdates3 = dataGen.generateInserts(commitTime3, 200); - Set keys3 = HoodieClientTestUtils.getRecordKeys(insertsAndUpdates3); - List updates3 = dataGen.generateUpdates(commitTime3, inserts2); - insertsAndUpdates3.addAll(updates3); - - JavaRDD insertAndUpdatesRDD3 = jsc.parallelize(insertsAndUpdates3, 1); - statuses = client.upsert(insertAndUpdatesRDD3, commitTime3).collect(); - assertNoWriteErrors(statuses); - - assertEquals("2 files needs to be committed.", 2, statuses.size()); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieTable table = HoodieTable.getHoodieTable(metadata, config); - TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView(); - List files = fileSystemView.getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect( + // update + inserts such that file1 is updated and expanded, a new file2 is created. + String commitTime3 = "003"; + client.startCommitWithTime(commitTime3); + List insertsAndUpdates3 = dataGen.generateInserts(commitTime3, 200); + Set keys3 = HoodieClientTestUtils.getRecordKeys(insertsAndUpdates3); + List updates3 = dataGen.generateUpdates(commitTime3, inserts2); + insertsAndUpdates3.addAll(updates3); + + JavaRDD insertAndUpdatesRDD3 = jsc.parallelize(insertsAndUpdates3, 1); + statuses = client.upsert(insertAndUpdatesRDD3, commitTime3).collect(); + assertNoWriteErrors(statuses); + + assertEquals("2 files needs to be committed.", 2, statuses.size()); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTable table = HoodieTable.getHoodieTable(metadata, config); + TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView(); + List files = fileSystemView + .getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect( Collectors.toList()); - int numTotalInsertsInCommit3 = 0; - for (HoodieDataFile file: files) { - if (file.getFileName().contains(file1)) { - assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime()); - records = ParquetUtils.readAvroRecords(new Path(file.getPath())); - for (GenericRecord record: records) { - String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); - if (recordCommitTime.equals(commitTime3)) { - if (keys2.contains(recordKey)) { - assertEquals("only expect commit3", commitTime3, recordCommitTime); - keys2.remove(recordKey); - } else { - numTotalInsertsInCommit3++; - } - } - } - assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0, keys2.size()); + int numTotalInsertsInCommit3 = 0; + for (HoodieDataFile file : files) { + if (file.getFileName().contains(file1)) { + assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime()); + records = ParquetUtils.readAvroRecords(new Path(file.getPath())); + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + if (recordCommitTime.equals(commitTime3)) { + if (keys2.contains(recordKey)) { + assertEquals("only expect commit3", commitTime3, recordCommitTime); + keys2.remove(recordKey); } else { - assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime()); - records = ParquetUtils.readAvroRecords(new Path(file.getPath())); - for (GenericRecord record: records) { - String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - assertEquals("only expect commit3", commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); - assertTrue("key expected to be part of commit3", keys3.contains(recordKey)); - } - numTotalInsertsInCommit3 += records.size(); + numTotalInsertsInCommit3++; } + } } - assertEquals("Total inserts in commit3 must add up", keys3.size(), numTotalInsertsInCommit3); - } - - @Test - public void testSmallInsertHandlingForInserts() throws Exception { - - final String TEST_PARTITION_PATH = "2016/09/26"; - final int INSERT_SPLIT_LIMIT = 100; - // setup the small file handling params - HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 200 records max - dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH}); - HoodieWriteClient client = new HoodieWriteClient(jsc, config); - - // Inserts => will write file1 - String commitTime1 = "001"; - client.startCommitWithTime(commitTime1); - List inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb - Set keys1 = HoodieClientTestUtils.getRecordKeys(inserts1); - JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); - List statuses= client.insert(insertRecordsRDD1, commitTime1).collect(); - - assertNoWriteErrors(statuses); - assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, FSUtils.getFs()); - - assertEquals("Just 1 file needs to be added.", 1, statuses.size()); - String file1 = statuses.get(0).getFileId(); - assertEquals("file should contain 100 records", - ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), - 100); - - // Second, set of Inserts should just expand file1 - String commitTime2 = "002"; - client.startCommitWithTime(commitTime2); - List inserts2 = dataGen.generateInserts(commitTime2, 40); - Set keys2 = HoodieClientTestUtils.getRecordKeys(inserts2); - JavaRDD insertRecordsRDD2 = jsc.parallelize(inserts2, 1); - statuses = client.insert(insertRecordsRDD2, commitTime2).collect(); - assertNoWriteErrors(statuses); - - assertEquals("Just 1 file needs to be updated.", 1, statuses.size()); - assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId()); - assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit()); - Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); - assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 140); - - List records = ParquetUtils.readAvroRecords(newFile); - for (GenericRecord record: records) { - String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); - assertTrue("Record expected to be part of commit 1 or commit2", commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime)); - assertTrue("key expected to be part of commit 1 or commit2", keys2.contains(recordKey) || keys1.contains(recordKey)); + assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0, + keys2.size()); + } else { + assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime()); + records = ParquetUtils.readAvroRecords(new Path(file.getPath())); + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + assertEquals("only expect commit3", commitTime3, + record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); + assertTrue("key expected to be part of commit3", keys3.contains(recordKey)); } + numTotalInsertsInCommit3 += records.size(); + } + } + assertEquals("Total inserts in commit3 must add up", keys3.size(), numTotalInsertsInCommit3); + } - // Lots of inserts such that file1 is updated and expanded, a new file2 is created. - String commitTime3 = "003"; - client.startCommitWithTime(commitTime3); - List insert3 = dataGen.generateInserts(commitTime3, 200); - JavaRDD insertRecordsRDD3 = jsc.parallelize(insert3, 1); - statuses = client.insert(insertRecordsRDD3, commitTime3).collect(); - assertNoWriteErrors(statuses); - assertEquals("2 files needs to be committed.", 2, statuses.size()); + @Test + public void testSmallInsertHandlingForInserts() throws Exception { + final String TEST_PARTITION_PATH = "2016/09/26"; + final int INSERT_SPLIT_LIMIT = 100; + // setup the small file handling params + HoodieWriteConfig config = getSmallInsertWriteConfig( + INSERT_SPLIT_LIMIT); // hold upto 200 records max + dataGen = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH}); + HoodieWriteClient client = new HoodieWriteClient(jsc, config); - FileSystem fs = FSUtils.getFs(); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); - List files = - table.getROFileSystemView().getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3) - .collect(Collectors.toList()); - assertEquals("Total of 2 valid data files", 2, files.size()); + // Inserts => will write file1 + String commitTime1 = "001"; + client.startCommitWithTime(commitTime1); + List inserts1 = dataGen + .generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb + Set keys1 = HoodieClientTestUtils.getRecordKeys(inserts1); + JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); + List statuses = client.insert(insertRecordsRDD1, commitTime1).collect(); + assertNoWriteErrors(statuses); + assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, FSUtils.getFs()); - int totalInserts = 0; - for (HoodieDataFile file: files) { - assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime()); - records = ParquetUtils.readAvroRecords(new Path(file.getPath())); - totalInserts += records.size(); + assertEquals("Just 1 file needs to be added.", 1, statuses.size()); + String file1 = statuses.get(0).getFileId(); + assertEquals("file should contain 100 records", + ParquetUtils.readRowKeysFromParquet(new Path(basePath, + TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), + 100); + + // Second, set of Inserts should just expand file1 + String commitTime2 = "002"; + client.startCommitWithTime(commitTime2); + List inserts2 = dataGen.generateInserts(commitTime2, 40); + Set keys2 = HoodieClientTestUtils.getRecordKeys(inserts2); + JavaRDD insertRecordsRDD2 = jsc.parallelize(inserts2, 1); + statuses = client.insert(insertRecordsRDD2, commitTime2).collect(); + assertNoWriteErrors(statuses); + + assertEquals("Just 1 file needs to be updated.", 1, statuses.size()); + assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId()); + assertEquals("Existing file should be expanded", commitTime1, + statuses.get(0).getStat().getPrevCommit()); + Path newFile = new Path(basePath, + TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); + assertEquals("file should contain 140 records", + ParquetUtils.readRowKeysFromParquet(newFile).size(), 140); + + List records = ParquetUtils.readAvroRecords(newFile); + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + assertTrue("Record expected to be part of commit 1 or commit2", + commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime)); + assertTrue("key expected to be part of commit 1 or commit2", + keys2.contains(recordKey) || keys1.contains(recordKey)); + } + + // Lots of inserts such that file1 is updated and expanded, a new file2 is created. + String commitTime3 = "003"; + client.startCommitWithTime(commitTime3); + List insert3 = dataGen.generateInserts(commitTime3, 200); + JavaRDD insertRecordsRDD3 = jsc.parallelize(insert3, 1); + statuses = client.insert(insertRecordsRDD3, commitTime3).collect(); + assertNoWriteErrors(statuses); + assertEquals("2 files needs to be committed.", 2, statuses.size()); + + FileSystem fs = FSUtils.getFs(); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); + List files = + table.getROFileSystemView().getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3) + .collect(Collectors.toList()); + assertEquals("Total of 2 valid data files", 2, files.size()); + + int totalInserts = 0; + for (HoodieDataFile file : files) { + assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime()); + records = ParquetUtils.readAvroRecords(new Path(file.getPath())); + totalInserts += records.size(); + } + assertEquals("Total number of records must add up", totalInserts, + inserts1.size() + inserts2.size() + insert3.size()); + } + + @Test + public void testKeepLatestFileVersions() throws IOException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withAssumeDatePartitioning(true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) + .retainFileVersions(1).build()).build(); + + // make 1 commit, with 1 file per partition + HoodieTestUtils.createCommitFiles(basePath, "000"); + + String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); + String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); + HoodieTable table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + config); + + List hoodieCleanStatsOne = table.clean(jsc); + assertEquals("Must not clean any files", 0, + getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size()); + assertEquals("Must not clean any files", 0, + getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size()); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); + + // make next commit, with 1 insert & 1 update per partition + HoodieTestUtils.createCommitFiles(basePath, "001"); + table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + config); + + String file2P0C1 = HoodieTestUtils + .createNewDataFile(basePath, partitionPaths[0], "001"); // insert + String file2P1C1 = HoodieTestUtils + .createNewDataFile(basePath, partitionPaths[1], "001"); // insert + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update + + List hoodieCleanStatsTwo = table.clean(jsc); + assertEquals("Must clean 1 file", 1, + getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size()); + assertEquals("Must clean 1 file", 1, + getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size()); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); + + // make next commit, with 2 updates to existing files, and 1 insert + HoodieTestUtils.createCommitFiles(basePath, "002"); + table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + config); + + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update + String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002"); + + List hoodieCleanStatsThree = table.clean(jsc); + assertEquals("Must clean two files", 2, + getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size()); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); + + // No cleaning on partially written file, with no commit. + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file3P0C2); // update + List hoodieCleanStatsFour = table.clean(jsc); + assertEquals("Must not clean any files", 0, + getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size()); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); + } + + @Test + public void testKeepLatestFileVersionsMOR() throws IOException { + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withAssumeDatePartitioning(true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) + .retainFileVersions(1).build()).build(); + + HoodieTableMetaClient metaClient = HoodieTestUtils + .initTableType(basePath, HoodieTableType.MERGE_ON_READ); + + // Make 3 files, one base file and 2 log files associated with base file + String file1P0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); + String file2P0L0 = HoodieTestUtils + .createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.empty()); + String file2P0L1 = HoodieTestUtils + .createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.of(2)); + // make 1 compaction commit + HoodieTestUtils.createCompactionCommitFiles(basePath, "000"); + + // Make 4 files, one base file and 3 log files associated with base file + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0); + file2P0L0 = HoodieTestUtils + .createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.empty()); + file2P0L0 = HoodieTestUtils + .createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(2)); + file2P0L0 = HoodieTestUtils + .createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(3)); + // make 1 compaction commit + HoodieTestUtils.createCompactionCommitFiles(basePath, "001"); + + HoodieTable table = HoodieTable + .getHoodieTable(metaClient, config); + List hoodieCleanStats = table.clean(jsc); + assertEquals("Must clean three files, one parquet and 2 log files", 3, + getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size()); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0)); + assertFalse(HoodieTestUtils + .doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.empty())); + assertFalse(HoodieTestUtils + .doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.of(2))); + } + + @Test + public void testKeepLatestCommits() throws IOException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withAssumeDatePartitioning(true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .retainCommits(2).build()).build(); + + // make 1 commit, with 1 file per partition + HoodieTestUtils.createCommitFiles(basePath, "000"); + + String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); + String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); + + HoodieTable table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + config); + + List hoodieCleanStatsOne = table.clean(jsc); + assertEquals("Must not clean any files", 0, + getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size()); + assertEquals("Must not clean any files", 0, + getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size()); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); + + // make next commit, with 1 insert & 1 update per partition + HoodieTestUtils.createCommitFiles(basePath, "001"); + table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + config); + + String file2P0C1 = HoodieTestUtils + .createNewDataFile(basePath, partitionPaths[0], "001"); // insert + String file2P1C1 = HoodieTestUtils + .createNewDataFile(basePath, partitionPaths[1], "001"); // insert + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update + + List hoodieCleanStatsTwo = table.clean(jsc); + assertEquals("Must not clean any files", 0, + getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size()); + assertEquals("Must not clean any files", 0, + getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size()); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); + + // make next commit, with 2 updates to existing files, and 1 insert + HoodieTestUtils.createCommitFiles(basePath, "002"); + table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + config); + + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update + String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002"); + + List hoodieCleanStatsThree = table.clean(jsc); + assertEquals( + "Must not clean any file. We have to keep 1 version before the latest commit time to keep", + 0, getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size()); + + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + + // make next commit, with 2 updates to existing files, and 1 insert + HoodieTestUtils.createCommitFiles(basePath, "003"); + table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + config); + + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update + String file4P0C3 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "003"); + + List hoodieCleanStatsFour = table.clean(jsc); + assertEquals( + "Must not clean one old file", 1, + getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size()); + + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "003", file4P0C3)); + + // No cleaning on partially written file, with no commit. + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "004", file3P0C2); // update + List hoodieCleanStatsFive = table.clean(jsc); + assertEquals("Must not clean any files", 0, + getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size()); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + } + + @Test + public void testCleaningWithZeroPartitonPaths() throws IOException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withAssumeDatePartitioning(true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .retainCommits(2).build()).build(); + + // Make a commit, although there are no partitionPaths. + // Example use-case of this is when a client wants to create a table + // with just some commit metadata, but no data/partitionPaths. + HoodieTestUtils.createCommitFiles(basePath, "000"); + + HoodieTable table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + config); + + List hoodieCleanStatsOne = table.clean(jsc); + assertTrue("HoodieCleanStats should be empty for a table with empty partitionPaths", + hoodieCleanStatsOne.isEmpty()); + } + + @Test + public void testCleaningSkewedPartitons() throws IOException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withAssumeDatePartitioning(true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .retainCommits(2).build()).build(); + Map stageOneShuffleReadTaskRecordsCountMap = new HashMap<>(); + + // Since clean involves repartition in order to uniformly distribute data, + // we can inspect the number of records read by various tasks in stage 1. + // There should not be skew in the number of records read in the task. + + // SparkListener below listens to the stage end events and captures number of + // records read by various tasks in stage-1. + jsc.sc().addSparkListener(new SparkListener() { + + @Override + public void onTaskEnd(SparkListenerTaskEnd taskEnd) { + + Iterator> iterator = taskEnd.taskMetrics().accumulators() + .iterator(); + while (iterator.hasNext()) { + AccumulatorV2 accumulator = iterator.next(); + if (taskEnd.stageId() == 1 && + accumulator.isRegistered() && + accumulator.name().isDefined() && + accumulator.name().get().equals("internal.metrics.shuffle.read.recordsRead")) { + stageOneShuffleReadTaskRecordsCountMap + .put(taskEnd.taskInfo().taskId(), (Long) accumulator.value()); + } } - assertEquals("Total number of records must add up", totalInserts, inserts1.size() + inserts2.size() + insert3.size()); + } + }); + + // make 1 commit, with 100 files in one partition and 10 in other two + HoodieTestUtils.createCommitFiles(basePath, "000"); + List filesP0C0 = createFilesInPartition(partitionPaths[0], "000", 100); + List filesP1C0 = createFilesInPartition(partitionPaths[1], "000", 10); + List filesP2C0 = createFilesInPartition(partitionPaths[2], "000", 10); + + HoodieTestUtils.createCommitFiles(basePath, "001"); + updateAllFilesInPartition(filesP0C0, partitionPaths[0], "001"); + updateAllFilesInPartition(filesP1C0, partitionPaths[1], "001"); + updateAllFilesInPartition(filesP2C0, partitionPaths[2], "001"); + + HoodieTestUtils.createCommitFiles(basePath, "002"); + updateAllFilesInPartition(filesP0C0, partitionPaths[0], "002"); + updateAllFilesInPartition(filesP1C0, partitionPaths[1], "002"); + updateAllFilesInPartition(filesP2C0, partitionPaths[2], "002"); + + HoodieTestUtils.createCommitFiles(basePath, "003"); + updateAllFilesInPartition(filesP0C0, partitionPaths[0], "003"); + updateAllFilesInPartition(filesP1C0, partitionPaths[1], "003"); + updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003"); + + HoodieTable table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + config); + List hoodieCleanStats = table.clean(jsc); + + assertEquals(100, + getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size()); + assertEquals(10, + getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size()); + assertEquals(10, + getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size()); + + // 3 tasks are expected since the number of partitions is 3 + assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size()); + // Sum of all records processed = total number of files to clean + assertEquals(120, stageOneShuffleReadTaskRecordsCountMap + .values().stream().reduce((a, b) -> a + b).get().intValue()); + assertTrue("The skew in handling files to clean is not removed. " + + "Each task should handle more records than the partitionPath with least files " + + "and less records than the partitionPath with most files.", + stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100) + .count() == 3); + } + + public void testCommitWritesRelativePaths() throws Exception { + + HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + FileSystem fs = FSUtils.getFs(); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg); + + String commitTime = "000"; + client.startCommitWithTime(commitTime); + + List records = dataGen.generateInserts(commitTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + JavaRDD result = client.bulkInsert(writeRecords, commitTime); + + assertTrue("Commit should succeed", client.commit(commitTime, result)); + assertTrue("After explicit commit, commit file should be created", + HoodieTestUtils.doesCommitExist(basePath, commitTime)); + + // Get parquet file paths from commit metadata + String actionType = table.getCompactedCommitActionType(); + HoodieInstant commitInstant = + new HoodieInstant(false, actionType, commitTime); + HoodieTimeline commitTimeline = table.getCompletedCompactionCommitTimeline(); + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commitInstant).get()); + String basePath = table.getMetaClient().getBasePath(); + Collection commitPathNames = commitMetadata.getFileIdAndFullPaths(basePath).values(); + + // Read from commit file + String filename = HoodieTestUtils.getCommitFilePath(basePath, commitTime); + FileInputStream inputStream = new FileInputStream(filename); + String everything = IOUtils.toString(inputStream); + HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything.toString()); + HashMap paths = metadata.getFileIdAndFullPaths(basePath); + inputStream.close(); + + // Compare values in both to make sure they are equal. + for (String pathName : paths.values()) { + assertTrue(commitPathNames.contains(pathName)); } + } - @Test - public void testKeepLatestFileVersions() throws IOException { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) - .retainFileVersions(1).build()).build(); + private HoodieCleanStat getCleanStat(List hoodieCleanStatsTwo, + String partitionPath) { + return hoodieCleanStatsTwo.stream() + .filter(e -> e.getPartitionPath().equals(partitionPath)) + .findFirst().get(); + } - // make 1 commit, with 1 file per partition - HoodieTestUtils.createCommitFiles(basePath, "000"); - - String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); - String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); - - List hoodieCleanStatsOne = table.clean(jsc); - assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size()); - assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size()); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); - - // make next commit, with 1 insert & 1 update per partition - HoodieTestUtils.createCommitFiles(basePath, "001"); - table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); - - String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert - String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update - HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update - - List hoodieCleanStatsTwo = table.clean(jsc); - assertEquals("Must clean 1 file" , 1, getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size()); - assertEquals("Must clean 1 file" , 1, getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size()); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1)); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); - - // make next commit, with 2 updates to existing files, and 1 insert - HoodieTestUtils.createCommitFiles(basePath, "002"); - table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); - - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update - String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002"); - - List hoodieCleanStatsThree = table.clean(jsc); - assertEquals("Must clean two files" , 2, getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size()); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); - - // No cleaning on partially written file, with no commit. - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file3P0C2); // update - List hoodieCleanStatsFour = table.clean(jsc); - assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size()); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); + private void updateAllFilesInPartition(List files, String partitionPath, + String commitTime) throws IOException { + for (String fileId : files) { + HoodieTestUtils.createDataFile(basePath, partitionPath, commitTime, fileId); } + } - @Test - public void testKeepLatestFileVersionsMOR() throws IOException { - - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) - .retainFileVersions(1).build()).build(); - - - HoodieTableMetaClient metaClient = HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ); - - // Make 3 files, one base file and 2 log files associated with base file - String file1P0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); - String file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.empty()); - String file2P0L1 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.of(2)); - // make 1 compaction commit - HoodieTestUtils.createCompactionCommitFiles(basePath, "000"); - - // Make 4 files, one base file and 3 log files associated with base file - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0); - file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.empty()); - file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(2)); - file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(3)); - // make 1 compaction commit - HoodieTestUtils.createCompactionCommitFiles(basePath, "001"); - - HoodieTable table = HoodieTable - .getHoodieTable(metaClient, config); - List hoodieCleanStats = table.clean(jsc); - assertEquals("Must clean three files, one parquet and 2 log files" , 3, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size()); - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0)); - assertFalse(HoodieTestUtils.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.empty())); - assertFalse(HoodieTestUtils.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.of(2))); + private List createFilesInPartition(String partitionPath, String commitTime, int numFiles) + throws IOException { + List files = new ArrayList<>(); + for (int i = 0; i < numFiles; i++) { + files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime)); } + return files; + } - @Test - public void testKeepLatestCommits() throws IOException { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) - .retainCommits(2).build()).build(); - - // make 1 commit, with 1 file per partition - HoodieTestUtils.createCommitFiles(basePath, "000"); - - String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); - String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); - - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); - - List hoodieCleanStatsOne = table.clean(jsc); - assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size()); - assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size()); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); - - // make next commit, with 1 insert & 1 update per partition - HoodieTestUtils.createCommitFiles(basePath, "001"); - table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); - - String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert - String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update - HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update - - List hoodieCleanStatsTwo = table.clean(jsc); - assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size()); - assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size()); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); - - // make next commit, with 2 updates to existing files, and 1 insert - HoodieTestUtils.createCommitFiles(basePath, "002"); - table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); - - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update - String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002"); - - List hoodieCleanStatsThree = table.clean(jsc); - assertEquals( - "Must not clean any file. We have to keep 1 version before the latest commit time to keep", - 0, getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size()); - - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); - - // make next commit, with 2 updates to existing files, and 1 insert - HoodieTestUtils.createCommitFiles(basePath, "003"); - table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); - - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update - String file4P0C3 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "003"); - - List hoodieCleanStatsFour = table.clean(jsc); - assertEquals( - "Must not clean one old file", 1, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size()); - - assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file1P0C0)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file2P0C1)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "003", file4P0C3)); - - // No cleaning on partially written file, with no commit. - HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "004", file3P0C2); // update - List hoodieCleanStatsFive = table.clean(jsc); - assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size()); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); - assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + @After + public void clean() { + if (basePath != null) { + new File(basePath).delete(); } - - @Test - public void testCleaningWithZeroPartitonPaths() throws IOException { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) - .retainCommits(2).build()).build(); - - // Make a commit, although there are no partitionPaths. - // Example use-case of this is when a client wants to create a table - // with just some commit metadata, but no data/partitionPaths. - HoodieTestUtils.createCommitFiles(basePath, "000"); - - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), - config); - - List hoodieCleanStatsOne = table.clean(jsc); - assertTrue("HoodieCleanStats should be empty for a table with empty partitionPaths", - hoodieCleanStatsOne.isEmpty()); - } - - @Test - public void testCleaningSkewedPartitons() throws IOException { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) - .retainCommits(2).build()).build(); - Map stageOneShuffleReadTaskRecordsCountMap = new HashMap<>(); - - // Since clean involves repartition in order to uniformly distribute data, - // we can inspect the number of records read by various tasks in stage 1. - // There should not be skew in the number of records read in the task. - - // SparkListener below listens to the stage end events and captures number of - // records read by various tasks in stage-1. - jsc.sc().addSparkListener(new SparkListener() { - - @Override - public void onTaskEnd(SparkListenerTaskEnd taskEnd) { - - Iterator> iterator = taskEnd.taskMetrics().accumulators() - .iterator(); - while(iterator.hasNext()) { - AccumulatorV2 accumulator = iterator.next(); - if (taskEnd.stageId() == 1 && - accumulator.isRegistered() && - accumulator.name().isDefined() && - accumulator.name().get().equals("internal.metrics.shuffle.read.recordsRead")) { - stageOneShuffleReadTaskRecordsCountMap.put(taskEnd.taskInfo().taskId(), (Long) accumulator.value()); - } - } - } - }); - - // make 1 commit, with 100 files in one partition and 10 in other two - HoodieTestUtils.createCommitFiles(basePath, "000"); - List filesP0C0 = createFilesInPartition(partitionPaths[0], "000", 100); - List filesP1C0 = createFilesInPartition(partitionPaths[1], "000", 10); - List filesP2C0 = createFilesInPartition(partitionPaths[2], "000", 10); - - HoodieTestUtils.createCommitFiles(basePath, "001"); - updateAllFilesInPartition(filesP0C0, partitionPaths[0], "001"); - updateAllFilesInPartition(filesP1C0, partitionPaths[1], "001"); - updateAllFilesInPartition(filesP2C0, partitionPaths[2], "001"); - - HoodieTestUtils.createCommitFiles(basePath, "002"); - updateAllFilesInPartition(filesP0C0, partitionPaths[0], "002"); - updateAllFilesInPartition(filesP1C0, partitionPaths[1], "002"); - updateAllFilesInPartition(filesP2C0, partitionPaths[2], "002"); - - HoodieTestUtils.createCommitFiles(basePath, "003"); - updateAllFilesInPartition(filesP0C0, partitionPaths[0], "003"); - updateAllFilesInPartition(filesP1C0, partitionPaths[1], "003"); - updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003"); - - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); - List hoodieCleanStats = table.clean(jsc); - - assertEquals(100, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size()); - assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size()); - assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size()); - - // 3 tasks are expected since the number of partitions is 3 - assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size()); - // Sum of all records processed = total number of files to clean - assertEquals(120, stageOneShuffleReadTaskRecordsCountMap - .values().stream().reduce((a,b) -> a + b).get().intValue()); - assertTrue("The skew in handling files to clean is not removed. " - + "Each task should handle more records than the partitionPath with least files " - + "and less records than the partitionPath with most files.", - stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3); - } - - public void testCommitWritesRelativePaths() throws Exception { - - HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - FileSystem fs = FSUtils.getFs(); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg); - - String commitTime = "000"; - client.startCommitWithTime(commitTime); - - List records = dataGen.generateInserts(commitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - JavaRDD result = client.bulkInsert(writeRecords, commitTime); - - assertTrue("Commit should succeed", client.commit(commitTime, result)); - assertTrue("After explicit commit, commit file should be created", - HoodieTestUtils.doesCommitExist(basePath, commitTime)); - - // Get parquet file paths from commit metadata - String actionType = table.getCompactedCommitActionType(); - HoodieInstant commitInstant = - new HoodieInstant(false, actionType, commitTime); - HoodieTimeline commitTimeline = table.getCompletedCompactionCommitTimeline(); - HoodieCommitMetadata commitMetadata = - HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commitInstant).get()); - String basePath = table.getMetaClient().getBasePath(); - Collection commitPathNames = commitMetadata.getFileIdAndFullPaths(basePath).values(); - - // Read from commit file - String filename = HoodieTestUtils.getCommitFilePath(basePath, commitTime); - FileInputStream inputStream = new FileInputStream(filename); - String everything = IOUtils.toString(inputStream); - HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything.toString()); - HashMap paths = metadata.getFileIdAndFullPaths(basePath); - inputStream.close(); - - // Compare values in both to make sure they are equal. - for (String pathName : paths.values()) { - assertTrue(commitPathNames.contains(pathName)); - } - } - - private HoodieCleanStat getCleanStat(List hoodieCleanStatsTwo, - String partitionPath) { - return hoodieCleanStatsTwo.stream() - .filter(e -> e.getPartitionPath().equals(partitionPath)) - .findFirst().get(); - } - - private void updateAllFilesInPartition(List files, String partitionPath, - String commitTime) throws IOException { - for (String fileId : files) { - HoodieTestUtils.createDataFile(basePath, partitionPath, commitTime, fileId); - } - } - - private List createFilesInPartition(String partitionPath, String commitTime, int numFiles) throws IOException { - List files = new ArrayList<>(); - for (int i = 0; i < numFiles; i++) { - files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime)); - } - return files; - } - - @After - public void clean() { - if (basePath != null) { - new File(basePath).delete(); - } - if (jsc != null) { - jsc.stop(); - } + if (jsc != null) { + jsc.stop(); } + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java index 002b6cd20..c2db12d5e 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java @@ -29,15 +29,6 @@ import com.uber.hoodie.common.table.view.HoodieTableFileSystemView; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.table.HoodieTable; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; - -import org.apache.spark.SparkConf; - import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; @@ -49,6 +40,12 @@ import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; /** * Utility methods to aid testing inside the HoodieClient module. @@ -56,133 +53,142 @@ import java.util.stream.Collectors; public class HoodieClientTestUtils { - public static List collectStatuses(Iterator> statusListItr) { - List statuses = new ArrayList<>(); - while (statusListItr.hasNext()) { - statuses.addAll(statusListItr.next()); - } - return statuses; - } - - public static Set getRecordKeys(List hoodieRecords) { - Set keys = new HashSet<>(); - for (HoodieRecord rec: hoodieRecords) { - keys.add(rec.getRecordKey()); - } - return keys; - } - - private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException { - String parentPath = basePath + "/"+ HoodieTableMetaClient.METAFOLDER_NAME; - new File(parentPath).mkdirs(); - new File(parentPath + "/" + commitTime + suffix).createNewFile(); - } - - - public static void fakeCommitFile(String basePath, String commitTime) throws IOException { - fakeMetaFile(basePath, commitTime, HoodieTimeline.COMMIT_EXTENSION); - } - - public static void fakeInFlightFile(String basePath, String commitTime) throws IOException { - fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION); - } - - public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId) throws Exception { - fakeDataFile(basePath, partitionPath, commitTime, fileId, 0); - } - - public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId, long length) throws Exception { - String parentPath = String.format("%s/%s", basePath, partitionPath); - new File(parentPath).mkdirs(); - String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId)); - new File(path).createNewFile(); - new RandomAccessFile(path, "rw").setLength(length); - } - - public static SparkConf getSparkConfForTest(String appName) { - SparkConf sparkConf = new SparkConf() - .setAppName(appName) - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - .setMaster("local[1]"); - return HoodieReadClient.addHoodieSupport(sparkConf); - } - - public static HashMap getLatestFileIDsToFullPath(String basePath, - HoodieTimeline commitTimeline, - List commitsToReturn) throws IOException { - HashMap fileIdToFullPath = new HashMap<>(); - for (HoodieInstant commit : commitsToReturn) { - HoodieCommitMetadata metadata = - HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get()); - fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath)); - } - return fileIdToFullPath; - } - - public static Dataset readCommit(String basePath, - SQLContext sqlContext, - HoodieTimeline commitTimeline, - String commitTime) { - HoodieInstant commitInstant = - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); - if (!commitTimeline.containsInstant(commitInstant)) { - new HoodieException("No commit exists at " + commitTime); - } - try { - HashMap paths = getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant)); - return sqlContext.read() - .parquet(paths.values().toArray(new String[paths.size()])) - .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime)); - } catch (Exception e) { - throw new HoodieException("Error reading commit " + commitTime, e); - } - } - - /** - * Obtain all new data written into the Hoodie dataset since the given timestamp. - */ - public static Dataset readSince(String basePath, - SQLContext sqlContext, - HoodieTimeline commitTimeline, - String lastCommitTime) { - List commitsToReturn = - commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE) - .getInstants().collect(Collectors.toList()); - try { - // Go over the commit metadata, and obtain the new files that need to be read. - HashMap fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn); - return sqlContext.read() - .parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()])) - .filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime)); - } catch (IOException e) { - throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e); - } - } - - /** - * Reads the paths under the a hoodie dataset out as a DataFrame - */ - public static Dataset read(String basePath, - SQLContext sqlContext, - FileSystem fs, - String... paths) { - List filteredPaths = new ArrayList<>(); - try { - HoodieTable hoodieTable = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null); - for (String path : paths) { - TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(hoodieTable.getMetaClient(), - hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path))); - List latestFiles = fileSystemView.getLatestDataFiles().collect( - Collectors.toList()); - for (HoodieDataFile file : latestFiles) { - filteredPaths.add(file.getPath()); - } - } - return sqlContext.read() - .parquet(filteredPaths.toArray(new String[filteredPaths.size()])); - } catch (Exception e) { - throw new HoodieException("Error reading hoodie dataset as a dataframe", e); + public static List collectStatuses(Iterator> statusListItr) { + List statuses = new ArrayList<>(); + while (statusListItr.hasNext()) { + statuses.addAll(statusListItr.next()); + } + return statuses; + } + + public static Set getRecordKeys(List hoodieRecords) { + Set keys = new HashSet<>(); + for (HoodieRecord rec : hoodieRecords) { + keys.add(rec.getRecordKey()); + } + return keys; + } + + private static void fakeMetaFile(String basePath, String commitTime, String suffix) + throws IOException { + String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME; + new File(parentPath).mkdirs(); + new File(parentPath + "/" + commitTime + suffix).createNewFile(); + } + + + public static void fakeCommitFile(String basePath, String commitTime) throws IOException { + fakeMetaFile(basePath, commitTime, HoodieTimeline.COMMIT_EXTENSION); + } + + public static void fakeInFlightFile(String basePath, String commitTime) throws IOException { + fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION); + } + + public static void fakeDataFile(String basePath, String partitionPath, String commitTime, + String fileId) throws Exception { + fakeDataFile(basePath, partitionPath, commitTime, fileId, 0); + } + + public static void fakeDataFile(String basePath, String partitionPath, String commitTime, + String fileId, long length) throws Exception { + String parentPath = String.format("%s/%s", basePath, partitionPath); + new File(parentPath).mkdirs(); + String path = String + .format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId)); + new File(path).createNewFile(); + new RandomAccessFile(path, "rw").setLength(length); + } + + public static SparkConf getSparkConfForTest(String appName) { + SparkConf sparkConf = new SparkConf() + .setAppName(appName) + .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .setMaster("local[1]"); + return HoodieReadClient.addHoodieSupport(sparkConf); + } + + public static HashMap getLatestFileIDsToFullPath(String basePath, + HoodieTimeline commitTimeline, + List commitsToReturn) throws IOException { + HashMap fileIdToFullPath = new HashMap<>(); + for (HoodieInstant commit : commitsToReturn) { + HoodieCommitMetadata metadata = + HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get()); + fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath)); + } + return fileIdToFullPath; + } + + public static Dataset readCommit(String basePath, + SQLContext sqlContext, + HoodieTimeline commitTimeline, + String commitTime) { + HoodieInstant commitInstant = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); + if (!commitTimeline.containsInstant(commitInstant)) { + new HoodieException("No commit exists at " + commitTime); + } + try { + HashMap paths = getLatestFileIDsToFullPath(basePath, commitTimeline, + Arrays.asList(commitInstant)); + return sqlContext.read() + .parquet(paths.values().toArray(new String[paths.size()])) + .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime)); + } catch (Exception e) { + throw new HoodieException("Error reading commit " + commitTime, e); + } + } + + /** + * Obtain all new data written into the Hoodie dataset since the given timestamp. + */ + public static Dataset readSince(String basePath, + SQLContext sqlContext, + HoodieTimeline commitTimeline, + String lastCommitTime) { + List commitsToReturn = + commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE) + .getInstants().collect(Collectors.toList()); + try { + // Go over the commit metadata, and obtain the new files that need to be read. + HashMap fileIdToFullPath = getLatestFileIDsToFullPath(basePath, + commitTimeline, commitsToReturn); + return sqlContext.read() + .parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()])) + .filter( + String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime)); + } catch (IOException e) { + throw new HoodieException( + "Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e); + } + } + + /** + * Reads the paths under the a hoodie dataset out as a DataFrame + */ + public static Dataset read(String basePath, + SQLContext sqlContext, + FileSystem fs, + String... paths) { + List filteredPaths = new ArrayList<>(); + try { + HoodieTable hoodieTable = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null); + for (String path : paths) { + TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView( + hoodieTable.getMetaClient(), + hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path))); + List latestFiles = fileSystemView.getLatestDataFiles().collect( + Collectors.toList()); + for (HoodieDataFile file : latestFiles) { + filteredPaths.add(file.getPath()); } + } + return sqlContext.read() + .parquet(filteredPaths.toArray(new String[filteredPaths.size()])); + } catch (Exception e) { + throw new HoodieException("Error reading hoodie dataset as a dataframe", e); } + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieMergeOnReadTestUtils.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieMergeOnReadTestUtils.java index 4b2424eb7..ec3d5728c 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieMergeOnReadTestUtils.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieMergeOnReadTestUtils.java @@ -16,9 +16,16 @@ package com.uber.hoodie.common; +import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; + import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecordBuilder; @@ -30,69 +37,64 @@ import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - -import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; - /** * Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR) */ public class HoodieMergeOnReadTestUtils { - public static List getRecordsUsingInputFormat(List inputPaths) throws IOException { - JobConf jobConf = new JobConf(); - Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA)); - HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat(); - setPropsForInputFormat(inputFormat, jobConf, schema); - return inputPaths.stream().map(path -> { - setInputPath(jobConf, path); - List records = new ArrayList<>(); - try { - List splits = Arrays.asList(inputFormat.getSplits(jobConf, 1)); - RecordReader recordReader = inputFormat.getRecordReader(splits.get(0), jobConf, null); - Void key = (Void) recordReader.createKey(); - ArrayWritable writable = (ArrayWritable) recordReader.createValue(); - while (recordReader.next(key, writable)) { - GenericRecordBuilder newRecord = new GenericRecordBuilder(schema); - // writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno] - Writable[] values = writable.get(); - schema.getFields().forEach(field -> { - newRecord.set(field, values[2]); - }); - records.add(newRecord.build()); - } - } catch (IOException ie) { - ie.printStackTrace(); - } - return records; - }).reduce((a, b) -> { - a.addAll(b); - return a; - }).get(); - } + public static List getRecordsUsingInputFormat(List inputPaths) + throws IOException { + JobConf jobConf = new JobConf(); + Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA)); + HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat(); + setPropsForInputFormat(inputFormat, jobConf, schema); + return inputPaths.stream().map(path -> { + setInputPath(jobConf, path); + List records = new ArrayList<>(); + try { + List splits = Arrays.asList(inputFormat.getSplits(jobConf, 1)); + RecordReader recordReader = inputFormat.getRecordReader(splits.get(0), jobConf, null); + Void key = (Void) recordReader.createKey(); + ArrayWritable writable = (ArrayWritable) recordReader.createValue(); + while (recordReader.next(key, writable)) { + GenericRecordBuilder newRecord = new GenericRecordBuilder(schema); + // writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno] + Writable[] values = writable.get(); + schema.getFields().forEach(field -> { + newRecord.set(field, values[2]); + }); + records.add(newRecord.build()); + } + } catch (IOException ie) { + ie.printStackTrace(); + } + return records; + }).reduce((a, b) -> { + a.addAll(b); + return a; + }).get(); + } - private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, Schema schema) { - List fields = schema.getFields(); - String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); - String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); - Configuration conf = FSUtils.getFs().getConf(); - jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); - jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); - jobConf.set("partition_columns", "datestr"); - conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); - conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); - conf.set("partition_columns", "datestr"); - inputFormat.setConf(conf); - jobConf.addResource(conf); - } + private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, + Schema schema) { + List fields = schema.getFields(); + String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); + String postions = fields.stream().map(f -> String.valueOf(f.pos())) + .collect(Collectors.joining(",")); + Configuration conf = FSUtils.getFs().getConf(); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); + jobConf.set("partition_columns", "datestr"); + conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); + conf.set("partition_columns", "datestr"); + inputFormat.setConf(conf); + jobConf.addResource(conf); + } - private static void setInputPath(JobConf jobConf, String inputPath) { - jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath); - jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath); - jobConf.set("map.input.dir", inputPath); - } + private static void setInputPath(JobConf jobConf, String inputPath) { + jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath); + jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath); + jobConf.set("map.input.dir", inputPath); + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java index fe9c9fd49..c197e6b51 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java @@ -16,17 +16,21 @@ package com.uber.hoodie.common; -import com.uber.hoodie.avro.model.HoodieCleanMetadata; -import com.uber.hoodie.common.model.HoodieCleaningPolicy; import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; -import com.uber.hoodie.common.util.AvroUtils; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.Random; +import java.util.UUID; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; @@ -34,15 +38,6 @@ import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; -import java.util.Random; -import java.util.UUID; - /** * Class to be used in tests to keep generating test inserts and updates against a corpus. * @@ -51,153 +46,164 @@ import java.util.UUID; public class HoodieTestDataGenerator { static class KeyPartition { - HoodieKey key; - String partitionPath; + + HoodieKey key; + String partitionPath; + } + + public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\"," + + "\"name\": \"triprec\"," + + "\"fields\": [ " + + "{\"name\": \"timestamp\",\"type\": \"double\"}," + + "{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"rider\", \"type\": \"string\"}," + + "{\"name\": \"driver\", \"type\": \"string\"}," + + "{\"name\": \"begin_lat\", \"type\": \"double\"}," + + "{\"name\": \"begin_lon\", \"type\": \"double\"}," + + "{\"name\": \"end_lat\", \"type\": \"double\"}," + + "{\"name\": \"end_lon\", \"type\": \"double\"}," + + "{\"name\":\"fare\",\"type\": \"double\"}]}"; + + // based on examination of sample file, the schema produces the following per record size + public static final int SIZE_PER_RECORD = 50 * 1024; + + public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"}; + + + public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, + String basePath) { + for (String partitionPath : partitionPaths) { + new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)) + .trySave(0); } + } - public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\"," - + "\"name\": \"triprec\"," - + "\"fields\": [ " - + "{\"name\": \"timestamp\",\"type\": \"double\"}," - + "{\"name\": \"_row_key\", \"type\": \"string\"}," - + "{\"name\": \"rider\", \"type\": \"string\"}," - + "{\"name\": \"driver\", \"type\": \"string\"}," - + "{\"name\": \"begin_lat\", \"type\": \"double\"}," - + "{\"name\": \"begin_lon\", \"type\": \"double\"}," - + "{\"name\": \"end_lat\", \"type\": \"double\"}," - + "{\"name\": \"end_lon\", \"type\": \"double\"}," - + "{\"name\":\"fare\",\"type\": \"double\"}]}"; + private List existingKeysList = new ArrayList<>(); + public static Schema avroSchema = HoodieAvroUtils + .addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA)); + private static Random rand = new Random(46474747); + private String[] partitionPaths = DEFAULT_PARTITION_PATHS; - // based on examination of sample file, the schema produces the following per record size - public static final int SIZE_PER_RECORD = 50 * 1024; + public HoodieTestDataGenerator(String[] partitionPaths) { + this.partitionPaths = partitionPaths; + } - public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"}; + public HoodieTestDataGenerator() { + this(new String[]{"2016/03/15", "2015/03/16", "2015/03/17"}); + } - public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) { - for (String partitionPath: partitionPaths) { - new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0); - } + /** + * Generates new inserts, uniformly across the partition paths above. It also updates the list of + * existing keys. + */ + public List generateInserts(String commitTime, int n) throws IOException { + List inserts = new ArrayList<>(); + for (int i = 0; i < n; i++) { + String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)]; + HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath); + HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, commitTime)); + inserts.add(record); + + KeyPartition kp = new KeyPartition(); + kp.key = key; + kp.partitionPath = partitionPath; + existingKeysList.add(kp); } + return inserts; + } - private List existingKeysList = new ArrayList<>(); - public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA)); - private static Random rand = new Random(46474747); - private String[] partitionPaths = DEFAULT_PARTITION_PATHS; + public List generateDeletes(String commitTime, int n) throws IOException { + List inserts = generateInserts(commitTime, n); + return generateDeletesFromExistingRecords(inserts); + } + + public List generateDeletesFromExistingRecords(List existingRecords) + throws IOException { + List deletes = new ArrayList<>(); + for (HoodieRecord existingRecord : existingRecords) { + HoodieRecord record = generateDeleteRecord(existingRecord); + deletes.add(record); - public HoodieTestDataGenerator(String[] partitionPaths) { - this.partitionPaths = partitionPaths; } + return deletes; + } - public HoodieTestDataGenerator() { - this(new String[]{"2016/03/15", "2015/03/16", "2015/03/17"}); + public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException { + HoodieKey key = existingRecord.getKey(); + TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), + key.getPartitionPath(), null, true); + return new HoodieRecord(key, payload); + } + + public List generateUpdates(String commitTime, List baseRecords) + throws IOException { + List updates = new ArrayList<>(); + for (HoodieRecord baseRecord : baseRecords) { + HoodieRecord record = new HoodieRecord(baseRecord.getKey(), + generateRandomValue(baseRecord.getKey(), commitTime)); + updates.add(record); } + return updates; + } - - /** - * Generates new inserts, uniformly across the partition paths above. It also updates the list - * of existing keys. - */ - public List generateInserts(String commitTime, int n) throws IOException { - List inserts = new ArrayList<>(); - for (int i = 0; i < n; i++) { - String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)]; - HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath); - HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, commitTime)); - inserts.add(record); - - KeyPartition kp = new KeyPartition(); - kp.key = key; - kp.partitionPath = partitionPath; - existingKeysList.add(kp); - } - return inserts; + /** + * Generates new updates, randomly distributed across the keys above. + */ + public List generateUpdates(String commitTime, int n) throws IOException { + List updates = new ArrayList<>(); + for (int i = 0; i < n; i++) { + KeyPartition kp = existingKeysList.get(rand.nextInt(existingKeysList.size() - 1)); + HoodieRecord record = new HoodieRecord(kp.key, generateRandomValue(kp.key, commitTime)); + updates.add(record); } + return updates; + } - public List generateDeletes(String commitTime, int n) throws IOException { - List inserts = generateInserts(commitTime, n); - return generateDeletesFromExistingRecords(inserts); - } - - public List generateDeletesFromExistingRecords(List existingRecords) throws IOException { - List deletes = new ArrayList<>(); - for (HoodieRecord existingRecord: existingRecords) { - HoodieRecord record = generateDeleteRecord(existingRecord); - deletes.add(record); - - } - return deletes; - } - - public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException { - HoodieKey key = existingRecord.getKey(); - TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), key.getPartitionPath(), null, true); - return new HoodieRecord(key, payload); - } - - public List generateUpdates(String commitTime, List baseRecords) throws IOException { - List updates = new ArrayList<>(); - for (HoodieRecord baseRecord: baseRecords) { - HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), commitTime)); - updates.add(record); - } - return updates; - } - - /** - * Generates new updates, randomly distributed across the keys above. - */ - public List generateUpdates(String commitTime, int n) throws IOException { - List updates = new ArrayList<>(); - for (int i = 0; i < n; i++) { - KeyPartition kp = existingKeysList.get(rand.nextInt(existingKeysList.size() - 1)); - HoodieRecord record = new HoodieRecord(kp.key, generateRandomValue(kp.key, commitTime)); - updates.add(record); - } - return updates; - } - - - /** - * Generates a new avro record of the above schema format, retaining the key if optionally - * provided. - */ - public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException { - GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, - "driver-" + commitTime, 0.0); - HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1"); - return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA); - } - - public static GenericRecord generateGenericRecord(String rowKey, String riderName, - String driverName, double timestamp) { - GenericRecord rec = new GenericData.Record(avroSchema); - rec.put("_row_key", rowKey); - rec.put("timestamp", timestamp); - rec.put("rider", riderName); - rec.put("driver", driverName); - rec.put("begin_lat", rand.nextDouble()); - rec.put("begin_lon", rand.nextDouble()); - rec.put("end_lat", rand.nextDouble()); - rec.put("end_lon", rand.nextDouble()); - rec.put("fare", rand.nextDouble() * 100); - return rec; - } - - public static void createCommitFile(String basePath, String commitTime) throws IOException { - Path commitFile = - new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime)); - FileSystem fs = FSUtils.getFs(); - FSDataOutputStream os = fs.create(commitFile, true); - HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); - try { - // Write empty commit metadata - os.writeBytes(new String(commitMetadata.toJsonString().getBytes( - StandardCharsets.UTF_8))); - } finally { - os.close(); - } + + /** + * Generates a new avro record of the above schema format, retaining the key if optionally + * provided. + */ + public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) + throws IOException { + GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, + "driver-" + commitTime, 0.0); + HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1"); + return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), + TRIP_EXAMPLE_SCHEMA); + } + + public static GenericRecord generateGenericRecord(String rowKey, String riderName, + String driverName, double timestamp) { + GenericRecord rec = new GenericData.Record(avroSchema); + rec.put("_row_key", rowKey); + rec.put("timestamp", timestamp); + rec.put("rider", riderName); + rec.put("driver", driverName); + rec.put("begin_lat", rand.nextDouble()); + rec.put("begin_lon", rand.nextDouble()); + rec.put("end_lat", rand.nextDouble()); + rec.put("end_lon", rand.nextDouble()); + rec.put("fare", rand.nextDouble() * 100); + return rec; + } + + public static void createCommitFile(String basePath, String commitTime) throws IOException { + Path commitFile = + new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline + .makeCommitFileName(commitTime)); + FileSystem fs = FSUtils.getFs(); + FSDataOutputStream os = fs.create(commitFile, true); + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + try { + // Write empty commit metadata + os.writeBytes(new String(commitMetadata.toJsonString().getBytes( + StandardCharsets.UTF_8))); + } finally { + os.close(); } + } public static void createSavepointFile(String basePath, String commitTime) throws IOException { Path commitFile = @@ -215,7 +221,7 @@ public class HoodieTestDataGenerator { } } - public String[] getPartitionPaths() { - return partitionPaths; - } + public String[] getPartitionPaths() { + return partitionPaths; + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/TestRawTripPayload.java b/hoodie-client/src/test/java/com/uber/hoodie/common/TestRawTripPayload.java index 572792495..11c790f66 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/TestRawTripPayload.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/TestRawTripPayload.java @@ -17,174 +17,182 @@ package com.uber.hoodie.common; import com.fasterxml.jackson.databind.ObjectMapper; - import com.uber.hoodie.WriteStatus; import com.uber.hoodie.avro.MercifulJsonConverter; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordPayload; - +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.StringWriter; import java.util.HashMap; import java.util.List; -import java.util.Map.Entry; -import org.apache.avro.Schema; -import org.apache.avro.generic.IndexedRecord; -import org.apache.commons.io.IOUtils; - -import java.io.*; import java.util.Map; +import java.util.Map.Entry; import java.util.Optional; import java.util.zip.Deflater; import java.util.zip.DeflaterOutputStream; import java.util.zip.InflaterInputStream; +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.commons.io.IOUtils; /** * Example row change event based on some example data used by testcases. The data avro schema is * src/test/resources/schema1. */ public class TestRawTripPayload implements HoodieRecordPayload { - private transient static final ObjectMapper mapper = new ObjectMapper(); - private String partitionPath; - private String rowKey; - private byte[] jsonDataCompressed; - private int dataSize; - private boolean isDeleted; - public TestRawTripPayload(Optional jsonData, String rowKey, String partitionPath, - String schemaStr, Boolean isDeleted) throws IOException { - if(jsonData.isPresent()) { - this.jsonDataCompressed = compressData(jsonData.get()); - this.dataSize = jsonData.get().length(); - } - this.rowKey = rowKey; - this.partitionPath = partitionPath; - this.isDeleted = isDeleted; + private transient static final ObjectMapper mapper = new ObjectMapper(); + private String partitionPath; + private String rowKey; + private byte[] jsonDataCompressed; + private int dataSize; + private boolean isDeleted; + + public TestRawTripPayload(Optional jsonData, String rowKey, String partitionPath, + String schemaStr, Boolean isDeleted) throws IOException { + if (jsonData.isPresent()) { + this.jsonDataCompressed = compressData(jsonData.get()); + this.dataSize = jsonData.get().length(); } + this.rowKey = rowKey; + this.partitionPath = partitionPath; + this.isDeleted = isDeleted; + } - public TestRawTripPayload(String jsonData, String rowKey, String partitionPath, - String schemaStr)throws IOException { - this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false); + public TestRawTripPayload(String jsonData, String rowKey, String partitionPath, + String schemaStr) throws IOException { + this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false); + } + + public TestRawTripPayload(String jsonData) throws IOException { + this.jsonDataCompressed = compressData(jsonData); + this.dataSize = jsonData.length(); + Map jsonRecordMap = mapper.readValue(jsonData, Map.class); + this.rowKey = jsonRecordMap.get("_row_key").toString(); + this.partitionPath = jsonRecordMap.get("time").toString().split("T")[0].replace("-", "/"); + this.isDeleted = false; + } + + public String getPartitionPath() { + return partitionPath; + } + + + @Override + public TestRawTripPayload preCombine(TestRawTripPayload another) { + return another; + } + + @Override + public Optional combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) + throws IOException { + return this.getInsertValue(schema); + } + + @Override + public Optional getInsertValue(Schema schema) throws IOException { + if (isDeleted) { + return Optional.empty(); + } else { + MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema); + return Optional.of(jsonConverter.convert(getJsonData())); } + } - public TestRawTripPayload(String jsonData) throws IOException { - this.jsonDataCompressed = compressData(jsonData); - this.dataSize = jsonData.length(); - Map jsonRecordMap = mapper.readValue(jsonData, Map.class); - this.rowKey = jsonRecordMap.get("_row_key").toString(); - this.partitionPath = jsonRecordMap.get("time").toString().split("T")[0].replace("-", "/"); - this.isDeleted = false; - } - - public String getPartitionPath() { - return partitionPath; + @Override + public Optional> getMetadata() { + // Let's assume we want to count the number of input row change events + // that are processed. Let the time-bucket for this row change event be 1506582000. + Map metadataMap = new HashMap<>(); + metadataMap.put("InputRecordCount_1506582000", "2"); + return Optional.of(metadataMap); + } + + public String getRowKey() { + return rowKey; + } + + public String getJsonData() throws IOException { + return unCompressData(jsonDataCompressed); + } + + private byte[] compressData(String jsonData) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DeflaterOutputStream dos = + new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true); + try { + dos.write(jsonData.getBytes()); + } finally { + dos.flush(); + dos.close(); } + return baos.toByteArray(); + } - @Override public TestRawTripPayload preCombine(TestRawTripPayload another) { - return another; - } + private String unCompressData(byte[] data) throws IOException { + InflaterInputStream iis = new InflaterInputStream(new ByteArrayInputStream(data)); + StringWriter sw = new StringWriter(dataSize); + IOUtils.copy(iis, sw); + return sw.toString(); + } - @Override public Optional combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { - return this.getInsertValue(schema); - } + /** + * A custom {@link WriteStatus} that merges passed metadata key value map to {@code + * WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}. + */ + public static class MetadataMergeWriteStatus extends WriteStatus { - @Override public Optional getInsertValue(Schema schema) throws IOException { - if(isDeleted){ - return Optional.empty(); - } else { - MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema); - return Optional.of(jsonConverter.convert(getJsonData())); - } + private Map mergedMetadataMap = new HashMap<>(); + + @Override + public void markSuccess(HoodieRecord record, Optional> recordMetadata) { + super.markSuccess(record, recordMetadata); + if (recordMetadata.isPresent()) { + mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap); + } } @Override - public Optional> getMetadata() { - // Let's assume we want to count the number of input row change events - // that are processed. Let the time-bucket for this row change event be 1506582000. - Map metadataMap = new HashMap<>(); - metadataMap.put("InputRecordCount_1506582000", "2"); - return Optional.of(metadataMap); + public void markFailure(HoodieRecord record, Throwable t, + Optional> recordMetadata) { + super.markFailure(record, t, recordMetadata); + if (recordMetadata.isPresent()) { + mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap); + } } - public String getRowKey() { - return rowKey; + public static Map mergeMetadataForWriteStatuses( + List writeStatuses) { + Map allWriteStatusMergedMetadataMap = new HashMap<>(); + for (WriteStatus writeStatus : writeStatuses) { + MetadataMergeWriteStatus.mergeMetadataMaps( + ((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(), + allWriteStatusMergedMetadataMap); + } + return allWriteStatusMergedMetadataMap; } - public String getJsonData() throws IOException { - return unCompressData(jsonDataCompressed); + private static void mergeMetadataMaps(Map mergeFromMap, + Map mergeToMap) { + for (Entry entry : mergeFromMap.entrySet()) { + String key = entry.getKey(); + if (!mergeToMap.containsKey(key)) { + mergeToMap.put(key, "0"); + } + mergeToMap + .put(key, addStrsAsInt(entry.getValue(), mergeToMap.get(key))); + } } - private byte[] compressData(String jsonData) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DeflaterOutputStream dos = - new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true); - try { - dos.write(jsonData.getBytes()); - } finally { - dos.flush(); - dos.close(); - } - return baos.toByteArray(); + private Map getMergedMetadataMap() { + return mergedMetadataMap; } - - private String unCompressData(byte[] data) throws IOException { - InflaterInputStream iis = new InflaterInputStream(new ByteArrayInputStream(data)); - StringWriter sw = new StringWriter(dataSize); - IOUtils.copy(iis, sw); - return sw.toString(); - } - - /** - * A custom {@link WriteStatus} that merges passed metadata key value map - * to {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}. - */ - public static class MetadataMergeWriteStatus extends WriteStatus { - private Map mergedMetadataMap = new HashMap<>(); - - @Override - public void markSuccess(HoodieRecord record, Optional> recordMetadata) { - super.markSuccess(record, recordMetadata); - if(recordMetadata.isPresent()) { - mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap); - } - } - - @Override - public void markFailure(HoodieRecord record, Throwable t, - Optional> recordMetadata) { - super.markFailure(record, t, recordMetadata); - if(recordMetadata.isPresent()) { - mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap); - } - } - - public static Map mergeMetadataForWriteStatuses(List writeStatuses) { - Map allWriteStatusMergedMetadataMap = new HashMap<>(); - for (WriteStatus writeStatus : writeStatuses) { - MetadataMergeWriteStatus.mergeMetadataMaps( - ((MetadataMergeWriteStatus)writeStatus).getMergedMetadataMap(), - allWriteStatusMergedMetadataMap); - } - return allWriteStatusMergedMetadataMap; - } - - private static void mergeMetadataMaps(Map mergeFromMap, Map mergeToMap) { - for (Entry entry : mergeFromMap.entrySet()) { - String key = entry.getKey(); - if(!mergeToMap.containsKey(key)) { - mergeToMap.put(key, "0"); - } - mergeToMap - .put(key, addStrsAsInt(entry.getValue(), mergeToMap.get(key))); - } - } - - private Map getMergedMetadataMap() { - return mergedMetadataMap; - } - - private static String addStrsAsInt(String a, String b) { - return String.valueOf(Integer.parseInt(a) + Integer.parseInt(b)); - } + private static String addStrsAsInt(String a, String b) { + return String.valueOf(Integer.parseInt(a) + Integer.parseInt(b)); } + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/config/HoodieWriteConfigTest.java b/hoodie-client/src/test/java/com/uber/hoodie/config/HoodieWriteConfigTest.java index 957b02c6b..395197bf9 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/config/HoodieWriteConfigTest.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/config/HoodieWriteConfigTest.java @@ -16,7 +16,7 @@ package com.uber.hoodie.config; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; import com.google.common.collect.Maps; import com.uber.hoodie.config.HoodieWriteConfig.Builder; @@ -29,6 +29,7 @@ import java.util.Properties; import org.junit.Test; public class HoodieWriteConfigTest { + @Test public void testPropertyLoading() throws IOException { Builder builder = HoodieWriteConfig.newBuilder().withPath("/tmp"); @@ -46,13 +47,14 @@ public class HoodieWriteConfigTest { HoodieWriteConfig config = builder.build(); assertEquals(config.getMaxCommitsToKeep(), 5); assertEquals(config.getMinCommitsToKeep(), 2); -} + } - private ByteArrayOutputStream saveParamsIntoOutputStream(Map params) throws IOException { + private ByteArrayOutputStream saveParamsIntoOutputStream(Map params) + throws IOException { Properties properties = new Properties(); properties.putAll(params); ByteArrayOutputStream outStream = new ByteArrayOutputStream(); properties.store(outStream, "Saved on " + new Date(System.currentTimeMillis())); return outStream; } -} \ No newline at end of file +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java b/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java index 955865e1f..8433c0366 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java @@ -16,105 +16,103 @@ package com.uber.hoodie.func; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.common.table.HoodieTimeline; -import com.uber.hoodie.config.HoodieWriteConfig; +import static org.junit.Assert.fail; + import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.TestRawTripPayload; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.table.HoodieCopyOnWriteTable; - +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.Path; import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import static org.junit.Assert.fail; - public class TestUpdateMapFunction { - private String basePath = null; - @Before - public void init() throws Exception { - // Create a temp folder as the base path - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - this.basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.init(basePath); + private String basePath = null; + + @Before + public void init() throws Exception { + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + this.basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.init(basePath); + } + + @Test + public void testSchemaEvolutionOnUpdate() throws Exception { + // Create a bunch of records with a old version of schema + HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); + + String recordStr1 = + "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = + "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = + "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + List records = new ArrayList<>(); + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + records.add( + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), + rowChange1)); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + records.add( + new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), + rowChange2)); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + records.add( + new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), + rowChange3)); + Iterator> insertResult = table.handleInsert("100", records.iterator()); + Path commitFile = + new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100")); + FSUtils.getFs().create(commitFile); + + // Now try an update with an evolved schema + // Evolved schema does not have guarantee on preserving the original field ordering + config = makeHoodieClientConfig("/exampleEvolvedSchema.txt"); + metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + String fileId = insertResult.next().get(0).getFileId(); + System.out.println(fileId); + + table = new HoodieCopyOnWriteTable(config, metadata); + // New content with values for the newly added field + recordStr1 = + "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}"; + records = new ArrayList<>(); + rowChange1 = new TestRawTripPayload(recordStr1); + HoodieRecord record1 = + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), + rowChange1); + record1.setCurrentLocation(new HoodieRecordLocation("100", fileId)); + records.add(record1); + + try { + table.handleUpdate("101", fileId, records.iterator()); + } catch (ClassCastException e) { + fail( + "UpdateFunction could not read records written with exampleSchema.txt using the exampleEvolvedSchema.txt"); } + } - @Test - public void testSchemaEvolutionOnUpdate() throws Exception { - // Create a bunch of records with a old version of schema - HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt"); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); - - String recordStr1 = - "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; - String recordStr2 = - "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - String recordStr3 = - "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; - List records = new ArrayList<>(); - TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - records.add( - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), - rowChange1)); - TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - records.add( - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), - rowChange2)); - TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); - records.add( - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), - rowChange3)); - Iterator> insertResult = table.handleInsert("100", records.iterator()); - Path commitFile = - new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100")); - FSUtils.getFs().create(commitFile); - - // Now try an update with an evolved schema - // Evolved schema does not have guarantee on preserving the original field ordering - config = makeHoodieClientConfig("/exampleEvolvedSchema.txt"); - metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - String fileId = insertResult.next().get(0).getFileId(); - System.out.println(fileId); - - - table = new HoodieCopyOnWriteTable(config, metadata); - // New content with values for the newly added field - recordStr1 = - "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}"; - records = new ArrayList<>(); - rowChange1 = new TestRawTripPayload(recordStr1); - HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), - rowChange1); - record1.setCurrentLocation(new HoodieRecordLocation("100", fileId)); - records.add(record1); - - try { - table.handleUpdate("101", fileId, records.iterator()); - } catch (ClassCastException e) { - fail( - "UpdateFunction could not read records written with exampleSchema.txt using the exampleEvolvedSchema.txt"); - } - } - - private HoodieWriteConfig makeHoodieClientConfig(String schema) throws Exception { - // Prepare the AvroParquetIO - String schemaStr = IOUtils.toString(getClass().getResourceAsStream(schema), "UTF-8"); - return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr).build(); - } + private HoodieWriteConfig makeHoodieClientConfig(String schema) throws Exception { + // Prepare the AvroParquetIO + String schemaStr = IOUtils.toString(getClass().getResourceAsStream(schema), "UTF-8"); + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr).build(); + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieIndex.java b/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieIndex.java index de9c2d368..2fba00693 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieIndex.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieIndex.java @@ -16,32 +16,31 @@ package com.uber.hoodie.index; -import com.uber.hoodie.config.HoodieWriteConfig; +import static org.junit.Assert.assertTrue; import com.uber.hoodie.config.HoodieIndexConfig; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.index.bloom.HoodieBloomIndex; import com.uber.hoodie.index.hbase.HBaseIndex; - import org.junit.Test; -import static org.junit.Assert.*; - public class TestHoodieIndex { - @Test - public void testCreateIndex() throws Exception { - HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); - HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder(); - // Different types - HoodieWriteConfig config = clientConfigBuilder.withPath("") - .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build()) - .build(); - assertTrue(HoodieIndex.createIndex(config, null) instanceof HBaseIndex); - config = clientConfigBuilder.withPath("").withIndexConfig( - indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); - assertTrue(HoodieIndex.createIndex(config, null) instanceof InMemoryHashIndex); - config = clientConfigBuilder.withPath("") - .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .build(); - assertTrue(HoodieIndex.createIndex(config, null) instanceof HoodieBloomIndex); - } + + @Test + public void testCreateIndex() throws Exception { + HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); + HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder(); + // Different types + HoodieWriteConfig config = clientConfigBuilder.withPath("") + .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build()) + .build(); + assertTrue(HoodieIndex.createIndex(config, null) instanceof HBaseIndex); + config = clientConfigBuilder.withPath("").withIndexConfig( + indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); + assertTrue(HoodieIndex.createIndex(config, null) instanceof InMemoryHashIndex); + config = clientConfigBuilder.withPath("") + .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .build(); + assertTrue(HoodieIndex.createIndex(config, null) instanceof HoodieBloomIndex); + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/index/bloom/TestHoodieBloomIndex.java b/hoodie-client/src/test/java/com/uber/hoodie/index/bloom/TestHoodieBloomIndex.java index 1a49b5953..8d4cc2558 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/index/bloom/TestHoodieBloomIndex.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/index/bloom/TestHoodieBloomIndex.java @@ -18,28 +18,39 @@ package com.uber.hoodie.index.bloom; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + import com.google.common.base.Optional; import com.google.common.collect.Lists; - -import com.uber.hoodie.common.HoodieClientTestUtils; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.config.HoodieIndexConfig; -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.BloomFilter; +import com.uber.hoodie.common.HoodieClientTestUtils; import com.uber.hoodie.common.TestRawTripPayload; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; - -import com.uber.hoodie.index.bloom.BloomIndexFileInfo; -import com.uber.hoodie.index.bloom.HoodieBloomIndex; -import com.uber.hoodie.index.bloom.HoodieBloomIndexCheckFunction; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.io.storage.HoodieParquetConfig; import com.uber.hoodie.io.storage.HoodieParquetWriter; import com.uber.hoodie.table.HoodieTable; +import java.io.File; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.commons.io.IOUtils; @@ -47,11 +58,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroSchemaConverter; -import org.apache.parquet.avro.AvroWriteSupport; import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -59,464 +67,489 @@ import org.junit.After; import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import org.mockito.Mockito; - import scala.Tuple2; -import java.io.File; -import java.io.IOException; -import java.text.SimpleDateFormat; -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static org.junit.Assert.*; - public class TestHoodieBloomIndex { - private JavaSparkContext jsc = null; - private String basePath = null; - private transient final FileSystem fs; - private String schemaStr; - private Schema schema; - public TestHoodieBloomIndex() throws Exception { - fs = FSUtils.getFs(); - } + private JavaSparkContext jsc = null; + private String basePath = null; + private transient final FileSystem fs; + private String schemaStr; + private Schema schema; - @Before - public void init() throws IOException { - // Initialize a local spark env - jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieBloomIndex")); - // Create a temp folder as the base path - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.init(basePath); - // We have some records to be tagged (two different partitions) - schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); - schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr)); - } + public TestHoodieBloomIndex() throws Exception { + fs = FSUtils.getFs(); + } - @Test - public void testLoadUUIDsInMemory() throws IOException { - // Create one RDD of hoodie record - String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; - String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; - String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + @Before + public void init() throws IOException { + // Initialize a local spark env + jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieBloomIndex")); + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.init(basePath); + // We have some records to be tagged (two different partitions) + schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); + schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr)); + } - TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); - TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); - TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); - HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); - TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); - HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + @Test + public void testLoadUUIDsInMemory() throws IOException { + // Create one RDD of hoodie record + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; - JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + HoodieRecord record1 = new HoodieRecord( + new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + HoodieRecord record2 = new HoodieRecord( + new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + HoodieRecord record3 = new HoodieRecord( + new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); + HoodieRecord record4 = new HoodieRecord( + new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); - // Load to memory - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + JavaRDD recordRDD = jsc + .parallelize(Arrays.asList(record1, record2, record3, record4)); - Map> map = recordRDD - .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())) - .groupByKey().collectAsMap(); - assertEquals(map.size(), 2); - List list1 = Lists.newArrayList(map.get("2016/01/31")); - List list2 = Lists.newArrayList(map.get("2015/01/31")); - assertEquals(list1.size(), 3); - assertEquals(list2.size(), 1); - } + // Load to memory + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - @Test - public void testLoadInvolvedFiles() throws IOException { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder() - .withPath(basePath) - .build(); - HoodieBloomIndex index = new HoodieBloomIndex(config, jsc); + Map> map = recordRDD + .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())) + .groupByKey().collectAsMap(); + assertEquals(map.size(), 2); + List list1 = Lists.newArrayList(map.get("2016/01/31")); + List list2 = Lists.newArrayList(map.get("2015/01/31")); + assertEquals(list1.size(), 3); + assertEquals(list2.size(), 1); + } - // Create some partitions, and put some files - // "2016/01/21": 0 file - // "2016/04/01": 1 file (2_0_20160401010101.parquet) - // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet) - new File(basePath + "/2016/01/21").mkdirs(); - new File(basePath + "/2016/04/01").mkdirs(); - new File(basePath + "/2015/03/12").mkdirs(); + @Test + public void testLoadInvolvedFiles() throws IOException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .withPath(basePath) + .build(); + HoodieBloomIndex index = new HoodieBloomIndex(config, jsc); - TestRawTripPayload rowChange1 = new TestRawTripPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); - TestRawTripPayload rowChange2 = new TestRawTripPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); - TestRawTripPayload rowChange3 = new TestRawTripPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); - TestRawTripPayload rowChange4 = new TestRawTripPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + // Create some partitions, and put some files + // "2016/01/21": 0 file + // "2016/04/01": 1 file (2_0_20160401010101.parquet) + // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet) + new File(basePath + "/2016/01/21").mkdirs(); + new File(basePath + "/2016/04/01").mkdirs(); + new File(basePath + "/2015/03/12").mkdirs(); + TestRawTripPayload rowChange1 = new TestRawTripPayload( + "{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record1 = new HoodieRecord( + new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload( + "{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record2 = new HoodieRecord( + new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + TestRawTripPayload rowChange3 = new TestRawTripPayload( + "{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record3 = new HoodieRecord( + new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + TestRawTripPayload rowChange4 = new TestRawTripPayload( + "{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record4 = new HoodieRecord( + new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); - writeParquetFile("2016/04/01","2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, false); - writeParquetFile("2015/03/12","1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, false); - writeParquetFile("2015/03/12","3_0_20150312101010.parquet", Arrays.asList(record1), schema, null, false); - writeParquetFile("2015/03/12","4_0_20150312101010.parquet", Arrays.asList(record2, record3, record4), schema, null, false); + writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, + false); + writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, + false); + writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema, + null, false); + writeParquetFile("2015/03/12", "4_0_20150312101010.parquet", + Arrays.asList(record2, record3, record4), schema, null, false); - List partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieTable table = HoodieTable.getHoodieTable(metadata, config); - List> filesList = index.loadInvolvedFiles(partitions, table); - // Still 0, as no valid commit - assertEquals(filesList.size(), 0); + List partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTable table = HoodieTable.getHoodieTable(metadata, config); + List> filesList = index.loadInvolvedFiles(partitions, table); + // Still 0, as no valid commit + assertEquals(filesList.size(), 0); - // Add some commits - new File(basePath + "/.hoodie").mkdirs(); - new File(basePath + "/.hoodie/20160401010101.commit").createNewFile(); - new File(basePath + "/.hoodie/20150312101010.commit").createNewFile(); + // Add some commits + new File(basePath + "/.hoodie").mkdirs(); + new File(basePath + "/.hoodie/20160401010101.commit").createNewFile(); + new File(basePath + "/.hoodie/20150312101010.commit").createNewFile(); - filesList = index.loadInvolvedFiles(partitions, table); - assertEquals(filesList.size(), 4); - // these files will not have the key ranges - assertNull(filesList.get(0)._2().getMaxRecordKey()); - assertNull(filesList.get(0)._2().getMinRecordKey()); - assertFalse(filesList.get(1)._2().hasKeyRanges()); - assertNotNull(filesList.get(2)._2().getMaxRecordKey()); - assertNotNull(filesList.get(2)._2().getMinRecordKey()); - assertTrue(filesList.get(3)._2().hasKeyRanges()); + filesList = index.loadInvolvedFiles(partitions, table); + assertEquals(filesList.size(), 4); + // these files will not have the key ranges + assertNull(filesList.get(0)._2().getMaxRecordKey()); + assertNull(filesList.get(0)._2().getMinRecordKey()); + assertFalse(filesList.get(1)._2().hasKeyRanges()); + assertNotNull(filesList.get(2)._2().getMaxRecordKey()); + assertNotNull(filesList.get(2)._2().getMinRecordKey()); + assertTrue(filesList.get(3)._2().hasKeyRanges()); - // no longer sorted, but should have same files. + // no longer sorted, but should have same files. - List> expected = Arrays.asList( - new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")), - new Tuple2<>("2015/03/12",new BloomIndexFileInfo("1_0_20150312101010.parquet")), - new Tuple2<>("2015/03/12",new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")), - new Tuple2<>("2015/03/12",new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003")) - ); - assertEquals(expected, filesList); - } + List> expected = Arrays.asList( + new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")), + new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")), + new Tuple2<>("2015/03/12", + new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")), + new Tuple2<>("2015/03/12", + new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003")) + ); + assertEquals(expected, filesList); + } - @Test - public void testRangePruning() { + @Test + public void testRangePruning() { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder() - .withPath(basePath) - .build(); - HoodieBloomIndex index = new HoodieBloomIndex(config, jsc); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .withPath(basePath) + .build(); + HoodieBloomIndex index = new HoodieBloomIndex(config, jsc); + final Map> partitionToFileIndexInfo = new HashMap<>(); + partitionToFileIndexInfo.put("2017/10/22", Arrays.asList( + new BloomIndexFileInfo("f1"), + new BloomIndexFileInfo("f2", "000", "000"), + new BloomIndexFileInfo("f3", "001", "003"), + new BloomIndexFileInfo("f4", "002", "007"), + new BloomIndexFileInfo("f5", "009", "010") + )); - final Map> partitionToFileIndexInfo = new HashMap<>(); - partitionToFileIndexInfo.put("2017/10/22", Arrays.asList( - new BloomIndexFileInfo("f1"), - new BloomIndexFileInfo("f2", "000", "000"), - new BloomIndexFileInfo("f3", "001", "003"), - new BloomIndexFileInfo("f4", "002", "007"), - new BloomIndexFileInfo("f5", "009", "010") + JavaPairRDD partitionRecordKeyPairRDD = jsc + .parallelize(Arrays.asList( + new Tuple2<>("2017/10/22", "003"), + new Tuple2<>("2017/10/22", "002"), + new Tuple2<>("2017/10/22", "005"), + new Tuple2<>("2017/10/22", "004") + )) + .mapToPair(t -> t); + + List>> comparisonKeyList = index + .explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD) + .collect(); + + assertEquals(10, comparisonKeyList.size()); + Map> recordKeyToFileComps = comparisonKeyList.stream() + .collect(Collectors.groupingBy( + t -> t._2()._2().getRecordKey(), + Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList() + ) )); - JavaPairRDD partitionRecordKeyPairRDD = jsc - .parallelize(Arrays.asList( - new Tuple2<>("2017/10/22","003"), - new Tuple2<>("2017/10/22","002"), - new Tuple2<>("2017/10/22","005"), - new Tuple2<>("2017/10/22","004") - )) - .mapToPair(t -> t); + assertEquals(4, recordKeyToFileComps.size()); + assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002")); + assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("003")); + assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("004")); + assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("005")); + } + + @Test + public void testCheckUUIDsAgainstOneFile() + throws IOException, InterruptedException, ClassNotFoundException { + + // Create some records to use + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}"; + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + HoodieRecord record1 = new HoodieRecord( + new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + HoodieRecord record2 = new HoodieRecord( + new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + HoodieRecord record3 = new HoodieRecord( + new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); + HoodieRecord record4 = new HoodieRecord( + new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + + // We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3). + BloomFilter filter = new BloomFilter(10000, 0.0000001); + filter.add(record3.getRecordKey()); + String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, + filter, true); + + // The bloom filter contains 3 records + assertTrue(filter.mightContain(record1.getRecordKey())); + assertTrue(filter.mightContain(record2.getRecordKey())); + assertTrue(filter.mightContain(record3.getRecordKey())); + assertFalse(filter.mightContain(record4.getRecordKey())); + + // Compare with file + List uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), + record3.getRecordKey(), record4.getRecordKey()); + + List results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(uuids, + new Path(basePath + "/2016/01/31/" + filename)); + assertEquals(results.size(), 2); + assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") + || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")); + assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") + || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")); + // TODO(vc): Need more coverage on actual filenames + //assertTrue(results.get(0)._2().equals(filename)); + //assertTrue(results.get(1)._2().equals(filename)); + } + + @Test + public void testTagLocationWithEmptyRDD() throws Exception { + // We have some records to be tagged (two different partitions) + JavaRDD recordRDD = jsc.emptyRDD(); + // Also create the metadata and config + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieTable table = HoodieTable.getHoodieTable(metadata, config); + + // Let's tag + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); + + try { + bloomIndex.tagLocation(recordRDD, table); + } catch (IllegalArgumentException e) { + fail( + "EmptyRDD should not result in IllegalArgumentException: Positive number of slices required"); + } + } - List>> comparisonKeyList = index - .explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD) - .collect(); + @Test + public void testTagLocation() throws Exception { + // We have some records to be tagged (two different partitions) - assertEquals(10, comparisonKeyList.size()); - Map> recordKeyToFileComps = comparisonKeyList.stream() - .collect(Collectors.groupingBy( - t -> t._2()._2().getRecordKey(), - Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList() - ) - )); + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + HoodieRecord record1 = new HoodieRecord( + new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + HoodieRecord record2 = new HoodieRecord( + new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + HoodieRecord record3 = new HoodieRecord( + new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); + HoodieRecord record4 = new HoodieRecord( + new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + JavaRDD recordRDD = jsc + .parallelize(Arrays.asList(record1, record2, record3, record4)); - assertEquals(4, recordKeyToFileComps.size()); - assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002")); - assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("003")); - assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("004")); - assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("005")); + // Also create the metadata and config + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieTable table = HoodieTable.getHoodieTable(metadata, config); + + // Let's tag + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); + JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table); + + // Should not find any files + for (HoodieRecord record : taggedRecordRDD.collect()) { + assertTrue(!record.isCurrentLocationKnown()); } - @Test - public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException { + // We create three parquet file, each having one record. (two different partitions) + String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true); + String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true); + String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true); - // Create some records to use - String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; - String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; - String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}"; - TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); - TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); - TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); - HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); - TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); - HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + // We do the tag again + metadata = new HoodieTableMetaClient(fs, basePath); + table = HoodieTable.getHoodieTable(metadata, config); + taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table); - // We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3). - BloomFilter filter = new BloomFilter(10000, 0.0000001); - filter.add(record3.getRecordKey()); - String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true); + // Check results + for (HoodieRecord record : taggedRecordRDD.collect()) { + if (record.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { + assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename1))); + } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { + assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2))); + } else if (record.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { + assertTrue(!record.isCurrentLocationKnown()); + } else if (record.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { + assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3))); + } + } + } - // The bloom filter contains 3 records - assertTrue(filter.mightContain(record1.getRecordKey())); - assertTrue(filter.mightContain(record2.getRecordKey())); - assertTrue(filter.mightContain(record3.getRecordKey())); - assertFalse(filter.mightContain(record4.getRecordKey())); + @Test + public void testCheckExists() throws Exception { + // We have some records to be tagged (two different partitions) - // Compare with file - List uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), - record3.getRecordKey(), record4.getRecordKey()); + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()); + HoodieRecord record1 = new HoodieRecord(key1, rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()); + HoodieRecord record2 = new HoodieRecord(key2, rowChange2); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()); + HoodieRecord record3 = new HoodieRecord(key3, rowChange3); + TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); + HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()); + HoodieRecord record4 = new HoodieRecord(key4, rowChange4); + JavaRDD keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4)); - List results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(uuids, - new Path(basePath + "/2016/01/31/" + filename)); - assertEquals(results.size(), 2); - assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") - || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")); - assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") - || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")); - // TODO(vc): Need more coverage on actual filenames - //assertTrue(results.get(0)._2().equals(filename)); - //assertTrue(results.get(1)._2().equals(filename)); + // Also create the metadata and config + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieTable table = HoodieTable.getHoodieTable(metadata, config); + + // Let's tag + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); + JavaPairRDD> taggedRecordRDD = bloomIndex + .fetchRecordLocation(keysRDD, table); + + // Should not find any files + for (Tuple2> record : taggedRecordRDD.collect()) { + assertTrue(!record._2.isPresent()); } - @Test - public void testTagLocationWithEmptyRDD() throws Exception { - // We have some records to be tagged (two different partitions) - JavaRDD recordRDD = jsc.emptyRDD(); - // Also create the metadata and config - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - HoodieTable table = HoodieTable.getHoodieTable(metadata, config); + // We create three parquet file, each having one record. (two different partitions) + String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true); + String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true); + String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true); - // Let's tag - HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); + // We do the tag again + metadata = new HoodieTableMetaClient(fs, basePath); + table = HoodieTable.getHoodieTable(metadata, config); + taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table); - try { - bloomIndex.tagLocation(recordRDD, table); - } catch (IllegalArgumentException e) { - fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices required"); - } + // Check results + for (Tuple2> record : taggedRecordRDD.collect()) { + if (record._1.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { + assertTrue(record._2.isPresent()); + Path path1 = new Path(record._2.get()); + assertEquals(FSUtils.getFileId(filename1), FSUtils.getFileId(path1.getName())); + } else if (record._1.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { + assertTrue(record._2.isPresent()); + Path path2 = new Path(record._2.get()); + assertEquals(FSUtils.getFileId(filename2), FSUtils.getFileId(path2.getName())); + } else if (record._1.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { + assertTrue(!record._2.isPresent()); + } else if (record._1.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { + assertTrue(record._2.isPresent()); + Path path3 = new Path(record._2.get()); + assertEquals(FSUtils.getFileId(filename3), FSUtils.getFileId(path3.getName())); + } } + } - @Test - public void testTagLocation() throws Exception { - // We have some records to be tagged (two different partitions) + @Test + public void testBloomFilterFalseError() throws IOException, InterruptedException { + // We have two hoodie records + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; - String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; - String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; - TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); - TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); - TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); - HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); - TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); - HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); - JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); + // We write record1 to a parquet file, using a bloom filter having both records + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + HoodieRecord record1 = new HoodieRecord( + new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + HoodieRecord record2 = new HoodieRecord( + new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); - // Also create the metadata and config - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - HoodieTable table = HoodieTable.getHoodieTable(metadata, config); + BloomFilter filter = new BloomFilter(10000, 0.0000001); + filter.add(record2.getRecordKey()); + String filename = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, filter, true); + assertTrue(filter.mightContain(record1.getRecordKey())); + assertTrue(filter.mightContain(record2.getRecordKey())); - // Let's tag - HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); - JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table); + // We do the tag + JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2)); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieTable table = HoodieTable.getHoodieTable(metadata, config); - // Should not find any files - for (HoodieRecord record : taggedRecordRDD.collect()) { - assertTrue(!record.isCurrentLocationKnown()); - } + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); + JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table); - // We create three parquet file, each having one record. (two different partitions) - String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true); - String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true); - String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true); - - // We do the tag again - metadata = new HoodieTableMetaClient(fs, basePath); - table = HoodieTable.getHoodieTable(metadata, config); - - taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table); - - // Check results - for (HoodieRecord record : taggedRecordRDD.collect()) { - if (record.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { - assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename1))); - } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { - assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2))); - } else if (record.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { - assertTrue(!record.isCurrentLocationKnown()); - } else if (record.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { - assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3))); - } - } + // Check results + for (HoodieRecord record : taggedRecordRDD.collect()) { + if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { + assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename))); + } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { + assertFalse(record.isCurrentLocationKnown()); + } } + } - @Test - public void testCheckExists() throws Exception { - // We have some records to be tagged (two different partitions) + private String writeParquetFile(String partitionPath, List records, Schema schema, + BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException { + Thread.sleep(1000); + String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); + String fileId = UUID.randomUUID().toString(); + String filename = FSUtils.makeDataFileName(commitTime, 1, fileId); - String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; - String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; - String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; - TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()); - HoodieRecord record1 = new HoodieRecord(key1, rowChange1); - TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()); - HoodieRecord record2 = new HoodieRecord(key2, rowChange2); - TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); - HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()); - HoodieRecord record3 = new HoodieRecord(key3, rowChange3); - TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); - HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()); - HoodieRecord record4 = new HoodieRecord(key4, rowChange4); - JavaRDD keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4)); + return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime); + } - // Also create the metadata and config - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - HoodieTable table = HoodieTable.getHoodieTable(metadata, config); + private String writeParquetFile(String partitionPath, String filename, List records, + Schema schema, + BloomFilter filter, boolean createCommitTime) throws IOException { - // Let's tag - HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); - JavaPairRDD> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table); - - // Should not find any files - for (Tuple2> record : taggedRecordRDD.collect()) { - assertTrue(!record._2.isPresent()); - } - - // We create three parquet file, each having one record. (two different partitions) - String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true); - String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true); - String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true); - - // We do the tag again - metadata = new HoodieTableMetaClient(fs, basePath); - table = HoodieTable.getHoodieTable(metadata, config); - taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table); - - // Check results - for (Tuple2> record : taggedRecordRDD.collect()) { - if (record._1.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { - assertTrue(record._2.isPresent()); - Path path1 = new Path(record._2.get()); - assertEquals(FSUtils.getFileId(filename1), FSUtils.getFileId(path1.getName())); - } else if (record._1.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { - assertTrue(record._2.isPresent()); - Path path2 = new Path(record._2.get()); - assertEquals(FSUtils.getFileId(filename2), FSUtils.getFileId(path2.getName())); - } else if (record._1.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { - assertTrue(!record._2.isPresent()); - } else if (record._1.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { - assertTrue(record._2.isPresent()); - Path path3 = new Path(record._2.get()); - assertEquals(FSUtils.getFileId(filename3), FSUtils.getFileId(path3.getName())); - } - } + if (filter == null) { + filter = new BloomFilter(10000, 0.0000001); } - - - @Test - public void testBloomFilterFalseError() throws IOException, InterruptedException { - // We have two hoodie records - String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; - String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - - // We write record1 to a parquet file, using a bloom filter having both records - TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); - TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); - - BloomFilter filter = new BloomFilter(10000, 0.0000001); - filter.add(record2.getRecordKey()); - String filename = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, filter, true); - assertTrue(filter.mightContain(record1.getRecordKey())); - assertTrue(filter.mightContain(record2.getRecordKey())); - - // We do the tag - JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2)); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - HoodieTable table = HoodieTable.getHoodieTable(metadata, config); - - HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); - JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table); - - // Check results - for (HoodieRecord record : taggedRecordRDD.collect()) { - if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { - assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename))); - } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { - assertFalse(record.isCurrentLocationKnown()); - } - } + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( + new AvroSchemaConverter().convert(schema), schema, filter); + String commitTime = FSUtils.getCommitTime(filename); + HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, + ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, + new Configuration()); + HoodieParquetWriter writer = new HoodieParquetWriter( + commitTime, + new Path(basePath + "/" + partitionPath + "/" + filename), + config, + schema); + int seqId = 1; + for (HoodieRecord record : records) { + GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); + HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++); + HoodieAvroUtils + .addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), + filename); + writer.writeAvro(record.getRecordKey(), avroRecord); + filter.add(record.getRecordKey()); } + writer.close(); - private String writeParquetFile(String partitionPath, List records, Schema schema, - BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException { - Thread.sleep(1000); - String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); - String fileId = UUID.randomUUID().toString(); - String filename = FSUtils.makeDataFileName(commitTime, 1, fileId); - - - return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime); + if (createCommitTime) { + // Also make sure the commit is valid + new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs(); + new File( + basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit") + .createNewFile(); } + return filename; + } - private String writeParquetFile(String partitionPath, String filename, List records, Schema schema, - BloomFilter filter, boolean createCommitTime) throws IOException { - - - if (filter == null) { - filter = new BloomFilter(10000, 0.0000001); - } - HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); - String commitTime = FSUtils.getCommitTime(filename); - HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, - ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, new Configuration()); - HoodieParquetWriter writer = new HoodieParquetWriter( - commitTime, - new Path(basePath + "/" + partitionPath + "/" + filename), - config, - schema); - int seqId = 1; - for (HoodieRecord record : records) { - GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); - HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++); - HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename); - writer.writeAvro(record.getRecordKey(), avroRecord); - filter.add(record.getRecordKey()); - } - writer.close(); - - if (createCommitTime) { - // Also make sure the commit is valid - new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs(); - new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile(); - } - return filename; + @After + public void clean() { + if (jsc != null) { + jsc.stop(); } - - @After - public void clean() { - if (jsc != null) { - jsc.stop(); - } - if (basePath != null) { - new File(basePath).delete(); - } + if (basePath != null) { + new File(basePath).delete(); } + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java index 690f1d89d..fb19bf7e8 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java @@ -16,9 +16,11 @@ package com.uber.hoodie.io; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import com.uber.hoodie.avro.model.HoodieArchivedMetaEntry; import com.uber.hoodie.common.HoodieTestDataGenerator; -import com.uber.hoodie.common.model.HoodieArchivedLogFile; import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.HoodieTableMetaClient; @@ -29,6 +31,11 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieWriteConfig; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FileSystem; @@ -37,197 +44,196 @@ import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - public class TestHoodieCommitArchiveLog { - private String basePath; - private FileSystem fs; - @Before - public void init() throws Exception { - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.init(basePath); - fs = FSUtils.getFs(); + private String basePath; + private FileSystem fs; + + @Before + public void init() throws Exception { + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.init(basePath); + fs = FSUtils.getFs(); + } + + @Test + public void testArchiveEmptyDataset() throws IOException { + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable("test-trip-table").build(); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); + boolean result = archiveLog.archiveIfRequired(); + assertTrue(result); + } + + @Test + public void testArchiveDatasetWithArchival() throws IOException { + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).build()) + .forTable("test-trip-table").build(); + HoodieTestUtils.init(basePath); + HoodieTestDataGenerator.createCommitFile(basePath, "100"); + HoodieTestDataGenerator.createCommitFile(basePath, "101"); + HoodieTestDataGenerator.createCommitFile(basePath, "102"); + HoodieTestDataGenerator.createCommitFile(basePath, "103"); + HoodieTestDataGenerator.createCommitFile(basePath, "104"); + HoodieTestDataGenerator.createCommitFile(basePath, "105"); + + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTimeline timeline = + metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants(); + + assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); + + HoodieTestUtils.createCleanFiles(basePath, "100"); + HoodieTestUtils.createCleanFiles(basePath, "101"); + HoodieTestUtils.createCleanFiles(basePath, "102"); + HoodieTestUtils.createCleanFiles(basePath, "103"); + HoodieTestUtils.createCleanFiles(basePath, "104"); + HoodieTestUtils.createCleanFiles(basePath, "105"); + + //reload the timeline and get all the commmits before archive + timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline() + .filterCompletedInstants(); + List originalCommits = timeline.getInstants().collect(Collectors.toList()); + + assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants()); + + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); + + assertTrue(archiveLog.archiveIfRequired()); + + //reload the timeline and remove the remaining commits + timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline() + .filterCompletedInstants(); + originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList())); + + //read the file + HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(), + new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")), + HoodieArchivedMetaEntry.getClassSchema(), false); + + int archivedRecordsCount = 0; + List readRecords = new ArrayList<>(); + //read the avro blocks and validate the number of records written in each avro block + while (reader.hasNext()) { + HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); + List records = blk.getRecords(); + readRecords.addAll(records); + assertEquals("Archived and read records for each block are same", 8, records.size()); + archivedRecordsCount += records.size(); } + assertEquals("Total archived records and total read records are the same count", 8, + archivedRecordsCount); - @Test - public void testArchiveEmptyDataset() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("test-trip-table").build(); - HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); - boolean result = archiveLog.archiveIfRequired(); - assertTrue(result); - } + //make sure the archived commits are the same as the (originalcommits - commitsleft) + List readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> { + return r.get("commitTime").toString(); + }).collect(Collectors.toList()); + Collections.sort(readCommits); - @Test - public void testArchiveDatasetWithArchival() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).build()) - .forTable("test-trip-table").build(); - HoodieTestUtils.init(basePath); - HoodieTestDataGenerator.createCommitFile(basePath, "100"); - HoodieTestDataGenerator.createCommitFile(basePath, "101"); - HoodieTestDataGenerator.createCommitFile(basePath, "102"); - HoodieTestDataGenerator.createCommitFile(basePath, "103"); - HoodieTestDataGenerator.createCommitFile(basePath, "104"); - HoodieTestDataGenerator.createCommitFile(basePath, "105"); + assertEquals( + "Read commits map should match the originalCommits - commitsLoadedFromArchival", + originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()), + readCommits); + } - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline timeline = - metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants(); + @Test + public void testArchiveDatasetWithNoArchival() throws IOException { + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable("test-trip-table").withCompactionConfig( + HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); + HoodieTestDataGenerator.createCommitFile(basePath, "100"); + HoodieTestDataGenerator.createCommitFile(basePath, "101"); + HoodieTestDataGenerator.createCommitFile(basePath, "102"); + HoodieTestDataGenerator.createCommitFile(basePath, "103"); - assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); + HoodieTimeline timeline = + metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants(); - HoodieTestUtils.createCleanFiles(basePath, "100"); - HoodieTestUtils.createCleanFiles(basePath, "101"); - HoodieTestUtils.createCleanFiles(basePath, "102"); - HoodieTestUtils.createCleanFiles(basePath, "103"); - HoodieTestUtils.createCleanFiles(basePath, "104"); - HoodieTestUtils.createCleanFiles(basePath, "105"); + assertEquals("Loaded 4 commits and the count should match", 4, timeline.countInstants()); + boolean result = archiveLog.archiveIfRequired(); + assertTrue(result); + timeline = + metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline() + .filterCompletedInstants(); + assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4, + timeline.countInstants()); + } - //reload the timeline and get all the commmits before archive - timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); - List originalCommits = timeline.getInstants().collect(Collectors.toList()); + @Test + public void testArchiveCommitSafety() throws IOException { + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable("test-trip-table").withCompactionConfig( + HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); + HoodieTestDataGenerator.createCommitFile(basePath, "100"); + HoodieTestDataGenerator.createCommitFile(basePath, "101"); + HoodieTestDataGenerator.createCommitFile(basePath, "102"); + HoodieTestDataGenerator.createCommitFile(basePath, "103"); + HoodieTestDataGenerator.createCommitFile(basePath, "104"); + HoodieTestDataGenerator.createCommitFile(basePath, "105"); - assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants()); + HoodieTimeline timeline = + metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants(); + assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); + boolean result = archiveLog.archiveIfRequired(); + assertTrue(result); + timeline = + metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline() + .filterCompletedInstants(); + assertTrue("Archived commits should always be safe", + timeline.containsOrBeforeTimelineStarts("100")); + assertTrue("Archived commits should always be safe", + timeline.containsOrBeforeTimelineStarts("101")); + assertTrue("Archived commits should always be safe", + timeline.containsOrBeforeTimelineStarts("102")); + assertTrue("Archived commits should always be safe", + timeline.containsOrBeforeTimelineStarts("103")); + } - HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); + @Test + public void testArchiveCommitSavepointNoHole() throws IOException { + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable("test-trip-table").withCompactionConfig( + HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); + HoodieTestDataGenerator.createCommitFile(basePath, "100"); + HoodieTestDataGenerator.createCommitFile(basePath, "101"); + HoodieTestDataGenerator.createSavepointFile(basePath, "101"); + HoodieTestDataGenerator.createCommitFile(basePath, "102"); + HoodieTestDataGenerator.createCommitFile(basePath, "103"); + HoodieTestDataGenerator.createCommitFile(basePath, "104"); + HoodieTestDataGenerator.createCommitFile(basePath, "105"); - assertTrue(archiveLog.archiveIfRequired()); - - //reload the timeline and remove the remaining commits - timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); - originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList())); - - //read the file - HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(), - new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")), HoodieArchivedMetaEntry.getClassSchema(), false); - - int archivedRecordsCount = 0; - List readRecords = new ArrayList<>(); - //read the avro blocks and validate the number of records written in each avro block - while(reader.hasNext()) { - HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - List records = blk.getRecords(); - readRecords.addAll(records); - assertEquals("Archived and read records for each block are same", 8, records.size()); - archivedRecordsCount += records.size(); - } - assertEquals("Total archived records and total read records are the same count", 8, archivedRecordsCount); - - //make sure the archived commits are the same as the (originalcommits - commitsleft) - List readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> { - return r.get("commitTime").toString(); - }).collect(Collectors.toList()); - Collections.sort(readCommits); - - assertEquals( - "Read commits map should match the originalCommits - commitsLoadedFromArchival", - originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()), - readCommits); - } - - @Test - public void testArchiveDatasetWithNoArchival() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("test-trip-table").withCompactionConfig( - HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); - HoodieTestDataGenerator.createCommitFile(basePath, "100"); - HoodieTestDataGenerator.createCommitFile(basePath, "101"); - HoodieTestDataGenerator.createCommitFile(basePath, "102"); - HoodieTestDataGenerator.createCommitFile(basePath, "103"); - - HoodieTimeline timeline = - metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants(); - - assertEquals("Loaded 4 commits and the count should match", 4, timeline.countInstants()); - boolean result = archiveLog.archiveIfRequired(); - assertTrue(result); - timeline = - metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants(); - assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4, - timeline.countInstants()); - } - - @Test - public void testArchiveCommitSafety() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("test-trip-table").withCompactionConfig( - HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); - HoodieTestDataGenerator.createCommitFile(basePath, "100"); - HoodieTestDataGenerator.createCommitFile(basePath, "101"); - HoodieTestDataGenerator.createCommitFile(basePath, "102"); - HoodieTestDataGenerator.createCommitFile(basePath, "103"); - HoodieTestDataGenerator.createCommitFile(basePath, "104"); - HoodieTestDataGenerator.createCommitFile(basePath, "105"); - - HoodieTimeline timeline = - metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants(); - assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); - boolean result = archiveLog.archiveIfRequired(); - assertTrue(result); - timeline = - metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants(); - assertTrue("Archived commits should always be safe", - timeline.containsOrBeforeTimelineStarts("100")); - assertTrue("Archived commits should always be safe", - timeline.containsOrBeforeTimelineStarts("101")); - assertTrue("Archived commits should always be safe", - timeline.containsOrBeforeTimelineStarts("102")); - assertTrue("Archived commits should always be safe", - timeline.containsOrBeforeTimelineStarts("103")); - } - - @Test - public void testArchiveCommitSavepointNoHole() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("test-trip-table").withCompactionConfig( - HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); - HoodieTestDataGenerator.createCommitFile(basePath, "100"); - HoodieTestDataGenerator.createCommitFile(basePath, "101"); - HoodieTestDataGenerator.createSavepointFile(basePath, "101"); - HoodieTestDataGenerator.createCommitFile(basePath, "102"); - HoodieTestDataGenerator.createCommitFile(basePath, "103"); - HoodieTestDataGenerator.createCommitFile(basePath, "104"); - HoodieTestDataGenerator.createCommitFile(basePath, "105"); - - HoodieTimeline timeline = - metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants(); - assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); - boolean result = archiveLog.archiveIfRequired(); - assertTrue(result); - timeline = - metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants(); - assertEquals( - "Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)", - 5, timeline.countInstants()); - assertTrue("Archived commits should always be safe", - timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101"))); - assertTrue("Archived commits should always be safe", - timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102"))); - assertTrue("Archived commits should always be safe", - timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "103"))); - } + HoodieTimeline timeline = + metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants(); + assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); + boolean result = archiveLog.archiveIfRequired(); + assertTrue(result); + timeline = + metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline() + .filterCompletedInstants(); + assertEquals( + "Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)", + 5, timeline.countInstants()); + assertTrue("Archived commits should always be safe", + timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101"))); + assertTrue("Archived commits should always be safe", + timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102"))); + assertTrue("Archived commits should always be safe", + timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "103"))); + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java index 784e35c37..40ebc1829 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java @@ -16,7 +16,9 @@ package com.uber.hoodie.io; -import com.uber.hoodie.HoodieReadClient; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.HoodieClientTestUtils; @@ -34,13 +36,16 @@ import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieStorageConfig; import com.uber.hoodie.config.HoodieWriteConfig; -import com.uber.hoodie.index.bloom.HoodieBloomIndex; import com.uber.hoodie.index.HoodieIndex; +import com.uber.hoodie.index.bloom.HoodieBloomIndex; import com.uber.hoodie.io.compact.HoodieCompactor; import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor; import com.uber.hoodie.table.HoodieTable; +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; import org.apache.hadoop.fs.FileSystem; -import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.junit.After; @@ -48,161 +53,154 @@ import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.stream.Collectors; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - public class TestHoodieCompactor { - private transient JavaSparkContext jsc = null; - private String basePath = null; - private HoodieCompactor compactor; - private transient HoodieTestDataGenerator dataGen = null; - @Before - public void init() throws IOException { - // Initialize a local spark env - jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieCompactor")); + private transient JavaSparkContext jsc = null; + private String basePath = null; + private HoodieCompactor compactor; + private transient HoodieTestDataGenerator dataGen = null; - // Create a temp folder as the base path - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ); + @Before + public void init() throws IOException { + // Initialize a local spark env + jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieCompactor")); - dataGen = new HoodieTestDataGenerator(); - compactor = new HoodieRealtimeTableCompactor(); + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ); + + dataGen = new HoodieTestDataGenerator(); + compactor = new HoodieRealtimeTableCompactor(); + } + + @After + public void clean() { + if (basePath != null) { + new File(basePath).delete(); } - - @After - public void clean() { - if (basePath != null) { - new File(basePath).delete(); - } - if (jsc != null) { - jsc.stop(); - } + if (jsc != null) { + jsc.stop(); } + } - private HoodieWriteConfig getConfig() { - return getConfigBuilder().build(); - } + private HoodieWriteConfig getConfig() { + return getConfigBuilder().build(); + } - private HoodieWriteConfig.Builder getConfigBuilder() { - return HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withCompactionConfig( - HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) - .withInlineCompaction(false).build()) - .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) - .forTable("test-trip-table").withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); - } + private HoodieWriteConfig.Builder getConfigBuilder() { + return HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withCompactionConfig( + HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) + .withInlineCompaction(false).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) + .forTable("test-trip-table").withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); + } - @Test(expected = IllegalArgumentException.class) - public void testCompactionOnCopyOnWriteFail() throws Exception { - HoodieTestUtils.initTableType(basePath, HoodieTableType.COPY_ON_WRITE); + @Test(expected = IllegalArgumentException.class) + public void testCompactionOnCopyOnWriteFail() throws Exception { + HoodieTestUtils.initTableType(basePath, HoodieTableType.COPY_ON_WRITE); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + compactor.compact(jsc, getConfig(), table); + } + + @Test + public void testCompactionEmpty() throws Exception { + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieWriteConfig config = getConfig(); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); + HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); + + String newCommitTime = writeClient.startCommit(); + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD recordsRDD = jsc.parallelize(records, 1); + writeClient.insert(recordsRDD, newCommitTime).collect(); + + HoodieCompactionMetadata result = compactor.compact(jsc, getConfig(), table); + String basePath = table.getMetaClient().getBasePath(); + assertTrue("If there is nothing to compact, result will be empty", + result.getFileIdAndFullPaths(basePath).isEmpty()); + } + + @Test + public void testLogFileCountsAfterCompaction() throws Exception { + FileSystem fs = FSUtils.getFs(); + // insert 100 records + HoodieWriteConfig config = getConfig(); + HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); + String newCommitTime = "100"; + writeClient.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD recordsRDD = jsc.parallelize(records, 1); + List statuses = writeClient.insert(recordsRDD, newCommitTime).collect(); + + // Update all the 100 records + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); + + newCommitTime = "101"; + writeClient.startCommitWithTime(newCommitTime); + + List updatedRecords = dataGen.generateUpdates(newCommitTime, records); + JavaRDD updatedRecordsRDD = jsc.parallelize(updatedRecords, 1); + HoodieIndex index = new HoodieBloomIndex<>(config, jsc); + updatedRecords = index.tagLocation(updatedRecordsRDD, table).collect(); + + // Write them to corresponding avro logfiles + HoodieTestUtils + .writeRecordsToLogFiles(metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema, + updatedRecords); + + // Verify that all data file has one log file + metaClient = new HoodieTableMetaClient(fs, basePath); + table = HoodieTable.getHoodieTable(metaClient, config); + for (String partitionPath : dataGen.getPartitionPaths()) { + List groupedLogFiles = + table.getRTFileSystemView().getLatestFileSlices(partitionPath) + .collect(Collectors.toList()); + for (FileSlice fileSlice : groupedLogFiles) { + assertEquals("There should be 1 log file written for every data file", 1, + fileSlice.getLogFiles().count()); + } } - @Test - public void testCompactionEmpty() throws Exception { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - HoodieWriteConfig config = getConfig(); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); - HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); + // Do a compaction + metaClient = new HoodieTableMetaClient(fs, basePath); + table = HoodieTable.getHoodieTable(metaClient, config); - String newCommitTime = writeClient.startCommit(); - List records = dataGen.generateInserts(newCommitTime, 100); - JavaRDD recordsRDD = jsc.parallelize(records, 1); - writeClient.insert(recordsRDD, newCommitTime).collect(); + HoodieCompactionMetadata result = + compactor.compact(jsc, getConfig(), table); - HoodieCompactionMetadata result = - compactor.compact(jsc, getConfig(), table); - String basePath = table.getMetaClient().getBasePath(); - assertTrue("If there is nothing to compact, result will be empty", - result.getFileIdAndFullPaths(basePath).isEmpty()); + // Verify that recently written compacted data file has no log file + metaClient = new HoodieTableMetaClient(fs, basePath); + table = HoodieTable.getHoodieTable(metaClient, config); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + + assertTrue("Compaction commit should be > than last insert", + HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), newCommitTime, + HoodieTimeline.GREATER)); + + for (String partitionPath : dataGen.getPartitionPaths()) { + List groupedLogFiles = table.getRTFileSystemView() + .getLatestFileSlices(partitionPath) + .collect(Collectors.toList()); + for (FileSlice slice : groupedLogFiles) { + assertTrue( + "After compaction there should be no log files visiable on a Realtime view", + slice.getLogFiles().collect(Collectors.toList()).isEmpty()); + } + assertTrue(result.getPartitionToCompactionWriteStats().containsKey(partitionPath)); } + } - @Test - public void testLogFileCountsAfterCompaction() throws Exception { - FileSystem fs = FSUtils.getFs(); - // insert 100 records - HoodieWriteConfig config = getConfig(); - HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); - String newCommitTime = "100"; - writeClient.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 100); - JavaRDD recordsRDD = jsc.parallelize(records, 1); - List statuses = writeClient.insert(recordsRDD, newCommitTime).collect(); - - // Update all the 100 records - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); - - newCommitTime = "101"; - writeClient.startCommitWithTime(newCommitTime); - - List updatedRecords = dataGen.generateUpdates(newCommitTime, records); - JavaRDD updatedRecordsRDD = jsc.parallelize(updatedRecords, 1); - HoodieIndex index = new HoodieBloomIndex<>(config, jsc); - updatedRecords = index.tagLocation(updatedRecordsRDD, table).collect(); - - // Write them to corresponding avro logfiles - HoodieTestUtils - .writeRecordsToLogFiles(metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema, - updatedRecords); - - // Verify that all data file has one log file - metaClient = new HoodieTableMetaClient(fs, basePath); - table = HoodieTable.getHoodieTable(metaClient, config); - for (String partitionPath : dataGen.getPartitionPaths()) { - List groupedLogFiles = - table.getRTFileSystemView().getLatestFileSlices(partitionPath) - .collect(Collectors.toList()); - for (FileSlice fileSlice : groupedLogFiles) { - assertEquals("There should be 1 log file written for every data file", 1, - fileSlice.getLogFiles().count()); - } - } - - // Do a compaction - metaClient = new HoodieTableMetaClient(fs, basePath); - table = HoodieTable.getHoodieTable(metaClient, config); - - HoodieCompactionMetadata result = - compactor.compact(jsc, getConfig(), table); - - // Verify that recently written compacted data file has no log file - metaClient = new HoodieTableMetaClient(fs, basePath); - table = HoodieTable.getHoodieTable(metaClient, config); - HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); - - assertTrue("Compaction commit should be > than last insert", - HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), newCommitTime, - HoodieTimeline.GREATER)); - - for (String partitionPath : dataGen.getPartitionPaths()) { - List groupedLogFiles = table.getRTFileSystemView() - .getLatestFileSlices(partitionPath) - .collect(Collectors.toList()); - for (FileSlice slice: groupedLogFiles) { - assertTrue( - "After compaction there should be no log files visiable on a Realtime view", - slice.getLogFiles().collect(Collectors.toList()).isEmpty()); - } - assertTrue(result.getPartitionToCompactionWriteStats().containsKey(partitionPath)); - } - } - - // TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make sure the data read is the updated data (compaction correctness) - // TODO - add more test cases for compactions after a failed commit/compaction + // TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make sure the data read is the updated data (compaction correctness) + // TODO - add more test cases for compactions after a failed commit/compaction } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieCompactionStrategy.java b/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieCompactionStrategy.java index cc1a1219c..c01e21522 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieCompactionStrategy.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieCompactionStrategy.java @@ -17,12 +17,10 @@ package com.uber.hoodie.io.strategy; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import com.beust.jcommander.internal.Lists; import com.google.common.collect.Maps; - import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.io.compact.CompactionOperation; diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieDataFile.java b/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieDataFile.java index 6d6219ff6..564d95218 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieDataFile.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieDataFile.java @@ -17,9 +17,7 @@ package com.uber.hoodie.io.strategy; import com.uber.hoodie.common.model.HoodieDataFile; -import com.uber.hoodie.common.util.FSUtils; import java.util.UUID; -import org.apache.hadoop.fs.FileStatus; public class TestHoodieDataFile extends HoodieDataFile { diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieLogFile.java b/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieLogFile.java index 0d2a2bd68..d23cbf27c 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieLogFile.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/strategy/TestHoodieLogFile.java @@ -18,7 +18,6 @@ package com.uber.hoodie.io.strategy; import com.uber.hoodie.common.model.HoodieLogFile; import java.util.Optional; - import org.apache.hadoop.fs.Path; public class TestHoodieLogFile extends HoodieLogFile { diff --git a/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java b/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java index 7e33ad579..911f97417 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java @@ -16,30 +16,31 @@ package com.uber.hoodie.metrics; -import com.uber.hoodie.config.HoodieWriteConfig; - -import org.apache.commons.configuration.ConfigurationException; -import org.junit.Before; -import org.junit.Test; - import static org.junit.Assert.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +import com.uber.hoodie.config.HoodieWriteConfig; +import org.apache.commons.configuration.ConfigurationException; +import org.junit.Before; +import org.junit.Test; + public class TestHoodieMetrics { - private HoodieMetrics metrics = null; - @Before - public void start() throws ConfigurationException { - HoodieWriteConfig config = mock(HoodieWriteConfig.class); - when(config.isMetricsOn()).thenReturn(true); - when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.INMEMORY); - metrics = new HoodieMetrics(config, "raw_table"); - } + private HoodieMetrics metrics = null; - @Test - public void testRegisterGauge() { - metrics.registerGauge("metric1", 123L); - assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123")); - } + @Before + public void start() throws ConfigurationException { + HoodieWriteConfig config = mock(HoodieWriteConfig.class); + when(config.isMetricsOn()).thenReturn(true); + when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.INMEMORY); + metrics = new HoodieMetrics(config, "raw_table"); + } + + @Test + public void testRegisterGauge() { + metrics.registerGauge("metric1", 123L); + assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString() + .equals("123")); + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java b/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java index 020166d5a..a98b76838 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java @@ -16,26 +16,37 @@ package com.uber.hoodie.table; -import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.common.table.HoodieTimeline; -import com.uber.hoodie.config.HoodieWriteConfig; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.HoodieClientTestUtils; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.TestRawTripPayload; +import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.ParquetUtils; - import com.uber.hoodie.config.HoodieCompactionConfig; -import com.uber.hoodie.io.HoodieCreateHandle; import com.uber.hoodie.config.HoodieStorageConfig; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.io.HoodieCreateHandle; +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; import java.util.Map; +import java.util.UUID; import org.apache.avro.generic.GenericRecord; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FileSystem; @@ -47,424 +58,452 @@ import org.junit.After; import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; - -import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.UUID; - import scala.Option; import scala.Tuple2; -import static org.junit.Assert.*; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - public class TestCopyOnWriteTable { - private String basePath = null; - private transient JavaSparkContext jsc = null; - @Before - public void init() throws Exception { + private String basePath = null; + private transient JavaSparkContext jsc = null; - // Initialize a local spark env - jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestCopyOnWriteTable")); + @Before + public void init() throws Exception { - // Create a temp folder as the base path - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - this.basePath = folder.getRoot().getAbsolutePath(); - HoodieTestUtils.init(basePath); + // Initialize a local spark env + jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestCopyOnWriteTable")); + + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + this.basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.init(basePath); + } + + @Test + public void testMakeNewPath() throws Exception { + String fileName = UUID.randomUUID().toString(); + String partitionPath = "2016/05/04"; + int unitNumber = (int) (Math.random() * 10); + HoodieRecord record = mock(HoodieRecord.class); + when(record.getPartitionPath()).thenReturn(partitionPath); + + String commitTime = HoodieTestUtils.makeNewCommitTime(); + HoodieWriteConfig config = makeHoodieClientConfig(); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); + + HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath); + Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName); + assertTrue(newPath.toString().equals(this.basePath + "/" + partitionPath + "/" + FSUtils + .makeDataFileName(commitTime, unitNumber, fileName))); + } + + private HoodieWriteConfig makeHoodieClientConfig() throws Exception { + return makeHoodieClientConfigBuilder().build(); + } + + private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception { + // Prepare the AvroParquetIO + String schemaStr = IOUtils + .toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr); + } + + // TODO (weiy): Add testcases for crossing file writing. + @Test + public void testUpdateRecords() throws Exception { + // Prepare the AvroParquetIO + HoodieWriteConfig config = makeHoodieClientConfig(); + String firstCommitTime = HoodieTestUtils.makeNewCommitTime(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + + String partitionPath = "/2016/01/31"; + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); + + // Get some records belong to the same partition (2016/01/31) + String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}"; + + List records = new ArrayList<>(); + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + records.add( + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), + rowChange1)); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + records.add( + new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), + rowChange2)); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + records.add( + new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), + rowChange3)); + + // Insert new records + HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator())); + // We should have a parquet file generated (TODO: better control # files after we revise AvroParquetIO) + File parquetFile = null; + for (File file : new File(this.basePath + partitionPath).listFiles()) { + if (file.getName().endsWith(".parquet")) { + parquetFile = file; + break; + } + } + assertTrue(parquetFile != null); + + // Read out the bloom filter and make sure filter can answer record exist or not + Path parquetFilePath = new Path(parquetFile.getAbsolutePath()); + BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(parquetFilePath); + for (HoodieRecord record : records) { + assertTrue(filter.mightContain(record.getRecordKey())); + } + // Create a commit file + new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + + FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile(); + + // Read the parquet file, check the record content + List fileRecords = ParquetUtils.readAvroRecords(parquetFilePath); + GenericRecord newRecord; + int index = 0; + for (GenericRecord record : fileRecords) { + assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey())); + index++; } - @Test - public void testMakeNewPath() throws Exception { - String fileName = UUID.randomUUID().toString(); - String partitionPath = "2016/05/04"; - int unitNumber = (int) (Math.random() * 10); - HoodieRecord record = mock(HoodieRecord.class); - when(record.getPartitionPath()).thenReturn(partitionPath); + // We update the 1st record & add a new record + String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1); + HoodieRecord updatedRecord1 = new HoodieRecord( + new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), + updateRowChanges1); + updatedRecord1.setCurrentLocation( + new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName()))); - String commitTime = HoodieTestUtils.makeNewCommitTime(); - HoodieWriteConfig config = makeHoodieClientConfig(); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); + TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); + HoodieRecord insertedRecord1 = new HoodieRecord( + new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); - HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath); - Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName); - assertTrue(newPath.toString().equals(this.basePath + "/" + partitionPath + "/" + FSUtils - .makeDataFileName(commitTime, unitNumber, fileName))); - } + List updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1); - private HoodieWriteConfig makeHoodieClientConfig() throws Exception { - return makeHoodieClientConfigBuilder().build(); - } + Thread.sleep(1000); + String newCommitTime = HoodieTestUtils.makeNewCommitTime(); + metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + table = new HoodieCopyOnWriteTable(config, metadata); + Iterator> iter = table + .handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), + updatedRecords.iterator()); - private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception { - // Prepare the AvroParquetIO - String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); - return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr); - } - - // TODO (weiy): Add testcases for crossing file writing. - @Test - public void testUpdateRecords() throws Exception { - // Prepare the AvroParquetIO - HoodieWriteConfig config = makeHoodieClientConfig(); - String firstCommitTime = HoodieTestUtils.makeNewCommitTime(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - - String partitionPath = "/2016/01/31"; - HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); - - // Get some records belong to the same partition (2016/01/31) - String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; - String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; - String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}"; - - List records = new ArrayList<>(); - TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); - TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); - TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); - records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); - - // Insert new records - HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator())); - // We should have a parquet file generated (TODO: better control # files after we revise AvroParquetIO) - File parquetFile = null; - for (File file : new File(this.basePath + partitionPath).listFiles()) { - if (file.getName().endsWith(".parquet")) { - parquetFile = file; - break; - } + // Check the updated file + File updatedParquetFile = null; + for (File file : new File(basePath + "/2016/01/31").listFiles()) { + if (file.getName().endsWith(".parquet")) { + if (FSUtils.getFileId(file.getName()) + .equals(FSUtils.getFileId(parquetFile.getName())) && + HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()), + FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) { + updatedParquetFile = file; + break; } - assertTrue(parquetFile != null); - - // Read out the bloom filter and make sure filter can answer record exist or not - Path parquetFilePath = new Path(parquetFile.getAbsolutePath()); - BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(parquetFilePath); - for (HoodieRecord record : records) { - assertTrue(filter.mightContain(record.getRecordKey())); - } - // Create a commit file - new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" - + FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile(); - - // Read the parquet file, check the record content - List fileRecords = ParquetUtils.readAvroRecords(parquetFilePath); - GenericRecord newRecord; - int index = 0; - for (GenericRecord record: fileRecords) { - assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey())); - index++; - } - - // We update the 1st record & add a new record - String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; - TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1); - HoodieRecord updatedRecord1 = new HoodieRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1); - updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName()))); - - TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); - HoodieRecord insertedRecord1 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); - - List updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1); - - Thread.sleep(1000); - String newCommitTime = HoodieTestUtils.makeNewCommitTime(); - metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - table = new HoodieCopyOnWriteTable(config, metadata); - Iterator> iter = table.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator()); - - // Check the updated file - File updatedParquetFile = null; - for (File file : new File(basePath + "/2016/01/31").listFiles()) { - if (file.getName().endsWith(".parquet")) { - if (FSUtils.getFileId(file.getName()) - .equals(FSUtils.getFileId(parquetFile.getName())) && - HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()), - FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) { - updatedParquetFile = file; - break; - } - } - } - assertTrue(updatedParquetFile != null); - // Check whether the record has been updated - Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath()); - BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(updatedParquetFilePath); - for (HoodieRecord record : records) { - // No change to the _row_key - assertTrue(updatedFilter.mightContain(record.getRecordKey())); - } - - assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey())); - records.add(insertedRecord1);// add this so it can further check below - - ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build(); - index = 0; - while ((newRecord = (GenericRecord) updatedReader.read()) != null) { - assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey())); - if (index == 0) { - assertTrue(newRecord.get("number").toString().equals("15")); - } - index++; - } - updatedReader.close(); - // Also check the numRecordsWritten - List statuses = HoodieClientTestUtils.collectStatuses(iter); - WriteStatus writeStatus = statuses.get(0); - assertTrue("Should be only one file generated", statuses.size() == 1); - assertEquals(4, writeStatus.getStat().getNumWrites());//3 rewritten records + 1 new record + } + } + assertTrue(updatedParquetFile != null); + // Check whether the record has been updated + Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath()); + BloomFilter updatedFilter = ParquetUtils + .readBloomFilterFromParquetMetadata(updatedParquetFilePath); + for (HoodieRecord record : records) { + // No change to the _row_key + assertTrue(updatedFilter.mightContain(record.getRecordKey())); } + assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey())); + records.add(insertedRecord1);// add this so it can further check below - private List newHoodieRecords(int n, String time) throws Exception { - List records = new ArrayList<>(); - for (int i = 0; i < n; i++) { - String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", - UUID.randomUUID().toString(), - time, - i); - TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); - records.add(new HoodieRecord( - new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), - rowChange)); - } - return records; + ParquetReader updatedReader = ParquetReader + .builder(new AvroReadSupport<>(), updatedParquetFilePath).build(); + index = 0; + while ((newRecord = (GenericRecord) updatedReader.read()) != null) { + assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey())); + if (index == 0) { + assertTrue(newRecord.get("number").toString().equals("15")); + } + index++; + } + updatedReader.close(); + // Also check the numRecordsWritten + List statuses = HoodieClientTestUtils.collectStatuses(iter); + WriteStatus writeStatus = statuses.get(0); + assertTrue("Should be only one file generated", statuses.size() == 1); + assertEquals(4, writeStatus.getStat().getNumWrites());//3 rewritten records + 1 new record + } + + + private List newHoodieRecords(int n, String time) throws Exception { + List records = new ArrayList<>(); + for (int i = 0; i < n; i++) { + String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", + UUID.randomUUID().toString(), + time, + i); + TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); + records.add(new HoodieRecord( + new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), + rowChange)); + } + return records; + } + + // Check if record level metadata is aggregated properly at the end of write. + @Test + public void testMetadataAggregateFromWriteStatus() throws Exception { + // Prepare the AvroParquetIO + HoodieWriteConfig config = makeHoodieClientConfigBuilder() + .withWriteStatusClass(MetadataMergeWriteStatus.class).build(); + String firstCommitTime = HoodieTestUtils.makeNewCommitTime(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); + + // Get some records belong to the same partition (2016/01/31) + String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + + List records = new ArrayList<>(); + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + records.add( + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), + rowChange1)); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + records.add( + new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), + rowChange2)); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + records.add( + new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), + rowChange3)); + + // Insert new records + List writeStatuses = HoodieClientTestUtils + .collectStatuses(table.handleInsert(firstCommitTime, records.iterator())); + Map allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus + .mergeMetadataForWriteStatuses(writeStatuses); + assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000")); + // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * 3 + assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000")); + } + + @Test + public void testInsertWithPartialFailures() throws Exception { + HoodieWriteConfig config = makeHoodieClientConfig(); + String commitTime = HoodieTestUtils.makeNewCommitTime(); + FileSystem fs = FSUtils.getFs(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); + + // Write a few records, and get atleast one file + // 10 records for partition 1, 1 record for partition 2. + List records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); + records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); + + // Simulate crash after first file + List statuses = HoodieClientTestUtils + .collectStatuses(table.handleInsert(commitTime, records.iterator())); + WriteStatus status = statuses.get(0); + Path partialFile = new Path(String.format("%s/%s/%s", + basePath, + status.getPartitionPath(), + FSUtils.makeDataFileName(commitTime, 0, status.getFileId())) + ); + assertTrue(fs.exists(partialFile)); + + // When we retry + records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); + records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); + + statuses = HoodieClientTestUtils + .collectStatuses(table.handleInsert(commitTime, records.iterator())); + status = statuses.get(0); + + Path retriedFIle = new Path(String.format("%s/%s/%s", + basePath, + status.getPartitionPath(), + FSUtils.makeDataFileName(commitTime, 0, status.getFileId())) + ); + assertTrue(fs.exists(retriedFIle)); + assertFalse(fs.exists(partialFile)); + } + + + @Test + public void testInsertRecords() throws Exception { + HoodieWriteConfig config = makeHoodieClientConfig(); + String commitTime = HoodieTestUtils.makeNewCommitTime(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); + + // Case 1: + // 10 records for partition 1, 1 record for partition 2. + List records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); + records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); + + // Insert new records + List returnedStatuses = HoodieClientTestUtils + .collectStatuses(table.handleInsert(commitTime, records.iterator())); + + // TODO: check the actual files and make sure 11 records, total were written. + assertEquals(2, returnedStatuses.size()); + assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath()); + assertEquals(0, returnedStatuses.get(0).getFailedRecords().size()); + assertEquals(10, returnedStatuses.get(0).getWrittenRecords().size()); + assertEquals("2016/02/01", returnedStatuses.get(1).getPartitionPath()); + assertEquals(0, returnedStatuses.get(0).getFailedRecords().size()); + assertEquals(1, returnedStatuses.get(1).getWrittenRecords().size()); + + // Case 2: + // 1 record for partition 1, 5 record for partition 2, 1 records for partition 3. + records = newHoodieRecords(1, "2016-01-31T03:16:41.415Z"); + records.addAll(newHoodieRecords(5, "2016-02-01T03:16:41.415Z")); + records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z")); + + // Insert new records + returnedStatuses = HoodieClientTestUtils + .collectStatuses(table.handleInsert(commitTime, records.iterator())); + + assertEquals(3, returnedStatuses.size()); + assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath()); + assertEquals(1, returnedStatuses.get(0).getWrittenRecords().size()); + + assertEquals("2016/02/01", returnedStatuses.get(1).getPartitionPath()); + assertEquals(5, returnedStatuses.get(1).getWrittenRecords().size()); + + assertEquals("2016/02/02", returnedStatuses.get(2).getPartitionPath()); + assertEquals(1, returnedStatuses.get(2).getWrittenRecords().size()); + + } + + @Test + public void testFileSizeUpsertRecords() throws Exception { + HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig( + HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024) + .parquetPageSize(64 * 1024).build()).build(); + String commitTime = HoodieTestUtils.makeNewCommitTime(); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); + + List records = new ArrayList<>(); + // Approx 1150 records are written for block size of 64KB + for (int i = 0; i < 2000; i++) { + String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; + TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); + records + .add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), + rowChange)); } - // Check if record level metadata is aggregated properly at the end of write. - @Test - public void testMetadataAggregateFromWriteStatus() throws Exception { - // Prepare the AvroParquetIO - HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build(); - String firstCommitTime = HoodieTestUtils.makeNewCommitTime(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + // Insert new records + HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); - HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); - - // Get some records belong to the same partition (2016/01/31) - String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; - String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; - - List records = new ArrayList<>(); - TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); - TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); - TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); - records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); - - // Insert new records - List writeStatuses = HoodieClientTestUtils - .collectStatuses(table.handleInsert(firstCommitTime, records.iterator())); - Map allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus - .mergeMetadataForWriteStatuses(writeStatuses); - assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000")); - // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * 3 - assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000")); + // Check the updated file + int counts = 0; + for (File file : new File(basePath + "/2016/01/31").listFiles()) { + if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()) + .equals(commitTime)) { + System.out.println(file.getName() + "-" + file.length()); + counts++; + } } + assertEquals( + "If the number of records are more than 1150, then there should be a new file", 3, + counts); + } - @Test - public void testInsertWithPartialFailures() throws Exception { - HoodieWriteConfig config = makeHoodieClientConfig(); - String commitTime = HoodieTestUtils.makeNewCommitTime(); - FileSystem fs = FSUtils.getFs(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); - HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); - // Write a few records, and get atleast one file - // 10 records for partition 1, 1 record for partition 2. - List records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); - records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); + private List testUpsertPartitioner(int smallFileSize, + int numInserts, + int numUpdates, + int fileSize, + boolean autoSplitInserts) throws Exception { + final String TEST_PARTITION_PATH = "2016/09/26"; + HoodieWriteConfig config = makeHoodieClientConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .compactionSmallFileSize(smallFileSize).insertSplitSize(100) + .autoTuneInsertSplits(autoSplitInserts).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()) + .build(); - // Simulate crash after first file - List statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); - WriteStatus status = statuses.get(0); - Path partialFile = new Path(String.format("%s/%s/%s", - basePath, - status.getPartitionPath(), - FSUtils.makeDataFileName(commitTime, 0, status.getFileId())) - ); - assertTrue(fs.exists(partialFile)); + HoodieClientTestUtils.fakeCommitFile(basePath, "001"); + HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize); - // When we retry - records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); - records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); - statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); - status = statuses.get(0); - - Path retriedFIle = new Path(String.format("%s/%s/%s", - basePath, - status.getPartitionPath(), - FSUtils.makeDataFileName(commitTime, 0, status.getFileId())) - ); - assertTrue(fs.exists(retriedFIle)); - assertFalse(fs.exists(partialFile)); + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator( + new String[]{TEST_PARTITION_PATH}); + List insertRecords = dataGenerator.generateInserts("001", numInserts); + List updateRecords = dataGenerator.generateUpdates("001", numUpdates); + for (HoodieRecord updateRec : updateRecords) { + updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1")); } + List records = new ArrayList<>(); + records.addAll(insertRecords); + records.addAll(updateRecords); + WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records)); + HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner) + table.getUpsertPartitioner(profile); + + assertEquals("Should have 3 partitions", 3, partitioner.numPartitions()); + assertEquals("Bucket 0 is UPDATE", HoodieCopyOnWriteTable.BucketType.UPDATE, + partitioner.getBucketInfo(0).bucketType); + assertEquals("Bucket 1 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT, + partitioner.getBucketInfo(1).bucketType); + assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT, + partitioner.getBucketInfo(2).bucketType); + assertEquals("Update record should have gone to the 1 update partiton", 0, + partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), + Option.apply(updateRecords.get(0).getCurrentLocation())))); + return partitioner.getInsertBuckets(TEST_PARTITION_PATH); + } - @Test public void testInsertRecords() throws Exception { - HoodieWriteConfig config = makeHoodieClientConfig(); - String commitTime = HoodieTestUtils.makeNewCommitTime(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); - - // Case 1: - // 10 records for partition 1, 1 record for partition 2. - List records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); - records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); - - // Insert new records - List returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); + @Test + public void testUpsertPartitioner() throws Exception { + // Inserts + Updates... Check all updates go together & inserts subsplit + List insertBuckets = testUpsertPartitioner(0, 200, 100, + 1024, false); + assertEquals("Total of 2 insert buckets", 2, insertBuckets.size()); + } - // TODO: check the actual files and make sure 11 records, total were written. - assertEquals(2, returnedStatuses.size()); - assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath()); - assertEquals(0, returnedStatuses.get(0).getFailedRecords().size()); - assertEquals(10, returnedStatuses.get(0).getWrittenRecords().size()); - assertEquals("2016/02/01", returnedStatuses.get(1).getPartitionPath()); - assertEquals(0, returnedStatuses.get(0).getFailedRecords().size()); - assertEquals(1, returnedStatuses.get(1).getWrittenRecords().size()); + @Test + public void testUpsertPartitionerWithSmallInsertHandling() throws Exception { + // Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file + List insertBuckets = testUpsertPartitioner(1000 * 1024, + 400, 100, 800 * 1024, false); + assertEquals("Total of 3 insert buckets", 3, insertBuckets.size()); + assertEquals("First insert bucket must be same as update bucket", 0, + insertBuckets.get(0).bucketNumber); + assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, + 0.01); - // Case 2: - // 1 record for partition 1, 5 record for partition 2, 1 records for partition 3. - records = newHoodieRecords(1, "2016-01-31T03:16:41.415Z"); - records.addAll(newHoodieRecords(5, "2016-02-01T03:16:41.415Z")); - records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z")); - - // Insert new records - returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); - - assertEquals(3, returnedStatuses.size()); - assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath()); - assertEquals(1, returnedStatuses.get(0).getWrittenRecords().size()); - - assertEquals("2016/02/01", returnedStatuses.get(1).getPartitionPath()); - assertEquals(5, returnedStatuses.get(1).getWrittenRecords().size()); - - assertEquals("2016/02/02", returnedStatuses.get(2).getPartitionPath()); - assertEquals(1, returnedStatuses.get(2).getWrittenRecords().size()); + // Now with insert split size auto tuned + insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true); + assertEquals("Total of 3 insert buckets", 3, insertBuckets.size()); + assertEquals("First insert bucket must be same as update bucket", 0, + insertBuckets.get(0).bucketNumber); + assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400, + insertBuckets.get(0).weight, 0.01); + } + @After + public void cleanup() { + if (basePath != null) { + new File(basePath).delete(); } - - @Test public void testFileSizeUpsertRecords() throws Exception { - HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig( - HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024) - .parquetPageSize(64 * 1024).build()).build(); - String commitTime = HoodieTestUtils.makeNewCommitTime(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); - - List records = new ArrayList<>(); - // Approx 1150 records are written for block size of 64KB - for (int i = 0; i < 2000; i++) { - String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; - TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); - records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), - rowChange)); - } - - // Insert new records - HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); - - // Check the updated file - int counts = 0; - for (File file : new File(basePath + "/2016/01/31").listFiles()) { - if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) { - System.out.println(file.getName() + "-" + file.length()); - counts++; - } - } - assertEquals( - "If the number of records are more than 1150, then there should be a new file", 3, - counts); - } - - - - private List testUpsertPartitioner(int smallFileSize, - int numInserts, - int numUpdates, - int fileSize, - boolean autoSplitInserts) throws Exception { - final String TEST_PARTITION_PATH = "2016/09/26"; - HoodieWriteConfig config = makeHoodieClientConfigBuilder() - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()) - .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); - - HoodieClientTestUtils.fakeCommitFile(basePath, "001"); - HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize); - - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); - - HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH}); - List insertRecords = dataGenerator.generateInserts("001", numInserts); - List updateRecords = dataGenerator.generateUpdates("001", numUpdates); - for (HoodieRecord updateRec: updateRecords) { - updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1")); - } - List records = new ArrayList<>(); - records.addAll(insertRecords); - records.addAll(updateRecords); - WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records)); - HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner) - table.getUpsertPartitioner(profile); - - assertEquals("Should have 3 partitions", 3, partitioner.numPartitions()); - assertEquals("Bucket 0 is UPDATE", HoodieCopyOnWriteTable.BucketType.UPDATE, - partitioner.getBucketInfo(0).bucketType); - assertEquals("Bucket 1 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT, - partitioner.getBucketInfo(1).bucketType); - assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT, - partitioner.getBucketInfo(2).bucketType); - assertEquals("Update record should have gone to the 1 update partiton", 0, - partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation())))); - return partitioner.getInsertBuckets(TEST_PARTITION_PATH); - } - - - @Test - public void testUpsertPartitioner() throws Exception { - // Inserts + Updates... Check all updates go together & inserts subsplit - List insertBuckets = testUpsertPartitioner(0, 200, 100, 1024, false); - assertEquals("Total of 2 insert buckets", 2, insertBuckets.size()); - } - - - @Test - public void testUpsertPartitionerWithSmallInsertHandling() throws Exception { - // Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file - List insertBuckets = testUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024, false); - assertEquals("Total of 3 insert buckets", 3, insertBuckets.size()); - assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber); - assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, 0.01); - - // Now with insert split size auto tuned - insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true); - assertEquals("Total of 3 insert buckets", 3, insertBuckets.size()); - assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber); - assertEquals("First insert bucket should have weight 0.5", 200.0/2400, insertBuckets.get(0).weight, 0.01); - } - - @After - public void cleanup() { - if (basePath != null) { - new File(basePath).delete(); - } - if (jsc != null) { - jsc.stop(); - } + if (jsc != null) { + jsc.stop(); } + } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java b/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java index 69cd2ce4b..9602a02ff 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java @@ -19,6 +19,11 @@ package com.uber.hoodie.table; +import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.HoodieClientTestUtils; @@ -44,6 +49,14 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.io.compact.HoodieCompactor; import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor; +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -60,494 +73,506 @@ import org.junit.BeforeClass; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - public class TestMergeOnReadTable { - private transient JavaSparkContext jsc = null; - private transient SQLContext sqlContext; - private String basePath = null; - private HoodieCompactor compactor; - private FileSystem fs; - //NOTE : Be careful in using DFS (FileSystem.class) vs LocalFs(RawLocalFileSystem.class) - //The implementation and gurantees of many API's differ, for example check rename(src,dst) - private static MiniDFSCluster dfsCluster; - private static DistributedFileSystem dfs; - private static HdfsTestService hdfsTestService; + private transient JavaSparkContext jsc = null; + private transient SQLContext sqlContext; + private String basePath = null; + private HoodieCompactor compactor; + private FileSystem fs; - @AfterClass - public static void cleanUp() throws Exception { - if (hdfsTestService != null) { - hdfsTestService.stop(); - dfsCluster.shutdown();; - } - FSUtils.setFs(null); - // Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM - FileSystem.closeAll(); - HoodieTestUtils.resetFS(); + //NOTE : Be careful in using DFS (FileSystem.class) vs LocalFs(RawLocalFileSystem.class) + //The implementation and gurantees of many API's differ, for example check rename(src,dst) + private static MiniDFSCluster dfsCluster; + private static DistributedFileSystem dfs; + private static HdfsTestService hdfsTestService; + + @AfterClass + public static void cleanUp() throws Exception { + if (hdfsTestService != null) { + hdfsTestService.stop(); + dfsCluster.shutdown(); + ; + } + FSUtils.setFs(null); + // Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM + FileSystem.closeAll(); + HoodieTestUtils.resetFS(); + } + + @BeforeClass + public static void setUpDFS() throws IOException { + // Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM + FileSystem.closeAll(); + if (hdfsTestService == null) { + hdfsTestService = new HdfsTestService(); + dfsCluster = hdfsTestService.start(true); + // Create a temp folder as the base path + dfs = dfsCluster.getFileSystem(); + } + FSUtils.setFs(dfs); + HoodieTestUtils.resetFS(); + } + + @Before + public void init() throws IOException { + this.fs = FSUtils.getFs(); + + // Initialize a local spark env + jsc = new JavaSparkContext( + HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable")); + jsc.hadoopConfiguration().addResource(FSUtils.getFs().getConf()); + + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + basePath = folder.getRoot().getAbsolutePath(); + dfs.mkdirs(new Path(basePath)); + FSUtils.setFs(dfs); + HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ); + + compactor = new HoodieRealtimeTableCompactor(); + + //SQLContext stuff + sqlContext = new SQLContext(jsc); + } + + @After + public void clean() { + if (basePath != null) { + new File(basePath).delete(); + } + if (jsc != null) { + jsc.stop(); + } + } + + @Test + public void testSimpleInsertAndUpdate() throws Exception { + HoodieWriteConfig cfg = getConfig(true); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + + /** + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); + + Optional deltaCommit = + metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp()); + + Optional commit = + metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + + FileStatus[] allFiles = HoodieTestUtils + .listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); + TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, + hoodieTable.getCompletedCompactionCommitTimeline(), allFiles); + Stream dataFilesToRead = roView.getLatestDataFiles(); + assertTrue(!dataFilesToRead.findAny().isPresent()); + + roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), + allFiles); + dataFilesToRead = roView.getLatestDataFiles(); + assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", + dataFilesToRead.findAny().isPresent()); + + /** + * Write 2 (updates) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, 100); + Map recordsMap = new HashMap<>(); + for (HoodieRecord rec : records) { + if (!recordsMap.containsKey(rec.getKey())) { + recordsMap.put(rec.getKey(), rec); + } } - @BeforeClass - public static void setUpDFS() throws IOException { - // Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM - FileSystem.closeAll(); - if (hdfsTestService == null) { - hdfsTestService = new HdfsTestService(); - dfsCluster = hdfsTestService.start(true); - // Create a temp folder as the base path - dfs = dfsCluster.getFileSystem(); - } - FSUtils.setFs(dfs); - HoodieTestUtils.resetFS(); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals("Latest Delta commit should be 004", "004", deltaCommit.get().getTimestamp()); + + commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + + HoodieCompactor compactor = new HoodieRealtimeTableCompactor(); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(true)); + + compactor.compact(jsc, getConfig(true), table); + + allFiles = HoodieTestUtils.listAllDataFilesInPath(fs, cfg.getBasePath()); + roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), + allFiles); + dataFilesToRead = roView.getLatestDataFiles(); + assertTrue(dataFilesToRead.findAny().isPresent()); + + // verify that there is a commit + table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(fs, cfg.getBasePath(), true), getConfig(false)); + HoodieTimeline timeline = table.getCompletedCompactionCommitTimeline(); + assertEquals("Expecting a single commit.", 1, + timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); + String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp(); + assertTrue(HoodieTimeline + .compareTimestamps("000", latestCompactionCommitTime, HoodieTimeline.LESSER)); + + assertEquals("Must contain 200 records", 200, + HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count()); + } + + // Check if record level metadata is aggregated properly at the end of write. + @Test + public void testMetadataAggregateFromWriteStatus() throws Exception { + HoodieWriteConfig cfg = getConfigBuilder(false) + .withWriteStatusClass(MetadataMergeWriteStatus.class).build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + + String newCommitTime = "001"; + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + client.startCommit(); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + Map allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus + .mergeMetadataForWriteStatuses(statuses); + assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000")); + // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * records.size() + assertEquals(String.valueOf(2 * records.size()), + allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000")); + } + + @Test + public void testSimpleInsertAndDelete() throws Exception { + HoodieWriteConfig cfg = getConfig(true); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + + /** + * Write 1 (only inserts, written as parquet file) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + List records = dataGen.generateInserts(newCommitTime, 20); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); + + Optional deltaCommit = + metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp()); + + Optional commit = + metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + + FileStatus[] allFiles = HoodieTestUtils + .listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); + TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, + hoodieTable.getCompletedCompactionCommitTimeline(), allFiles); + Stream dataFilesToRead = roView.getLatestDataFiles(); + assertTrue(!dataFilesToRead.findAny().isPresent()); + + roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), + allFiles); + dataFilesToRead = roView.getLatestDataFiles(); + assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", + dataFilesToRead.findAny().isPresent()); + + /** + * Write 2 (only inserts, written to .log file) + */ + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateInserts(newCommitTime, 20); + writeRecords = jsc.parallelize(records, 1); + statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + /** + * Write 2 (only deletes, written to .log file) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + + List fewRecordsForDelete = dataGen.generateDeletesFromExistingRecords(records); + + statuses = client.upsert(jsc.parallelize(fewRecordsForDelete, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals("Latest Delta commit should be 004", "004", deltaCommit.get().getTimestamp()); + + commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + + allFiles = HoodieTestUtils.listAllDataFilesInPath(fs, cfg.getBasePath()); + roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), + allFiles); + dataFilesToRead = roView.getLatestDataFiles(); + assertTrue(dataFilesToRead.findAny().isPresent()); + + List dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils + .getRecordsUsingInputFormat(dataFiles); + //Wrote 40 records and deleted 20 records, so remaining 40-20 = 20 + assertEquals("Must contain 20 records", 20, recordsRead.size()); + } + + @Test + public void testCOWToMORConvertedDatasetRollback() throws Exception { + + //Set TableType to COW + HoodieTestUtils.initTableType(basePath, HoodieTableType.COPY_ON_WRITE); + + HoodieWriteConfig cfg = getConfig(true); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + + /** + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + //verify there are no errors + assertNoWriteErrors(statuses); + + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); + + Optional commit = + metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertTrue(commit.isPresent()); + assertEquals("commit should be 001", "001", commit.get().getTimestamp()); + + /** + * Write 2 (updates) + */ + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + //Set TableType to MOR + HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ); + + //rollback a COW commit when TableType is MOR + client.rollback(newCommitTime); + + metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); + FileStatus[] allFiles = HoodieTestUtils + .listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); + HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient, + hoodieTable.getCompletedCommitTimeline(), allFiles); + + final String absentCommit = newCommitTime; + assertFalse(roView.getLatestDataFiles().filter(file -> { + if (absentCommit.equals(file.getCommitTime())) { + return true; + } else { + return false; + } + }).findAny().isPresent()); + } + + @Test + public void testRollbackWithDeltaAndCompactionCommit() throws Exception { + + HoodieWriteConfig cfg = getConfig(true); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + + // Test delta commit rollback (with all log files) + /** + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); + + Optional deltaCommit = + metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp()); + + Optional commit = + metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + + FileStatus[] allFiles = HoodieTestUtils + .listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); + TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, + hoodieTable.getCompletedCompactionCommitTimeline(), allFiles); + Stream dataFilesToRead = roView.getLatestDataFiles(); + assertTrue(!dataFilesToRead.findAny().isPresent()); + + roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), + allFiles); + dataFilesToRead = roView.getLatestDataFiles(); + assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", + dataFilesToRead.findAny().isPresent()); + + /** + * Write 2 (updates) + */ + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, 200); + + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals("Latest Delta commit should be 002", "002", deltaCommit.get().getTimestamp()); + + commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + + List dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils + .getRecordsUsingInputFormat(dataFiles); + + assertEquals(recordsRead.size(), 200); + + // Test delta commit rollback + client.rollback(newCommitTime); + + metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); + roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), + allFiles); + dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); + recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles); + + assertEquals(recordsRead.size(), 200); + + //Test compaction commit rollback + /** + * Write 2 (updates) + */ + newCommitTime = "003"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, 400); + + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(statuses); + + HoodieCompactor compactor = new HoodieRealtimeTableCompactor(); + metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(true)); + + compactor.compact(jsc, getConfig(true), table); + + allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); + metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); + roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompactionCommitTimeline(), + allFiles); + + final String compactedCommitTime = metaClient.getActiveTimeline().reload() + .getCommitsAndCompactionsTimeline().lastInstant().get().getTimestamp(); + + assertTrue(roView.getLatestDataFiles().filter(file -> { + if (compactedCommitTime.equals(file.getCommitTime())) { + return true; + } else { + return false; + } + }).findAny().isPresent()); + + client.rollback(compactedCommitTime); + + allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); + metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); + roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompactionCommitTimeline(), + allFiles); + + assertFalse(roView.getLatestDataFiles().filter(file -> { + if (compactedCommitTime.equals(file.getCommitTime())) { + return true; + } else { + return false; + } + }).findAny().isPresent()); + } + + private HoodieWriteConfig getConfig(Boolean autoCommit) { + return getConfigBuilder(autoCommit).build(); + } + + private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { + return HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withAutoCommit(autoCommit) + .withCompactionConfig( + HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) + .withInlineCompaction(false).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) + .forTable("test-trip-table").withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); + } + + private void assertNoWriteErrors(List statuses) { + // Verify there are no errors + for (WriteStatus status : statuses) { + assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors()); } - - @Before - public void init() throws IOException { - this.fs = FSUtils.getFs(); - - // Initialize a local spark env - jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable")); - jsc.hadoopConfiguration().addResource(FSUtils.getFs().getConf()); - - // Create a temp folder as the base path - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - basePath = folder.getRoot().getAbsolutePath(); - dfs.mkdirs(new Path(basePath)); - FSUtils.setFs(dfs); - HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ); - - compactor = new HoodieRealtimeTableCompactor(); - - //SQLContext stuff - sqlContext = new SQLContext(jsc); - } - - @After - public void clean() { - if (basePath != null) { - new File(basePath).delete(); - } - if (jsc != null) { - jsc.stop(); - } - } - - @Test - public void testSimpleInsertAndUpdate() throws Exception { - HoodieWriteConfig cfg = getConfig(true); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); - List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - List statuses = client.upsert(writeRecords, newCommitTime).collect(); - assertNoWriteErrors(statuses); - - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); - - Optional deltaCommit = - metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp()); - - Optional commit = - metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - - FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, - hoodieTable.getCompletedCompactionCommitTimeline(), allFiles); - Stream dataFilesToRead = roView.getLatestDataFiles(); - assertTrue(!dataFilesToRead.findAny().isPresent()); - - roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles); - dataFilesToRead = roView.getLatestDataFiles(); - assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", - dataFilesToRead.findAny().isPresent()); - - /** - * Write 2 (updates) - */ - newCommitTime = "004"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, 100); - Map recordsMap = new HashMap<>(); - for (HoodieRecord rec : records) { - if (!recordsMap.containsKey(rec.getKey())) { - recordsMap.put(rec.getKey(), rec); - } - } - - - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("Latest Delta commit should be 004", "004", deltaCommit.get().getTimestamp()); - - commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - - - HoodieCompactor compactor = new HoodieRealtimeTableCompactor(); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(true)); - - compactor.compact(jsc, getConfig(true), table); - - allFiles = HoodieTestUtils.listAllDataFilesInPath(fs, cfg.getBasePath()); - roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles); - dataFilesToRead = roView.getLatestDataFiles(); - assertTrue(dataFilesToRead.findAny().isPresent()); - - // verify that there is a commit - table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(fs, cfg.getBasePath(), true), getConfig(false)); - HoodieTimeline timeline = table.getCompletedCompactionCommitTimeline(); - assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); - String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp(); - assertTrue(HoodieTimeline - .compareTimestamps("000", latestCompactionCommitTime, HoodieTimeline.LESSER)); - - assertEquals("Must contain 200 records", 200, - HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count()); - } - - // Check if record level metadata is aggregated properly at the end of write. - @Test - public void testMetadataAggregateFromWriteStatus() throws Exception { - HoodieWriteConfig cfg = getConfigBuilder(false).withWriteStatusClass(MetadataMergeWriteStatus.class).build(); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - - String newCommitTime = "001"; - HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); - List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - client.startCommit(); - - List statuses = client.upsert(writeRecords, newCommitTime).collect(); - assertNoWriteErrors(statuses); - Map allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus .mergeMetadataForWriteStatuses(statuses); - assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000")); - // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * records.size() - assertEquals(String.valueOf(2 * records.size()), allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000")); - } - - @Test - public void testSimpleInsertAndDelete() throws Exception { - HoodieWriteConfig cfg = getConfig(true); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - - /** - * Write 1 (only inserts, written as parquet file) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); - List records = dataGen.generateInserts(newCommitTime, 20); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - List statuses = client.upsert(writeRecords, newCommitTime).collect(); - assertNoWriteErrors(statuses); - - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); - - Optional deltaCommit = - metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp()); - - Optional commit = - metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - - FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, - hoodieTable.getCompletedCompactionCommitTimeline(), allFiles); - Stream dataFilesToRead = roView.getLatestDataFiles(); - assertTrue(!dataFilesToRead.findAny().isPresent()); - - roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles); - dataFilesToRead = roView.getLatestDataFiles(); - assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", - dataFilesToRead.findAny().isPresent()); - - /** - * Write 2 (only inserts, written to .log file) - */ - newCommitTime = "002"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateInserts(newCommitTime, 20); - writeRecords = jsc.parallelize(records, 1); - statuses = client.upsert(writeRecords, newCommitTime).collect(); - assertNoWriteErrors(statuses); - - /** - * Write 2 (only deletes, written to .log file) - */ - newCommitTime = "004"; - client.startCommitWithTime(newCommitTime); - - List fewRecordsForDelete = dataGen.generateDeletesFromExistingRecords(records); - - statuses = client.upsert(jsc.parallelize(fewRecordsForDelete, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("Latest Delta commit should be 004", "004", deltaCommit.get().getTimestamp()); - - commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - - allFiles = HoodieTestUtils.listAllDataFilesInPath(fs, cfg.getBasePath()); - roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles); - dataFilesToRead = roView.getLatestDataFiles(); - assertTrue(dataFilesToRead.findAny().isPresent()); - - List dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles); - //Wrote 40 records and deleted 20 records, so remaining 40-20 = 20 - assertEquals("Must contain 20 records", 20, recordsRead.size()); - } - - @Test - public void testCOWToMORConvertedDatasetRollback() throws Exception { - - //Set TableType to COW - HoodieTestUtils.initTableType(basePath, HoodieTableType.COPY_ON_WRITE); - - HoodieWriteConfig cfg = getConfig(true); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); - List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - List statuses = client.upsert(writeRecords, newCommitTime).collect(); - //verify there are no errors - assertNoWriteErrors(statuses); - - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); - - Optional commit = - metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertTrue(commit.isPresent()); - assertEquals("commit should be 001", "001", commit.get().getTimestamp()); - - /** - * Write 2 (updates) - */ - newCommitTime = "002"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, records); - - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - //Set TableType to MOR - HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ); - - //rollback a COW commit when TableType is MOR - client.rollback(newCommitTime); - - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); - FileStatus [] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles); - - final String absentCommit = newCommitTime; - assertFalse(roView.getLatestDataFiles().filter(file -> { - if(absentCommit.equals(file.getCommitTime())) - return true; - else - return false; - }).findAny().isPresent()); - } - - @Test - public void testRollbackWithDeltaAndCompactionCommit() throws Exception { - - HoodieWriteConfig cfg = getConfig(true); - HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - - // Test delta commit rollback (with all log files) - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); - List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - List statuses = client.upsert(writeRecords, newCommitTime).collect(); - assertNoWriteErrors(statuses); - - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); - - Optional deltaCommit = - metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp()); - - Optional commit = - metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - - FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, - hoodieTable.getCompletedCompactionCommitTimeline(), allFiles); - Stream dataFilesToRead = roView.getLatestDataFiles(); - assertTrue(!dataFilesToRead.findAny().isPresent()); - - roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles); - dataFilesToRead = roView.getLatestDataFiles(); - assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", - dataFilesToRead.findAny().isPresent()); - - /** - * Write 2 (updates) - */ - newCommitTime = "002"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, 200); - - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("Latest Delta commit should be 002", "002", deltaCommit.get().getTimestamp()); - - commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - - List dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles); - - assertEquals(recordsRead.size(), 200); - - // Test delta commit rollback - client.rollback(newCommitTime); - - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); - roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles); - dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); - recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles); - - assertEquals(recordsRead.size(), 200); - - - //Test compaction commit rollback - /** - * Write 2 (updates) - */ - newCommitTime = "003"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, 400); - - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(statuses); - - HoodieCompactor compactor = new HoodieRealtimeTableCompactor(); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(true)); - - compactor.compact(jsc, getConfig(true), table); - - allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); - roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompactionCommitTimeline(), allFiles); - - final String compactedCommitTime = metaClient.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().lastInstant().get().getTimestamp(); - - assertTrue(roView.getLatestDataFiles().filter(file -> { - if(compactedCommitTime.equals(file.getCommitTime())) - return true; - else - return false; - }).findAny().isPresent()); - - client.rollback(compactedCommitTime); - - allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); - roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompactionCommitTimeline(), allFiles); - - assertFalse(roView.getLatestDataFiles().filter(file -> { - if(compactedCommitTime.equals(file.getCommitTime())) - return true; - else - return false; - }).findAny().isPresent()); - } - - private HoodieWriteConfig getConfig(Boolean autoCommit) { - return getConfigBuilder(autoCommit).build(); - } - - private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { - return HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withAutoCommit(autoCommit) - .withCompactionConfig( - HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) - .withInlineCompaction(false).build()) - .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) - .forTable("test-trip-table").withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); - } - - private void assertNoWriteErrors(List statuses) { - // Verify there are no errors - for (WriteStatus status : statuses) { - assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors()); - } - } -} \ No newline at end of file + } +} diff --git a/hoodie-client/src/test/resources/log4j-surefire.properties b/hoodie-client/src/test/resources/log4j-surefire.properties index 490c6411d..daf8d28c1 100644 --- a/hoodie-client/src/test/resources/log4j-surefire.properties +++ b/hoodie-client/src/test/resources/log4j-surefire.properties @@ -20,7 +20,6 @@ log4j.category.com.uber.hoodie.io=WARN log4j.category.com.uber.hoodie.common=WARN log4j.category.com.uber.hoodie.table.log=WARN log4j.category.org.apache.parquet.hadoop=WARN - # A1 is set to be a ConsoleAppender. log4j.appender.A1=org.apache.log4j.ConsoleAppender # A1 uses PatternLayout. diff --git a/hoodie-common/pom.xml b/hoodie-common/pom.xml index b010ac3ec..db973dfa7 100644 --- a/hoodie-common/pom.xml +++ b/hoodie-common/pom.xml @@ -15,128 +15,130 @@ ~ limitations under the License. --> - - - hoodie - com.uber.hoodie - 0.4.1-SNAPSHOT - - 4.0.0 + + + hoodie + com.uber.hoodie + 0.4.1-SNAPSHOT + + 4.0.0 - hoodie-common + hoodie-common - - - - org.jacoco - jacoco-maven-plugin - - - org.apache.maven.plugins - maven-jar-plugin - 2.5 - - - - test-jar - - - - - - org.apache.rat - apache-rat-plugin - - - org.apache.avro - avro-maven-plugin - - - - ${basedir}/src/main/avro/HoodieCommitMetadata.avsc - ${basedir}/src/main/avro/HoodieSavePointMetadata.avsc - ${basedir}/src/main/avro/HoodieCompactionMetadata.avsc - ${basedir}/src/main/avro/HoodieCleanMetadata.avsc - ${basedir}/src/main/avro/HoodieRollbackMetadata.avsc - - - - - + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-jar-plugin + 2.5 + + + + test-jar + + + + + + org.apache.rat + apache-rat-plugin + + + org.apache.avro + avro-maven-plugin + + + + ${basedir}/src/main/avro/HoodieCommitMetadata.avsc + ${basedir}/src/main/avro/HoodieSavePointMetadata.avsc + ${basedir}/src/main/avro/HoodieCompactionMetadata.avsc + ${basedir}/src/main/avro/HoodieCleanMetadata.avsc + ${basedir}/src/main/avro/HoodieRollbackMetadata.avsc + + + + + - - - org.apache.avro - avro - - - org.apache.hadoop - hadoop-client - - - javax.servlet - * - - - - - junit - junit - ${junit.version} - test - - - com.fasterxml.jackson.core - jackson-annotations - - - org.codehaus.jackson - jackson-mapper-asl - - - org.apache.parquet - parquet-avro - ${parquet.version} - - - org.mockito - mockito-all - 1.10.19 - test - - - org.apache.hadoop - hadoop-hdfs - tests - - - org.apache.hadoop - hadoop-common - tests - - - org.codehaus.jackson - jackson-core-asl - 1.9.13 - - - org.apache.commons - commons-lang3 - - - com.esotericsoftware - kryo - test - - - org.apache.avro - avro-mapred - - - org.mortbay.jetty - * - - - - + + + org.apache.avro + avro + + + org.apache.hadoop + hadoop-client + + + javax.servlet + * + + + + + junit + junit + ${junit.version} + test + + + com.fasterxml.jackson.core + jackson-annotations + + + org.codehaus.jackson + jackson-mapper-asl + + + org.apache.parquet + parquet-avro + ${parquet.version} + + + org.mockito + mockito-all + 1.10.19 + test + + + org.apache.hadoop + hadoop-hdfs + tests + + + org.apache.hadoop + hadoop-common + tests + + + org.codehaus.jackson + jackson-core-asl + 1.9.13 + + + org.apache.commons + commons-lang3 + + + com.esotericsoftware + kryo + test + + + org.apache.avro + avro-mapred + + + org.mortbay.jetty + * + + + + diff --git a/hoodie-common/src/main/java/com/uber/hoodie/avro/HoodieAvroWriteSupport.java b/hoodie-common/src/main/java/com/uber/hoodie/avro/HoodieAvroWriteSupport.java index dd3318228..f793c1539 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/avro/HoodieAvroWriteSupport.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/avro/HoodieAvroWriteSupport.java @@ -17,60 +17,59 @@ package com.uber.hoodie.avro; import com.uber.hoodie.common.BloomFilter; - +import java.util.HashMap; import org.apache.avro.Schema; import org.apache.parquet.avro.AvroWriteSupport; import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.parquet.schema.MessageType; -import java.util.HashMap; - /** * Wrap AvroWriterSupport for plugging in the bloom filter. */ public class HoodieAvroWriteSupport extends AvroWriteSupport { - private BloomFilter bloomFilter; - private String minRecordKey; - private String maxRecordKey; + + private BloomFilter bloomFilter; + private String minRecordKey; + private String maxRecordKey; - public final static String HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY = - "com.uber.hoodie.bloomfilter"; - public final static String HOODIE_MIN_RECORD_KEY_FOOTER = "hoodie_min_record_key"; - public final static String HOODIE_MAX_RECORD_KEY_FOOTER = "hoodie_max_record_key"; + public final static String HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY = + "com.uber.hoodie.bloomfilter"; + public final static String HOODIE_MIN_RECORD_KEY_FOOTER = "hoodie_min_record_key"; + public final static String HOODIE_MAX_RECORD_KEY_FOOTER = "hoodie_max_record_key"; - public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, BloomFilter bloomFilter) { - super(schema, avroSchema); - this.bloomFilter = bloomFilter; + public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, BloomFilter bloomFilter) { + super(schema, avroSchema); + this.bloomFilter = bloomFilter; + } + + @Override + public WriteSupport.FinalizedWriteContext finalizeWrite() { + HashMap extraMetaData = new HashMap<>(); + if (bloomFilter != null) { + extraMetaData + .put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString()); + if (minRecordKey != null && maxRecordKey != null) { + extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey); + extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey); + } + } + return new WriteSupport.FinalizedWriteContext(extraMetaData); + } + + public void add(String recordKey) { + this.bloomFilter.add(recordKey); + if (minRecordKey != null) { + minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey; + } else { + minRecordKey = recordKey; } - @Override - public WriteSupport.FinalizedWriteContext finalizeWrite() { - HashMap extraMetaData = new HashMap<>(); - if (bloomFilter != null) { - extraMetaData - .put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString()); - if (minRecordKey != null && maxRecordKey != null) { - extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey); - extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey); - } - } - return new WriteSupport.FinalizedWriteContext(extraMetaData); - } - - public void add(String recordKey) { - this.bloomFilter.add(recordKey); - if (minRecordKey != null) { - minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey; - } else { - minRecordKey = recordKey; - } - - if (maxRecordKey != null) { - maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey; - } else { - maxRecordKey = recordKey; - } + if (maxRecordKey != null) { + maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey; + } else { + maxRecordKey = recordKey; } + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/avro/MercifulJsonConverter.java b/hoodie-common/src/main/java/com/uber/hoodie/avro/MercifulJsonConverter.java index 9efe8408d..b775d9068 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/avro/MercifulJsonConverter.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/avro/MercifulJsonConverter.java @@ -17,151 +17,148 @@ package com.uber.hoodie.avro; import java.io.IOException; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; - import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.GenericRecordBuilder; import org.codehaus.jackson.map.ObjectMapper; /** - * Marjority of this is copied from - * https://github.com/jwills/avro-json/blob/master/src/main/java/com/cloudera/science/avro/common/JsonConverter.java + * Marjority of this is copied from https://github.com/jwills/avro-json/blob/master/src/main/java/com/cloudera/science/avro/common/JsonConverter.java * Adjusted for expected behavior of our use cases */ public class MercifulJsonConverter { - private final ObjectMapper mapper = new ObjectMapper(); - private final Schema baseSchema; - public MercifulJsonConverter(Schema schema) { - this.baseSchema = schema; + private final ObjectMapper mapper = new ObjectMapper(); + private final Schema baseSchema; + + public MercifulJsonConverter(Schema schema) { + this.baseSchema = schema; + } + + + public GenericRecord convert(String json) throws IOException { + try { + return convert(mapper.readValue(json, Map.class), baseSchema); + } catch (IOException e) { + throw new IOException("Failed to parse as Json: " + json + "\n\n" + e.getMessage()); + } + } + + private GenericRecord convert(Map raw, Schema schema) + throws IOException { + GenericRecord result = new GenericData.Record(schema); + for (Schema.Field f : schema.getFields()) { + String name = f.name(); + Object rawValue = raw.get(name); + if (rawValue != null) { + result.put(f.pos(), typeConvert(rawValue, name, f.schema())); + } } + return result; + } - public GenericRecord convert(String json) throws IOException { - try { - return convert(mapper.readValue(json, Map.class), baseSchema); - } catch (IOException e) { - throw new IOException("Failed to parse as Json: " + json + "\n\n" + e.getMessage()); + private Object typeConvert(Object value, String name, Schema schema) throws IOException { + if (isOptional(schema)) { + if (value == null) { + return null; + } else { + schema = getNonNull(schema); + } + } else if (value == null) { + // Always fail on null for non-nullable schemas + throw new JsonConversionException(null, name, schema); + } + + switch (schema.getType()) { + case BOOLEAN: + if (value instanceof Boolean) { + return (Boolean) value; } - } - - private GenericRecord convert(Map raw, Schema schema) - throws IOException { - GenericRecord result = new GenericData.Record(schema); - for (Schema.Field f : schema.getFields()) { - String name = f.name(); - Object rawValue = raw.get(name); - if (rawValue != null) { - result.put(f.pos(), typeConvert(rawValue, name, f.schema())); - } + break; + case DOUBLE: + if (value instanceof Number) { + return ((Number) value).doubleValue(); } - - return result; - } - - private Object typeConvert(Object value, String name, Schema schema) throws IOException { - if (isOptional(schema)) { - if (value == null) { - return null; - } else { - schema = getNonNull(schema); - } - } else if (value == null) { - // Always fail on null for non-nullable schemas - throw new JsonConversionException(null, name, schema); + break; + case FLOAT: + if (value instanceof Number) { + return ((Number) value).floatValue(); } - - switch (schema.getType()) { - case BOOLEAN: - if (value instanceof Boolean) { - return (Boolean) value; - } - break; - case DOUBLE: - if (value instanceof Number) { - return ((Number) value).doubleValue(); - } - break; - case FLOAT: - if (value instanceof Number) { - return ((Number) value).floatValue(); - } - break; - case INT: - if (value instanceof Number) { - return ((Number) value).intValue(); - } - break; - case LONG: - if (value instanceof Number) { - return ((Number) value).longValue(); - } - break; - case STRING: - return value.toString(); - case ENUM: - if (schema.getEnumSymbols().contains(value.toString())) { - return new GenericData.EnumSymbol(schema, value.toString()); - } - throw new JsonConversionException(String.format("Symbol %s not in enum", value.toString()), - schema.getFullName(), schema); - case RECORD: - return convert((Map) value, schema); - case ARRAY: - Schema elementSchema = schema.getElementType(); - List listRes = new ArrayList(); - for (Object v : (List) value) { - listRes.add(typeConvert(v, name, elementSchema)); - } - return listRes; - case MAP: - Schema valueSchema = schema.getValueType(); - Map mapRes = new HashMap(); - for (Map.Entry v : ((Map) value).entrySet()) { - mapRes.put(v.getKey(), typeConvert(v.getValue(), name, valueSchema)); - } - return mapRes; - default: - throw new IllegalArgumentException( - "JsonConverter cannot handle type: " + schema.getType()); + break; + case INT: + if (value instanceof Number) { + return ((Number) value).intValue(); } - throw new JsonConversionException(value, name, schema); - } - - private boolean isOptional(Schema schema) { - return schema.getType().equals(Schema.Type.UNION) && - schema.getTypes().size() == 2 && - (schema.getTypes().get(0).getType().equals(Schema.Type.NULL) || - schema.getTypes().get(1).getType().equals(Schema.Type.NULL)); - } - - private Schema getNonNull(Schema schema) { - List types = schema.getTypes(); - return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0); - } - - public static class JsonConversionException extends RuntimeException { - - private Object value; - private String fieldName; - private Schema schema; - - public JsonConversionException(Object value, String fieldName, Schema schema) { - this.value = value; - this.fieldName = fieldName; - this.schema = schema; + break; + case LONG: + if (value instanceof Number) { + return ((Number) value).longValue(); } - - @Override - public String toString() { - return String.format("Type conversion error for field %s, %s for %s", - fieldName, value, schema); + break; + case STRING: + return value.toString(); + case ENUM: + if (schema.getEnumSymbols().contains(value.toString())) { + return new GenericData.EnumSymbol(schema, value.toString()); } + throw new JsonConversionException(String.format("Symbol %s not in enum", value.toString()), + schema.getFullName(), schema); + case RECORD: + return convert((Map) value, schema); + case ARRAY: + Schema elementSchema = schema.getElementType(); + List listRes = new ArrayList(); + for (Object v : (List) value) { + listRes.add(typeConvert(v, name, elementSchema)); + } + return listRes; + case MAP: + Schema valueSchema = schema.getValueType(); + Map mapRes = new HashMap(); + for (Map.Entry v : ((Map) value).entrySet()) { + mapRes.put(v.getKey(), typeConvert(v.getValue(), name, valueSchema)); + } + return mapRes; + default: + throw new IllegalArgumentException( + "JsonConverter cannot handle type: " + schema.getType()); } + throw new JsonConversionException(value, name, schema); + } + + private boolean isOptional(Schema schema) { + return schema.getType().equals(Schema.Type.UNION) && + schema.getTypes().size() == 2 && + (schema.getTypes().get(0).getType().equals(Schema.Type.NULL) || + schema.getTypes().get(1).getType().equals(Schema.Type.NULL)); + } + + private Schema getNonNull(Schema schema) { + List types = schema.getTypes(); + return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0); + } + + public static class JsonConversionException extends RuntimeException { + + private Object value; + private String fieldName; + private Schema schema; + + public JsonConversionException(Object value, String fieldName, Schema schema) { + this.value = value; + this.fieldName = fieldName; + this.schema = schema; + } + + @Override + public String toString() { + return String.format("Type conversion error for field %s, %s for %s", + fieldName, value, schema); + } + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/BloomFilter.java b/hoodie-common/src/main/java/com/uber/hoodie/common/BloomFilter.java index d81e31df3..ce2249179 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/BloomFilter.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/BloomFilter.java @@ -17,84 +17,86 @@ package com.uber.hoodie.common; import com.uber.hoodie.exception.HoodieIndexException; +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import javax.xml.bind.DatatypeConverter; import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.hadoop.util.bloom.Key; import org.apache.hadoop.util.hash.Hash; -import javax.xml.bind.DatatypeConverter; - -import java.io.*; -import java.nio.charset.StandardCharsets; - /** * A Bloom filter implementation built on top of {@link org.apache.hadoop.util.bloom.BloomFilter}. */ public class BloomFilter { - /** - * Used in computing the optimal Bloom filter size. This approximately equals 0.480453. - */ - public static final double LOG2_SQUARED = Math.log(2) * Math.log(2); - private org.apache.hadoop.util.bloom.BloomFilter filter = null; + /** + * Used in computing the optimal Bloom filter size. This approximately equals 0.480453. + */ + public static final double LOG2_SQUARED = Math.log(2) * Math.log(2); - public BloomFilter(int numEntries, double errorRate) { - this(numEntries, errorRate, Hash.MURMUR_HASH); + private org.apache.hadoop.util.bloom.BloomFilter filter = null; + + public BloomFilter(int numEntries, double errorRate) { + this(numEntries, errorRate, Hash.MURMUR_HASH); + } + + /** + * Create a new Bloom filter with the given configurations. + */ + public BloomFilter(int numEntries, double errorRate, int hashType) { + // Bit size + int bitSize = (int) Math.ceil(numEntries * (-Math.log(errorRate) / LOG2_SQUARED)); + // Number of the hash functions + int numHashs = (int) Math.ceil(Math.log(2) * bitSize / numEntries); + // The filter + this.filter = new org.apache.hadoop.util.bloom.BloomFilter(bitSize, numHashs, hashType); + } + + /** + * Create the bloom filter from serialized string. + */ + public BloomFilter(String filterStr) { + this.filter = new org.apache.hadoop.util.bloom.BloomFilter(); + byte[] bytes = DatatypeConverter.parseBase64Binary(filterStr); + DataInputStream dis = new DataInputStream(new ByteArrayInputStream(bytes)); + try { + this.filter.readFields(dis); + dis.close(); + } catch (IOException e) { + throw new HoodieIndexException("Could not deserialize BloomFilter instance", e); } + } - /** - * Create a new Bloom filter with the given configurations. - */ - public BloomFilter(int numEntries, double errorRate, int hashType) { - // Bit size - int bitSize = (int) Math.ceil(numEntries * (-Math.log(errorRate) / LOG2_SQUARED)); - // Number of the hash functions - int numHashs = (int) Math.ceil(Math.log(2) * bitSize / numEntries); - // The filter - this.filter = new org.apache.hadoop.util.bloom.BloomFilter(bitSize, numHashs, hashType); + public void add(String key) { + if (key == null) { + throw new NullPointerException("Key cannot by null"); } + filter.add(new Key(key.getBytes(StandardCharsets.UTF_8))); + } - /** - * Create the bloom filter from serialized string. - */ - public BloomFilter(String filterStr) { - this.filter = new org.apache.hadoop.util.bloom.BloomFilter(); - byte[] bytes = DatatypeConverter.parseBase64Binary(filterStr); - DataInputStream dis = new DataInputStream(new ByteArrayInputStream(bytes)); - try { - this.filter.readFields(dis); - dis.close(); - } catch (IOException e) { - throw new HoodieIndexException("Could not deserialize BloomFilter instance", e); - } + public boolean mightContain(String key) { + if (key == null) { + throw new NullPointerException("Key cannot by null"); } + return filter.membershipTest(new Key(key.getBytes(StandardCharsets.UTF_8))); + } - public void add(String key) { - if (key == null) { - throw new NullPointerException("Key cannot by null"); - } - filter.add(new Key(key.getBytes(StandardCharsets.UTF_8))); - } - - public boolean mightContain(String key) { - if (key == null) { - throw new NullPointerException("Key cannot by null"); - } - return filter.membershipTest(new Key(key.getBytes(StandardCharsets.UTF_8))); - } - - /** - * Serialize the bloom filter as a string. - */ - public String serializeToString() { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutputStream dos = new DataOutputStream(baos); - try { - filter.write(dos); - byte[] bytes = baos.toByteArray(); - dos.close(); - return DatatypeConverter.printBase64Binary(bytes); - } catch (IOException e) { - throw new HoodieIndexException("Could not serialize BloomFilter instance", e); - } + /** + * Serialize the bloom filter as a string. + */ + public String serializeToString() { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(baos); + try { + filter.write(dos); + byte[] bytes = baos.toByteArray(); + dos.close(); + return DatatypeConverter.printBase64Binary(bytes); + } catch (IOException e) { + throw new HoodieIndexException("Could not serialize BloomFilter instance", e); } + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieCleanStat.java b/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieCleanStat.java index ee3177f1d..d2de837ef 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieCleanStat.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieCleanStat.java @@ -18,7 +18,6 @@ package com.uber.hoodie.common; import com.uber.hoodie.common.model.HoodieCleaningPolicy; import com.uber.hoodie.common.table.timeline.HoodieInstant; - import java.io.Serializable; import java.util.List; import java.util.Optional; @@ -27,100 +26,102 @@ import java.util.Optional; * Collects stats about a single partition clean operation */ public class HoodieCleanStat implements Serializable { - // Policy used - private final HoodieCleaningPolicy policy; - // Partition path cleaned - private final String partitionPath; - // The patterns that were generated for the delete operation - private final List deletePathPatterns; - private final List successDeleteFiles; - // Files that could not be deleted - private final List failedDeleteFiles; - // Earliest commit that was retained in this clean - private final String earliestCommitToRetain; - public HoodieCleanStat(HoodieCleaningPolicy policy, String partitionPath, - List deletePathPatterns, List successDeleteFiles, - List failedDeleteFiles, String earliestCommitToRetain) { - this.policy = policy; - this.partitionPath = partitionPath; - this.deletePathPatterns = deletePathPatterns; - this.successDeleteFiles = successDeleteFiles; - this.failedDeleteFiles = failedDeleteFiles; - this.earliestCommitToRetain = earliestCommitToRetain; + // Policy used + private final HoodieCleaningPolicy policy; + // Partition path cleaned + private final String partitionPath; + // The patterns that were generated for the delete operation + private final List deletePathPatterns; + private final List successDeleteFiles; + // Files that could not be deleted + private final List failedDeleteFiles; + // Earliest commit that was retained in this clean + private final String earliestCommitToRetain; + + public HoodieCleanStat(HoodieCleaningPolicy policy, String partitionPath, + List deletePathPatterns, List successDeleteFiles, + List failedDeleteFiles, String earliestCommitToRetain) { + this.policy = policy; + this.partitionPath = partitionPath; + this.deletePathPatterns = deletePathPatterns; + this.successDeleteFiles = successDeleteFiles; + this.failedDeleteFiles = failedDeleteFiles; + this.earliestCommitToRetain = earliestCommitToRetain; + } + + public HoodieCleaningPolicy getPolicy() { + return policy; + } + + public String getPartitionPath() { + return partitionPath; + } + + public List getDeletePathPatterns() { + return deletePathPatterns; + } + + public List getSuccessDeleteFiles() { + return successDeleteFiles; + } + + public List getFailedDeleteFiles() { + return failedDeleteFiles; + } + + public String getEarliestCommitToRetain() { + return earliestCommitToRetain; + } + + public static HoodieCleanStat.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + + private HoodieCleaningPolicy policy; + private List deletePathPatterns; + private List successDeleteFiles; + private List failedDeleteFiles; + private String partitionPath; + private String earliestCommitToRetain; + + public Builder withPolicy(HoodieCleaningPolicy policy) { + this.policy = policy; + return this; } - public HoodieCleaningPolicy getPolicy() { - return policy; + public Builder withDeletePathPattern(List deletePathPatterns) { + this.deletePathPatterns = deletePathPatterns; + return this; } - public String getPartitionPath() { - return partitionPath; + public Builder withSuccessfulDeletes(List successDeleteFiles) { + this.successDeleteFiles = successDeleteFiles; + return this; } - public List getDeletePathPatterns() { - return deletePathPatterns; + public Builder withFailedDeletes(List failedDeleteFiles) { + this.failedDeleteFiles = failedDeleteFiles; + return this; } - public List getSuccessDeleteFiles() { - return successDeleteFiles; + public Builder withPartitionPath(String partitionPath) { + this.partitionPath = partitionPath; + return this; } - public List getFailedDeleteFiles() { - return failedDeleteFiles; + public Builder withEarliestCommitRetained(Optional earliestCommitToRetain) { + this.earliestCommitToRetain = (earliestCommitToRetain.isPresent()) ? + earliestCommitToRetain.get().getTimestamp() : + "-1"; + return this; } - public String getEarliestCommitToRetain() { - return earliestCommitToRetain; - } - - public static HoodieCleanStat.Builder newBuilder() { - return new Builder(); - } - - public static class Builder { - private HoodieCleaningPolicy policy; - private List deletePathPatterns; - private List successDeleteFiles; - private List failedDeleteFiles; - private String partitionPath; - private String earliestCommitToRetain; - - public Builder withPolicy(HoodieCleaningPolicy policy) { - this.policy = policy; - return this; - } - - public Builder withDeletePathPattern(List deletePathPatterns) { - this.deletePathPatterns = deletePathPatterns; - return this; - } - - public Builder withSuccessfulDeletes(List successDeleteFiles) { - this.successDeleteFiles = successDeleteFiles; - return this; - } - - public Builder withFailedDeletes(List failedDeleteFiles) { - this.failedDeleteFiles= failedDeleteFiles; - return this; - } - - public Builder withPartitionPath(String partitionPath) { - this.partitionPath = partitionPath; - return this; - } - - public Builder withEarliestCommitRetained(Optional earliestCommitToRetain) { - this.earliestCommitToRetain = (earliestCommitToRetain.isPresent()) ? - earliestCommitToRetain.get().getTimestamp() : - "-1"; - return this; - } - - public HoodieCleanStat build() { - return new HoodieCleanStat(policy, partitionPath, deletePathPatterns, - successDeleteFiles, failedDeleteFiles, earliestCommitToRetain); - } + public HoodieCleanStat build() { + return new HoodieCleanStat(policy, partitionPath, deletePathPatterns, + successDeleteFiles, failedDeleteFiles, earliestCommitToRetain); } + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieJsonPayload.java b/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieJsonPayload.java index 85ccfc744..c8d89d96b 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieJsonPayload.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieJsonPayload.java @@ -19,13 +19,6 @@ package com.uber.hoodie.common; import com.uber.hoodie.avro.MercifulJsonConverter; import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.exception.HoodieException; - -import org.apache.avro.Schema; -import org.apache.avro.generic.IndexedRecord; -import org.apache.commons.io.IOUtils; -import org.codehaus.jackson.JsonNode; -import org.codehaus.jackson.map.ObjectMapper; - import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -34,75 +27,85 @@ import java.util.Optional; import java.util.zip.Deflater; import java.util.zip.DeflaterOutputStream; import java.util.zip.InflaterInputStream; +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.commons.io.IOUtils; +import org.codehaus.jackson.JsonNode; +import org.codehaus.jackson.map.ObjectMapper; public class HoodieJsonPayload implements HoodieRecordPayload { - private byte[] jsonDataCompressed; - private int dataSize; - public HoodieJsonPayload(String json) throws IOException { - this.jsonDataCompressed = compressData(json); - this.dataSize = json.length(); + private byte[] jsonDataCompressed; + private int dataSize; + + public HoodieJsonPayload(String json) throws IOException { + this.jsonDataCompressed = compressData(json); + this.dataSize = json.length(); + } + + @Override + public HoodieJsonPayload preCombine(HoodieJsonPayload another) { + return this; + } + + @Override + public Optional combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) + throws IOException { + return getInsertValue(schema); + } + + @Override + public Optional getInsertValue(Schema schema) throws IOException { + MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema); + return Optional.of(jsonConverter.convert(getJsonData())); + } + + private String getJsonData() throws IOException { + return unCompressData(jsonDataCompressed); + } + + private byte[] compressData(String jsonData) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + Deflater deflater = new Deflater(Deflater.BEST_COMPRESSION); + DeflaterOutputStream dos = + new DeflaterOutputStream(baos, deflater, true); + try { + dos.write(jsonData.getBytes()); + } finally { + dos.flush(); + dos.close(); + // Its important to call this. + // Deflater takes off-heap native memory and does not release until GC kicks in + deflater.end(); } + return baos.toByteArray(); + } - @Override public HoodieJsonPayload preCombine(HoodieJsonPayload another) { - return this; + + private String unCompressData(byte[] data) throws IOException { + InflaterInputStream iis = new InflaterInputStream(new ByteArrayInputStream(data)); + try { + StringWriter sw = new StringWriter(dataSize); + IOUtils.copy(iis, sw); + return sw.toString(); + } finally { + iis.close(); } + } - @Override public Optional combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { - return getInsertValue(schema); + private String getFieldFromJsonOrFail(String field) throws IOException { + JsonNode node = new ObjectMapper().readTree(getJsonData()); + if (!node.has(field)) { + throw new HoodieException("Field :" + field + " not found in payload => " + node.toString()); } + return node.get(field).getTextValue(); + } - @Override public Optional getInsertValue(Schema schema) throws IOException { - MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema); - return Optional.of(jsonConverter.convert(getJsonData())); - } + public String getRowKey(String keyColumnField) throws IOException { + return getFieldFromJsonOrFail(keyColumnField); + } - private String getJsonData() throws IOException { - return unCompressData(jsonDataCompressed); - } - - private byte[] compressData(String jsonData) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - Deflater deflater = new Deflater(Deflater.BEST_COMPRESSION); - DeflaterOutputStream dos = - new DeflaterOutputStream(baos, deflater, true); - try { - dos.write(jsonData.getBytes()); - } finally { - dos.flush(); - dos.close(); - // Its important to call this. - // Deflater takes off-heap native memory and does not release until GC kicks in - deflater.end(); - } - return baos.toByteArray(); - } - - - private String unCompressData(byte[] data) throws IOException { - InflaterInputStream iis = new InflaterInputStream(new ByteArrayInputStream(data)); - try { - StringWriter sw = new StringWriter(dataSize); - IOUtils.copy(iis, sw); - return sw.toString(); - } finally { - iis.close(); - } - } - - private String getFieldFromJsonOrFail(String field) throws IOException { - JsonNode node = new ObjectMapper().readTree(getJsonData()); - if(!node.has(field)) { - throw new HoodieException("Field :" + field + " not found in payload => " + node.toString()); - } - return node.get(field).getTextValue(); - } - - public String getRowKey(String keyColumnField) throws IOException { - return getFieldFromJsonOrFail(keyColumnField); - } - - public String getPartitionPath(String partitionPathField) throws IOException { - return getFieldFromJsonOrFail(partitionPathField); - } + public String getPartitionPath(String partitionPathField) throws IOException { + return getFieldFromJsonOrFail(partitionPathField); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieRollbackStat.java b/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieRollbackStat.java index e9d271dbc..72afe1c73 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieRollbackStat.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/HoodieRollbackStat.java @@ -16,81 +16,82 @@ package com.uber.hoodie.common; -import org.apache.hadoop.fs.FileStatus; - -import java.io.File; import java.io.Serializable; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileStatus; /** * Collects stats about a single partition clean operation */ public class HoodieRollbackStat implements Serializable { - // Partition path - private final String partitionPath; - private final List successDeleteFiles; - // Files that could not be deleted - private final List failedDeleteFiles; - // Count of HoodieLogFile to commandBlocks written for a particular rollback - private final Map commandBlocksCount; - public HoodieRollbackStat(String partitionPath, List successDeleteFiles, - List failedDeleteFiles, Map commandBlocksCount) { - this.partitionPath = partitionPath; - this.successDeleteFiles = successDeleteFiles; - this.failedDeleteFiles = failedDeleteFiles; - this.commandBlocksCount = commandBlocksCount; + // Partition path + private final String partitionPath; + private final List successDeleteFiles; + // Files that could not be deleted + private final List failedDeleteFiles; + // Count of HoodieLogFile to commandBlocks written for a particular rollback + private final Map commandBlocksCount; + + public HoodieRollbackStat(String partitionPath, List successDeleteFiles, + List failedDeleteFiles, Map commandBlocksCount) { + this.partitionPath = partitionPath; + this.successDeleteFiles = successDeleteFiles; + this.failedDeleteFiles = failedDeleteFiles; + this.commandBlocksCount = commandBlocksCount; + } + + public Map getCommandBlocksCount() { + return commandBlocksCount; + } + + public String getPartitionPath() { + return partitionPath; + } + + public List getSuccessDeleteFiles() { + return successDeleteFiles; + } + + public List getFailedDeleteFiles() { + return failedDeleteFiles; + } + + public static HoodieRollbackStat.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + + private List successDeleteFiles; + private List failedDeleteFiles; + private Map commandBlocksCount; + private String partitionPath; + + public Builder withDeletedFileResults(Map deletedFiles) { + //noinspection Convert2MethodRef + successDeleteFiles = deletedFiles.entrySet().stream().filter(s -> s.getValue()) + .map(s -> s.getKey().getPath().toString()).collect(Collectors.toList()); + failedDeleteFiles = deletedFiles.entrySet().stream().filter(s -> !s.getValue()) + .map(s -> s.getKey().getPath().toString()).collect(Collectors.toList()); + return this; } - public Map getCommandBlocksCount() { - return commandBlocksCount; + public Builder withRollbackBlockAppendResults(Map commandBlocksCount) { + this.commandBlocksCount = commandBlocksCount; + return this; } - public String getPartitionPath() { - return partitionPath; + public Builder withPartitionPath(String partitionPath) { + this.partitionPath = partitionPath; + return this; } - public List getSuccessDeleteFiles() { - return successDeleteFiles; - } - - public List getFailedDeleteFiles() { - return failedDeleteFiles; - } - - public static HoodieRollbackStat.Builder newBuilder() { - return new Builder(); - } - - public static class Builder { - private List successDeleteFiles; - private List failedDeleteFiles; - private Map commandBlocksCount; - private String partitionPath; - - public Builder withDeletedFileResults(Map deletedFiles) { - //noinspection Convert2MethodRef - successDeleteFiles = deletedFiles.entrySet().stream().filter(s -> s.getValue()) - .map(s -> s.getKey().getPath().toString()).collect(Collectors.toList()); - failedDeleteFiles = deletedFiles.entrySet().stream().filter(s -> !s.getValue()) - .map(s -> s.getKey().getPath().toString()).collect(Collectors.toList()); - return this; - } - - public Builder withRollbackBlockAppendResults(Map commandBlocksCount) { - this.commandBlocksCount = commandBlocksCount; - return this; - } - - public Builder withPartitionPath(String partitionPath) { - this.partitionPath = partitionPath; - return this; - } - - public HoodieRollbackStat build() { - return new HoodieRollbackStat(partitionPath, successDeleteFiles, failedDeleteFiles, commandBlocksCount); - } + public HoodieRollbackStat build() { + return new HoodieRollbackStat(partitionPath, successDeleteFiles, failedDeleteFiles, + commandBlocksCount); } + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/ActionType.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/ActionType.java index 5e8a8c2a4..4a4427696 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/ActionType.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/ActionType.java @@ -17,5 +17,5 @@ package com.uber.hoodie.common.model; public enum ActionType { - commit, savepoint, compaction, clean, rollback; + commit, savepoint, compaction, clean, rollback; } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/CompactionWriteStat.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/CompactionWriteStat.java index 1ff704bbb..40f7fc363 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/CompactionWriteStat.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/CompactionWriteStat.java @@ -17,13 +17,7 @@ package com.uber.hoodie.common.model; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.uber.hoodie.common.util.FSUtils; import java.io.Serializable; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import org.apache.hadoop.fs.Path; @JsonIgnoreProperties(ignoreUnknown = true) public class CompactionWriteStat implements Serializable { @@ -34,7 +28,8 @@ public class CompactionWriteStat implements Serializable { private long totalLogFiles; private long totalRecordsToBeUpdate; - public CompactionWriteStat(HoodieWriteStat writeStat, String partitionPath, long totalLogFiles, long totalLogRecords, + public CompactionWriteStat(HoodieWriteStat writeStat, String partitionPath, long totalLogFiles, + long totalLogRecords, long totalRecordsToUpdate) { this.writeStat = writeStat; this.partitionPath = partitionPath; @@ -58,6 +53,7 @@ public class CompactionWriteStat implements Serializable { public long getTotalRecordsToBeUpdate() { return totalRecordsToBeUpdate; } + public HoodieWriteStat getHoodieWriteStat() { return writeStat; } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/FileSlice.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/FileSlice.java index be8b6c1c4..b0f4c4182 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/FileSlice.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/FileSlice.java @@ -19,79 +19,75 @@ package com.uber.hoodie.common.model; import java.io.Serializable; -import java.util.List; import java.util.Optional; import java.util.TreeSet; -import java.util.stream.Collectors; import java.util.stream.Stream; /** - * Within a file group, a slice is a combination of data file written at a commit time - * and list of log files, containing changes to the data file from that commit time + * Within a file group, a slice is a combination of data file written at a commit time and list of + * log files, containing changes to the data file from that commit time */ public class FileSlice implements Serializable { - /** - * id of the slice - */ - private String fileId; + /** + * id of the slice + */ + private String fileId; - /** - * Point in the timeline, at which the slice was created - */ - private String baseCommitTime; + /** + * Point in the timeline, at which the slice was created + */ + private String baseCommitTime; - /** - * data file, with the compacted data, for this slice - * - */ - private HoodieDataFile dataFile; + /** + * data file, with the compacted data, for this slice + */ + private HoodieDataFile dataFile; - /** - * List of appendable log files with real time data - * - Sorted with greater log version first - * - Always empty for copy_on_write storage. - */ - private final TreeSet logFiles; + /** + * List of appendable log files with real time data - Sorted with greater log version first - + * Always empty for copy_on_write storage. + */ + private final TreeSet logFiles; - public FileSlice(String baseCommitTime, String fileId) { - this.fileId = fileId; - this.baseCommitTime = baseCommitTime; - this.dataFile = null; - this.logFiles = new TreeSet<>(HoodieLogFile.getLogVersionComparator()); - } + public FileSlice(String baseCommitTime, String fileId) { + this.fileId = fileId; + this.baseCommitTime = baseCommitTime; + this.dataFile = null; + this.logFiles = new TreeSet<>(HoodieLogFile.getLogVersionComparator()); + } - public void setDataFile(HoodieDataFile dataFile) { - this.dataFile = dataFile; - } + public void setDataFile(HoodieDataFile dataFile) { + this.dataFile = dataFile; + } - public void addLogFile(HoodieLogFile logFile) { - this.logFiles.add(logFile); - } + public void addLogFile(HoodieLogFile logFile) { + this.logFiles.add(logFile); + } - public Stream getLogFiles() { - return logFiles.stream(); - } + public Stream getLogFiles() { + return logFiles.stream(); + } - public String getBaseCommitTime() { - return baseCommitTime; - } + public String getBaseCommitTime() { + return baseCommitTime; + } - public String getFileId() { - return fileId; - } + public String getFileId() { + return fileId; + } - public Optional getDataFile() { - return Optional.ofNullable(dataFile); - } + public Optional getDataFile() { + return Optional.ofNullable(dataFile); + } - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("FileSlice {"); - sb.append("baseCommitTime=").append(baseCommitTime); - sb.append(", dataFile='").append(dataFile).append('\''); - sb.append(", logFiles='").append(logFiles).append('\''); - sb.append('}'); - return sb.toString(); - } + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("FileSlice {"); + sb.append("baseCommitTime=").append(baseCommitTime); + sb.append(", dataFile='").append(dataFile).append('\''); + sb.append(", logFiles='").append(logFiles).append('\''); + sb.append('}'); + return sb.toString(); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieArchivedLogFile.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieArchivedLogFile.java index 100cd8382..fb2038f38 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieArchivedLogFile.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieArchivedLogFile.java @@ -23,18 +23,18 @@ import org.apache.hadoop.fs.Path; public class HoodieArchivedLogFile extends HoodieLogFile { - public static final String ARCHIVE_EXTENSION = ".archive"; + public static final String ARCHIVE_EXTENSION = ".archive"; - public HoodieArchivedLogFile(FileStatus fileStatus) { - super(fileStatus); - } + public HoodieArchivedLogFile(FileStatus fileStatus) { + super(fileStatus); + } - public HoodieArchivedLogFile(Path logPath) { - super(logPath); - } + public HoodieArchivedLogFile(Path logPath) { + super(logPath); + } - @Override - public String toString() { - return "HoodieArchivedLogFile {" + super.getPath() + '}'; - } + @Override + public String toString() { + return "HoodieArchivedLogFile {" + super.getPath() + '}'; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieAvroPayload.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieAvroPayload.java index 9fc0b3570..a6c45c737 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieAvroPayload.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieAvroPayload.java @@ -17,40 +17,37 @@ package com.uber.hoodie.common.model; import com.uber.hoodie.common.util.HoodieAvroUtils; - +import java.io.IOException; import java.util.Optional; - import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; -import java.io.IOException; - /** - * This is a payload to wrap a existing Hoodie Avro Record. - * Useful to create a HoodieRecord over existing GenericRecords in a hoodie datasets (useful in compactions) - * + * This is a payload to wrap a existing Hoodie Avro Record. Useful to create a HoodieRecord over + * existing GenericRecords in a hoodie datasets (useful in compactions) */ public class HoodieAvroPayload implements HoodieRecordPayload { - private final Optional record; - public HoodieAvroPayload(Optional record) { - this.record = record; - } + private final Optional record; - @Override - public HoodieAvroPayload preCombine(HoodieAvroPayload another) { - return this; - } + public HoodieAvroPayload(Optional record) { + this.record = record; + } - @Override - public Optional combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) - throws IOException { - return getInsertValue(schema); - } + @Override + public HoodieAvroPayload preCombine(HoodieAvroPayload another) { + return this; + } - @Override - public Optional getInsertValue(Schema schema) throws IOException { - return record.map(r -> HoodieAvroUtils.rewriteRecord(r, schema)); - } + @Override + public Optional combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) + throws IOException { + return getInsertValue(schema); + } + + @Override + public Optional getInsertValue(Schema schema) throws IOException { + return record.map(r -> HoodieAvroUtils.rewriteRecord(r, schema)); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCleaningPolicy.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCleaningPolicy.java index c351ef1b9..4b12b19e6 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCleaningPolicy.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCleaningPolicy.java @@ -17,6 +17,6 @@ package com.uber.hoodie.common.model; public enum HoodieCleaningPolicy { - KEEP_LATEST_FILE_VERSIONS, - KEEP_LATEST_COMMITS + KEEP_LATEST_FILE_VERSIONS, + KEEP_LATEST_COMMITS } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommitMetadata.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommitMetadata.java index 19787ebed..47253637b 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommitMetadata.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommitMetadata.java @@ -17,8 +17,13 @@ package com.uber.hoodie.common.model; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; - -import com.fasterxml.jackson.databind.DeserializationFeature; +import java.io.IOException; +import java.io.Serializable; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -27,196 +32,195 @@ import org.codehaus.jackson.annotate.JsonMethod; import org.codehaus.jackson.map.DeserializationConfig.Feature; import org.codehaus.jackson.map.ObjectMapper; -import java.io.IOException; -import java.io.Serializable; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - /** * All the metadata that gets stored along with a commit. */ @JsonIgnoreProperties(ignoreUnknown = true) public class HoodieCommitMetadata implements Serializable { - private static volatile Logger log = LogManager.getLogger(HoodieCommitMetadata.class); - protected Map> partitionToWriteStats; - private Map extraMetadataMap; + private static volatile Logger log = LogManager.getLogger(HoodieCommitMetadata.class); + protected Map> partitionToWriteStats; - public HoodieCommitMetadata() { - extraMetadataMap = new HashMap<>(); - partitionToWriteStats = new HashMap<>(); + private Map extraMetadataMap; + + public HoodieCommitMetadata() { + extraMetadataMap = new HashMap<>(); + partitionToWriteStats = new HashMap<>(); + } + + public void addWriteStat(String partitionPath, HoodieWriteStat stat) { + if (!partitionToWriteStats.containsKey(partitionPath)) { + partitionToWriteStats.put(partitionPath, new ArrayList<>()); } + partitionToWriteStats.get(partitionPath).add(stat); + } - public void addWriteStat(String partitionPath, HoodieWriteStat stat) { - if (!partitionToWriteStats.containsKey(partitionPath)) { - partitionToWriteStats.put(partitionPath, new ArrayList<>()); + public void addMetadata(String metaKey, String value) { + extraMetadataMap.put(metaKey, value); + } + + public List getWriteStats(String partitionPath) { + return partitionToWriteStats.get(partitionPath); + } + + public Map getExtraMetadata() { + return extraMetadataMap; + } + + public Map> getPartitionToWriteStats() { + return partitionToWriteStats; + } + + public String getMetadata(String metaKey) { + return extraMetadataMap.get(metaKey); + } + + public HashMap getFileIdAndRelativePaths() { + HashMap filePaths = new HashMap<>(); + // list all partitions paths + for (Map.Entry> entry : getPartitionToWriteStats().entrySet()) { + for (HoodieWriteStat stat : entry.getValue()) { + filePaths.put(stat.getFileId(), stat.getPath()); + } + } + return filePaths; + } + + public HashMap getFileIdAndFullPaths(String basePath) { + HashMap fullPaths = new HashMap<>(); + for (Map.Entry entry : getFileIdAndRelativePaths().entrySet()) { + String fullPath = + (entry.getValue() != null) ? (new Path(basePath, entry.getValue())).toString() : null; + fullPaths.put(entry.getKey(), fullPath); + } + return fullPaths; + } + + public String toJsonString() throws IOException { + if (partitionToWriteStats.containsKey(null)) { + log.info("partition path is null for " + partitionToWriteStats.get(null)); + partitionToWriteStats.remove(null); + } + ObjectMapper mapper = new ObjectMapper(); + mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY); + return mapper.defaultPrettyPrintingWriter().writeValueAsString(this); + } + + public static HoodieCommitMetadata fromJsonString(String jsonStr) throws IOException { + if (jsonStr == null || jsonStr.isEmpty()) { + // For empty commit file (no data or somethings bad happen). + return new HoodieCommitMetadata(); + } + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(Feature.FAIL_ON_UNKNOWN_PROPERTIES, false); + mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY); + return mapper.readValue(jsonStr, HoodieCommitMetadata.class); + } + + // Here the functions are named "fetch" instead of "get", to get avoid of the json conversion. + public long fetchTotalPartitionsWritten() { + return partitionToWriteStats.size(); + } + + public long fetchTotalFilesInsert() { + long totalFilesInsert = 0; + for (List stats : partitionToWriteStats.values()) { + for (HoodieWriteStat stat : stats) { + if (stat.getPrevCommit() != null && stat.getPrevCommit().equals("null")) { + totalFilesInsert++; } - partitionToWriteStats.get(partitionPath).add(stat); + } } + return totalFilesInsert; + } - public void addMetadata(String metaKey, String value) { - extraMetadataMap.put(metaKey, value); - } - - public List getWriteStats(String partitionPath) { - return partitionToWriteStats.get(partitionPath); - } - - public Map getExtraMetadata() { return extraMetadataMap; } - - public Map> getPartitionToWriteStats() { - return partitionToWriteStats; - } - - public String getMetadata(String metaKey) { - return extraMetadataMap.get(metaKey); - } - - public HashMap getFileIdAndRelativePaths() { - HashMap filePaths = new HashMap<>(); - // list all partitions paths - for (Map.Entry> entry: getPartitionToWriteStats().entrySet()) { - for (HoodieWriteStat stat: entry.getValue()) { - filePaths.put(stat.getFileId(), stat.getPath()); - } + public long fetchTotalFilesUpdated() { + long totalFilesUpdated = 0; + for (List stats : partitionToWriteStats.values()) { + for (HoodieWriteStat stat : stats) { + if (stat.getPrevCommit() != null && !stat.getPrevCommit().equals("null")) { + totalFilesUpdated++; } - return filePaths; + } } + return totalFilesUpdated; + } - public HashMap getFileIdAndFullPaths(String basePath) { - HashMap fullPaths = new HashMap<>(); - for (Map.Entry entry: getFileIdAndRelativePaths().entrySet()) { - String fullPath = (entry.getValue() != null) ? (new Path(basePath, entry.getValue())).toString() : null; - fullPaths.put(entry.getKey(), fullPath); - } return fullPaths; + public long fetchTotalUpdateRecordsWritten() { + long totalUpdateRecordsWritten = 0; + for (List stats : partitionToWriteStats.values()) { + for (HoodieWriteStat stat : stats) { + totalUpdateRecordsWritten += stat.getNumUpdateWrites(); + } } + return totalUpdateRecordsWritten; + } - public String toJsonString() throws IOException { - if(partitionToWriteStats.containsKey(null)) { - log.info("partition path is null for " + partitionToWriteStats.get(null)); - partitionToWriteStats.remove(null); + public long fetchTotalInsertRecordsWritten() { + long totalInsertRecordsWritten = 0; + for (List stats : partitionToWriteStats.values()) { + for (HoodieWriteStat stat : stats) { + if (stat.getPrevCommit() != null && stat.getPrevCommit().equals("null")) { + totalInsertRecordsWritten += stat.getNumWrites(); } - ObjectMapper mapper = new ObjectMapper(); - mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY); - return mapper.defaultPrettyPrintingWriter().writeValueAsString(this); + } + } + return totalInsertRecordsWritten; + } + + public long fetchTotalRecordsWritten() { + long totalRecordsWritten = 0; + for (List stats : partitionToWriteStats.values()) { + for (HoodieWriteStat stat : stats) { + totalRecordsWritten += stat.getNumWrites(); + } + } + return totalRecordsWritten; + } + + public long fetchTotalBytesWritten() { + long totalBytesWritten = 0; + for (List stats : partitionToWriteStats.values()) { + for (HoodieWriteStat stat : stats) { + totalBytesWritten += stat.getTotalWriteBytes(); + } + } + return totalBytesWritten; + } + + public long fetchTotalWriteErrors() { + long totalWriteErrors = 0; + for (List stats : partitionToWriteStats.values()) { + for (HoodieWriteStat stat : stats) { + totalWriteErrors += stat.getTotalWriteErrors(); + } + } + return totalWriteErrors; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; } - public static HoodieCommitMetadata fromJsonString(String jsonStr) throws IOException { - if (jsonStr == null || jsonStr.isEmpty()) { - // For empty commit file (no data or somethings bad happen). - return new HoodieCommitMetadata(); - } - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(Feature.FAIL_ON_UNKNOWN_PROPERTIES, false); - mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY); - return mapper.readValue(jsonStr, HoodieCommitMetadata.class); - } + HoodieCommitMetadata that = (HoodieCommitMetadata) o; - // Here the functions are named "fetch" instead of "get", to get avoid of the json conversion. - public long fetchTotalPartitionsWritten() { - return partitionToWriteStats.size(); - } + return partitionToWriteStats != null ? + partitionToWriteStats.equals(that.partitionToWriteStats) : + that.partitionToWriteStats == null; - public long fetchTotalFilesInsert() { - long totalFilesInsert = 0; - for (List stats : partitionToWriteStats.values()) { - for (HoodieWriteStat stat : stats) { - if (stat.getPrevCommit() != null && stat.getPrevCommit().equals("null")) { - totalFilesInsert ++; - } - } - } - return totalFilesInsert; - } + } - public long fetchTotalFilesUpdated() { - long totalFilesUpdated = 0; - for (List stats : partitionToWriteStats.values()) { - for (HoodieWriteStat stat : stats) { - if (stat.getPrevCommit() != null && !stat.getPrevCommit().equals("null")) { - totalFilesUpdated ++; - } - } - } - return totalFilesUpdated; - } + @Override + public int hashCode() { + return partitionToWriteStats != null ? partitionToWriteStats.hashCode() : 0; + } - public long fetchTotalUpdateRecordsWritten() { - long totalUpdateRecordsWritten = 0; - for (List stats : partitionToWriteStats.values()) { - for (HoodieWriteStat stat : stats) { - totalUpdateRecordsWritten += stat.getNumUpdateWrites(); - } - } - return totalUpdateRecordsWritten; - } - - public long fetchTotalInsertRecordsWritten() { - long totalInsertRecordsWritten = 0; - for (List stats : partitionToWriteStats.values()) { - for (HoodieWriteStat stat : stats) { - if (stat.getPrevCommit() != null && stat.getPrevCommit().equals("null")) { - totalInsertRecordsWritten += stat.getNumWrites(); - } - } - } - return totalInsertRecordsWritten; - } - - public long fetchTotalRecordsWritten() { - long totalRecordsWritten = 0; - for (List stats : partitionToWriteStats.values()) { - for (HoodieWriteStat stat : stats) { - totalRecordsWritten += stat.getNumWrites(); - } - } - return totalRecordsWritten; - } - - public long fetchTotalBytesWritten() { - long totalBytesWritten = 0; - for (List stats : partitionToWriteStats.values()) { - for (HoodieWriteStat stat : stats) { - totalBytesWritten += stat.getTotalWriteBytes(); - } - } - return totalBytesWritten; - } - - public long fetchTotalWriteErrors() { - long totalWriteErrors = 0; - for (List stats : partitionToWriteStats.values()) { - for (HoodieWriteStat stat : stats) { - totalWriteErrors += stat.getTotalWriteErrors(); - } - } - return totalWriteErrors; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - - HoodieCommitMetadata that = (HoodieCommitMetadata) o; - - return partitionToWriteStats != null ? - partitionToWriteStats.equals(that.partitionToWriteStats) : - that.partitionToWriteStats == null; - - } - - @Override - public int hashCode() { - return partitionToWriteStats != null ? partitionToWriteStats.hashCode() : 0; - } - - public static HoodieCommitMetadata fromBytes(byte[] bytes) throws IOException { - return fromJsonString(new String(bytes, Charset.forName("utf-8"))); - } + public static HoodieCommitMetadata fromBytes(byte[] bytes) throws IOException { + return fromJsonString(new String(bytes, Charset.forName("utf-8"))); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCompactionMetadata.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCompactionMetadata.java index dc37649a0..043098f36 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCompactionMetadata.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCompactionMetadata.java @@ -16,15 +16,12 @@ package com.uber.hoodie.common.model; -import com.google.common.collect.Maps; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.function.BinaryOperator; -import java.util.function.Supplier; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.codehaus.jackson.annotate.JsonAutoDetect; @@ -33,9 +30,11 @@ import org.codehaus.jackson.map.DeserializationConfig.Feature; import org.codehaus.jackson.map.ObjectMapper; /** - * Place holder for the compaction specific meta-data, uses all the details used in a normal HoodieCommitMetadata + * Place holder for the compaction specific meta-data, uses all the details used in a normal + * HoodieCommitMetadata */ public class HoodieCompactionMetadata extends HoodieCommitMetadata { + private static volatile Logger log = LogManager.getLogger(HoodieCompactionMetadata.class); protected HashMap> partitionToCompactionWriteStats; @@ -60,7 +59,7 @@ public class HoodieCompactionMetadata extends HoodieCommitMetadata { } public String toJsonString() throws IOException { - if(partitionToCompactionWriteStats.containsKey(null)) { + if (partitionToCompactionWriteStats.containsKey(null)) { log.info("partition path is null for " + partitionToCompactionWriteStats.get(null)); partitionToCompactionWriteStats.remove(null); } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDataFile.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDataFile.java index adf2f09d0..baa3c755c 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDataFile.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDataFile.java @@ -17,56 +17,54 @@ package com.uber.hoodie.common.model; import com.uber.hoodie.common.util.FSUtils; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; - import java.io.Serializable; import java.util.Comparator; +import org.apache.hadoop.fs.FileStatus; public class HoodieDataFile implements Serializable { - private FileStatus fileStatus; - public HoodieDataFile(FileStatus fileStatus) { - this.fileStatus = fileStatus; - } + private FileStatus fileStatus; - public String getFileId() { - return FSUtils.getFileId(fileStatus.getPath().getName()); - } + public HoodieDataFile(FileStatus fileStatus) { + this.fileStatus = fileStatus; + } - public String getCommitTime() { - return FSUtils.getCommitTime(fileStatus.getPath().getName()); - } + public String getFileId() { + return FSUtils.getFileId(fileStatus.getPath().getName()); + } - public String getPath() { - return fileStatus.getPath().toString(); - } + public String getCommitTime() { + return FSUtils.getCommitTime(fileStatus.getPath().getName()); + } - public String getFileName() { - return fileStatus.getPath().getName(); - } + public String getPath() { + return fileStatus.getPath().toString(); + } - public FileStatus getFileStatus() { - return fileStatus; - } + public String getFileName() { + return fileStatus.getPath().getName(); + } - public static Comparator getCommitTimeComparator() { - return (o1, o2) -> { - // reverse the order - return o2.getCommitTime().compareTo(o1.getCommitTime()); - }; - } + public FileStatus getFileStatus() { + return fileStatus; + } - public long getFileSize() { - return fileStatus.getLen(); - } + public static Comparator getCommitTimeComparator() { + return (o1, o2) -> { + // reverse the order + return o2.getCommitTime().compareTo(o1.getCommitTime()); + }; + } - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("HoodieDataFile {"); - sb.append("fileStatus=").append(fileStatus); - sb.append('}'); - return sb.toString(); - } + public long getFileSize() { + return fileStatus.getLen(); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("HoodieDataFile {"); + sb.append("fileStatus=").append(fileStatus); + sb.append('}'); + return sb.toString(); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDeltaWriteStat.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDeltaWriteStat.java index 2f3ee88bd..a801338d5 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDeltaWriteStat.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieDeltaWriteStat.java @@ -24,22 +24,22 @@ import com.fasterxml.jackson.annotation.JsonIgnoreProperties; @JsonIgnoreProperties(ignoreUnknown = true) public class HoodieDeltaWriteStat extends HoodieWriteStat { - private int logVersion; - private long logOffset; + private int logVersion; + private long logOffset; - public void setLogVersion(int logVersion) { - this.logVersion = logVersion; - } + public void setLogVersion(int logVersion) { + this.logVersion = logVersion; + } - public int getLogVersion() { - return logVersion; - } + public int getLogVersion() { + return logVersion; + } - public void setLogOffset(long logOffset) { - this.logOffset = logOffset; - } + public void setLogOffset(long logOffset) { + this.logOffset = logOffset; + } - public long getLogOffset() { - return logOffset; - } + public long getLogOffset() { + return logOffset; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileFormat.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileFormat.java index 8ef06ba53..497a9a3c8 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileFormat.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileFormat.java @@ -17,15 +17,15 @@ package com.uber.hoodie.common.model; public enum HoodieFileFormat { - PARQUET(".parquet"), HOODIE_LOG(".log"); + PARQUET(".parquet"), HOODIE_LOG(".log"); - private final String extension; + private final String extension; - HoodieFileFormat(String extension) { - this.extension = extension; - } + HoodieFileFormat(String extension) { + this.extension = extension; + } - public String getFileExtension() { - return extension; - } + public String getFileExtension() { + return extension; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileGroup.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileGroup.java index 97781850a..d5884fb99 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileGroup.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieFileGroup.java @@ -20,9 +20,6 @@ package com.uber.hoodie.common.model; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; - -import org.apache.commons.lang3.tuple.Pair; - import java.io.Serializable; import java.util.Comparator; import java.util.List; @@ -35,212 +32,184 @@ import java.util.stream.Stream; */ public class HoodieFileGroup implements Serializable { - public static Comparator getReverseCommitTimeComparator() { - return (o1, o2) -> { - // reverse the order - return o2.compareTo(o1); - }; + public static Comparator getReverseCommitTimeComparator() { + return (o1, o2) -> { + // reverse the order + return o2.compareTo(o1); + }; + } + + + /** + * Partition containing the file group. + */ + private final String partitionPath; + + /** + * uniquely identifies the file group + */ + private final String id; + + /** + * Slices of files in this group, sorted with greater commit first. + */ + private final TreeMap fileSlices; + + /** + * Timeline, based on which all getter work + */ + private final HoodieTimeline timeline; + + /** + * The last completed instant, that acts as a high watermark for all getters + */ + private final Optional lastInstant; + + public HoodieFileGroup(String partitionPath, String id, HoodieTimeline timeline) { + this.partitionPath = partitionPath; + this.id = id; + this.fileSlices = new TreeMap<>(HoodieFileGroup.getReverseCommitTimeComparator()); + this.timeline = timeline; + this.lastInstant = timeline.lastInstant(); + } + + /** + * Add a new datafile into the file group + */ + public void addDataFile(HoodieDataFile dataFile) { + if (!fileSlices.containsKey(dataFile.getCommitTime())) { + fileSlices.put(dataFile.getCommitTime(), new FileSlice(dataFile.getCommitTime(), id)); } + fileSlices.get(dataFile.getCommitTime()).setDataFile(dataFile); + } - - /** - * Partition containing the file group. - */ - private final String partitionPath; - - /** - * uniquely identifies the file group - */ - private final String id; - - /** - * Slices of files in this group, sorted with greater commit first. - */ - private final TreeMap fileSlices; - - /** - * Timeline, based on which all getter work - */ - private final HoodieTimeline timeline; - - /** - * The last completed instant, that acts as a high watermark for all - * getters - */ - private final Optional lastInstant; - - public HoodieFileGroup(String partitionPath, String id, HoodieTimeline timeline) { - this.partitionPath = partitionPath; - this.id = id; - this.fileSlices = new TreeMap<>(HoodieFileGroup.getReverseCommitTimeComparator()); - this.timeline = timeline; - this.lastInstant = timeline.lastInstant(); + /** + * Add a new log file into the group + */ + public void addLogFile(HoodieLogFile logFile) { + if (!fileSlices.containsKey(logFile.getBaseCommitTime())) { + fileSlices.put(logFile.getBaseCommitTime(), new FileSlice(logFile.getBaseCommitTime(), id)); } + fileSlices.get(logFile.getBaseCommitTime()).addLogFile(logFile); + } - /** - * Add a new datafile into the file group - * - * @param dataFile - */ - public void addDataFile(HoodieDataFile dataFile) { - if (!fileSlices.containsKey(dataFile.getCommitTime())) { - fileSlices.put(dataFile.getCommitTime(), new FileSlice(dataFile.getCommitTime(), id)); - } - fileSlices.get(dataFile.getCommitTime()).setDataFile(dataFile); + public String getId() { + return id; + } + + public String getPartitionPath() { + return partitionPath; + } + + /** + * A FileSlice is considered committed, if one of the following is true - There is a committed + * data file - There are some log files, that are based off a commit or delta commit + */ + private boolean isFileSliceCommitted(FileSlice slice) { + String maxCommitTime = lastInstant.get().getTimestamp(); + return timeline.containsOrBeforeTimelineStarts(slice.getBaseCommitTime()) && + HoodieTimeline.compareTimestamps(slice.getBaseCommitTime(), + maxCommitTime, + HoodieTimeline.LESSER_OR_EQUAL); + + } + + /** + * Provides a stream of committed file slices, sorted reverse base commit time. + */ + public Stream getAllFileSlices() { + if (!timeline.empty()) { + return fileSlices.entrySet().stream() + .map(sliceEntry -> sliceEntry.getValue()) + .filter(slice -> isFileSliceCommitted(slice)); } + return Stream.empty(); + } - /** - * Add a new log file into the group - * - * @param logFile - */ - public void addLogFile(HoodieLogFile logFile) { - if (!fileSlices.containsKey(logFile.getBaseCommitTime())) { - fileSlices.put(logFile.getBaseCommitTime(), new FileSlice(logFile.getBaseCommitTime(), id)); - } - fileSlices.get(logFile.getBaseCommitTime()).addLogFile(logFile); + /** + * Gets the latest slice - this can contain either + * + * - just the log files without data file - (or) data file with 0 or more log files + */ + public Optional getLatestFileSlice() { + // there should always be one + return getAllFileSlices().findFirst(); + } + + /** + * Obtain the latest file slice, upto a commitTime i.e <= maxCommitTime + */ + public Optional getLatestFileSliceBeforeOrOn(String maxCommitTime) { + return getAllFileSlices() + .filter(slice -> + HoodieTimeline.compareTimestamps(slice.getBaseCommitTime(), + maxCommitTime, + HoodieTimeline.LESSER_OR_EQUAL)) + .findFirst(); + } + + public Optional getLatestFileSliceInRange(List commitRange) { + return getAllFileSlices() + .filter(slice -> commitRange.contains(slice.getBaseCommitTime())) + .findFirst(); + } + + /** + * Stream of committed data files, sorted reverse commit time + */ + public Stream getAllDataFiles() { + return getAllFileSlices() + .filter(slice -> slice.getDataFile().isPresent()) + .map(slice -> slice.getDataFile().get()); + } + + /** + * Get the latest committed data file + */ + public Optional getLatestDataFile() { + return getAllDataFiles().findFirst(); + } + + /** + * Get the latest data file, that is <= max commit time + */ + public Optional getLatestDataFileBeforeOrOn(String maxCommitTime) { + return getAllDataFiles() + .filter(dataFile -> + HoodieTimeline.compareTimestamps(dataFile.getCommitTime(), + maxCommitTime, + HoodieTimeline.LESSER_OR_EQUAL)) + .findFirst(); + } + + /** + * Get the latest data file, that is contained within the provided commit range. + */ + public Optional getLatestDataFileInRange(List commitRange) { + return getAllDataFiles() + .filter(dataFile -> commitRange.contains(dataFile.getCommitTime())) + .findFirst(); + } + + /** + * Obtain the latest log file (based on latest committed data file), currently being appended to + * + * @return logfile if present, empty if no log file has been opened already. + */ + public Optional getLatestLogFile() { + Optional latestSlice = getLatestFileSlice(); + if (latestSlice.isPresent() && latestSlice.get().getLogFiles().count() > 0) { + return latestSlice.get().getLogFiles().findFirst(); } + return Optional.empty(); + } - public String getId() { - return id; - } - - public String getPartitionPath() { - return partitionPath; - } - - /** - * A FileSlice is considered committed, if one of the following is true - * - There is a committed data file - * - There are some log files, that are based off a commit or delta commit - * - * @param slice - * @return - */ - private boolean isFileSliceCommitted(FileSlice slice) { - String maxCommitTime = lastInstant.get().getTimestamp(); - return timeline.containsOrBeforeTimelineStarts(slice.getBaseCommitTime()) && - HoodieTimeline.compareTimestamps(slice.getBaseCommitTime(), - maxCommitTime, - HoodieTimeline.LESSER_OR_EQUAL); - - } - - /** - * Provides a stream of committed file slices, sorted reverse base commit time. - * - * @return - */ - public Stream getAllFileSlices() { - if (!timeline.empty()) { - return fileSlices.entrySet().stream() - .map(sliceEntry -> sliceEntry.getValue()) - .filter(slice -> isFileSliceCommitted(slice)); - } - return Stream.empty(); - } - - /** - * Gets the latest slice - this can contain either - * - * - just the log files without data file - * - (or) data file with 0 or more log files - * - * @return - */ - public Optional getLatestFileSlice() { - // there should always be one - return getAllFileSlices().findFirst(); - } - - /** - * Obtain the latest file slice, upto a commitTime i.e <= maxCommitTime - * - * @param maxCommitTime - * @return - */ - public Optional getLatestFileSliceBeforeOrOn(String maxCommitTime) { - return getAllFileSlices() - .filter(slice -> - HoodieTimeline.compareTimestamps(slice.getBaseCommitTime(), - maxCommitTime, - HoodieTimeline.LESSER_OR_EQUAL)) - .findFirst(); - } - - public Optional getLatestFileSliceInRange(List commitRange) { - return getAllFileSlices() - .filter(slice -> commitRange.contains(slice.getBaseCommitTime())) - .findFirst(); - } - - /** - * Stream of committed data files, sorted reverse commit time - * - * @return - */ - public Stream getAllDataFiles() { - return getAllFileSlices() - .filter(slice -> slice.getDataFile().isPresent()) - .map(slice -> slice.getDataFile().get()); - } - - /** - * Get the latest committed data file - * - * @return - */ - public Optional getLatestDataFile() { - return getAllDataFiles().findFirst(); - } - - /** - * Get the latest data file, that is <= max commit time - * - * @param maxCommitTime - * @return - */ - public Optional getLatestDataFileBeforeOrOn(String maxCommitTime) { - return getAllDataFiles() - .filter(dataFile -> - HoodieTimeline.compareTimestamps(dataFile.getCommitTime(), - maxCommitTime, - HoodieTimeline.LESSER_OR_EQUAL)) - .findFirst(); - } - - /** - * Get the latest data file, that is contained within the provided commit range. - * - * @param commitRange - * @return - */ - public Optional getLatestDataFileInRange(List commitRange) { - return getAllDataFiles() - .filter(dataFile -> commitRange.contains(dataFile.getCommitTime())) - .findFirst(); - } - - /** - * Obtain the latest log file (based on latest committed data file), - * currently being appended to - * - * @return logfile if present, empty if no log file has been opened already. - */ - public Optional getLatestLogFile() { - Optional latestSlice = getLatestFileSlice(); - if (latestSlice.isPresent() && latestSlice.get().getLogFiles().count() > 0) { - return latestSlice.get().getLogFiles().findFirst(); - } - return Optional.empty(); - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("HoodieFileGroup {"); - sb.append("id=").append(id); - sb.append(", fileSlices='").append(fileSlices).append('\''); - sb.append('}'); - return sb.toString(); - } + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("HoodieFileGroup {"); + sb.append("id=").append(id); + sb.append(", fileSlices='").append(fileSlices).append('\''); + sb.append('}'); + return sb.toString(); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieKey.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieKey.java index d36400434..f4545809c 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieKey.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieKey.java @@ -17,57 +17,58 @@ package com.uber.hoodie.common.model; import com.google.common.base.Objects; - import java.io.Serializable; /** * HoodieKey consists of * - * - recordKey : a recordKey that acts as primary key for a record - * - partitionPath : path to the partition that contains the record + * - recordKey : a recordKey that acts as primary key for a record - partitionPath : path to the + * partition that contains the record */ public class HoodieKey implements Serializable { - private final String recordKey; + private final String recordKey; - private final String partitionPath; + private final String partitionPath; - public HoodieKey(String recordKey, String partitionPath) { - this.recordKey = recordKey; - this.partitionPath = partitionPath; + public HoodieKey(String recordKey, String partitionPath) { + this.recordKey = recordKey; + this.partitionPath = partitionPath; + } + + public String getRecordKey() { + return recordKey; + } + + public String getPartitionPath() { + return partitionPath; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; } - - public String getRecordKey() { - return recordKey; + if (o == null || getClass() != o.getClass()) { + return false; } + HoodieKey otherKey = (HoodieKey) o; + return Objects.equal(recordKey, otherKey.recordKey) && + Objects.equal(partitionPath, otherKey.partitionPath); + } - public String getPartitionPath() { - return partitionPath; - } + @Override + public int hashCode() { + return Objects.hashCode(recordKey, partitionPath); + } - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - HoodieKey otherKey = (HoodieKey) o; - return Objects.equal(recordKey, otherKey.recordKey) && - Objects.equal(partitionPath, otherKey.partitionPath); - } - - @Override - public int hashCode() { - return Objects.hashCode(recordKey, partitionPath); - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("HoodieKey {"); - sb.append(" recordKey=").append(recordKey); - sb.append(" partitionPath=").append(partitionPath); - sb.append('}'); - return sb.toString(); - } + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("HoodieKey {"); + sb.append(" recordKey=").append(recordKey); + sb.append(" partitionPath=").append(partitionPath); + sb.append('}'); + return sb.toString(); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieLogFile.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieLogFile.java index d1cb636d0..4e09f5f33 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieLogFile.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieLogFile.java @@ -19,13 +19,13 @@ package com.uber.hoodie.common.model; import com.uber.hoodie.common.util.FSUtils; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import java.io.IOException; import java.io.Serializable; import java.util.Comparator; import java.util.Optional; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; /** * Abstracts a single log file. Contains methods to extract metadata like the fileId, version and @@ -34,73 +34,74 @@ import java.util.Optional; * Also contains logic to roll-over the log file */ public class HoodieLogFile implements Serializable { - public static final String DELTA_EXTENSION = ".log"; - private final Path path; - private Optional fileStatus; + public static final String DELTA_EXTENSION = ".log"; - public HoodieLogFile(FileStatus fileStatus) { - this(fileStatus.getPath()); - this.fileStatus = Optional.of(fileStatus); - } + private final Path path; + private Optional fileStatus; - public HoodieLogFile(Path logPath) { - this.path = logPath; - this.fileStatus = Optional.empty(); - } + public HoodieLogFile(FileStatus fileStatus) { + this(fileStatus.getPath()); + this.fileStatus = Optional.of(fileStatus); + } - public String getFileId() { - return FSUtils.getFileIdFromLogPath(path); - } + public HoodieLogFile(Path logPath) { + this.path = logPath; + this.fileStatus = Optional.empty(); + } - public String getBaseCommitTime() { - return FSUtils.getBaseCommitTimeFromLogPath(path); - } + public String getFileId() { + return FSUtils.getFileIdFromLogPath(path); + } - public int getLogVersion() { - return FSUtils.getFileVersionFromLog(path); - } + public String getBaseCommitTime() { + return FSUtils.getBaseCommitTimeFromLogPath(path); + } - public String getFileExtension() { - return FSUtils.getFileExtensionFromLog(path); - } + public int getLogVersion() { + return FSUtils.getFileVersionFromLog(path); + } - public Path getPath() { - return path; - } + public String getFileExtension() { + return FSUtils.getFileExtensionFromLog(path); + } - public String getFileName() { - return path.getName(); - } + public Path getPath() { + return path; + } - public Optional getFileStatus() { - return fileStatus; - } + public String getFileName() { + return path.getName(); + } - public Optional getFileSize() { - return fileStatus.map(FileStatus::getLen); - } + public Optional getFileStatus() { + return fileStatus; + } - public HoodieLogFile rollOver(FileSystem fs) throws IOException { - String fileId = getFileId(); - String baseCommitTime = getBaseCommitTime(); - String extension = "." + FSUtils.getFileExtensionFromLog(path); - int newVersion = FSUtils - .computeNextLogVersion(fs, path.getParent(), fileId, - extension, baseCommitTime); - return new HoodieLogFile(new Path(path.getParent(), - FSUtils.makeLogFileName(fileId, extension, baseCommitTime, newVersion))); - } + public Optional getFileSize() { + return fileStatus.map(FileStatus::getLen); + } - public static Comparator getLogVersionComparator() { - return (o1, o2) -> { - // reverse the order - return new Integer(o2.getLogVersion()).compareTo(o1.getLogVersion()); - }; - } + public HoodieLogFile rollOver(FileSystem fs) throws IOException { + String fileId = getFileId(); + String baseCommitTime = getBaseCommitTime(); + String extension = "." + FSUtils.getFileExtensionFromLog(path); + int newVersion = FSUtils + .computeNextLogVersion(fs, path.getParent(), fileId, + extension, baseCommitTime); + return new HoodieLogFile(new Path(path.getParent(), + FSUtils.makeLogFileName(fileId, extension, baseCommitTime, newVersion))); + } - @Override - public String toString() { - return "HoodieLogFile {" + path + '}'; - } + public static Comparator getLogVersionComparator() { + return (o1, o2) -> { + // reverse the order + return new Integer(o2.getLogVersion()).compareTo(o1.getLogVersion()); + }; + } + + @Override + public String toString() { + return "HoodieLogFile {" + path + '}'; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodiePartitionMetadata.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodiePartitionMetadata.java index d0ddf16ca..ebe801811 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodiePartitionMetadata.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodiePartitionMetadata.java @@ -17,7 +17,8 @@ package com.uber.hoodie.common.model; import com.uber.hoodie.exception.HoodieException; - +import java.io.IOException; +import java.util.Properties; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; @@ -25,117 +26,119 @@ import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; -import java.util.Properties; - /** * The metadata that goes into the meta file in each partition */ public class HoodiePartitionMetadata { - public static final String HOODIE_PARTITION_METAFILE = ".hoodie_partition_metadata"; - public static final String PARTITION_DEPTH_KEY = "partitionDepth"; - public static final String COMMIT_TIME_KEY = "commitTime"; + public static final String HOODIE_PARTITION_METAFILE = ".hoodie_partition_metadata"; + public static final String PARTITION_DEPTH_KEY = "partitionDepth"; + public static final String COMMIT_TIME_KEY = "commitTime"; - /** - * Contents of the metadata - */ - private final Properties props; + /** + * Contents of the metadata + */ + private final Properties props; - /** - * Path to the partition, about which we have the metadata - */ - private final Path partitionPath; + /** + * Path to the partition, about which we have the metadata + */ + private final Path partitionPath; - private final FileSystem fs; + private final FileSystem fs; - private static Logger log = LogManager.getLogger(HoodiePartitionMetadata.class); + private static Logger log = LogManager.getLogger(HoodiePartitionMetadata.class); - /** - * Construct metadata from existing partition - */ - public HoodiePartitionMetadata(FileSystem fs, Path partitionPath) { - this.fs = fs; - this.props = new Properties(); - this.partitionPath = partitionPath; + /** + * Construct metadata from existing partition + */ + public HoodiePartitionMetadata(FileSystem fs, Path partitionPath) { + this.fs = fs; + this.props = new Properties(); + this.partitionPath = partitionPath; + } + + /** + * Construct metadata object to be written out. + */ + public HoodiePartitionMetadata(FileSystem fs, String commitTime, Path basePath, + Path partitionPath) { + this(fs, partitionPath); + props.setProperty(COMMIT_TIME_KEY, commitTime); + props + .setProperty(PARTITION_DEPTH_KEY, String.valueOf(partitionPath.depth() - basePath.depth())); + } + + public int getPartitionDepth() { + if (!props.containsKey(PARTITION_DEPTH_KEY)) { + throw new HoodieException("Could not find partitionDepth in partition metafile"); } + return Integer.parseInt(props.getProperty(PARTITION_DEPTH_KEY)); + } - /** - * Construct metadata object to be written out. - */ - public HoodiePartitionMetadata(FileSystem fs, String commitTime, Path basePath, Path partitionPath) { - this(fs, partitionPath); - props.setProperty(COMMIT_TIME_KEY, commitTime); - props.setProperty(PARTITION_DEPTH_KEY, String.valueOf(partitionPath.depth() - basePath.depth())); - } + /** + * Write the metadata safely into partition atomically. + */ + public void trySave(int taskPartitionId) { + Path tmpMetaPath = new Path(partitionPath, + HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE + "_" + taskPartitionId); + Path metaPath = new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); + boolean metafileExists = false; - public int getPartitionDepth() { - if (!props.containsKey(PARTITION_DEPTH_KEY)) { - throw new HoodieException("Could not find partitionDepth in partition metafile"); - } - return Integer.parseInt(props.getProperty(PARTITION_DEPTH_KEY)); - } - - /** - * Write the metadata safely into partition atomically. - * - * @param taskPartitionId - */ - public void trySave(int taskPartitionId) { - Path tmpMetaPath = new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE + "_" + taskPartitionId); - Path metaPath = new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); - boolean metafileExists = false; + try { + metafileExists = fs.exists(metaPath); + if (!metafileExists) { + // write to temporary file + FSDataOutputStream os = fs.create(tmpMetaPath, true); + props.store(os, "partition metadata"); + os.hsync(); + os.hflush(); + os.close(); + // move to actual path + fs.rename(tmpMetaPath, metaPath); + } + } catch (IOException ioe) { + log.warn( + "Error trying to save partition metadata (this is okay, as long as atleast 1 of these succced), " + + + partitionPath, ioe); + } finally { + if (!metafileExists) { try { - metafileExists = fs.exists(metaPath); - if (!metafileExists) { - // write to temporary file - FSDataOutputStream os = fs.create(tmpMetaPath, true); - props.store(os, "partition metadata"); - os.hsync(); - os.hflush(); - os.close(); - - // move to actual path - fs.rename(tmpMetaPath, metaPath); - } + // clean up tmp file, if still lying around + if (fs.exists(tmpMetaPath)) { + fs.delete(tmpMetaPath, false); + } } catch (IOException ioe) { - log.warn("Error trying to save partition metadata (this is okay, as long as atleast 1 of these succced), " + - partitionPath, ioe); - } finally { - if (!metafileExists) { - try { - // clean up tmp file, if still lying around - if (fs.exists(tmpMetaPath)) { - fs.delete(tmpMetaPath, false); - } - } catch (IOException ioe) { - log.warn("Error trying to clean up temporary files for " + partitionPath, ioe); - } - } + log.warn("Error trying to clean up temporary files for " + partitionPath, ioe); } + } } + } - /** - * Read out the metadata for this partition - */ - public void readFromFS() { - try { - Path metaFile = new Path(partitionPath, HOODIE_PARTITION_METAFILE); - FSDataInputStream is = fs.open(metaFile); - props.load(is); - } catch (IOException ioe) { - throw new HoodieException("Error reading Hoodie partition metadata for " + partitionPath, ioe); - } + /** + * Read out the metadata for this partition + */ + public void readFromFS() { + try { + Path metaFile = new Path(partitionPath, HOODIE_PARTITION_METAFILE); + FSDataInputStream is = fs.open(metaFile); + props.load(is); + } catch (IOException ioe) { + throw new HoodieException("Error reading Hoodie partition metadata for " + partitionPath, + ioe); } + } - // methods related to partition meta data - public static boolean hasPartitionMetadata(FileSystem fs, Path partitionPath) { - try { - return fs.exists(new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)); - } catch (IOException ioe) { - throw new HoodieException("Error checking Hoodie partition metadata for " + partitionPath, ioe); - } + // methods related to partition meta data + public static boolean hasPartitionMetadata(FileSystem fs, Path partitionPath) { + try { + return fs.exists(new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)); + } catch (IOException ioe) { + throw new HoodieException("Error checking Hoodie partition metadata for " + partitionPath, + ioe); } + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java index 4b05a2e3c..43b023020 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java @@ -17,7 +17,6 @@ package com.uber.hoodie.common.model; import com.google.common.base.Objects; - import java.io.Serializable; import java.util.Optional; @@ -26,129 +25,131 @@ import java.util.Optional; */ public class HoodieRecord implements Serializable { - public static String COMMIT_TIME_METADATA_FIELD = "_hoodie_commit_time"; - public static String COMMIT_SEQNO_METADATA_FIELD = "_hoodie_commit_seqno"; - public static String RECORD_KEY_METADATA_FIELD = "_hoodie_record_key"; - public static String PARTITION_PATH_METADATA_FIELD = "_hoodie_partition_path"; - public static String FILENAME_METADATA_FIELD = "_hoodie_file_name"; + public static String COMMIT_TIME_METADATA_FIELD = "_hoodie_commit_time"; + public static String COMMIT_SEQNO_METADATA_FIELD = "_hoodie_commit_seqno"; + public static String RECORD_KEY_METADATA_FIELD = "_hoodie_record_key"; + public static String PARTITION_PATH_METADATA_FIELD = "_hoodie_partition_path"; + public static String FILENAME_METADATA_FIELD = "_hoodie_file_name"; - /** - * Identifies the record across the table - */ - private HoodieKey key; + /** + * Identifies the record across the table + */ + private HoodieKey key; - /** - * Actual payload of the record - */ - private T data; + /** + * Actual payload of the record + */ + private T data; - /** - * Current location of record on storage. Filled in by looking up index - */ - private HoodieRecordLocation currentLocation; + /** + * Current location of record on storage. Filled in by looking up index + */ + private HoodieRecordLocation currentLocation; - /** - * New location of record on storage, after written - */ - private HoodieRecordLocation newLocation; + /** + * New location of record on storage, after written + */ + private HoodieRecordLocation newLocation; - public HoodieRecord(HoodieKey key, T data) { - this.key = key; - this.data = data; - this.currentLocation = null; - this.newLocation = null; + public HoodieRecord(HoodieKey key, T data) { + this.key = key; + this.data = data; + this.currentLocation = null; + this.newLocation = null; + } + + public HoodieKey getKey() { + return key; + } + + public T getData() { + if (data == null) { + throw new IllegalStateException("Payload already deflated for record."); } + return data; + } - public HoodieKey getKey() { - return key; + /** + * Release the actual payload, to ease memory pressure. To be called after the record has been + * written to storage. Once deflated, cannot be inflated. + */ + public void deflate() { + this.data = null; + } + + + /** + * Sets the current currentLocation of the record. This should happen exactly-once + */ + public HoodieRecord setCurrentLocation(HoodieRecordLocation location) { + assert currentLocation == null; + this.currentLocation = location; + return this; + } + + public HoodieRecordLocation getCurrentLocation() { + return currentLocation; + } + + /** + * Sets the new currentLocation of the record, after being written. This again should happen + * exactly-once. + */ + public HoodieRecord setNewLocation(HoodieRecordLocation location) { + assert newLocation == null; + this.newLocation = location; + return this; + } + + public Optional getNewLocation() { + return Optional.of(this.newLocation); + } + + public boolean isCurrentLocationKnown() { + return this.currentLocation != null; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; } - - public T getData() { - if (data == null) { - throw new IllegalStateException("Payload already deflated for record."); - } - return data; + if (o == null || getClass() != o.getClass()) { + return false; } + HoodieRecord that = (HoodieRecord) o; + return Objects.equal(key, that.key) && + Objects.equal(data, that.data) && + Objects.equal(currentLocation, that.currentLocation) && + Objects.equal(newLocation, that.newLocation); + } - /** - * Release the actual payload, to ease memory pressure. To be called after the record - * has been written to storage. Once deflated, cannot be inflated. - */ - public void deflate() { - this.data = null; - } + @Override + public int hashCode() { + return Objects.hashCode(key, data, currentLocation, newLocation); + } + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("HoodieRecord{"); + sb.append("key=").append(key); + sb.append(", currentLocation='").append(currentLocation).append('\''); + sb.append(", newLocation='").append(newLocation).append('\''); + sb.append('}'); + return sb.toString(); + } - /** - * Sets the current currentLocation of the record. This should happen exactly-once - */ - public HoodieRecord setCurrentLocation(HoodieRecordLocation location) { - assert currentLocation == null; - this.currentLocation = location; - return this; - } + public static String generateSequenceId(String commitTime, int partitionId, long recordIndex) { + return commitTime + "_" + partitionId + "_" + recordIndex; + } - public HoodieRecordLocation getCurrentLocation() { - return currentLocation; - } + public String getPartitionPath() { + assert key != null; + return key.getPartitionPath(); + } - /** - * Sets the new currentLocation of the record, after being written. This again should happen - * exactly-once. - */ - public HoodieRecord setNewLocation(HoodieRecordLocation location) { - assert newLocation == null; - this.newLocation = location; - return this; - } - - public Optional getNewLocation() { - return Optional.of(this.newLocation); - } - - public boolean isCurrentLocationKnown() { - return this.currentLocation != null; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - HoodieRecord that = (HoodieRecord) o; - return Objects.equal(key, that.key) && - Objects.equal(data, that.data) && - Objects.equal(currentLocation, that.currentLocation) && - Objects.equal(newLocation, that.newLocation); - } - - @Override - public int hashCode() { - return Objects.hashCode(key, data, currentLocation, newLocation); - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("HoodieRecord{"); - sb.append("key=").append(key); - sb.append(", currentLocation='").append(currentLocation).append('\''); - sb.append(", newLocation='").append(newLocation).append('\''); - sb.append('}'); - return sb.toString(); - } - - public static String generateSequenceId(String commitTime, int partitionId, long recordIndex) { - return commitTime + "_" + partitionId + "_" + recordIndex; - } - - public String getPartitionPath() { - assert key != null; - return key.getPartitionPath(); - } - - public String getRecordKey() { - assert key != null; - return key.getRecordKey(); - } + public String getRecordKey() { + assert key != null; + return key.getRecordKey(); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecordLocation.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecordLocation.java index c84b028ff..fa5f7a04f 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecordLocation.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecordLocation.java @@ -17,7 +17,6 @@ package com.uber.hoodie.common.model; import com.google.common.base.Objects; - import java.io.Serializable; /** @@ -26,44 +25,46 @@ import java.io.Serializable; */ public class HoodieRecordLocation implements Serializable { - private final String commitTime; - private final String fileId; + private final String commitTime; + private final String fileId; - public HoodieRecordLocation(String commitTime, String fileId) { - this.commitTime = commitTime; - this.fileId = fileId; - } + public HoodieRecordLocation(String commitTime, String fileId) { + this.commitTime = commitTime; + this.fileId = fileId; + } - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - HoodieRecordLocation otherLoc = (HoodieRecordLocation) o; - return Objects.equal(commitTime, otherLoc.commitTime) && - Objects.equal(fileId, otherLoc.fileId); + @Override + public boolean equals(Object o) { + if (this == o) { + return true; } + if (o == null || getClass() != o.getClass()) { + return false; + } + HoodieRecordLocation otherLoc = (HoodieRecordLocation) o; + return Objects.equal(commitTime, otherLoc.commitTime) && + Objects.equal(fileId, otherLoc.fileId); + } - @Override - public int hashCode() { - return Objects.hashCode(commitTime, fileId); - } + @Override + public int hashCode() { + return Objects.hashCode(commitTime, fileId); + } - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("HoodieRecordLocation {"); - sb.append("commitTime=").append(commitTime).append(", "); - sb.append("fileId=").append(fileId); - sb.append('}'); - return sb.toString(); - } + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("HoodieRecordLocation {"); + sb.append("commitTime=").append(commitTime).append(", "); + sb.append("fileId=").append(fileId); + sb.append('}'); + return sb.toString(); + } - public String getCommitTime() { - return commitTime; - } + public String getCommitTime() { + return commitTime; + } - public String getFileId() { - return fileId; - } + public String getFileId() { + return fileId; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecordPayload.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecordPayload.java index c2ca79343..c7fe8fff8 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecordPayload.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecordPayload.java @@ -16,54 +16,55 @@ package com.uber.hoodie.common.model; +import java.io.IOException; +import java.io.Serializable; import java.util.Map; +import java.util.Optional; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; -import java.io.IOException; -import java.io.Serializable; -import java.util.Optional; - /** - * Every Hoodie dataset has an implementation of the HoodieRecordPayload - * This abstracts out callbacks which depend on record specific logic + * Every Hoodie dataset has an implementation of the HoodieRecordPayload This abstracts + * out callbacks which depend on record specific logic */ public interface HoodieRecordPayload extends Serializable { - /** - * When more than one HoodieRecord have the same HoodieKey, this function combines them - * before attempting to insert/upsert (if combining turned on in HoodieClientConfig) - */ - T preCombine(T another); - /** - * - * This methods lets you write custom merging/combining logic to produce new values - * as a function of current value on storage and whats contained in this object. - * - * eg: - * 1) You are updating counters, you may want to add counts to currentValue and write back updated counts - * 2) You may be reading DB redo logs, and merge them with current image for a database row on storage - * - * @param currentValue Current value in storage, to merge/combine this payload with - * @param schema Schema used for record - * @return new combined/merged value to be written back to storage. EMPTY to skip writing this record. - */ - Optional combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException; + /** + * When more than one HoodieRecord have the same HoodieKey, this function combines them before + * attempting to insert/upsert (if combining turned on in HoodieClientConfig) + */ + T preCombine(T another); - /** - * Generates an avro record out of the given HoodieRecordPayload, to be written out to storage. - * Called when writing a new value for the given HoodieKey, wherein there is no existing record in - * storage to be combined against. (i.e insert) - * Return EMPTY to skip writing this record. - */ - Optional getInsertValue(Schema schema) throws IOException; + /** + * This methods lets you write custom merging/combining logic to produce new values as a function + * of current value on storage and whats contained in this object. + * + * eg: 1) You are updating counters, you may want to add counts to currentValue and write back + * updated counts 2) You may be reading DB redo logs, and merge them with current image for a + * database row on storage + * + * @param currentValue Current value in storage, to merge/combine this payload with + * @param schema Schema used for record + * @return new combined/merged value to be written back to storage. EMPTY to skip writing this + * record. + */ + Optional combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) + throws IOException; - /** - * This method can be used to extract some metadata from HoodieRecordPayload. The metadata is passed - * to {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()} in order to compute - * some aggregate metrics using the metadata in the context of a write success or failure. - */ - default Optional> getMetadata() { - return Optional.empty(); - } + /** + * Generates an avro record out of the given HoodieRecordPayload, to be written out to storage. + * Called when writing a new value for the given HoodieKey, wherein there is no existing record in + * storage to be combined against. (i.e insert) Return EMPTY to skip writing this record. + */ + Optional getInsertValue(Schema schema) throws IOException; + + /** + * This method can be used to extract some metadata from HoodieRecordPayload. The metadata is + * passed to {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()} in order to + * compute some aggregate metrics using the metadata in the context of a write success or + * failure. + */ + default Optional> getMetadata() { + return Optional.empty(); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableType.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableType.java index fb91bc1ca..00564d627 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableType.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableType.java @@ -31,5 +31,5 @@ package com.uber.hoodie.common.model; * SIMPLE_LSM - A simple 2 level LSM tree. */ public enum HoodieTableType { - COPY_ON_WRITE, MERGE_ON_READ + COPY_ON_WRITE, MERGE_ON_READ } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieWriteStat.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieWriteStat.java index a56338cc4..b69aed36c 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieWriteStat.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieWriteStat.java @@ -17,8 +17,6 @@ package com.uber.hoodie.common.model; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.uber.hoodie.common.util.FSUtils; - import java.io.Serializable; /** @@ -27,143 +25,153 @@ import java.io.Serializable; @JsonIgnoreProperties(ignoreUnknown = true) public class HoodieWriteStat implements Serializable { - public static final String NULL_COMMIT = "null"; + public static final String NULL_COMMIT = "null"; - /** - * Id of the file being written - */ - private String fileId; + /** + * Id of the file being written + */ + private String fileId; - /** - * Relative path to the file from the base path - */ - private String path; + /** + * Relative path to the file from the base path + */ + private String path; - /** - * The previous version of the file. (null if this is the first version. i.e insert) - */ - private String prevCommit; + /** + * The previous version of the file. (null if this is the first version. i.e insert) + */ + private String prevCommit; - /** - * Total number of records written for this file. - * - for updates, its the entire number of records in the file - * - for inserts, its the actual number of records inserted. - */ - private long numWrites; + /** + * Total number of records written for this file. - for updates, its the entire number of records + * in the file - for inserts, its the actual number of records inserted. + */ + private long numWrites; - /** - * Total number of records deleted. - */ - private long numDeletes; + /** + * Total number of records deleted. + */ + private long numDeletes; - /** - * Total number of records actually changed. (0 for inserts) - */ - private long numUpdateWrites; + /** + * Total number of records actually changed. (0 for inserts) + */ + private long numUpdateWrites; - /** - * Total size of file written - */ - private long totalWriteBytes; + /** + * Total size of file written + */ + private long totalWriteBytes; - /** - * Total number of records, that were n't able to be written due to errors. - */ - private long totalWriteErrors; + /** + * Total number of records, that were n't able to be written due to errors. + */ + private long totalWriteErrors; - public HoodieWriteStat() { - // called by jackson json lib + public HoodieWriteStat() { + // called by jackson json lib + } + + public void setFileId(String fileId) { + this.fileId = fileId; + } + + public void setPath(String path) { + this.path = path; + } + + public void setPrevCommit(String prevCommit) { + this.prevCommit = prevCommit; + } + + public void setNumWrites(long numWrites) { + this.numWrites = numWrites; + } + + public void setNumDeletes(long numDeletes) { + this.numDeletes = numDeletes; + } + + public void setNumUpdateWrites(long numUpdateWrites) { + this.numUpdateWrites = numUpdateWrites; + } + + public long getTotalWriteBytes() { + return totalWriteBytes; + } + + public void setTotalWriteBytes(long totalWriteBytes) { + this.totalWriteBytes = totalWriteBytes; + } + + public long getTotalWriteErrors() { + return totalWriteErrors; + } + + public void setTotalWriteErrors(long totalWriteErrors) { + this.totalWriteErrors = totalWriteErrors; + } + + public String getPrevCommit() { + return prevCommit; + } + + public long getNumWrites() { + return numWrites; + } + + public long getNumDeletes() { + return numDeletes; + } + + public long getNumUpdateWrites() { + return numUpdateWrites; + } + + public String getFileId() { + return fileId; + } + + public String getPath() { + return path; + } + + + @Override + public String toString() { + return new StringBuilder() + .append("HoodieWriteStat {") + .append("path=" + path) + .append(", prevCommit='" + prevCommit + '\'') + .append(", numWrites=" + numWrites) + .append(", numDeletes=" + numDeletes) + .append(", numUpdateWrites=" + numUpdateWrites) + .append(", numWriteBytes=" + totalWriteBytes) + .append('}') + .toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; } - public void setFileId(String fileId) { - this.fileId = fileId; + HoodieWriteStat that = (HoodieWriteStat) o; + if (!path.equals(that.path)) { + return false; } + return prevCommit.equals(that.prevCommit); - public void setPath(String path) { this.path = path; } + } - public void setPrevCommit(String prevCommit) { - this.prevCommit = prevCommit; - } - - public void setNumWrites(long numWrites) { - this.numWrites = numWrites; - } - - public void setNumDeletes(long numDeletes) { - this.numDeletes = numDeletes; - } - - public void setNumUpdateWrites(long numUpdateWrites) { - this.numUpdateWrites = numUpdateWrites; - } - - public long getTotalWriteBytes() { - return totalWriteBytes; - } - - public void setTotalWriteBytes(long totalWriteBytes) { - this.totalWriteBytes = totalWriteBytes; - } - - public long getTotalWriteErrors() { return totalWriteErrors; } - - public void setTotalWriteErrors(long totalWriteErrors) { this.totalWriteErrors = totalWriteErrors; } - - public String getPrevCommit() { - return prevCommit; - } - - public long getNumWrites() { - return numWrites; - } - - public long getNumDeletes() { - return numDeletes; - } - - public long getNumUpdateWrites() { - return numUpdateWrites; - } - - public String getFileId() { - return fileId; - } - - public String getPath() { return path; } - - - @Override - public String toString() { - return new StringBuilder() - .append("HoodieWriteStat {") - .append("path=" + path) - .append(", prevCommit='" + prevCommit + '\'') - .append(", numWrites=" + numWrites) - .append(", numDeletes=" + numDeletes) - .append(", numUpdateWrites=" + numUpdateWrites) - .append(", numWriteBytes=" + totalWriteBytes) - .append('}') - .toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - - HoodieWriteStat that = (HoodieWriteStat) o; - if (!path.equals(that.path)) - return false; - return prevCommit.equals(that.prevCommit); - - } - - @Override - public int hashCode() { - int result = path.hashCode(); - result = 31 * result + prevCommit.hashCode(); - return result; - } + @Override + public int hashCode() { + int result = path.hashCode(); + result = 31 * result + prevCommit.hashCode(); + return result; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java index 48cb75ca3..8cc6c18c6 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableConfig.java @@ -19,8 +19,11 @@ package com.uber.hoodie.common.table; import com.uber.hoodie.common.model.HoodieAvroPayload; import com.uber.hoodie.common.model.HoodieFileFormat; import com.uber.hoodie.common.model.HoodieTableType; -import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieIOException; +import java.io.IOException; +import java.io.Serializable; +import java.util.Date; +import java.util.Properties; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; @@ -28,144 +31,134 @@ import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; -import java.io.Serializable; -import java.util.Date; -import java.util.Properties; - /** * Configurations on the Hoodie Table like type of ingestion, storage formats, hive table name etc - * Configurations are loaded from hoodie.properties, these properties are usually set during initializing a path as hoodie base path - * and never changes during the lifetime of a hoodie dataset. + * Configurations are loaded from hoodie.properties, these properties are usually set during + * initializing a path as hoodie base path and never changes during the lifetime of a hoodie + * dataset. * * @see HoodieTableMetaClient * @since 0.3.0 */ public class HoodieTableConfig implements Serializable { - private final transient static Logger log = LogManager.getLogger(HoodieTableConfig.class); - public static final String HOODIE_PROPERTIES_FILE = "hoodie.properties"; - public static final String HOODIE_TABLE_NAME_PROP_NAME = "hoodie.table.name"; - public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type"; - public static final String HOODIE_RO_FILE_FORMAT_PROP_NAME = - "hoodie.table.ro.file.format"; - public static final String HOODIE_RT_FILE_FORMAT_PROP_NAME = - "hoodie.table.rt.file.format"; - public static final String HOODIE_PAYLOAD_CLASS_PROP_NAME = "hoodie.compaction.payload.class"; + private final transient static Logger log = LogManager.getLogger(HoodieTableConfig.class); - public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE; - public static final HoodieFileFormat DEFAULT_RO_FILE_FORMAT = HoodieFileFormat.PARQUET; - public static final HoodieFileFormat DEFAULT_RT_FILE_FORMAT = HoodieFileFormat.HOODIE_LOG; - public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName(); - private Properties props; + public static final String HOODIE_PROPERTIES_FILE = "hoodie.properties"; + public static final String HOODIE_TABLE_NAME_PROP_NAME = "hoodie.table.name"; + public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type"; + public static final String HOODIE_RO_FILE_FORMAT_PROP_NAME = + "hoodie.table.ro.file.format"; + public static final String HOODIE_RT_FILE_FORMAT_PROP_NAME = + "hoodie.table.rt.file.format"; + public static final String HOODIE_PAYLOAD_CLASS_PROP_NAME = "hoodie.compaction.payload.class"; - public HoodieTableConfig(FileSystem fs, String metaPath) { - Properties props = new Properties(); - Path propertyPath = new Path(metaPath, HOODIE_PROPERTIES_FILE); - log.info("Loading dataset properties from " + propertyPath); - try { - try (FSDataInputStream inputStream = fs.open(propertyPath)) { - props.load(inputStream); - } - } catch (IOException e) { - throw new HoodieIOException("Could not load Hoodie properties from " + propertyPath, e); - } - this.props = props; + public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE; + public static final HoodieFileFormat DEFAULT_RO_FILE_FORMAT = HoodieFileFormat.PARQUET; + public static final HoodieFileFormat DEFAULT_RT_FILE_FORMAT = HoodieFileFormat.HOODIE_LOG; + public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName(); + private Properties props; + + public HoodieTableConfig(FileSystem fs, String metaPath) { + Properties props = new Properties(); + Path propertyPath = new Path(metaPath, HOODIE_PROPERTIES_FILE); + log.info("Loading dataset properties from " + propertyPath); + try { + try (FSDataInputStream inputStream = fs.open(propertyPath)) { + props.load(inputStream); + } + } catch (IOException e) { + throw new HoodieIOException("Could not load Hoodie properties from " + propertyPath, e); } + this.props = props; + } - /** - * For serailizing and de-serializing - * @deprecated - */ - public HoodieTableConfig() { + /** + * For serailizing and de-serializing + * + * @deprecated + */ + public HoodieTableConfig() { + } + + /** + * Initialize the hoodie meta directory and any necessary files inside the meta (including the + * hoodie.properties) + */ + public static void createHoodieProperties(FileSystem fs, Path metadataFolder, + Properties properties) throws IOException { + if (!fs.exists(metadataFolder)) { + fs.mkdirs(metadataFolder); } - - /** - * Initialize the hoodie meta directory and any necessary files inside the meta (including the hoodie.properties) - * - * @param metadataFolder - * @param properties - * @throws IOException - */ - public static void createHoodieProperties(FileSystem fs, Path metadataFolder, - Properties properties) throws IOException { - if (!fs.exists(metadataFolder)) { - fs.mkdirs(metadataFolder); - } - Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE); - FSDataOutputStream outputStream = fs.create(propertyPath); - try { - if (!properties.containsKey(HOODIE_TABLE_NAME_PROP_NAME)) { - throw new IllegalArgumentException( - HOODIE_TABLE_NAME_PROP_NAME + " property needs to be specified"); - } - if (!properties.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) { - properties.setProperty(HOODIE_TABLE_TYPE_PROP_NAME, DEFAULT_TABLE_TYPE.name()); - } - if (properties.getProperty(HOODIE_TABLE_TYPE_PROP_NAME) == HoodieTableType.MERGE_ON_READ.name() - && !properties.containsKey(HOODIE_PAYLOAD_CLASS_PROP_NAME)) { - properties.setProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS); - } - properties - .store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); - } finally { - outputStream.close(); - } + Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE); + FSDataOutputStream outputStream = fs.create(propertyPath); + try { + if (!properties.containsKey(HOODIE_TABLE_NAME_PROP_NAME)) { + throw new IllegalArgumentException( + HOODIE_TABLE_NAME_PROP_NAME + " property needs to be specified"); + } + if (!properties.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) { + properties.setProperty(HOODIE_TABLE_TYPE_PROP_NAME, DEFAULT_TABLE_TYPE.name()); + } + if (properties.getProperty(HOODIE_TABLE_TYPE_PROP_NAME) == HoodieTableType.MERGE_ON_READ + .name() + && !properties.containsKey(HOODIE_PAYLOAD_CLASS_PROP_NAME)) { + properties.setProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS); + } + properties + .store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); + } finally { + outputStream.close(); } + } - /** - * Read the table type from the table properties and if not found, return the default - * - * @return - */ - public HoodieTableType getTableType() { - if (props.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) { - return HoodieTableType.valueOf(props.getProperty(HOODIE_TABLE_TYPE_PROP_NAME)); - } - return DEFAULT_TABLE_TYPE; + /** + * Read the table type from the table properties and if not found, return the default + */ + public HoodieTableType getTableType() { + if (props.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) { + return HoodieTableType.valueOf(props.getProperty(HOODIE_TABLE_TYPE_PROP_NAME)); } + return DEFAULT_TABLE_TYPE; + } - /** - * Read the payload class for HoodieRecords from the table properties - * - * @return - */ - public String getPayloadClass() { - return props.getProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS); - } + /** + * Read the payload class for HoodieRecords from the table properties + */ + public String getPayloadClass() { + return props.getProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS); + } - /** - * Read the table name - * - * @return - */ - public String getTableName() { - return props.getProperty(HOODIE_TABLE_NAME_PROP_NAME); - } + /** + * Read the table name + */ + public String getTableName() { + return props.getProperty(HOODIE_TABLE_NAME_PROP_NAME); + } - /** - * Get the Read Optimized Storage Format - * - * @return HoodieFileFormat for the Read Optimized Storage format - */ - public HoodieFileFormat getROFileFormat() { - if (props.containsKey(HOODIE_RO_FILE_FORMAT_PROP_NAME)) { - return HoodieFileFormat.valueOf(props.getProperty(HOODIE_RO_FILE_FORMAT_PROP_NAME)); - } - return DEFAULT_RO_FILE_FORMAT; + /** + * Get the Read Optimized Storage Format + * + * @return HoodieFileFormat for the Read Optimized Storage format + */ + public HoodieFileFormat getROFileFormat() { + if (props.containsKey(HOODIE_RO_FILE_FORMAT_PROP_NAME)) { + return HoodieFileFormat.valueOf(props.getProperty(HOODIE_RO_FILE_FORMAT_PROP_NAME)); } + return DEFAULT_RO_FILE_FORMAT; + } - /** - * Get the Read Optimized Storage Format - * - * @return HoodieFileFormat for the Read Optimized Storage format - */ - public HoodieFileFormat getRTFileFormat() { - if (props.containsKey(HOODIE_RT_FILE_FORMAT_PROP_NAME)) { - return HoodieFileFormat.valueOf(props.getProperty(HOODIE_RT_FILE_FORMAT_PROP_NAME)); - } - return DEFAULT_RT_FILE_FORMAT; + /** + * Get the Read Optimized Storage Format + * + * @return HoodieFileFormat for the Read Optimized Storage format + */ + public HoodieFileFormat getRTFileFormat() { + if (props.containsKey(HOODIE_RT_FILE_FORMAT_PROP_NAME)) { + return HoodieFileFormat.valueOf(props.getProperty(HOODIE_RT_FILE_FORMAT_PROP_NAME)); } + return DEFAULT_RT_FILE_FORMAT; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java index b1dd0296c..d012d9799 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java @@ -21,6 +21,11 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.DatasetNotFoundException; +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.util.Objects; +import java.util.Properties; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -28,225 +33,205 @@ import org.apache.hadoop.fs.PathFilter; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.Externalizable; -import java.io.File; -import java.io.IOException; -import java.io.ObjectInput; -import java.io.ObjectInputStream; -import java.io.ObjectOutput; -import java.io.ObjectOutputStream; -import java.io.Serializable; -import java.util.Objects; -import java.util.Properties; - /** - * HoodieTableMetaClient allows to access meta-data about a hoodie table - * It returns meta-data about commits, savepoints, compactions, cleanups as a HoodieTimeline - * Create an instance of the HoodieTableMetaClient with FileSystem and basePath to start getting the meta-data. - *

- * All the timelines are computed lazily, once computed the timeline is cached and never refreshed. - * Use the HoodieTimeline.reload() to refresh timelines. + * HoodieTableMetaClient allows to access meta-data about a hoodie table It returns + * meta-data about commits, savepoints, compactions, cleanups as a HoodieTimeline + * Create an instance of the HoodieTableMetaClient with FileSystem and basePath to + * start getting the meta-data.

All the timelines are computed lazily, once computed the + * timeline is cached and never refreshed. Use the HoodieTimeline.reload() to refresh + * timelines. * * @see HoodieTimeline * @since 0.3.0 */ public class HoodieTableMetaClient implements Serializable { - private final transient static Logger log = LogManager.getLogger(HoodieTableMetaClient.class); - public static String METAFOLDER_NAME = ".hoodie"; - private String basePath; - private transient FileSystem fs; - private String metaPath; - private HoodieTableType tableType; - private HoodieTableConfig tableConfig; - private HoodieActiveTimeline activeTimeline; - private HoodieArchivedTimeline archivedTimeline; + private final transient static Logger log = LogManager.getLogger(HoodieTableMetaClient.class); + public static String METAFOLDER_NAME = ".hoodie"; - public HoodieTableMetaClient(FileSystem fs, String basePath) throws DatasetNotFoundException { - // Do not load any timeline by default - this(fs, basePath, false); + private String basePath; + private transient FileSystem fs; + private String metaPath; + private HoodieTableType tableType; + private HoodieTableConfig tableConfig; + private HoodieActiveTimeline activeTimeline; + private HoodieArchivedTimeline archivedTimeline; + + public HoodieTableMetaClient(FileSystem fs, String basePath) throws DatasetNotFoundException { + // Do not load any timeline by default + this(fs, basePath, false); + } + + public HoodieTableMetaClient(FileSystem fs, String basePath, boolean loadActiveTimelineOnLoad) + throws DatasetNotFoundException { + log.info("Loading HoodieTableMetaClient from " + basePath); + this.basePath = basePath; + this.fs = fs; + Path basePathDir = new Path(this.basePath); + this.metaPath = basePath + File.separator + METAFOLDER_NAME; + Path metaPathDir = new Path(this.metaPath); + DatasetNotFoundException.checkValidDataset(fs, basePathDir, metaPathDir); + this.tableConfig = new HoodieTableConfig(fs, metaPath); + this.tableType = tableConfig.getTableType(); + log.info("Finished Loading Table of type " + tableType + " from " + basePath); + if (loadActiveTimelineOnLoad) { + log.info("Loading Active commit timeline for " + basePath); + getActiveTimeline(); } + } - public HoodieTableMetaClient(FileSystem fs, String basePath, boolean loadActiveTimelineOnLoad) - throws DatasetNotFoundException { - log.info("Loading HoodieTableMetaClient from " + basePath); - this.basePath = basePath; - this.fs = fs; - Path basePathDir = new Path(this.basePath); - this.metaPath = basePath + File.separator + METAFOLDER_NAME; - Path metaPathDir = new Path(this.metaPath); - DatasetNotFoundException.checkValidDataset(fs, basePathDir, metaPathDir); - this.tableConfig = new HoodieTableConfig(fs, metaPath); - this.tableType = tableConfig.getTableType(); - log.info("Finished Loading Table of type " + tableType + " from " + basePath); - if (loadActiveTimelineOnLoad) { - log.info("Loading Active commit timeline for " + basePath); - getActiveTimeline(); - } - } + /** + * For serailizing and de-serializing + * + * @deprecated + */ + public HoodieTableMetaClient() { + } - /** - * For serailizing and de-serializing - * - * @deprecated - */ - public HoodieTableMetaClient() { - } + /** + * This method is only used when this object is deserialized in a spark executor. + * + * @deprecated + */ + private void readObject(java.io.ObjectInputStream in) + throws IOException, ClassNotFoundException { + in.defaultReadObject(); + this.fs = FSUtils.getFs(); + } - /** - * This method is only used when this object is deserialized in a spark executor. - * - * @deprecated - */ - private void readObject(java.io.ObjectInputStream in) - throws IOException, ClassNotFoundException { - in.defaultReadObject(); - this.fs = FSUtils.getFs(); - } + private void writeObject(java.io.ObjectOutputStream out) + throws IOException { + out.defaultWriteObject(); + } - private void writeObject(java.io.ObjectOutputStream out) - throws IOException { - out.defaultWriteObject(); - } + /** + * @return Base path + */ + public String getBasePath() { + return basePath; + } - /** - * @return Base path - */ - public String getBasePath() { - return basePath; - } + /** + * @return Hoodie Table Type + */ + public HoodieTableType getTableType() { + return tableType; + } - /** - * @return Hoodie Table Type - */ - public HoodieTableType getTableType() { - return tableType; - } + /** + * @return Meta path + */ + public String getMetaPath() { + return metaPath; + } - /** - * @return Meta path - */ - public String getMetaPath() { - return metaPath; - } + /** + * @return Table Config + */ + public HoodieTableConfig getTableConfig() { + return tableConfig; + } - /** - * @return Table Config - */ - public HoodieTableConfig getTableConfig() { - return tableConfig; - } + /** + * Get the FS implementation for this table + */ + public FileSystem getFs() { + return fs; + } - /** - * Get the FS implementation for this table - * @return - */ - public FileSystem getFs() { - return fs; + /** + * Get the active instants as a timeline + * + * @return Active instants timeline + */ + public synchronized HoodieActiveTimeline getActiveTimeline() { + if (activeTimeline == null) { + activeTimeline = new HoodieActiveTimeline(fs, metaPath); } + return activeTimeline; + } - /** - * Get the active instants as a timeline - * - * @return Active instants timeline - * @throws IOException - */ - public synchronized HoodieActiveTimeline getActiveTimeline() { - if (activeTimeline == null) { - activeTimeline = new HoodieActiveTimeline(fs, metaPath); - } - return activeTimeline; + /** + * Get the archived commits as a timeline. This is costly operation, as all data from the archived + * files are read. This should not be used, unless for historical debugging purposes + * + * @return Active commit timeline + */ + public synchronized HoodieArchivedTimeline getArchivedTimeline() { + if (archivedTimeline == null) { + archivedTimeline = new HoodieArchivedTimeline(fs, metaPath); } + return archivedTimeline; + } - /** - * Get the archived commits as a timeline. This is costly operation, as all data from the - * archived files are read. This should not be used, unless for historical debugging purposes - * - * @return Active commit timeline - * @throws IOException - */ - public synchronized HoodieArchivedTimeline getArchivedTimeline() { - if (archivedTimeline == null) { - archivedTimeline = new HoodieArchivedTimeline(fs, metaPath); - } - return archivedTimeline; + /** + * Helper method to initialize a given path, as a given storage type and table name + */ + public static HoodieTableMetaClient initTableType(FileSystem fs, String basePath, + HoodieTableType tableType, String tableName, String payloadClassName) throws IOException { + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName); + properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name()); + if (tableType == HoodieTableType.MERGE_ON_READ) { + properties.setProperty(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, payloadClassName); } + return HoodieTableMetaClient.initializePathAsHoodieDataset(fs, basePath, properties); + } - /** - * Helper method to initialize a given path, as a given storage type and table name - * - * @param fs - * @param basePath - * @param tableType - * @param tableName - * @return - * @throws IOException - */ - public static HoodieTableMetaClient initTableType(FileSystem fs, String basePath, HoodieTableType tableType, String tableName, String payloadClassName) throws IOException { - Properties properties = new Properties(); - properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName); - properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name()); - if(tableType == HoodieTableType.MERGE_ON_READ) { - properties.setProperty(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, payloadClassName); - } - return HoodieTableMetaClient.initializePathAsHoodieDataset(fs, basePath, properties); + /** + * Helper method to initialize a given path as a hoodie dataset with configs passed in as as + * Properties + * + * @return Instance of HoodieTableMetaClient + */ + public static HoodieTableMetaClient initializePathAsHoodieDataset(FileSystem fs, + String basePath, Properties props) throws IOException { + log.info("Initializing " + basePath + " as hoodie dataset " + basePath); + Path basePathDir = new Path(basePath); + if (!fs.exists(basePathDir)) { + fs.mkdirs(basePathDir); } + Path metaPathDir = new Path(basePath, METAFOLDER_NAME); + if (!fs.exists(metaPathDir)) { + fs.mkdirs(metaPathDir); + } + HoodieTableConfig.createHoodieProperties(fs, metaPathDir, props); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + log.info("Finished initializing Table of type " + metaClient.getTableConfig().getTableType() + + " from " + basePath); + return metaClient; + } - /** - * Helper method to initialize a given path as a hoodie dataset with configs passed in as as Properties - * - * @param fs - * @param basePath - * @param props - * @return Instance of HoodieTableMetaClient - * @throws IOException - */ - public static HoodieTableMetaClient initializePathAsHoodieDataset(FileSystem fs, - String basePath, Properties props) throws IOException { - log.info("Initializing " + basePath + " as hoodie dataset " + basePath); - Path basePathDir = new Path(basePath); - if (!fs.exists(basePathDir)) { - fs.mkdirs(basePathDir); - } - Path metaPathDir = new Path(basePath, METAFOLDER_NAME); - if (!fs.exists(metaPathDir)) { - fs.mkdirs(metaPathDir); - } - HoodieTableConfig.createHoodieProperties(fs, metaPathDir, props); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - log.info("Finished initializing Table of type " + metaClient.getTableConfig().getTableType() - + " from " + basePath); - return metaClient; - } + // HELPER METHODS TO CREATE META FILE NAMES + public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter nameFilter) + throws IOException { + return fs.listStatus(metaPath, nameFilter); + } - // HELPER METHODS TO CREATE META FILE NAMES - public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter nameFilter) - throws IOException { - return fs.listStatus(metaPath, nameFilter); + @Override + public boolean equals(Object o) { + if (this == o) { + return true; } + if (o == null || getClass() != o.getClass()) { + return false; + } + HoodieTableMetaClient that = (HoodieTableMetaClient) o; + return Objects.equals(basePath, that.basePath) && tableType == that.tableType; + } - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - HoodieTableMetaClient that = (HoodieTableMetaClient) o; - return Objects.equals(basePath, that.basePath) && tableType == that.tableType; - } + @Override + public int hashCode() { + return Objects.hash(basePath, tableType); + } - @Override - public int hashCode() { - return Objects.hash(basePath, tableType); - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("HoodieTableMetaClient{"); - sb.append("basePath='").append(basePath).append('\''); - sb.append(", metaPath='").append(metaPath).append('\''); - sb.append(", tableType=").append(tableType); - sb.append('}'); - return sb.toString(); - } + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("HoodieTableMetaClient{"); + sb.append("basePath='").append(basePath).append('\''); + sb.append(", metaPath='").append(metaPath).append('\''); + sb.append(", tableType=").append(tableType); + sb.append('}'); + return sb.toString(); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java index 93377c3eb..e2001a2c7 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTimeline.java @@ -18,19 +18,16 @@ package com.uber.hoodie.common.table; import com.uber.hoodie.common.table.timeline.HoodieDefaultTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; -import com.uber.hoodie.common.util.FSUtils; - import java.io.Serializable; import java.util.Optional; import java.util.function.BiPredicate; import java.util.stream.Stream; /** - * HoodieTimeline is a view of meta-data instants in the hoodie dataset. - * Instants are specific points in time represented as HoodieInstant. - *

- * Timelines are immutable once created and operations create new instance of - * timelines which filter on the instants and this can be chained. + * HoodieTimeline is a view of meta-data instants in the hoodie dataset. Instants are specific + * points in time represented as HoodieInstant.

Timelines are immutable once created and + * operations create new instance of timelines which filter on the instants and this can be + * chained. * * @see com.uber.hoodie.common.table.HoodieTableMetaClient * @see HoodieDefaultTimeline @@ -38,205 +35,195 @@ import java.util.stream.Stream; * @since 0.3.0 */ public interface HoodieTimeline extends Serializable { - String COMMIT_ACTION = "commit"; - String DELTA_COMMIT_ACTION = "deltacommit"; - String CLEAN_ACTION = "clean"; - String ROLLBACK_ACTION = "rollback"; - String SAVEPOINT_ACTION = "savepoint"; - String COMPACTION_ACTION = "compaction"; - String INFLIGHT_EXTENSION = ".inflight"; - String COMMIT_EXTENSION = "." + COMMIT_ACTION; - String DELTA_COMMIT_EXTENSION = "." + DELTA_COMMIT_ACTION; - String CLEAN_EXTENSION = "." + CLEAN_ACTION; - String ROLLBACK_EXTENSION = "." + ROLLBACK_ACTION; - String SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION; - String COMPACTION_EXTENSION = "." + COMPACTION_ACTION; - //this is to preserve backwards compatibility on commit in-flight filenames - String INFLIGHT_COMMIT_EXTENSION = INFLIGHT_EXTENSION; - String INFLIGHT_DELTA_COMMIT_EXTENSION = "." + DELTA_COMMIT_ACTION + INFLIGHT_EXTENSION; - String INFLIGHT_CLEAN_EXTENSION = "." + CLEAN_ACTION + INFLIGHT_EXTENSION; - String INFLIGHT_ROLLBACK_EXTENSION = "." + ROLLBACK_ACTION + INFLIGHT_EXTENSION; - String INFLIGHT_SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION + INFLIGHT_EXTENSION; - String INFLIGHT_COMPACTION_EXTENSION = "." + COMPACTION_ACTION + INFLIGHT_EXTENSION; + String COMMIT_ACTION = "commit"; + String DELTA_COMMIT_ACTION = "deltacommit"; + String CLEAN_ACTION = "clean"; + String ROLLBACK_ACTION = "rollback"; + String SAVEPOINT_ACTION = "savepoint"; + String COMPACTION_ACTION = "compaction"; + String INFLIGHT_EXTENSION = ".inflight"; - /** - * Filter this timeline to just include the in-flights - * - * @return New instance of HoodieTimeline with just in-flights - */ - HoodieTimeline filterInflights(); + String COMMIT_EXTENSION = "." + COMMIT_ACTION; + String DELTA_COMMIT_EXTENSION = "." + DELTA_COMMIT_ACTION; + String CLEAN_EXTENSION = "." + CLEAN_ACTION; + String ROLLBACK_EXTENSION = "." + ROLLBACK_ACTION; + String SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION; + String COMPACTION_EXTENSION = "." + COMPACTION_ACTION; + //this is to preserve backwards compatibility on commit in-flight filenames + String INFLIGHT_COMMIT_EXTENSION = INFLIGHT_EXTENSION; + String INFLIGHT_DELTA_COMMIT_EXTENSION = "." + DELTA_COMMIT_ACTION + INFLIGHT_EXTENSION; + String INFLIGHT_CLEAN_EXTENSION = "." + CLEAN_ACTION + INFLIGHT_EXTENSION; + String INFLIGHT_ROLLBACK_EXTENSION = "." + ROLLBACK_ACTION + INFLIGHT_EXTENSION; + String INFLIGHT_SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION + INFLIGHT_EXTENSION; + String INFLIGHT_COMPACTION_EXTENSION = "." + COMPACTION_ACTION + INFLIGHT_EXTENSION; - /** - * Filter this timeline to just include the completed instants - * - * @return New instance of HoodieTimeline with just completed instants - */ - HoodieTimeline filterCompletedInstants(); + /** + * Filter this timeline to just include the in-flights + * + * @return New instance of HoodieTimeline with just in-flights + */ + HoodieTimeline filterInflights(); + + /** + * Filter this timeline to just include the completed instants + * + * @return New instance of HoodieTimeline with just completed instants + */ + HoodieTimeline filterCompletedInstants(); - /** - * Create a new Timeline with instants after startTs and before or on endTs - * - * @param startTs - * @param endTs - */ - HoodieTimeline findInstantsInRange(String startTs, String endTs); + /** + * Create a new Timeline with instants after startTs and before or on endTs + */ + HoodieTimeline findInstantsInRange(String startTs, String endTs); - /** - * Create a new Timeline with all the instants after startTs - * - * @param commitTime - * @param numCommits - */ - HoodieTimeline findInstantsAfter(String commitTime, int numCommits); + /** + * Create a new Timeline with all the instants after startTs + */ + HoodieTimeline findInstantsAfter(String commitTime, int numCommits); - /** - * If the timeline has any instants - * - * @return true if timeline is empty - */ - boolean empty(); + /** + * If the timeline has any instants + * + * @return true if timeline is empty + */ + boolean empty(); - /** - * @return total number of completed instants - */ - int countInstants(); + /** + * @return total number of completed instants + */ + int countInstants(); - /** - * @return first completed instant if available - */ - Optional firstInstant(); + /** + * @return first completed instant if available + */ + Optional firstInstant(); - /** - * @param n - * @return nth completed instant from the first completed instant - */ - Optional nthInstant(int n); + /** + * @return nth completed instant from the first completed instant + */ + Optional nthInstant(int n); - /** - * @return last completed instant if available - */ - Optional lastInstant(); + /** + * @return last completed instant if available + */ + Optional lastInstant(); - /** - * @param n - * @return nth completed instant going back from the last completed instant - */ - Optional nthFromLastInstant(int n); + /** + * @return nth completed instant going back from the last completed instant + */ + Optional nthFromLastInstant(int n); - /** - * @return true if the passed instant is present as a completed instant on the timeline - */ - boolean containsInstant(HoodieInstant instant); + /** + * @return true if the passed instant is present as a completed instant on the timeline + */ + boolean containsInstant(HoodieInstant instant); - /** - * @return true if the passed instant is present as a completed instant on the timeline or - * if the instant is before the first completed instant in the timeline - */ - boolean containsOrBeforeTimelineStarts(String ts); + /** + * @return true if the passed instant is present as a completed instant on the timeline or if the + * instant is before the first completed instant in the timeline + */ + boolean containsOrBeforeTimelineStarts(String ts); - /** - * @return Get the stream of completed instants - */ - Stream getInstants(); + /** + * @return Get the stream of completed instants + */ + Stream getInstants(); - /** - * @return true if the passed in instant is before the first completed instant in the timeline - */ - boolean isBeforeTimelineStarts(String ts); + /** + * @return true if the passed in instant is before the first completed instant in the timeline + */ + boolean isBeforeTimelineStarts(String ts); - /** - * Read the completed instant details - * - * @param instant - * @return - */ - Optional getInstantDetails(HoodieInstant instant); + /** + * Read the completed instant details + */ + Optional getInstantDetails(HoodieInstant instant); - /** - * Helper methods to compare instants - **/ - BiPredicate GREATER_OR_EQUAL = - (commit1, commit2) -> commit1.compareTo(commit2) >= 0; - BiPredicate GREATER = (commit1, commit2) -> commit1.compareTo(commit2) > 0; - BiPredicate LESSER_OR_EQUAL = - (commit1, commit2) -> commit1.compareTo(commit2) <= 0; - BiPredicate LESSER = (commit1, commit2) -> commit1.compareTo(commit2) < 0; + /** + * Helper methods to compare instants + **/ + BiPredicate GREATER_OR_EQUAL = + (commit1, commit2) -> commit1.compareTo(commit2) >= 0; + BiPredicate GREATER = (commit1, commit2) -> commit1.compareTo(commit2) > 0; + BiPredicate LESSER_OR_EQUAL = + (commit1, commit2) -> commit1.compareTo(commit2) <= 0; + BiPredicate LESSER = (commit1, commit2) -> commit1.compareTo(commit2) < 0; - static boolean compareTimestamps(String commit1, String commit2, - BiPredicate predicateToApply) { - return predicateToApply.test(commit1, commit2); - } + static boolean compareTimestamps(String commit1, String commit2, + BiPredicate predicateToApply) { + return predicateToApply.test(commit1, commit2); + } - static HoodieInstant getCompletedInstant(final HoodieInstant instant) { - return new HoodieInstant(false, instant.getAction(), instant.getTimestamp()); - } + static HoodieInstant getCompletedInstant(final HoodieInstant instant) { + return new HoodieInstant(false, instant.getAction(), instant.getTimestamp()); + } - static HoodieInstant getInflightInstant(final HoodieInstant instant) { - return new HoodieInstant(true, instant.getAction(), instant.getTimestamp()); - } + static HoodieInstant getInflightInstant(final HoodieInstant instant) { + return new HoodieInstant(true, instant.getAction(), instant.getTimestamp()); + } - static String makeCommitFileName(String commitTime) { - return commitTime + HoodieTimeline.COMMIT_EXTENSION; - } + static String makeCommitFileName(String commitTime) { + return commitTime + HoodieTimeline.COMMIT_EXTENSION; + } - static String makeInflightCommitFileName(String commitTime) { - return commitTime + HoodieTimeline.INFLIGHT_COMMIT_EXTENSION; - } + static String makeInflightCommitFileName(String commitTime) { + return commitTime + HoodieTimeline.INFLIGHT_COMMIT_EXTENSION; + } - static String makeCleanerFileName(String instant) { - return instant + HoodieTimeline.CLEAN_EXTENSION; - } + static String makeCleanerFileName(String instant) { + return instant + HoodieTimeline.CLEAN_EXTENSION; + } - static String makeInflightCleanerFileName(String instant) { - return instant + HoodieTimeline.INFLIGHT_CLEAN_EXTENSION; - } + static String makeInflightCleanerFileName(String instant) { + return instant + HoodieTimeline.INFLIGHT_CLEAN_EXTENSION; + } - static String makeRollbackFileName(String instant) { - return instant + HoodieTimeline.ROLLBACK_EXTENSION; - } + static String makeRollbackFileName(String instant) { + return instant + HoodieTimeline.ROLLBACK_EXTENSION; + } - static String makeInflightRollbackFileName(String instant) { - return instant + HoodieTimeline.INFLIGHT_ROLLBACK_EXTENSION; - } + static String makeInflightRollbackFileName(String instant) { + return instant + HoodieTimeline.INFLIGHT_ROLLBACK_EXTENSION; + } - static String makeInflightSavePointFileName(String commitTime) { - return commitTime + HoodieTimeline.INFLIGHT_SAVEPOINT_EXTENSION; - } + static String makeInflightSavePointFileName(String commitTime) { + return commitTime + HoodieTimeline.INFLIGHT_SAVEPOINT_EXTENSION; + } - static String makeSavePointFileName(String commitTime) { - return commitTime + HoodieTimeline.SAVEPOINT_EXTENSION; - } + static String makeSavePointFileName(String commitTime) { + return commitTime + HoodieTimeline.SAVEPOINT_EXTENSION; + } - static String makeInflightCompactionFileName(String commitTime) { - return commitTime + HoodieTimeline.INFLIGHT_COMPACTION_EXTENSION; - } + static String makeInflightCompactionFileName(String commitTime) { + return commitTime + HoodieTimeline.INFLIGHT_COMPACTION_EXTENSION; + } - static String makeCompactionFileName(String commitTime) { - return commitTime + HoodieTimeline.COMPACTION_EXTENSION; - } + static String makeCompactionFileName(String commitTime) { + return commitTime + HoodieTimeline.COMPACTION_EXTENSION; + } - static String makeInflightDeltaFileName(String commitTime) { - return commitTime + HoodieTimeline.INFLIGHT_DELTA_COMMIT_EXTENSION; - } + static String makeInflightDeltaFileName(String commitTime) { + return commitTime + HoodieTimeline.INFLIGHT_DELTA_COMMIT_EXTENSION; + } - static String makeDeltaFileName(String commitTime) { - return commitTime + HoodieTimeline.DELTA_COMMIT_EXTENSION; - } + static String makeDeltaFileName(String commitTime) { + return commitTime + HoodieTimeline.DELTA_COMMIT_EXTENSION; + } - static String getCommitFromCommitFile(String commitFileName) { - return commitFileName.split("\\.")[0]; - } + static String getCommitFromCommitFile(String commitFileName) { + return commitFileName.split("\\.")[0]; + } - static String makeFileNameAsComplete(String fileName) { - return fileName.replace(HoodieTimeline.INFLIGHT_EXTENSION, ""); - } + static String makeFileNameAsComplete(String fileName) { + return fileName.replace(HoodieTimeline.INFLIGHT_EXTENSION, ""); + } - static String makeFileNameAsInflight(String fileName) { - return fileName + HoodieTimeline.INFLIGHT_EXTENSION; - } + static String makeFileNameAsInflight(String fileName) { + return fileName + HoodieTimeline.INFLIGHT_EXTENSION; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java index ee4e373b5..d9ffae790 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/TableFileSystemView.java @@ -19,13 +19,7 @@ package com.uber.hoodie.common.table; import com.uber.hoodie.common.model.FileSlice; import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieFileGroup; -import com.uber.hoodie.common.model.HoodieLogFile; -import org.apache.hadoop.fs.FileStatus; - -import java.io.IOException; import java.util.List; -import java.util.Map; -import java.util.function.Predicate; import java.util.stream.Stream; /** @@ -35,70 +29,70 @@ import java.util.stream.Stream; */ public interface TableFileSystemView { - /** - * ReadOptimizedView - methods to provide a view of columnar data files only. - */ - interface ReadOptimizedView { - /** - * Stream all the latest data files in the given partition - */ - Stream getLatestDataFiles(String partitionPath); - - /** - * Stream all the latest data files, in the file system view - */ - Stream getLatestDataFiles(); - - /** - * Stream all the latest version data files in the given partition with precondition that - * commitTime(file) before maxCommitTime - */ - Stream getLatestDataFilesBeforeOrOn(String partitionPath, - String maxCommitTime); - - /** - * Stream all the latest data files pass - */ - Stream getLatestDataFilesInRange(List commitsToReturn); - - /** - * Stream all the data file versions grouped by FileId for a given partition - */ - Stream getAllDataFiles(String partitionPath); - } + /** + * ReadOptimizedView - methods to provide a view of columnar data files only. + */ + interface ReadOptimizedView { /** - * RealtimeView - methods to access a combination of columnar data files + log files with real time data. + * Stream all the latest data files in the given partition */ - interface RealtimeView { - /** - * Stream all the latest file slices in the given partition - */ - Stream getLatestFileSlices(String partitionPath); - - /** - * Stream all the latest file slices in the given partition with precondition that - * commitTime(file) before maxCommitTime - */ - Stream getLatestFileSlicesBeforeOrOn(String partitionPath, - String maxCommitTime); - - /** - * Stream all the latest file slices, in the given range - */ - Stream getLatestFileSliceInRange(List commitsToReturn); - - /** - * Stream all the file slices for a given partition, latest or not. - */ - Stream getAllFileSlices(String partitionPath); - } + Stream getLatestDataFiles(String partitionPath); /** - * Stream all the file groups for a given partition - * - * @param partitionPath - * @return + * Stream all the latest data files, in the file system view */ - Stream getAllFileGroups(String partitionPath); + Stream getLatestDataFiles(); + + /** + * Stream all the latest version data files in the given partition with precondition that + * commitTime(file) before maxCommitTime + */ + Stream getLatestDataFilesBeforeOrOn(String partitionPath, + String maxCommitTime); + + /** + * Stream all the latest data files pass + */ + Stream getLatestDataFilesInRange(List commitsToReturn); + + /** + * Stream all the data file versions grouped by FileId for a given partition + */ + Stream getAllDataFiles(String partitionPath); + } + + /** + * RealtimeView - methods to access a combination of columnar data files + log files with real + * time data. + */ + interface RealtimeView { + + /** + * Stream all the latest file slices in the given partition + */ + Stream getLatestFileSlices(String partitionPath); + + /** + * Stream all the latest file slices in the given partition with precondition that + * commitTime(file) before maxCommitTime + */ + Stream getLatestFileSlicesBeforeOrOn(String partitionPath, + String maxCommitTime); + + /** + * Stream all the latest file slices, in the given range + */ + Stream getLatestFileSliceInRange(List commitsToReturn); + + /** + * Stream all the file slices for a given partition, latest or not. + */ + Stream getAllFileSlices(String partitionPath); + } + + /** + * Stream all the file groups for a given partition + */ + Stream getAllFileGroups(String partitionPath); } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieCompactedLogRecordScanner.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieCompactedLogRecordScanner.java index 2060db1a1..667199233 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieCompactedLogRecordScanner.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieCompactedLogRecordScanner.java @@ -16,6 +16,9 @@ package com.uber.hoodie.common.table.log; +import static com.uber.hoodie.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.CORRUPT_BLOCK; +import static com.uber.hoodie.common.table.log.block.HoodieLogBlock.LogMetadataType.INSTANT_TIME; + import com.google.common.collect.Maps; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieLogFile; @@ -29,14 +32,6 @@ import com.uber.hoodie.common.table.log.block.HoodieDeleteBlock; import com.uber.hoodie.common.table.log.block.HoodieLogBlock; import com.uber.hoodie.common.util.ReflectionUtils; import com.uber.hoodie.exception.HoodieIOException; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - import java.io.IOException; import java.util.ArrayDeque; import java.util.Arrays; @@ -48,17 +43,22 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.atomic.AtomicLong; - -import static com.uber.hoodie.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.CORRUPT_BLOCK; -import static com.uber.hoodie.common.table.log.block.HoodieLogBlock.LogMetadataType.INSTANT_TIME; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; /** - * Scans through all the blocks in a list of HoodieLogFile and builds up a compacted/merged - * list of records which will be used as a lookup table when merging the base columnar file - * with the redo log file. - * + * Scans through all the blocks in a list of HoodieLogFile and builds up a compacted/merged list of + * records which will be used as a lookup table when merging the base columnar file with the redo + * log file. */ -public class HoodieCompactedLogRecordScanner implements Iterable> { +public class HoodieCompactedLogRecordScanner implements + Iterable> { + private final static Logger log = LogManager.getLogger(HoodieCompactedLogRecordScanner.class); // Final list of compacted/merged records to iterate @@ -80,10 +80,10 @@ public class HoodieCompactedLogRecordScanner implements Iterable lastBlocks = new ArrayDeque<>(); public HoodieCompactedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, - Schema readerSchema, String latestInstantTime) { + Schema readerSchema, String latestInstantTime) { this.readerSchema = readerSchema; this.latestInstantTime = latestInstantTime; - this.hoodieTableMetaClient = new HoodieTableMetaClient(fs, basePath); + this.hoodieTableMetaClient = new HoodieTableMetaClient(fs, basePath); // load class from the payload fully qualified class name this.payloadClassFQN = this.hoodieTableMetaClient.getTableConfig().getPayloadClass(); @@ -91,18 +91,18 @@ public class HoodieCompactedLogRecordScanner implements Iterable> records = Maps.newHashMap(); // iterate over the paths Iterator logFilePathsItr = logFilePaths.iterator(); - while(logFilePathsItr.hasNext()) { + while (logFilePathsItr.hasNext()) { HoodieLogFile logFile = new HoodieLogFile(new Path(logFilePathsItr.next())); log.info("Scanning log file " + logFile.getPath()); totalLogFiles.incrementAndGet(); try { // Use the HoodieLogFormatReader to iterate through the blocks in the log file HoodieLogFormatReader reader = new HoodieLogFormatReader(fs, logFile, readerSchema, true); - while(reader.hasNext()) { + while (reader.hasNext()) { HoodieLogBlock r = reader.next(); String blockInstantTime = r.getLogMetadata().get(INSTANT_TIME); - if(!HoodieTimeline.compareTimestamps(blockInstantTime, this.latestInstantTime, - HoodieTimeline.LESSER_OR_EQUAL)) { + if (!HoodieTimeline.compareTimestamps(blockInstantTime, this.latestInstantTime, + HoodieTimeline.LESSER_OR_EQUAL)) { //hit a block with instant time greater than should be processed, stop processing further break; } @@ -117,7 +117,7 @@ public class HoodieCompactedLogRecordScanner implements Iterable> loadRecordsFromBlock(HoodieAvroDataBlock dataBlock) { - Map> recordsFromLastBlock = Maps.newHashMap(); + private Map> loadRecordsFromBlock( + HoodieAvroDataBlock dataBlock) { + Map> recordsFromLastBlock = Maps + .newHashMap(); List recs = dataBlock.getRecords(); totalLogRecords.addAndGet(recs.size()); recs.forEach(rec -> { String key = ((GenericRecord) rec).get(HoodieRecord.RECORD_KEY_METADATA_FIELD) - .toString(); + .toString(); String partitionPath = - ((GenericRecord) rec).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD) - .toString(); + ((GenericRecord) rec).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD) + .toString(); HoodieRecord hoodieRecord = new HoodieRecord<>( - new HoodieKey(key, partitionPath), - ReflectionUtils.loadPayload(this.payloadClassFQN, new Object[]{Optional.of(rec)}, Optional.class)); + new HoodieKey(key, partitionPath), + ReflectionUtils + .loadPayload(this.payloadClassFQN, new Object[]{Optional.of(rec)}, Optional.class)); if (recordsFromLastBlock.containsKey(key)) { // Merge and store the merged record HoodieRecordPayload combinedValue = recordsFromLastBlock.get(key).getData() - .preCombine(hoodieRecord.getData()); + .preCombine(hoodieRecord.getData()); recordsFromLastBlock - .put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), - combinedValue)); + .put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), + combinedValue)); } else { // Put the record as is recordsFromLastBlock.put(key, hoodieRecord); @@ -211,12 +214,9 @@ public class HoodieCompactedLogRecordScanner implements Iterable> records, - Deque lastBlocks) { + Deque lastBlocks) { while (!lastBlocks.isEmpty()) { HoodieLogBlock lastBlock = lastBlocks.pop(); switch (lastBlock.getBlockType()) { @@ -234,19 +234,16 @@ public class HoodieCompactedLogRecordScanner implements Iterable> records, - Map> recordsFromLastBlock) { + Map> recordsFromLastBlock) { recordsFromLastBlock.forEach((key, hoodieRecord) -> { if (records.containsKey(key)) { // Merge and store the merged record HoodieRecordPayload combinedValue = records.get(key).getData() - .preCombine(hoodieRecord.getData()); + .preCombine(hoodieRecord.getData()); records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), - combinedValue)); + combinedValue)); } else { // Put the record as is records.put(key, hoodieRecord); diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormat.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormat.java index bae8aa126..2979bb619 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormat.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormat.java @@ -29,31 +29,36 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * File Format for Hoodie Log Files. - * The File Format consists of blocks each seperated with a MAGIC sync marker. - * A Block can either be a Data block, Command block or Delete Block. - * Data Block - Contains log records serialized as Avro Binary Format - * Command Block - Specific commands like RoLLBACK_PREVIOUS-BLOCK - Tombstone for the previously written block - * Delete Block - List of keys to delete - tombstone for keys + * File Format for Hoodie Log Files. The File Format consists of blocks each seperated with a MAGIC + * sync marker. A Block can either be a Data block, Command block or Delete Block. Data Block - + * Contains log records serialized as Avro Binary Format Command Block - Specific commands like + * RoLLBACK_PREVIOUS-BLOCK - Tombstone for the previously written block Delete Block - List of keys + * to delete - tombstone for keys */ public interface HoodieLogFormat { + /** - * Magic 4 bytes we put at the start of every block in the log file. Sync marker. - * We could make this file specific (generate a random 4 byte magic and stick it in the file header), but this I think is suffice for now - PR + * Magic 4 bytes we put at the start of every block in the log file. Sync marker. We could make + * this file specific (generate a random 4 byte magic and stick it in the file header), but this I + * think is suffice for now - PR */ - byte [] MAGIC = new byte [] {'H', 'U', 'D', 'I'}; + byte[] MAGIC = new byte[]{'H', 'U', 'D', 'I'}; /** * Writer interface to allow appending block to this file format */ interface Writer extends Closeable { - /** @return the path to this {@link HoodieLogFormat} */ + + /** + * @return the path to this {@link HoodieLogFormat} + */ HoodieLogFile getLogFile(); /** * Append Block returns a new Writer if the log is rolled */ Writer appendBlock(HoodieLogBlock block) throws IOException, InterruptedException; + long getCurrentSize() throws IOException; } @@ -61,7 +66,10 @@ public interface HoodieLogFormat { * Reader interface which is an Iterator of HoodieLogBlock */ interface Reader extends Closeable, Iterator { - /** @return the path to this {@link HoodieLogFormat} */ + + /** + * @return the path to this {@link HoodieLogFormat} + */ HoodieLogFile getLogFile(); } @@ -70,6 +78,7 @@ public interface HoodieLogFormat { * Builder class to construct the default log format writer */ class WriterBuilder { + private final static Logger log = LogManager.getLogger(WriterBuilder.class); // Default max log file size 512 MB public static final long DEFAULT_SIZE_THRESHOLD = 512 * 1024 * 1024L; @@ -187,7 +196,8 @@ public interface HoodieLogFormat { return new WriterBuilder(); } - static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, boolean readMetadata) + static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, + boolean readMetadata) throws IOException { return new HoodieLogFormatReader(fs, logFile, readerSchema, readMetadata); } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormatReader.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormatReader.java index d6a513a66..4168e27cf 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormatReader.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormatReader.java @@ -17,7 +17,6 @@ package com.uber.hoodie.common.table.log; import com.google.common.base.Preconditions; - import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock; import com.uber.hoodie.common.table.log.block.HoodieCommandBlock; @@ -38,11 +37,12 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Scans a log file and provides block level iterator on the log file - * Loads the entire block contents in memory - * Can emit either a DataBlock, CommandBlock, DeleteBlock or CorruptBlock (if one is found) + * Scans a log file and provides block level iterator on the log file Loads the entire block + * contents in memory Can emit either a DataBlock, CommandBlock, DeleteBlock or CorruptBlock (if one + * is found) */ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { + private static final int DEFAULT_BUFFER_SIZE = 4096; private final static Logger log = LogManager.getLogger(HoodieLogFormatReader.class); @@ -53,14 +53,16 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { private HoodieLogBlock nextBlock = null; private boolean readMetadata = true; - HoodieLogFormatReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, boolean readMetadata) throws IOException { + HoodieLogFormatReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, + boolean readMetadata) throws IOException { this.inputStream = fs.open(logFile.getPath(), bufferSize); this.logFile = logFile; this.readerSchema = readerSchema; this.readMetadata = readMetadata; } - HoodieLogFormatReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, boolean readMetadata) throws IOException { + HoodieLogFormatReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, + boolean readMetadata) throws IOException { this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, readMetadata); } @@ -83,7 +85,7 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { // Skip blocksize in the stream and we should either find a sync marker (start of the next block) or EOF // If we did not find either of it, then this block is a corrupted block. boolean isCorrupted = isBlockCorrupt(blocksize); - if(isCorrupted) { + if (isCorrupted) { return createCorruptBlock(); } @@ -140,7 +142,7 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { } private long scanForNextAvailableBlockOffset() throws IOException { - while(true) { + while (true) { long currentPos = inputStream.getPos(); try { boolean isEOF = readMagic(); @@ -191,7 +193,7 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { @Override public HoodieLogBlock next() { - if(nextBlock == null) { + if (nextBlock == null) { // may be hasNext is not called hasNext(); } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormatWriter.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormatWriter.java index 277829e3e..26a0845e2 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormatWriter.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormatWriter.java @@ -16,8 +16,6 @@ package com.uber.hoodie.common.table.log; -import com.google.common.base.Preconditions; - import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.table.log.HoodieLogFormat.Writer; import com.uber.hoodie.common.table.log.HoodieLogFormat.WriterBuilder; @@ -35,8 +33,8 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * HoodieLogFormatWriter can be used to append blocks to a log file - * Use HoodieLogFormat.WriterBuilder to construct + * HoodieLogFormatWriter can be used to append blocks to a log file Use + * HoodieLogFormat.WriterBuilder to construct */ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { @@ -58,7 +56,7 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { * @param sizeThreshold */ HoodieLogFormatWriter(FileSystem fs, HoodieLogFile logFile, Integer bufferSize, - Short replication, Long sizeThreshold) + Short replication, Long sizeThreshold) throws IOException, InterruptedException { this.fs = fs; this.logFile = logFile; @@ -157,8 +155,9 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { } public long getCurrentSize() throws IOException { - if(output == null) { - throw new IllegalStateException("Cannot get current size as the underlying stream has been closed already"); + if (output == null) { + throw new IllegalStateException( + "Cannot get current size as the underlying stream has been closed already"); } return output.getPos(); } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieAvroDataBlock.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieAvroDataBlock.java index 9019689b7..d2f73ef1b 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieAvroDataBlock.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieAvroDataBlock.java @@ -18,6 +18,14 @@ package com.uber.hoodie.common.table.log.block; import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.exception.HoodieIOException; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; @@ -27,30 +35,18 @@ import org.apache.avro.io.DecoderFactory; import org.apache.avro.io.Encoder; import org.apache.avro.io.EncoderFactory; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - /** - * DataBlock contains a list of records serialized using Avro. - * The Datablock contains - * 1. Compressed Writer Schema length - * 2. Compressed Writer Schema content - * 3. Total number of records in the block - * 4. Size of a record - * 5. Actual avro serialized content of the record + * DataBlock contains a list of records serialized using Avro. The Datablock contains 1. Compressed + * Writer Schema length 2. Compressed Writer Schema content 3. Total number of records in the block + * 4. Size of a record 5. Actual avro serialized content of the record */ public class HoodieAvroDataBlock extends HoodieLogBlock { private List records; private Schema schema; - public HoodieAvroDataBlock(List records, Schema schema, Map metadata) { + public HoodieAvroDataBlock(List records, Schema schema, + Map metadata) { super(metadata); this.records = records; this.schema = schema; @@ -76,7 +72,7 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { DataOutputStream output = new DataOutputStream(baos); // 1. Write out metadata - if(super.getLogMetadata() != null) { + if (super.getLogMetadata() != null) { output.write(HoodieLogBlock.getLogMetadataBytes(super.getLogMetadata())); } @@ -117,12 +113,13 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { return HoodieLogBlockType.AVRO_DATA_BLOCK; } - public static HoodieLogBlock fromBytes(byte[] content, Schema readerSchema, boolean readMetadata) throws IOException { + public static HoodieLogBlock fromBytes(byte[] content, Schema readerSchema, boolean readMetadata) + throws IOException { DataInputStream dis = new DataInputStream(new ByteArrayInputStream(content)); Map metadata = null; // 1. Read the metadata written out, if applicable - if(readMetadata) { + if (readMetadata) { metadata = HoodieLogBlock.getLogMetadata(dis); } // 1. Read the schema written out @@ -131,7 +128,7 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { dis.readFully(compressedSchema, 0, schemaLength); Schema writerSchema = new Schema.Parser().parse(HoodieAvroUtils.decompress(compressedSchema)); - if(readerSchema == null) { + if (readerSchema == null) { readerSchema = writerSchema; } @@ -141,7 +138,7 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { List records = new ArrayList<>(totalRecords); // 3. Read the content - for(int i=0;i metadata) { + public HoodieCommandBlock(HoodieCommandBlockTypeEnum type, + Map metadata) { super(metadata); this.type = type; } @@ -46,7 +47,7 @@ public class HoodieCommandBlock extends HoodieLogBlock { ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream output = new DataOutputStream(baos); - if(super.getLogMetadata() != null) { + if (super.getLogMetadata() != null) { output.write(HoodieLogBlock.getLogMetadataBytes(super.getLogMetadata())); } output.writeInt(type.ordinal()); @@ -66,7 +67,7 @@ public class HoodieCommandBlock extends HoodieLogBlock { public static HoodieLogBlock fromBytes(byte[] content, boolean readMetadata) throws IOException { DataInputStream dis = new DataInputStream(new ByteArrayInputStream(content)); Map metadata = null; - if(readMetadata) { + if (readMetadata) { metadata = HoodieLogBlock.getLogMetadata(dis); } int ordinal = dis.readInt(); diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieCorruptBlock.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieCorruptBlock.java index 1e79a6241..3858ae54e 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieCorruptBlock.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieCorruptBlock.java @@ -44,7 +44,7 @@ public class HoodieCorruptBlock extends HoodieLogBlock { public byte[] getBytes() throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream output = new DataOutputStream(baos); - if(super.getLogMetadata() != null) { + if (super.getLogMetadata() != null) { output.write(HoodieLogBlock.getLogMetadataBytes(super.getLogMetadata())); } output.write(corruptedBytes); @@ -60,20 +60,21 @@ public class HoodieCorruptBlock extends HoodieLogBlock { return corruptedBytes; } - public static HoodieLogBlock fromBytes(byte[] content, int blockSize, boolean readMetadata) throws IOException { + public static HoodieLogBlock fromBytes(byte[] content, int blockSize, boolean readMetadata) + throws IOException { DataInputStream dis = new DataInputStream(new ByteArrayInputStream(content)); Map metadata = null; int bytesRemaining = blockSize; - if(readMetadata) { + if (readMetadata) { try { //attempt to read metadata metadata = HoodieLogBlock.getLogMetadata(dis); bytesRemaining = blockSize - HoodieLogBlock.getLogMetadataBytes(metadata).length; - } catch(IOException e) { + } catch (IOException e) { // unable to read metadata, possibly corrupted metadata = null; } } - byte [] corruptedBytes = new byte[bytesRemaining]; + byte[] corruptedBytes = new byte[bytesRemaining]; dis.readFully(corruptedBytes); return new HoodieCorruptBlock(corruptedBytes, metadata); } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieDeleteBlock.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieDeleteBlock.java index c1256a4f6..485bfdcc0 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieDeleteBlock.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieDeleteBlock.java @@ -16,8 +16,6 @@ package com.uber.hoodie.common.table.log.block; -import org.apache.commons.lang3.StringUtils; - import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; @@ -25,6 +23,7 @@ import java.io.DataOutputStream; import java.io.IOException; import java.nio.charset.Charset; import java.util.Map; +import org.apache.commons.lang3.StringUtils; /** * Delete block contains a list of keys to be deleted from scanning the blocks so far @@ -46,10 +45,10 @@ public class HoodieDeleteBlock extends HoodieLogBlock { public byte[] getBytes() throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream output = new DataOutputStream(baos); - if(super.getLogMetadata() != null) { + if (super.getLogMetadata() != null) { output.write(HoodieLogBlock.getLogMetadataBytes(super.getLogMetadata())); } - byte [] bytesToWrite = StringUtils.join(keysToDelete, ',').getBytes(Charset.forName("utf-8")); + byte[] bytesToWrite = StringUtils.join(keysToDelete, ',').getBytes(Charset.forName("utf-8")); output.writeInt(bytesToWrite.length); output.write(bytesToWrite); return baos.toByteArray(); @@ -67,11 +66,11 @@ public class HoodieDeleteBlock extends HoodieLogBlock { public static HoodieLogBlock fromBytes(byte[] content, boolean readMetadata) throws IOException { DataInputStream dis = new DataInputStream(new ByteArrayInputStream(content)); Map metadata = null; - if(readMetadata) { + if (readMetadata) { metadata = HoodieLogBlock.getLogMetadata(dis); } int dataLength = dis.readInt(); - byte [] data = new byte[dataLength]; + byte[] data = new byte[dataLength]; dis.readFully(data); return new HoodieDeleteBlock(new String(data).split(","), metadata); } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieLogBlock.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieLogBlock.java index cf4f90859..817016100 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieLogBlock.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieLogBlock.java @@ -18,7 +18,6 @@ package com.uber.hoodie.common.table.log.block; import com.google.common.collect.Maps; import com.uber.hoodie.exception.HoodieException; - import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; @@ -34,6 +33,7 @@ public abstract class HoodieLogBlock { public byte[] getBytes() throws IOException { throw new HoodieException("No implementation was provided"); } + public HoodieLogBlockType getBlockType() { throw new HoodieException("No implementation was provided"); } @@ -42,8 +42,8 @@ public abstract class HoodieLogBlock { private Map logMetadata; /** - * Type of the log block - * WARNING: This enum is serialized as the ordinal. Only add new enums at the end. + * Type of the log block WARNING: This enum is serialized as the ordinal. Only add new enums at + * the end. */ public enum HoodieLogBlockType { COMMAND_BLOCK, @@ -53,8 +53,8 @@ public abstract class HoodieLogBlock { } /** - * Metadata abstraction for a HoodieLogBlock - * WARNING : This enum is serialized as the ordinal. Only add new enums at the end. + * Metadata abstraction for a HoodieLogBlock WARNING : This enum is serialized as the ordinal. + * Only add new enums at the end. */ public enum LogMetadataType { INSTANT_TIME, @@ -70,21 +70,17 @@ public abstract class HoodieLogBlock { } /** - * Convert log metadata to bytes - * 1. Write size of metadata - * 2. Write enum ordinal - * 3. Write actual bytes - * @param metadata - * @return - * @throws IOException + * Convert log metadata to bytes 1. Write size of metadata 2. Write enum ordinal 3. Write actual + * bytes */ - public static byte [] getLogMetadataBytes(Map metadata) throws IOException { + public static byte[] getLogMetadataBytes(Map metadata) + throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream output = new DataOutputStream(baos); output.writeInt(metadata.size()); - for(Map.Entry entry : metadata.entrySet()) { + for (Map.Entry entry : metadata.entrySet()) { output.writeInt(entry.getKey().ordinal()); - byte [] bytes = entry.getValue().getBytes(); + byte[] bytes = entry.getValue().getBytes(); output.writeInt(bytes.length); output.write(bytes); } @@ -92,13 +88,10 @@ public abstract class HoodieLogBlock { } /** - * Convert bytes to LogMetadata, follow the same order as - * {@link HoodieLogBlock#getLogMetadataBytes} - * @param dis - * @return - * @throws IOException + * Convert bytes to LogMetadata, follow the same order as {@link HoodieLogBlock#getLogMetadataBytes} */ - public static Map getLogMetadata(DataInputStream dis) throws IOException { + public static Map getLogMetadata(DataInputStream dis) + throws IOException { Map metadata = Maps.newHashMap(); // 1. Read the metadata written out @@ -113,7 +106,7 @@ public abstract class HoodieLogBlock { metadataCount--; } return metadata; - } catch(EOFException eof) { + } catch (EOFException eof) { throw new IOException("Could not read metadata fields ", eof); } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java index 3d739f156..6848d4a21 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java @@ -22,7 +22,18 @@ import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieIOException; +import java.io.IOException; +import java.io.Serializable; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Comparator; import java.util.Date; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; @@ -31,297 +42,273 @@ import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; -import java.io.Serializable; -import java.text.SimpleDateFormat; -import java.util.Arrays; -import java.util.Comparator; -import java.util.Optional; -import java.util.Set; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.stream.Collectors; -import java.util.stream.Stream; - /** - * Represents the Active Timeline for the HoodieDataset. Instants for the last 12 hours (configurable) - * is in the ActiveTimeline and the rest are Archived. ActiveTimeline is a special timeline - * that allows for creation of instants on the timeline. - *

- * The timeline is not automatically reloaded on any mutation operation, clients have to manually call reload() - * so that they can chain multiple mutations to the timeline and then call reload() once. - *

- * This class can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized. + * Represents the Active Timeline for the HoodieDataset. Instants for the last 12 hours + * (configurable) is in the ActiveTimeline and the rest are Archived. ActiveTimeline is a special + * timeline that allows for creation of instants on the timeline.

The timeline is not + * automatically reloaded on any mutation operation, clients have to manually call reload() so that + * they can chain multiple mutations to the timeline and then call reload() once.

This class + * can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized. */ public class HoodieActiveTimeline extends HoodieDefaultTimeline { - public static final SimpleDateFormat COMMIT_FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss"); + + public static final SimpleDateFormat COMMIT_FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss"); - private final transient static Logger log = LogManager.getLogger(HoodieActiveTimeline.class); - private String metaPath; - private transient FileSystem fs; + private final transient static Logger log = LogManager.getLogger(HoodieActiveTimeline.class); + private String metaPath; + private transient FileSystem fs; - /** - * Returns next commit time in the {@link #COMMIT_FORMATTER} format. - * @return - */ - public static String createNewCommitTime() { - return HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date()); + /** + * Returns next commit time in the {@link #COMMIT_FORMATTER} format. + */ + public static String createNewCommitTime() { + return HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date()); + } + + protected HoodieActiveTimeline(FileSystem fs, String metaPath, String[] includedExtensions) { + // Filter all the filter in the metapath and include only the extensions passed and + // convert them into HoodieInstant + try { + this.instants = + Arrays.stream(HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), path -> { + // Include only the meta files with extensions that needs to be included + String extension = FSUtils.getFileExtension(path.getName()); + return Arrays.stream(includedExtensions).anyMatch(Predicate.isEqual(extension)); + })).sorted(Comparator.comparing( + // Sort the meta-data by the instant time (first part of the file name) + fileStatus -> FSUtils.getInstantTime(fileStatus.getPath().getName()))) + // create HoodieInstantMarkers from FileStatus, which extracts properties + .map(HoodieInstant::new).collect(Collectors.toList()); + log.info("Loaded instants " + instants); + } catch (IOException e) { + throw new HoodieIOException("Failed to scan metadata", e); } + this.fs = fs; + this.metaPath = metaPath; + // multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 + this.details = (Function> & Serializable) this::getInstantDetails; + } - protected HoodieActiveTimeline(FileSystem fs, String metaPath, String[] includedExtensions) { - // Filter all the filter in the metapath and include only the extensions passed and - // convert them into HoodieInstant - try { - this.instants = - Arrays.stream(HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), path -> { - // Include only the meta files with extensions that needs to be included - String extension = FSUtils.getFileExtension(path.getName()); - return Arrays.stream(includedExtensions).anyMatch(Predicate.isEqual(extension)); - })).sorted(Comparator.comparing( - // Sort the meta-data by the instant time (first part of the file name) - fileStatus -> FSUtils.getInstantTime(fileStatus.getPath().getName()))) - // create HoodieInstantMarkers from FileStatus, which extracts properties - .map(HoodieInstant::new).collect(Collectors.toList()); - log.info("Loaded instants " + instants); - } catch (IOException e) { - throw new HoodieIOException("Failed to scan metadata", e); - } - this.fs = fs; - this.metaPath = metaPath; - // multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 - this.details = (Function> & Serializable) this::getInstantDetails; + public HoodieActiveTimeline(FileSystem fs, String metaPath) { + this(fs, metaPath, + new String[]{COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, DELTA_COMMIT_EXTENSION, + INFLIGHT_DELTA_COMMIT_EXTENSION, COMPACTION_EXTENSION, + INFLIGHT_COMPACTION_EXTENSION, SAVEPOINT_EXTENSION, INFLIGHT_SAVEPOINT_EXTENSION, + CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION}); + } + + /** + * For serialization and de-serialization only. + * + * @deprecated + */ + public HoodieActiveTimeline() { + } + + /** + * This method is only used when this object is deserialized in a spark executor. + * + * @deprecated + */ + private void readObject(java.io.ObjectInputStream in) + throws IOException, ClassNotFoundException { + in.defaultReadObject(); + this.fs = FSUtils.getFs(); + } + + /** + * Get all instants (commits, delta commits, compactions) that produce new data, in the active + * timeline * + */ + public HoodieTimeline getCommitsAndCompactionsTimeline() { + return getTimelineOfActions( + Sets.newHashSet(COMMIT_ACTION, COMPACTION_ACTION, DELTA_COMMIT_ACTION)); + } + + /** + * Get all instants (commits, delta commits, compactions, clean, savepoint, rollback) that result + * in actions, in the active timeline * + */ + public HoodieTimeline getAllCommitsTimeline() { + return getTimelineOfActions( + Sets.newHashSet(COMMIT_ACTION, COMPACTION_ACTION, DELTA_COMMIT_ACTION, CLEAN_ACTION, + SAVEPOINT_ACTION, ROLLBACK_ACTION)); + } + + /** + * Get only pure commits (inflight and completed) in the active timeline + */ + public HoodieTimeline getCommitTimeline() { + return getTimelineOfActions(Sets.newHashSet(COMMIT_ACTION)); + } + + /** + * Get only the delta commits (inflight and completed) in the active timeline + */ + public HoodieTimeline getDeltaCommitTimeline() { + return new HoodieDefaultTimeline(filterInstantsByAction(DELTA_COMMIT_ACTION), + (Function> & Serializable) this::getInstantDetails); + } + + /** + * Get only the commits (inflight and completed) in the compaction timeline + */ + public HoodieTimeline getCompactionTimeline() { + return new HoodieDefaultTimeline(filterInstantsByAction(COMPACTION_ACTION), + (Function> & Serializable) this::getInstantDetails); + } + + /** + * Get a timeline of a specific set of actions. useful to create a merged timeline of multiple + * actions + * + * @param actions actions allowed in the timeline + */ + public HoodieTimeline getTimelineOfActions(Set actions) { + return new HoodieDefaultTimeline(instants.stream().filter(s -> actions.contains(s.getAction())), + (Function> & Serializable) this::getInstantDetails); + } + + + /** + * Get only the cleaner action (inflight and completed) in the active timeline + */ + public HoodieTimeline getCleanerTimeline() { + return new HoodieDefaultTimeline(filterInstantsByAction(CLEAN_ACTION), + (Function> & Serializable) this::getInstantDetails); + } + + /** + * Get only the rollback action (inflight and completed) in the active timeline + */ + public HoodieTimeline getRollbackTimeline() { + return new HoodieDefaultTimeline(filterInstantsByAction(ROLLBACK_ACTION), + (Function> & Serializable) this::getInstantDetails); + } + + /** + * Get only the save point action (inflight and completed) in the active timeline + */ + public HoodieTimeline getSavePointTimeline() { + return new HoodieDefaultTimeline(filterInstantsByAction(SAVEPOINT_ACTION), + (Function> & Serializable) this::getInstantDetails); + } + + + protected Stream filterInstantsByAction(String action) { + return instants.stream().filter(s -> s.getAction().equals(action)); + } + + public void createInflight(HoodieInstant instant) { + log.info("Creating a new in-flight instant " + instant); + // Create the in-flight file + createFileInMetaPath(instant.getFileName(), Optional.empty()); + } + + public void saveAsComplete(HoodieInstant instant, Optional data) { + log.info("Marking instant complete " + instant); + Preconditions.checkArgument(instant.isInflight(), + "Could not mark an already completed instant as complete again " + instant); + moveInflightToComplete(instant, HoodieTimeline.getCompletedInstant(instant), data); + log.info("Completed " + instant); + } + + public void revertToInflight(HoodieInstant instant) { + log.info("Reverting instant to inflight " + instant); + moveCompleteToInflight(instant, HoodieTimeline.getInflightInstant(instant)); + log.info("Reverted " + instant + " to inflight"); + } + + public void deleteInflight(HoodieInstant instant) { + log.info("Deleting in-flight " + instant); + Path inFlightCommitFilePath = new Path(metaPath, instant.getFileName()); + try { + boolean result = fs.delete(inFlightCommitFilePath, false); + if (result) { + log.info("Removed in-flight " + instant); + } else { + throw new HoodieIOException("Could not delete in-flight instant " + instant); + } + } catch (IOException e) { + throw new HoodieIOException( + "Could not remove inflight commit " + inFlightCommitFilePath, e); } + } - public HoodieActiveTimeline(FileSystem fs, String metaPath) { - this(fs, metaPath, - new String[] {COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, DELTA_COMMIT_EXTENSION, - INFLIGHT_DELTA_COMMIT_EXTENSION, COMPACTION_EXTENSION, - INFLIGHT_COMPACTION_EXTENSION, SAVEPOINT_EXTENSION, INFLIGHT_SAVEPOINT_EXTENSION, - CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION}); + @Override + public Optional getInstantDetails(HoodieInstant instant) { + Path detailPath = new Path(metaPath, instant.getFileName()); + return readDataFromPath(detailPath); + } + + protected void moveInflightToComplete(HoodieInstant inflight, HoodieInstant completed, + Optional data) { + Path commitFilePath = new Path(metaPath, completed.getFileName()); + try { + // open a new file and write the commit metadata in + Path inflightCommitFile = new Path(metaPath, inflight.getFileName()); + createFileInMetaPath(inflight.getFileName(), data); + boolean success = fs.rename(inflightCommitFile, commitFilePath); + if (!success) { + throw new HoodieIOException( + "Could not rename " + inflightCommitFile + " to " + commitFilePath); + } + } catch (IOException e) { + throw new HoodieIOException("Could not complete " + inflight, e); } + } - /** - * For serialization and de-serialization only. - * @deprecated - */ - public HoodieActiveTimeline() { - } - - /** - * This method is only used when this object is deserialized in a spark executor. - * - * @deprecated - */ - private void readObject(java.io.ObjectInputStream in) - throws IOException, ClassNotFoundException { - in.defaultReadObject(); - this.fs = FSUtils.getFs(); - } - - /** - * Get all instants (commits, delta commits, compactions) that produce new data, in the active timeline - ** - * @return - */ - public HoodieTimeline getCommitsAndCompactionsTimeline() { - return getTimelineOfActions(Sets.newHashSet(COMMIT_ACTION, COMPACTION_ACTION, DELTA_COMMIT_ACTION)); - } - - /** - * Get all instants (commits, delta commits, compactions, clean, savepoint, rollback) that result in actions, in the active timeline - ** - * @return - */ - public HoodieTimeline getAllCommitsTimeline() { - return getTimelineOfActions(Sets.newHashSet(COMMIT_ACTION, COMPACTION_ACTION, DELTA_COMMIT_ACTION, CLEAN_ACTION, SAVEPOINT_ACTION, ROLLBACK_ACTION)); - } - - /** - * Get only pure commits (inflight and completed) in the active timeline - * - * @return - */ - public HoodieTimeline getCommitTimeline() { - return getTimelineOfActions(Sets.newHashSet(COMMIT_ACTION)); - } - - /** - * Get only the delta commits (inflight and completed) in the active timeline - * - * @return - */ - public HoodieTimeline getDeltaCommitTimeline() { - return new HoodieDefaultTimeline(filterInstantsByAction(DELTA_COMMIT_ACTION), - (Function> & Serializable) this::getInstantDetails); - } - - /** - * Get only the commits (inflight and completed) in the compaction timeline - * - * @return - */ - public HoodieTimeline getCompactionTimeline() { - return new HoodieDefaultTimeline(filterInstantsByAction(COMPACTION_ACTION), - (Function> & Serializable) this::getInstantDetails); - } - - /** - * Get a timeline of a specific set of actions. useful to create a merged timeline of multiple actions - * - * @param actions actions allowed in the timeline - * @return - */ - public HoodieTimeline getTimelineOfActions(Set actions) { - return new HoodieDefaultTimeline(instants.stream().filter(s -> actions.contains(s.getAction())), - (Function> & Serializable) this::getInstantDetails); - } - - - /** - * Get only the cleaner action (inflight and completed) in the active timeline - * - * @return - */ - public HoodieTimeline getCleanerTimeline() { - return new HoodieDefaultTimeline(filterInstantsByAction(CLEAN_ACTION), - (Function> & Serializable) this::getInstantDetails); - } - - /** - * Get only the rollback action (inflight and completed) in the active timeline - * - * @return - */ - public HoodieTimeline getRollbackTimeline() { - return new HoodieDefaultTimeline(filterInstantsByAction(ROLLBACK_ACTION), - (Function> & Serializable) this::getInstantDetails); - } - - /** - * Get only the save point action (inflight and completed) in the active timeline - * - * @return - */ - public HoodieTimeline getSavePointTimeline() { - return new HoodieDefaultTimeline(filterInstantsByAction(SAVEPOINT_ACTION), - (Function> & Serializable) this::getInstantDetails); - } - - - protected Stream filterInstantsByAction(String action) { - return instants.stream().filter(s -> s.getAction().equals(action)); - } - - public void createInflight(HoodieInstant instant) { - log.info("Creating a new in-flight instant " + instant); - // Create the in-flight file - createFileInMetaPath(instant.getFileName(), Optional.empty()); - } - - public void saveAsComplete(HoodieInstant instant, Optional data) { - log.info("Marking instant complete " + instant); - Preconditions.checkArgument(instant.isInflight(), - "Could not mark an already completed instant as complete again " + instant); - moveInflightToComplete(instant, HoodieTimeline.getCompletedInstant(instant), data); - log.info("Completed " + instant); - } - - public void revertToInflight(HoodieInstant instant) { - log.info("Reverting instant to inflight " + instant); - moveCompleteToInflight(instant, HoodieTimeline.getInflightInstant(instant)); - log.info("Reverted " + instant + " to inflight"); - } - - public void deleteInflight(HoodieInstant instant) { - log.info("Deleting in-flight " + instant); - Path inFlightCommitFilePath = new Path(metaPath, instant.getFileName()); - try { - boolean result = fs.delete(inFlightCommitFilePath, false); - if (result) { - log.info("Removed in-flight " + instant); - } else { - throw new HoodieIOException("Could not delete in-flight instant " + instant); - } - } catch (IOException e) { - throw new HoodieIOException( - "Could not remove inflight commit " + inFlightCommitFilePath, e); - } - } - - @Override - public Optional getInstantDetails(HoodieInstant instant) { - Path detailPath = new Path(metaPath, instant.getFileName()); - return readDataFromPath(detailPath); - } - - protected void moveInflightToComplete(HoodieInstant inflight, HoodieInstant completed, - Optional data) { + protected void moveCompleteToInflight(HoodieInstant completed, HoodieInstant inflight) { + Path inFlightCommitFilePath = new Path(metaPath, inflight.getFileName()); + try { + if (!fs.exists(inFlightCommitFilePath)) { Path commitFilePath = new Path(metaPath, completed.getFileName()); - try { - // open a new file and write the commit metadata in - Path inflightCommitFile = new Path(metaPath, inflight.getFileName()); - createFileInMetaPath(inflight.getFileName(), data); - boolean success = fs.rename(inflightCommitFile, commitFilePath); - if (!success) { - throw new HoodieIOException( - "Could not rename " + inflightCommitFile + " to " + commitFilePath); - } - } catch (IOException e) { - throw new HoodieIOException("Could not complete " + inflight, e); + boolean success = fs.rename(commitFilePath, inFlightCommitFilePath); + if (!success) { + throw new HoodieIOException( + "Could not rename " + commitFilePath + " to " + inFlightCommitFilePath); } + } + } catch (IOException e) { + throw new HoodieIOException("Could not complete revert " + completed, e); } + } - protected void moveCompleteToInflight(HoodieInstant completed, HoodieInstant inflight) { - Path inFlightCommitFilePath = new Path(metaPath, inflight.getFileName()); - try { - if (!fs.exists(inFlightCommitFilePath)) { - Path commitFilePath = new Path(metaPath, completed.getFileName()); - boolean success = fs.rename(commitFilePath, inFlightCommitFilePath); - if (!success) { - throw new HoodieIOException( - "Could not rename " + commitFilePath + " to " + inFlightCommitFilePath); - } - } - } catch (IOException e) { - throw new HoodieIOException("Could not complete revert " + completed, e); + public void saveToInflight(HoodieInstant instant, Optional content) { + createFileInMetaPath(instant.getFileName(), content); + } + + protected void createFileInMetaPath(String filename, Optional content) { + Path fullPath = new Path(metaPath, filename); + try { + if (!content.isPresent()) { + if (fs.createNewFile(fullPath)) { + log.info("Created a new file in meta path: " + fullPath); + return; } + } else { + FSDataOutputStream fsout = fs.create(fullPath, true); + fsout.write(content.get()); + fsout.close(); + return; + } + throw new HoodieIOException("Failed to create file " + fullPath); + } catch (IOException e) { + throw new HoodieIOException("Failed to create file " + fullPath, e); } + } - public void saveToInflight(HoodieInstant instant, Optional content) { - createFileInMetaPath(instant.getFileName(), content); + protected Optional readDataFromPath(Path detailPath) { + try (FSDataInputStream is = fs.open(detailPath)) { + return Optional.of(IOUtils.toByteArray(is)); + } catch (IOException e) { + throw new HoodieIOException("Could not read commit details from " + detailPath, e); } + } - protected void createFileInMetaPath(String filename, Optional content) { - Path fullPath = new Path(metaPath, filename); - try { - if (!content.isPresent()) { - if (fs.createNewFile(fullPath)) { - log.info("Created a new file in meta path: " + fullPath); - return; - } - } else { - FSDataOutputStream fsout = fs.create(fullPath, true); - fsout.write(content.get()); - fsout.close(); - return; - } - throw new HoodieIOException("Failed to create file " + fullPath); - } catch (IOException e) { - throw new HoodieIOException("Failed to create file " + fullPath, e); - } - } - - protected Optional readDataFromPath(Path detailPath) { - try (FSDataInputStream is = fs.open(detailPath)) { - return Optional.of(IOUtils.toByteArray(is)); - } catch (IOException e) { - throw new HoodieIOException("Could not read commit details from " + detailPath, e); - } - } - - public HoodieActiveTimeline reload() { - return new HoodieActiveTimeline(fs, metaPath); - } + public HoodieActiveTimeline reload() { + return new HoodieActiveTimeline(fs, metaPath); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java index 458cf6eb8..37e5e9414 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java @@ -19,13 +19,6 @@ package com.uber.hoodie.common.table.timeline; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieIOException; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - import java.io.IOException; import java.io.Serializable; import java.util.Arrays; @@ -34,79 +27,85 @@ import java.util.Map; import java.util.Optional; import java.util.function.Function; import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; /** - * Represents the Archived Timeline for the HoodieDataset. Instants for the last 12 hours (configurable) - * is in the ActiveTimeline and the rest are in ArchivedTimeline. - *

- * Instants are read from the archive file during initialization and never refreshed. To refresh, clients - * need to call reload() - *

- * This class can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized. + * Represents the Archived Timeline for the HoodieDataset. Instants for the last 12 hours + * (configurable) is in the ActiveTimeline and the rest are in ArchivedTimeline.

Instants + * are read from the archive file during initialization and never refreshed. To refresh, clients + * need to call reload()

This class can be serialized and de-serialized and on + * de-serialization the FileSystem is re-initialized. */ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { - private static final String HOODIE_COMMIT_ARCHIVE_LOG_FILE = "commits"; - private transient FileSystem fs; - private String metaPath; - private Map readCommits = new HashMap<>(); - private final transient static Logger log = LogManager.getLogger(HoodieArchivedTimeline.class); + private static final String HOODIE_COMMIT_ARCHIVE_LOG_FILE = "commits"; + private transient FileSystem fs; + private String metaPath; + private Map readCommits = new HashMap<>(); - public HoodieArchivedTimeline(FileSystem fs, String metaPath) { - // Read back the commits to make sure - Path archiveLogPath = getArchiveLogPath(metaPath); - try (SequenceFile.Reader reader = - new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(archiveLogPath))) { - Text key = new Text(); - Text val = new Text(); - while (reader.next(key, val)) { - // TODO - limit the number of commits loaded in memory. this could get very large. - // This is okay because only tooling will load the archived commit timeline today - readCommits.put(key.toString(), Arrays.copyOf(val.getBytes(), val.getLength())); - } - this.instants = readCommits.keySet().stream().map( - s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)).collect( - Collectors.toList()); - } catch (IOException e) { - throw new HoodieIOException( - "Could not load archived commit timeline from path " + archiveLogPath, e); - } - // multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 - this.details = (Function> & Serializable) this::getInstantDetails; - this.fs = fs; - this.metaPath = metaPath; + private final transient static Logger log = LogManager.getLogger(HoodieArchivedTimeline.class); + + public HoodieArchivedTimeline(FileSystem fs, String metaPath) { + // Read back the commits to make sure + Path archiveLogPath = getArchiveLogPath(metaPath); + try (SequenceFile.Reader reader = + new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(archiveLogPath))) { + Text key = new Text(); + Text val = new Text(); + while (reader.next(key, val)) { + // TODO - limit the number of commits loaded in memory. this could get very large. + // This is okay because only tooling will load the archived commit timeline today + readCommits.put(key.toString(), Arrays.copyOf(val.getBytes(), val.getLength())); + } + this.instants = readCommits.keySet().stream().map( + s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)).collect( + Collectors.toList()); + } catch (IOException e) { + throw new HoodieIOException( + "Could not load archived commit timeline from path " + archiveLogPath, e); } + // multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 + this.details = (Function> & Serializable) this::getInstantDetails; + this.fs = fs; + this.metaPath = metaPath; + } - /** - * For serialization and de-serialization only. - * @deprecated - */ - public HoodieArchivedTimeline() { - } + /** + * For serialization and de-serialization only. + * + * @deprecated + */ + public HoodieArchivedTimeline() { + } - /** - * This method is only used when this object is deserialized in a spark executor. - * - * @deprecated - */ - private void readObject(java.io.ObjectInputStream in) - throws IOException, ClassNotFoundException { - in.defaultReadObject(); - this.fs = FSUtils.getFs(); - } + /** + * This method is only used when this object is deserialized in a spark executor. + * + * @deprecated + */ + private void readObject(java.io.ObjectInputStream in) + throws IOException, ClassNotFoundException { + in.defaultReadObject(); + this.fs = FSUtils.getFs(); + } - public static Path getArchiveLogPath(String metaPath) { - return new Path(metaPath, HOODIE_COMMIT_ARCHIVE_LOG_FILE); - } + public static Path getArchiveLogPath(String metaPath) { + return new Path(metaPath, HOODIE_COMMIT_ARCHIVE_LOG_FILE); + } - @Override - public Optional getInstantDetails(HoodieInstant instant) { - return Optional.ofNullable(readCommits.get(instant.getTimestamp())); - } + @Override + public Optional getInstantDetails(HoodieInstant instant) { + return Optional.ofNullable(readCommits.get(instant.getTimestamp())); + } - public HoodieArchivedTimeline reload() { - return new HoodieArchivedTimeline(fs, metaPath); - } + public HoodieArchivedTimeline reload() { + return new HoodieArchivedTimeline(fs, metaPath); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java index e250640c6..3a0240239 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieDefaultTimeline.java @@ -17,135 +17,136 @@ package com.uber.hoodie.common.table.timeline; import com.uber.hoodie.common.table.HoodieTimeline; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - import java.util.List; import java.util.Optional; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; /** - * HoodieDefaultTimeline is a default implementation of the HoodieTimeline. - * It provides methods to inspect a List[HoodieInstant]. Function to get the details of the instant - * is passed in as a lamdba. + * HoodieDefaultTimeline is a default implementation of the HoodieTimeline. It provides methods to + * inspect a List[HoodieInstant]. Function to get the details of the instant is passed in as a + * lamdba. * * @see HoodieTimeline */ public class HoodieDefaultTimeline implements HoodieTimeline { - private final transient static Logger log = LogManager.getLogger(HoodieDefaultTimeline.class); - protected Function> details; - protected List instants; + private final transient static Logger log = LogManager.getLogger(HoodieDefaultTimeline.class); - public HoodieDefaultTimeline(Stream instants, - Function> details) { - this.instants = instants.collect(Collectors.toList()); - this.details = details; - } + protected Function> details; + protected List instants; - /** - * For serailizing and de-serializing - * - * @deprecated - */ - public HoodieDefaultTimeline() { - } + public HoodieDefaultTimeline(Stream instants, + Function> details) { + this.instants = instants.collect(Collectors.toList()); + this.details = details; + } - public HoodieTimeline filterInflights() { - return new HoodieDefaultTimeline(instants.stream().filter(HoodieInstant::isInflight), - details); - } + /** + * For serailizing and de-serializing + * + * @deprecated + */ + public HoodieDefaultTimeline() { + } - public HoodieTimeline filterCompletedInstants() { - return new HoodieDefaultTimeline(instants.stream().filter(s -> !s.isInflight()), details); - } + public HoodieTimeline filterInflights() { + return new HoodieDefaultTimeline(instants.stream().filter(HoodieInstant::isInflight), + details); + } - @Override - public HoodieDefaultTimeline findInstantsInRange(String startTs, String endTs) { - return new HoodieDefaultTimeline(instants.stream().filter( - s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), startTs, GREATER) && - HoodieTimeline.compareTimestamps( + public HoodieTimeline filterCompletedInstants() { + return new HoodieDefaultTimeline(instants.stream().filter(s -> !s.isInflight()), details); + } + + @Override + public HoodieDefaultTimeline findInstantsInRange(String startTs, String endTs) { + return new HoodieDefaultTimeline(instants.stream().filter( + s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), startTs, GREATER) && + HoodieTimeline.compareTimestamps( s.getTimestamp(), endTs, LESSER_OR_EQUAL)), details); - } + } - @Override - public HoodieDefaultTimeline findInstantsAfter(String commitTime, int numCommits) { - return new HoodieDefaultTimeline( - instants.stream().filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), commitTime, GREATER)) - .limit(numCommits), details); - } + @Override + public HoodieDefaultTimeline findInstantsAfter(String commitTime, int numCommits) { + return new HoodieDefaultTimeline( + instants.stream() + .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), commitTime, GREATER)) + .limit(numCommits), details); + } - @Override - public boolean empty() { - return !instants.stream().findFirst().isPresent(); - } + @Override + public boolean empty() { + return !instants.stream().findFirst().isPresent(); + } - @Override - public int countInstants() { - return new Long(instants.stream().count()).intValue(); - } + @Override + public int countInstants() { + return new Long(instants.stream().count()).intValue(); + } - @Override - public Optional firstInstant() { - return instants.stream().findFirst(); - } + @Override + public Optional firstInstant() { + return instants.stream().findFirst(); + } - @Override - public Optional nthInstant(int n) { - if (empty() || n >= countInstants()) { - return Optional.empty(); - } - return Optional.of(instants.get(n)); + @Override + public Optional nthInstant(int n) { + if (empty() || n >= countInstants()) { + return Optional.empty(); } + return Optional.of(instants.get(n)); + } - @Override - public Optional lastInstant() { - return empty() ? Optional.empty() : nthInstant(countInstants() - 1); - } + @Override + public Optional lastInstant() { + return empty() ? Optional.empty() : nthInstant(countInstants() - 1); + } - @Override - public Optional nthFromLastInstant(int n) { - if (countInstants() < n + 1) { - return Optional.empty(); - } - return nthInstant(countInstants() - 1 - n); + @Override + public Optional nthFromLastInstant(int n) { + if (countInstants() < n + 1) { + return Optional.empty(); } + return nthInstant(countInstants() - 1 - n); + } - @Override - public boolean containsInstant(HoodieInstant instant) { - return instants.stream().anyMatch(s -> s.equals(instant)); - } + @Override + public boolean containsInstant(HoodieInstant instant) { + return instants.stream().anyMatch(s -> s.equals(instant)); + } - @Override - public boolean containsOrBeforeTimelineStarts(String instant) { - return instants.stream().anyMatch(s -> s.getTimestamp().equals(instant)) - || isBeforeTimelineStarts(instant); - } + @Override + public boolean containsOrBeforeTimelineStarts(String instant) { + return instants.stream().anyMatch(s -> s.getTimestamp().equals(instant)) + || isBeforeTimelineStarts(instant); + } - @Override - public Stream getInstants() { - return instants.stream(); - } + @Override + public Stream getInstants() { + return instants.stream(); + } - @Override - public boolean isBeforeTimelineStarts(String instant) { - Optional firstCommit = firstInstant(); - return firstCommit.isPresent() && - HoodieTimeline.compareTimestamps(instant, firstCommit.get().getTimestamp(), LESSER); - } + @Override + public boolean isBeforeTimelineStarts(String instant) { + Optional firstCommit = firstInstant(); + return firstCommit.isPresent() && + HoodieTimeline.compareTimestamps(instant, firstCommit.get().getTimestamp(), LESSER); + } - @Override - public Optional getInstantDetails(HoodieInstant instant) { - return details.apply(instant); - } + @Override + public Optional getInstantDetails(HoodieInstant instant) { + return details.apply(instant); + } - @Override - public String toString() { - return this.getClass().getName() + ": " + instants.stream().map(Object::toString) - .collect(Collectors.joining(",")); - } + @Override + public String toString() { + return this.getClass().getName() + ": " + instants.stream().map(Object::toString) + .collect(Collectors.joining(",")); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieInstant.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieInstant.java index 584105dee..bf27b7db2 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieInstant.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieInstant.java @@ -16,118 +16,117 @@ package com.uber.hoodie.common.table.timeline; -import com.google.common.io.Files; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; -import org.apache.hadoop.fs.FileStatus; - import java.io.Serializable; import java.util.Objects; +import org.apache.hadoop.fs.FileStatus; /** - * A Hoodie Instant represents a action done on a hoodie dataset. - * All actions start with a inflight instant and then create a completed instant after done. + * A Hoodie Instant represents a action done on a hoodie dataset. All actions start with a inflight + * instant and then create a completed instant after done. * * @see HoodieTimeline */ public class HoodieInstant implements Serializable { - private boolean isInflight = false; - private String action; - private String timestamp; - /** - * Load the instant from the meta FileStatus - * @param fileStatus - */ - public HoodieInstant(FileStatus fileStatus) { - // First read the instant timestamp. [==>20170101193025<==].commit - String fileName = fileStatus.getPath().getName(); - String fileExtension = FSUtils.getFileExtension(fileName); - timestamp = fileName.replace(fileExtension, ""); + private boolean isInflight = false; + private String action; + private String timestamp; - // Next read the action for this marker - action = fileExtension.replaceFirst(".", ""); - if(action.equals("inflight")) { - // This is to support backwards compatibility on how in-flight commit files were written - // General rule is inflight extension is ..inflight, but for commit it is .inflight - action = "commit"; - isInflight = true; - } else if (action.contains(HoodieTimeline.INFLIGHT_EXTENSION)) { - isInflight = true; - action = action.replace(HoodieTimeline.INFLIGHT_EXTENSION, ""); - } + /** + * Load the instant from the meta FileStatus + */ + public HoodieInstant(FileStatus fileStatus) { + // First read the instant timestamp. [==>20170101193025<==].commit + String fileName = fileStatus.getPath().getName(); + String fileExtension = FSUtils.getFileExtension(fileName); + timestamp = fileName.replace(fileExtension, ""); + + // Next read the action for this marker + action = fileExtension.replaceFirst(".", ""); + if (action.equals("inflight")) { + // This is to support backwards compatibility on how in-flight commit files were written + // General rule is inflight extension is ..inflight, but for commit it is .inflight + action = "commit"; + isInflight = true; + } else if (action.contains(HoodieTimeline.INFLIGHT_EXTENSION)) { + isInflight = true; + action = action.replace(HoodieTimeline.INFLIGHT_EXTENSION, ""); } + } - public HoodieInstant(boolean isInflight, String action, String timestamp) { - this.isInflight = isInflight; - this.action = action; - this.timestamp = timestamp; - } + public HoodieInstant(boolean isInflight, String action, String timestamp) { + this.isInflight = isInflight; + this.action = action; + this.timestamp = timestamp; + } - public boolean isInflight() { - return isInflight; - } + public boolean isInflight() { + return isInflight; + } - public String getAction() { - return action; - } + public String getAction() { + return action; + } - public String getTimestamp() { - return timestamp; - } + public String getTimestamp() { + return timestamp; + } - /** - * Get the filename for this instant - * @return - */ - public String getFileName() { - if (HoodieTimeline.COMMIT_ACTION.equals(action)) { - return isInflight ? - HoodieTimeline.makeInflightCommitFileName(timestamp) : - HoodieTimeline.makeCommitFileName(timestamp); - } else if (HoodieTimeline.CLEAN_ACTION.equals(action)) { - return isInflight ? - HoodieTimeline.makeInflightCleanerFileName(timestamp) : - HoodieTimeline.makeCleanerFileName(timestamp); - } else if (HoodieTimeline.ROLLBACK_ACTION.equals(action)) { - return isInflight ? - HoodieTimeline.makeInflightRollbackFileName(timestamp) : - HoodieTimeline.makeRollbackFileName(timestamp); - } else if (HoodieTimeline.SAVEPOINT_ACTION.equals(action)) { - return isInflight ? - HoodieTimeline.makeInflightSavePointFileName(timestamp) : - HoodieTimeline.makeSavePointFileName(timestamp); - } else if (HoodieTimeline.COMPACTION_ACTION.equals(action)) { - return isInflight ? - HoodieTimeline.makeInflightCompactionFileName(timestamp) : - HoodieTimeline.makeCompactionFileName(timestamp); - } else if (HoodieTimeline.DELTA_COMMIT_ACTION.equals(action)) { - return isInflight ? - HoodieTimeline.makeInflightDeltaFileName(timestamp) : - HoodieTimeline.makeDeltaFileName(timestamp); - } - throw new IllegalArgumentException("Cannot get file name for unknown action " + action); + /** + * Get the filename for this instant + */ + public String getFileName() { + if (HoodieTimeline.COMMIT_ACTION.equals(action)) { + return isInflight ? + HoodieTimeline.makeInflightCommitFileName(timestamp) : + HoodieTimeline.makeCommitFileName(timestamp); + } else if (HoodieTimeline.CLEAN_ACTION.equals(action)) { + return isInflight ? + HoodieTimeline.makeInflightCleanerFileName(timestamp) : + HoodieTimeline.makeCleanerFileName(timestamp); + } else if (HoodieTimeline.ROLLBACK_ACTION.equals(action)) { + return isInflight ? + HoodieTimeline.makeInflightRollbackFileName(timestamp) : + HoodieTimeline.makeRollbackFileName(timestamp); + } else if (HoodieTimeline.SAVEPOINT_ACTION.equals(action)) { + return isInflight ? + HoodieTimeline.makeInflightSavePointFileName(timestamp) : + HoodieTimeline.makeSavePointFileName(timestamp); + } else if (HoodieTimeline.COMPACTION_ACTION.equals(action)) { + return isInflight ? + HoodieTimeline.makeInflightCompactionFileName(timestamp) : + HoodieTimeline.makeCompactionFileName(timestamp); + } else if (HoodieTimeline.DELTA_COMMIT_ACTION.equals(action)) { + return isInflight ? + HoodieTimeline.makeInflightDeltaFileName(timestamp) : + HoodieTimeline.makeDeltaFileName(timestamp); } + throw new IllegalArgumentException("Cannot get file name for unknown action " + action); + } - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - HoodieInstant that = (HoodieInstant) o; - return isInflight == that.isInflight && - Objects.equals(action, that.action) && - Objects.equals(timestamp, that.timestamp); + @Override + public boolean equals(Object o) { + if (this == o) { + return true; } + if (o == null || getClass() != o.getClass()) { + return false; + } + HoodieInstant that = (HoodieInstant) o; + return isInflight == that.isInflight && + Objects.equals(action, that.action) && + Objects.equals(timestamp, that.timestamp); + } - @Override - public int hashCode() { - return Objects.hash(isInflight, action, timestamp); - } + @Override + public int hashCode() { + return Objects.hash(isInflight, action, timestamp); + } - @Override - public String toString() { - return "[" + ((isInflight) ? "==>" : "") + timestamp + "__" + action + "]"; - } + @Override + public String toString() { + return "[" + ((isInflight) ? "==>" : "") + timestamp + "__" + action + "]"; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemView.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemView.java index 6f1d63c44..afd2c89dc 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemView.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemView.java @@ -19,18 +19,12 @@ package com.uber.hoodie.common.table.view; import com.uber.hoodie.common.model.FileSlice; import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieFileGroup; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.common.table.TableFileSystemView; -import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.model.HoodieLogFile; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.TableFileSystemView; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieIOException; - -import org.apache.commons.lang3.tuple.Pair; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; @@ -44,6 +38,10 @@ import java.util.Set; import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; /** * Common abstract implementation for multiple TableFileSystemView Implementations. 2 possible @@ -54,8 +52,9 @@ import java.util.stream.Stream; * @see TableFileSystemView * @since 0.3.0 */ -public class HoodieTableFileSystemView implements TableFileSystemView, TableFileSystemView.ReadOptimizedView, - TableFileSystemView.RealtimeView, Serializable { +public class HoodieTableFileSystemView implements TableFileSystemView, + TableFileSystemView.ReadOptimizedView, + TableFileSystemView.RealtimeView, Serializable { protected HoodieTableMetaClient metaClient; protected transient FileSystem fs; @@ -69,12 +68,9 @@ public class HoodieTableFileSystemView implements TableFileSystemView, TableFile /** * Create a file system view, as of the given timeline - * - * @param metaClient - * @param visibleActiveTimeline */ public HoodieTableFileSystemView(HoodieTableMetaClient metaClient, - HoodieTimeline visibleActiveTimeline) { + HoodieTimeline visibleActiveTimeline) { this.metaClient = metaClient; this.fs = metaClient.getFs(); this.visibleActiveTimeline = visibleActiveTimeline; @@ -85,14 +81,10 @@ public class HoodieTableFileSystemView implements TableFileSystemView, TableFile /** * Create a file system view, as of the given timeline, with the provided file statuses. - * - * @param metaClient - * @param visibleActiveTimeline - * @param fileStatuses */ public HoodieTableFileSystemView(HoodieTableMetaClient metaClient, - HoodieTimeline visibleActiveTimeline, - FileStatus[] fileStatuses) { + HoodieTimeline visibleActiveTimeline, + FileStatus[] fileStatuses) { this(metaClient, visibleActiveTimeline); addFilesToView(fileStatuses); } @@ -104,44 +96,44 @@ public class HoodieTableFileSystemView implements TableFileSystemView, TableFile * @deprecated */ private void readObject(java.io.ObjectInputStream in) - throws IOException, ClassNotFoundException { + throws IOException, ClassNotFoundException { in.defaultReadObject(); this.fs = FSUtils.getFs(); } private void writeObject(java.io.ObjectOutputStream out) - throws IOException { + throws IOException { out.defaultWriteObject(); } /** * Adds the provided statuses into the file system view, and also caches it inside this object. - * - * @param statuses - * @return */ private List addFilesToView(FileStatus[] statuses) { - Map, List> dataFiles = convertFileStatusesToDataFiles(statuses) - .collect(Collectors.groupingBy((dataFile) -> { - String partitionPathStr = FSUtils.getRelativePartitionPath( - new Path(metaClient.getBasePath()), - dataFile.getFileStatus().getPath().getParent()); - return Pair.of(partitionPathStr , dataFile.getFileId()); - })); - Map, List> logFiles = convertFileStatusesToLogFiles(statuses) - .collect(Collectors.groupingBy((logFile) -> { - String partitionPathStr = FSUtils.getRelativePartitionPath( - new Path(metaClient.getBasePath()), - logFile.getPath().getParent()); - return Pair.of(partitionPathStr , logFile.getFileId()); - })); + Map, List> dataFiles = convertFileStatusesToDataFiles( + statuses) + .collect(Collectors.groupingBy((dataFile) -> { + String partitionPathStr = FSUtils.getRelativePartitionPath( + new Path(metaClient.getBasePath()), + dataFile.getFileStatus().getPath().getParent()); + return Pair.of(partitionPathStr, dataFile.getFileId()); + })); + Map, List> logFiles = convertFileStatusesToLogFiles( + statuses) + .collect(Collectors.groupingBy((logFile) -> { + String partitionPathStr = FSUtils.getRelativePartitionPath( + new Path(metaClient.getBasePath()), + logFile.getPath().getParent()); + return Pair.of(partitionPathStr, logFile.getFileId()); + })); Set> fileIdSet = new HashSet<>(dataFiles.keySet()); fileIdSet.addAll(logFiles.keySet()); List fileGroups = new ArrayList<>(); fileIdSet.forEach(pair -> { - HoodieFileGroup group = new HoodieFileGroup(pair.getKey(), pair.getValue(), visibleActiveTimeline); + HoodieFileGroup group = new HoodieFileGroup(pair.getKey(), pair.getValue(), + visibleActiveTimeline); if (dataFiles.containsKey(pair)) { dataFiles.get(pair).forEach(dataFile -> group.addDataFile(dataFile)); } @@ -165,90 +157,93 @@ public class HoodieTableFileSystemView implements TableFileSystemView, TableFile private Stream convertFileStatusesToDataFiles(FileStatus[] statuses) { Predicate roFilePredicate = fileStatus -> - fileStatus.getPath().getName().contains(metaClient.getTableConfig().getROFileFormat().getFileExtension()); + fileStatus.getPath().getName() + .contains(metaClient.getTableConfig().getROFileFormat().getFileExtension()); return Arrays.stream(statuses).filter(roFilePredicate).map(HoodieDataFile::new); } private Stream convertFileStatusesToLogFiles(FileStatus[] statuses) { Predicate rtFilePredicate = fileStatus -> - fileStatus.getPath().getName().contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()); + fileStatus.getPath().getName() + .contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()); return Arrays.stream(statuses).filter(rtFilePredicate).map(HoodieLogFile::new); } @Override public Stream getLatestDataFiles(final String partitionPath) { return getAllFileGroups(partitionPath) - .map(fileGroup -> fileGroup.getLatestDataFile()) - .filter(dataFileOpt -> dataFileOpt.isPresent()) - .map(Optional::get); + .map(fileGroup -> fileGroup.getLatestDataFile()) + .filter(dataFileOpt -> dataFileOpt.isPresent()) + .map(Optional::get); } @Override public Stream getLatestDataFiles() { return fileGroupMap.values().stream() - .map(fileGroup -> fileGroup.getLatestDataFile()) - .filter(dataFileOpt -> dataFileOpt.isPresent()) - .map(Optional::get); + .map(fileGroup -> fileGroup.getLatestDataFile()) + .filter(dataFileOpt -> dataFileOpt.isPresent()) + .map(Optional::get); } @Override public Stream getLatestDataFilesBeforeOrOn(String partitionPath, - String maxCommitTime) { + String maxCommitTime) { return getAllFileGroups(partitionPath) - .map(fileGroup -> fileGroup.getLatestDataFileBeforeOrOn(maxCommitTime)) - .filter(dataFileOpt -> dataFileOpt.isPresent()) - .map(Optional::get); + .map(fileGroup -> fileGroup.getLatestDataFileBeforeOrOn(maxCommitTime)) + .filter(dataFileOpt -> dataFileOpt.isPresent()) + .map(Optional::get); } @Override public Stream getLatestDataFilesInRange(List commitsToReturn) { - return fileGroupMap.values().stream() - .map(fileGroup -> fileGroup.getLatestDataFileInRange(commitsToReturn)) - .filter(dataFileOpt -> dataFileOpt.isPresent()) - .map(Optional::get); + return fileGroupMap.values().stream() + .map(fileGroup -> fileGroup.getLatestDataFileInRange(commitsToReturn)) + .filter(dataFileOpt -> dataFileOpt.isPresent()) + .map(Optional::get); } @Override public Stream getAllDataFiles(String partitionPath) { return getAllFileGroups(partitionPath) - .map(fileGroup -> fileGroup.getAllDataFiles()) - .flatMap(dataFileList -> dataFileList); + .map(fileGroup -> fileGroup.getAllDataFiles()) + .flatMap(dataFileList -> dataFileList); } @Override public Stream getLatestFileSlices(String partitionPath) { return getAllFileGroups(partitionPath) - .map(fileGroup -> fileGroup.getLatestFileSlice()) - .filter(dataFileOpt -> dataFileOpt.isPresent()) - .map(Optional::get); + .map(fileGroup -> fileGroup.getLatestFileSlice()) + .filter(dataFileOpt -> dataFileOpt.isPresent()) + .map(Optional::get); } @Override - public Stream getLatestFileSlicesBeforeOrOn(String partitionPath, String maxCommitTime) { + public Stream getLatestFileSlicesBeforeOrOn(String partitionPath, + String maxCommitTime) { return getAllFileGroups(partitionPath) - .map(fileGroup -> fileGroup.getLatestFileSliceBeforeOrOn(maxCommitTime)) - .filter(dataFileOpt -> dataFileOpt.isPresent()) - .map(Optional::get); + .map(fileGroup -> fileGroup.getLatestFileSliceBeforeOrOn(maxCommitTime)) + .filter(dataFileOpt -> dataFileOpt.isPresent()) + .map(Optional::get); } @Override public Stream getLatestFileSliceInRange(List commitsToReturn) { return fileGroupMap.values().stream() - .map(fileGroup -> fileGroup.getLatestFileSliceInRange(commitsToReturn)) - .filter(dataFileOpt -> dataFileOpt.isPresent()) - .map(Optional::get); + .map(fileGroup -> fileGroup.getLatestFileSliceInRange(commitsToReturn)) + .filter(dataFileOpt -> dataFileOpt.isPresent()) + .map(Optional::get); } @Override public Stream getAllFileSlices(String partitionPath) { return getAllFileGroups(partitionPath) - .map(group -> group.getAllFileSlices()) - .flatMap(sliceList -> sliceList); + .map(group -> group.getAllFileSlices()) + .flatMap(sliceList -> sliceList); } /** - * Given a partition path, obtain all filegroups within that. All methods, that work at the partition level - * go through this. + * Given a partition path, obtain all filegroups within that. All methods, that work at the + * partition level go through this. */ @Override public Stream getAllFileGroups(String partitionPathStr) { @@ -266,7 +261,7 @@ public class HoodieTableFileSystemView implements TableFileSystemView, TableFile return fileGroups.stream(); } catch (IOException e) { throw new HoodieIOException( - "Failed to list data files in partition " + partitionPathStr, e); + "Failed to list data files in partition " + partitionPathStr, e); } } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/AvroUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/AvroUtils.java index 3d1fad843..ae0dbd3f0 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/AvroUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/AvroUtils.java @@ -19,7 +19,6 @@ package com.uber.hoodie.common.util; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; - import com.uber.hoodie.avro.model.HoodieCleanMetadata; import com.uber.hoodie.avro.model.HoodieCleanPartitionMetadata; import com.uber.hoodie.avro.model.HoodieRollbackMetadata; @@ -32,7 +31,11 @@ import com.uber.hoodie.common.model.HoodieAvroPayload; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.exception.HoodieIOException; - +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.DataFileWriter; @@ -50,146 +53,140 @@ import org.apache.avro.specific.SpecificRecordBase; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.Optional; - public class AvroUtils { - public static List> loadFromFiles(FileSystem fs, - List deltaFilePaths, Schema expectedSchema) { - List> loadedRecords = Lists.newArrayList(); - deltaFilePaths.forEach(s -> { - List> records = loadFromFile(fs, s, expectedSchema); - loadedRecords.addAll(records); - }); - return loadedRecords; + public static List> loadFromFiles(FileSystem fs, + List deltaFilePaths, Schema expectedSchema) { + List> loadedRecords = Lists.newArrayList(); + deltaFilePaths.forEach(s -> { + List> records = loadFromFile(fs, s, expectedSchema); + loadedRecords.addAll(records); + }); + return loadedRecords; + } + + public static List> loadFromFile(FileSystem fs, + String deltaFilePath, Schema expectedSchema) { + List> loadedRecords = Lists.newArrayList(); + Path path = new Path(deltaFilePath); + try { + SeekableInput input = new FsInput(path, fs.getConf()); + GenericDatumReader reader = new GenericDatumReader<>(); + // Set the expected schema to be the current schema to account for schema evolution + reader.setExpected(expectedSchema); + + FileReader fileReader = DataFileReader.openReader(input, reader); + for (GenericRecord deltaRecord : fileReader) { + String key = deltaRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partitionPath = + deltaRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + loadedRecords.add(new HoodieRecord<>(new HoodieKey(key, partitionPath), + new HoodieAvroPayload(Optional.of(deltaRecord)))); + } + fileReader.close(); // also closes underlying FsInput + } catch (IOException e) { + throw new HoodieIOException("Could not read avro records from path " + deltaFilePath, + e); } + return loadedRecords; + } - public static List> loadFromFile(FileSystem fs, - String deltaFilePath, Schema expectedSchema) { - List> loadedRecords = Lists.newArrayList(); - Path path = new Path(deltaFilePath); - try { - SeekableInput input = new FsInput(path, fs.getConf()); - GenericDatumReader reader = new GenericDatumReader<>(); - // Set the expected schema to be the current schema to account for schema evolution - reader.setExpected(expectedSchema); - - FileReader fileReader = DataFileReader.openReader(input, reader); - for (GenericRecord deltaRecord : fileReader) { - String key = deltaRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String partitionPath = - deltaRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - loadedRecords.add(new HoodieRecord<>(new HoodieKey(key, partitionPath), - new HoodieAvroPayload(Optional.of(deltaRecord)))); - } - fileReader.close(); // also closes underlying FsInput - } catch (IOException e) { - throw new HoodieIOException("Could not read avro records from path " + deltaFilePath, - e); - } - return loadedRecords; + public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime, + Optional durationInMs, List cleanStats) { + ImmutableMap.Builder partitionMetadataBuilder = + ImmutableMap.builder(); + int totalDeleted = 0; + String earliestCommitToRetain = null; + for (HoodieCleanStat stat : cleanStats) { + HoodieCleanPartitionMetadata metadata = + new HoodieCleanPartitionMetadata(stat.getPartitionPath(), stat.getPolicy().name(), + stat.getDeletePathPatterns(), stat.getSuccessDeleteFiles(), + stat.getDeletePathPatterns()); + partitionMetadataBuilder.put(stat.getPartitionPath(), metadata); + totalDeleted += stat.getSuccessDeleteFiles().size(); + if (earliestCommitToRetain == null) { + // This will be the same for all partitions + earliestCommitToRetain = stat.getEarliestCommitToRetain(); + } } + return new HoodieCleanMetadata(startCleanTime, durationInMs.orElseGet(() -> -1L), + totalDeleted, earliestCommitToRetain, partitionMetadataBuilder.build()); + } - public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime, - Optional durationInMs, List cleanStats) { - ImmutableMap.Builder partitionMetadataBuilder = - ImmutableMap.builder(); - int totalDeleted = 0; - String earliestCommitToRetain = null; - for (HoodieCleanStat stat : cleanStats) { - HoodieCleanPartitionMetadata metadata = - new HoodieCleanPartitionMetadata(stat.getPartitionPath(), stat.getPolicy().name(), - stat.getDeletePathPatterns(), stat.getSuccessDeleteFiles(), - stat.getDeletePathPatterns()); - partitionMetadataBuilder.put(stat.getPartitionPath(), metadata); - totalDeleted += stat.getSuccessDeleteFiles().size(); - if (earliestCommitToRetain == null) { - // This will be the same for all partitions - earliestCommitToRetain = stat.getEarliestCommitToRetain(); - } - } - return new HoodieCleanMetadata(startCleanTime, durationInMs.orElseGet(() -> -1L), - totalDeleted, earliestCommitToRetain, partitionMetadataBuilder.build()); + public static HoodieRollbackMetadata convertRollbackMetadata(String startRollbackTime, + Optional durationInMs, List commits, List stats) { + ImmutableMap.Builder partitionMetadataBuilder = + ImmutableMap.builder(); + int totalDeleted = 0; + for (HoodieRollbackStat stat : stats) { + HoodieRollbackPartitionMetadata metadata = + new HoodieRollbackPartitionMetadata(stat.getPartitionPath(), + stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles()); + partitionMetadataBuilder.put(stat.getPartitionPath(), metadata); + totalDeleted += stat.getSuccessDeleteFiles().size(); } + return new HoodieRollbackMetadata(startRollbackTime, durationInMs.orElseGet(() -> -1L), + totalDeleted, commits, partitionMetadataBuilder.build()); + } - public static HoodieRollbackMetadata convertRollbackMetadata(String startRollbackTime, - Optional durationInMs, List commits, List stats) { - ImmutableMap.Builder partitionMetadataBuilder = - ImmutableMap.builder(); - int totalDeleted = 0; - for (HoodieRollbackStat stat : stats) { - HoodieRollbackPartitionMetadata metadata = - new HoodieRollbackPartitionMetadata(stat.getPartitionPath(), - stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles()); - partitionMetadataBuilder.put(stat.getPartitionPath(), metadata); - totalDeleted += stat.getSuccessDeleteFiles().size(); - } - return new HoodieRollbackMetadata(startRollbackTime, durationInMs.orElseGet(() -> -1L), - totalDeleted, commits, partitionMetadataBuilder.build()); - } - - public static HoodieSavepointMetadata convertSavepointMetadata(String user, String comment, - Map> latestFiles) { - ImmutableMap.Builder partitionMetadataBuilder = - ImmutableMap.builder(); - for (Map.Entry> stat : latestFiles.entrySet()) { - HoodieSavepointPartitionMetadata metadata = - new HoodieSavepointPartitionMetadata(stat.getKey(), stat.getValue()); - partitionMetadataBuilder.put(stat.getKey(), metadata); - } - return new HoodieSavepointMetadata(user, System.currentTimeMillis(), comment, - partitionMetadataBuilder.build()); + public static HoodieSavepointMetadata convertSavepointMetadata(String user, String comment, + Map> latestFiles) { + ImmutableMap.Builder partitionMetadataBuilder = + ImmutableMap.builder(); + for (Map.Entry> stat : latestFiles.entrySet()) { + HoodieSavepointPartitionMetadata metadata = + new HoodieSavepointPartitionMetadata(stat.getKey(), stat.getValue()); + partitionMetadataBuilder.put(stat.getKey(), metadata); } + return new HoodieSavepointMetadata(user, System.currentTimeMillis(), comment, + partitionMetadataBuilder.build()); + } - public static Optional serializeCleanMetadata(HoodieCleanMetadata metadata) - throws IOException { - return serializeAvroMetadata(metadata, HoodieCleanMetadata.class); - } + public static Optional serializeCleanMetadata(HoodieCleanMetadata metadata) + throws IOException { + return serializeAvroMetadata(metadata, HoodieCleanMetadata.class); + } - public static Optional serializeSavepointMetadata(HoodieSavepointMetadata metadata) - throws IOException { - return serializeAvroMetadata(metadata, HoodieSavepointMetadata.class); - } + public static Optional serializeSavepointMetadata(HoodieSavepointMetadata metadata) + throws IOException { + return serializeAvroMetadata(metadata, HoodieSavepointMetadata.class); + } - public static Optional serializeRollbackMetadata( - HoodieRollbackMetadata rollbackMetadata) throws IOException { - return serializeAvroMetadata(rollbackMetadata, HoodieRollbackMetadata.class); - } + public static Optional serializeRollbackMetadata( + HoodieRollbackMetadata rollbackMetadata) throws IOException { + return serializeAvroMetadata(rollbackMetadata, HoodieRollbackMetadata.class); + } - public static Optional serializeAvroMetadata(T metadata, - Class clazz) throws IOException { - DatumWriter datumWriter = new SpecificDatumWriter<>(clazz); - DataFileWriter fileWriter = new DataFileWriter<>(datumWriter); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - fileWriter.create(metadata.getSchema(), baos); - fileWriter.append(metadata); - fileWriter.flush(); - return Optional.of(baos.toByteArray()); - } + public static Optional serializeAvroMetadata(T metadata, + Class clazz) throws IOException { + DatumWriter datumWriter = new SpecificDatumWriter<>(clazz); + DataFileWriter fileWriter = new DataFileWriter<>(datumWriter); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + fileWriter.create(metadata.getSchema(), baos); + fileWriter.append(metadata); + fileWriter.flush(); + return Optional.of(baos.toByteArray()); + } - public static HoodieCleanMetadata deserializeHoodieCleanMetadata(byte[] bytes) - throws IOException { - return deserializeAvroMetadata(bytes, HoodieCleanMetadata.class); - } + public static HoodieCleanMetadata deserializeHoodieCleanMetadata(byte[] bytes) + throws IOException { + return deserializeAvroMetadata(bytes, HoodieCleanMetadata.class); + } - public static HoodieSavepointMetadata deserializeHoodieSavepointMetadata(byte[] bytes) - throws IOException { - return deserializeAvroMetadata(bytes, HoodieSavepointMetadata.class); - } + public static HoodieSavepointMetadata deserializeHoodieSavepointMetadata(byte[] bytes) + throws IOException { + return deserializeAvroMetadata(bytes, HoodieSavepointMetadata.class); + } - public static T deserializeAvroMetadata(byte[] bytes, - Class clazz) throws IOException { - DatumReader reader = new SpecificDatumReader<>(clazz); - FileReader fileReader = - DataFileReader.openReader(new SeekableByteArrayInput(bytes), reader); - Preconditions - .checkArgument(fileReader.hasNext(), "Could not deserialize metadata of type " + clazz); - return fileReader.next(); - } + public static T deserializeAvroMetadata(byte[] bytes, + Class clazz) throws IOException { + DatumReader reader = new SpecificDatumReader<>(clazz); + FileReader fileReader = + DataFileReader.openReader(new SeekableByteArrayInput(bytes), reader); + Preconditions + .checkArgument(fileReader.hasNext(), "Could not deserialize metadata of type " + clazz); + return fileReader.next(); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java index e4c97f75b..daecf6237 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java @@ -23,16 +23,6 @@ import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.InvalidHoodiePathException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.hadoop.hdfs.DistributedFileSystem; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - import java.io.File; import java.io.IOException; import java.util.ArrayList; @@ -43,319 +33,339 @@ import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; /** * Utility functions related to accessing the file storage */ public class FSUtils { - private static final Logger LOG = LogManager.getLogger(FSUtils.class); - // Log files are of this pattern - .b5068208-e1a4-11e6-bf01-fe55135034f3_20170101134598.log.1 - private static final Pattern LOG_FILE_PATTERN = Pattern.compile("\\.(.*)_(.*)\\.(.*)\\.([0-9]*)"); - private static final String LOG_FILE_PREFIX = "."; - private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10; - private static final long MIN_CLEAN_TO_KEEP = 10; - private static final long MIN_ROLLBACK_TO_KEEP = 10; - private static FileSystem fs; + private static final Logger LOG = LogManager.getLogger(FSUtils.class); + // Log files are of this pattern - .b5068208-e1a4-11e6-bf01-fe55135034f3_20170101134598.log.1 + private static final Pattern LOG_FILE_PATTERN = Pattern.compile("\\.(.*)_(.*)\\.(.*)\\.([0-9]*)"); + private static final String LOG_FILE_PREFIX = "."; + private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10; + private static final long MIN_CLEAN_TO_KEEP = 10; + private static final long MIN_ROLLBACK_TO_KEEP = 10; + private static FileSystem fs; - /** - * Only to be used for testing. - */ - @VisibleForTesting - public static void setFs(FileSystem fs) { - FSUtils.fs = fs; + /** + * Only to be used for testing. + */ + @VisibleForTesting + public static void setFs(FileSystem fs) { + FSUtils.fs = fs; + } + + + public static FileSystem getFs() { + if (fs != null) { + return fs; } - - - public static FileSystem getFs() { - if (fs != null) { - return fs; - } - Configuration conf = new Configuration(); - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - FileSystem fs; - try { - fs = FileSystem.get(conf); - } catch (IOException e) { - throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), - e); - } - LOG.info(String.format("Hadoop Configuration: fs.defaultFS: [%s], Config:[%s], FileSystem: [%s]", - conf.getRaw("fs.defaultFS"), conf.toString(), fs.toString())); - - return fs; + Configuration conf = new Configuration(); + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + FileSystem fs; + try { + fs = FileSystem.get(conf); + } catch (IOException e) { + throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), + e); } + LOG.info( + String.format("Hadoop Configuration: fs.defaultFS: [%s], Config:[%s], FileSystem: [%s]", + conf.getRaw("fs.defaultFS"), conf.toString(), fs.toString())); - public static String makeDataFileName(String commitTime, int taskPartitionId, String fileId) { - return String.format("%s_%d_%s.parquet", fileId, taskPartitionId, commitTime); + return fs; + } + + public static String makeDataFileName(String commitTime, int taskPartitionId, String fileId) { + return String.format("%s_%d_%s.parquet", fileId, taskPartitionId, commitTime); + } + + public static String maskWithoutFileId(String commitTime, int taskPartitionId) { + return String.format("*_%s_%s.parquet", taskPartitionId, commitTime); + } + + public static String maskWithoutTaskPartitionId(String commitTime, String fileId) { + return String.format("%s_*_%s.parquet", fileId, commitTime); + } + + public static String maskWithOnlyCommitTime(String commitTime) { + return String.format("*_*_%s.parquet", commitTime); + } + + public static String getCommitFromCommitFile(String commitFileName) { + return commitFileName.split("\\.")[0]; + } + + public static String getCommitTime(String fullFileName) { + return fullFileName.split("_")[2].split("\\.")[0]; + } + + public static long getFileSize(FileSystem fs, Path path) throws IOException { + return fs.getFileStatus(path).getLen(); + } + + public static String getFileId(String fullFileName) { + return fullFileName.split("_")[0]; + } + + + /** + * Gets all partition paths assuming date partitioning (year, month, day) three levels down. + */ + public static List getAllFoldersThreeLevelsDown(FileSystem fs, String basePath) + throws IOException { + List datePartitions = new ArrayList<>(); + FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*")); + for (FileStatus status : folders) { + Path path = status.getPath(); + datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(), + path.getParent().getName(), path.getName())); } + return datePartitions; + } - public static String maskWithoutFileId(String commitTime, int taskPartitionId) { - return String.format("*_%s_%s.parquet", taskPartitionId, commitTime); + public static String getRelativePartitionPath(Path basePath, Path partitionPath) { + String partitionFullPath = partitionPath.toString(); + int partitionStartIndex = partitionFullPath.lastIndexOf(basePath.getName()); + return partitionFullPath.substring(partitionStartIndex + basePath.getName().length() + 1); + } + + /** + * Obtain all the partition paths, that are present in this table, denoted by presence of {@link + * com.uber.hoodie.common.model.HoodiePartitionMetadata#HOODIE_PARTITION_METAFILE} + */ + public static List getAllFoldersWithPartitionMetaFile(FileSystem fs, String basePathStr) + throws IOException { + List partitions = new ArrayList<>(); + Path basePath = new Path(basePathStr); + RemoteIterator allFiles = fs.listFiles(new Path(basePathStr), true); + while (allFiles.hasNext()) { + Path filePath = allFiles.next().getPath(); + if (filePath.getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)) { + partitions.add(getRelativePartitionPath(basePath, filePath.getParent())); + } } + return partitions; + } - public static String maskWithoutTaskPartitionId(String commitTime, String fileId) { - return String.format("%s_*_%s.parquet", fileId, commitTime); + public static List getAllPartitionPaths(FileSystem fs, String basePathStr, + boolean assumeDatePartitioning) + throws IOException { + if (assumeDatePartitioning) { + return getAllFoldersThreeLevelsDown(fs, basePathStr); + } else { + return getAllFoldersWithPartitionMetaFile(fs, basePathStr); } + } - public static String maskWithOnlyCommitTime(String commitTime) { - return String.format("*_*_%s.parquet", commitTime); + public static String getFileExtension(String fullName) { + Preconditions.checkNotNull(fullName); + String fileName = (new File(fullName)).getName(); + int dotIndex = fileName.indexOf('.'); + return dotIndex == -1 ? "" : fileName.substring(dotIndex); + } + + public static String getInstantTime(String name) { + return name.replace(getFileExtension(name), ""); + } + + + /** + * Get the file extension from the log file + */ + public static String getFileExtensionFromLog(Path logPath) { + Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName()); + if (!matcher.find()) { + throw new InvalidHoodiePathException(logPath, "LogFile"); } + return matcher.group(3); + } - public static String getCommitFromCommitFile(String commitFileName) { - return commitFileName.split("\\.")[0]; + /** + * Get the first part of the file name in the log file. That will be the fileId. Log file do not + * have commitTime in the file name. + */ + public static String getFileIdFromLogPath(Path path) { + Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); + if (!matcher.find()) { + throw new InvalidHoodiePathException(path, "LogFile"); } + return matcher.group(1); + } - public static String getCommitTime(String fullFileName) { - return fullFileName.split("_")[2].split("\\.")[0]; + /** + * Get the first part of the file name in the log file. That will be the fileId. Log file do not + * have commitTime in the file name. + */ + public static String getBaseCommitTimeFromLogPath(Path path) { + Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); + if (!matcher.find()) { + throw new InvalidHoodiePathException(path, "LogFile"); } + return matcher.group(2); + } - public static long getFileSize(FileSystem fs, Path path) throws IOException { - return fs.getFileStatus(path).getLen(); + /** + * Get the last part of the file name in the log file and convert to int. + */ + public static int getFileVersionFromLog(Path logPath) { + Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName()); + if (!matcher.find()) { + throw new InvalidHoodiePathException(logPath, "LogFile"); } + return Integer.parseInt(matcher.group(4)); + } - public static String getFileId(String fullFileName) { - return fullFileName.split("_")[0]; + public static String makeLogFileName(String fileId, String logFileExtension, + String baseCommitTime, int version) { + return LOG_FILE_PREFIX + String + .format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version); + } + + public static String maskWithoutLogVersion(String commitTime, String fileId, + String logFileExtension) { + return LOG_FILE_PREFIX + String.format("%s_%s%s*", fileId, commitTime, logFileExtension); + } + + + /** + * Get the latest log file written from the list of log files passed in + */ + public static Optional getLatestLogFile(Stream logFiles) { + return logFiles.sorted(Comparator + .comparing(s -> s.getLogVersion(), + Comparator.reverseOrder())).findFirst(); + } + + /** + * Get all the log files for the passed in FileId in the partition path + */ + public static Stream getAllLogFiles(FileSystem fs, Path partitionPath, + final String fileId, final String logFileExtension, final String baseCommitTime) + throws IOException { + return Arrays.stream(fs.listStatus(partitionPath, + path -> path.getName().startsWith("." + fileId) && path.getName() + .contains(logFileExtension))) + .map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime)); + } + + /** + * Get the latest log version for the fileId in the partition path + */ + public static Optional getLatestLogVersion(FileSystem fs, Path partitionPath, + final String fileId, final String logFileExtension, final String baseCommitTime) + throws IOException { + Optional latestLogFile = + getLatestLogFile( + getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime)); + if (latestLogFile.isPresent()) { + return Optional.of(latestLogFile.get().getLogVersion()); } + return Optional.empty(); + } + public static int getCurrentLogVersion(FileSystem fs, Path partitionPath, + final String fileId, final String logFileExtension, final String baseCommitTime) + throws IOException { + Optional currentVersion = + getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime); + // handle potential overflow + return (currentVersion.isPresent()) ? currentVersion.get() : 1; + } - /** - * Gets all partition paths assuming date partitioning (year, month, day) three levels down. - */ - public static List getAllFoldersThreeLevelsDown(FileSystem fs, String basePath) throws IOException { - List datePartitions = new ArrayList<>(); - FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*")); - for (FileStatus status : folders) { - Path path = status.getPath(); - datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(), - path.getParent().getName(), path.getName())); - } - return datePartitions; + /** + * computes the next log version for the specified fileId in the partition path + */ + public static int computeNextLogVersion(FileSystem fs, Path partitionPath, final String fileId, + final String logFileExtension, final String baseCommitTime) throws IOException { + Optional currentVersion = + getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime); + // handle potential overflow + return (currentVersion.isPresent()) ? currentVersion.get() + 1 : 1; + } + + public static int getDefaultBufferSize(final FileSystem fs) { + return fs.getConf().getInt("io.file.buffer.size", 4096); + } + + public static Short getDefaultReplication(FileSystem fs, Path path) { + return fs.getDefaultReplication(path); + } + + public static Long getDefaultBlockSize(FileSystem fs, Path path) { + return fs.getDefaultBlockSize(path); + } + + /** + * When a file was opened and the task died without closing the stream, another task executor + * cannot open because the existing lease will be active. We will try to recover the lease, from + * HDFS. If a data node went down, it takes about 10 minutes for the lease to be rocovered. But if + * the client dies, this should be instant. + */ + public static boolean recoverDFSFileLease(final DistributedFileSystem dfs, final Path p) + throws IOException, InterruptedException { + LOG.info("Recover lease on dfs file " + p); + // initiate the recovery + boolean recovered = false; + for (int nbAttempt = 0; nbAttempt < MAX_ATTEMPTS_RECOVER_LEASE; nbAttempt++) { + LOG.info("Attempt " + nbAttempt + " to recover lease on dfs file " + p); + recovered = dfs.recoverLease(p); + if (recovered) { + break; + } + // Sleep for 1 second before trying again. Typically it takes about 2-3 seconds to recover under default settings + Thread.sleep(1000); } + return recovered; - public static String getRelativePartitionPath(Path basePath, Path partitionPath) { - String partitionFullPath = partitionPath.toString(); - int partitionStartIndex = partitionFullPath.lastIndexOf(basePath.getName()); - return partitionFullPath.substring(partitionStartIndex + basePath.getName().length() + 1); + } + + public static void deleteOlderCleanMetaFiles(FileSystem fs, String metaPath, + Stream instants) { + //TODO - this should be archived when archival is made general for all meta-data + // skip MIN_CLEAN_TO_KEEP and delete rest + instants.skip(MIN_CLEAN_TO_KEEP).map(s -> { + try { + return fs.delete(new Path(metaPath, s.getFileName()), false); + } catch (IOException e) { + throw new HoodieIOException("Could not delete clean meta files" + s.getFileName(), + e); + } + }); + } + + public static void deleteOlderRollbackMetaFiles(FileSystem fs, String metaPath, + Stream instants) { + //TODO - this should be archived when archival is made general for all meta-data + // skip MIN_ROLLBACK_TO_KEEP and delete rest + instants.skip(MIN_ROLLBACK_TO_KEEP).map(s -> { + try { + return fs.delete(new Path(metaPath, s.getFileName()), false); + } catch (IOException e) { + throw new HoodieIOException( + "Could not delete rollback meta files " + s.getFileName(), e); + } + }); + } + + public static void createPathIfNotExists(FileSystem fs, Path partitionPath) throws IOException { + if (!fs.exists(partitionPath)) { + fs.mkdirs(partitionPath); } + } - /** - * Obtain all the partition paths, that are present in this table, denoted by presence of {@link - * com.uber.hoodie.common.model.HoodiePartitionMetadata#HOODIE_PARTITION_METAFILE} - */ - public static List getAllFoldersWithPartitionMetaFile(FileSystem fs, String basePathStr) - throws IOException { - List partitions = new ArrayList<>(); - Path basePath = new Path(basePathStr); - RemoteIterator allFiles = fs.listFiles(new Path(basePathStr), true); - while (allFiles.hasNext()) { - Path filePath = allFiles.next().getPath(); - if (filePath.getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)) { - partitions.add(getRelativePartitionPath(basePath, filePath.getParent())); - } - } - return partitions; - } - - public static List getAllPartitionPaths(FileSystem fs, String basePathStr, boolean assumeDatePartitioning) - throws IOException { - if (assumeDatePartitioning) { - return getAllFoldersThreeLevelsDown(fs, basePathStr); - } else { - return getAllFoldersWithPartitionMetaFile(fs, basePathStr); - } - } - - public static String getFileExtension(String fullName) { - Preconditions.checkNotNull(fullName); - String fileName = (new File(fullName)).getName(); - int dotIndex = fileName.indexOf('.'); - return dotIndex == -1 ? "" : fileName.substring(dotIndex); - } - - public static String getInstantTime(String name) { - return name.replace(getFileExtension(name), ""); - } - - - /** - * Get the file extension from the log file - */ - public static String getFileExtensionFromLog(Path logPath) { - Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName()); - if (!matcher.find()) { - throw new InvalidHoodiePathException(logPath, "LogFile"); - } - return matcher.group(3); - } - - /** - * Get the first part of the file name in the log file. That will be the fileId. Log file do not - * have commitTime in the file name. - */ - public static String getFileIdFromLogPath(Path path) { - Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); - if (!matcher.find()) { - throw new InvalidHoodiePathException(path, "LogFile"); - } - return matcher.group(1); - } - - /** - * Get the first part of the file name in the log file. That will be the fileId. Log file do not - * have commitTime in the file name. - */ - public static String getBaseCommitTimeFromLogPath(Path path) { - Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); - if (!matcher.find()) { - throw new InvalidHoodiePathException(path, "LogFile"); - } - return matcher.group(2); - } - - /** - * Get the last part of the file name in the log file and convert to int. - */ - public static int getFileVersionFromLog(Path logPath) { - Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName()); - if (!matcher.find()) { - throw new InvalidHoodiePathException(logPath, "LogFile"); - } - return Integer.parseInt(matcher.group(4)); - } - - public static String makeLogFileName(String fileId, String logFileExtension, - String baseCommitTime, int version) { - return LOG_FILE_PREFIX + String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version); - } - - public static String maskWithoutLogVersion(String commitTime, String fileId, String logFileExtension) { - return LOG_FILE_PREFIX + String.format("%s_%s%s*", fileId, commitTime, logFileExtension); - } - - - /** - * Get the latest log file written from the list of log files passed in - */ - public static Optional getLatestLogFile(Stream logFiles) { - return logFiles.sorted(Comparator - .comparing(s -> s.getLogVersion(), - Comparator.reverseOrder())).findFirst(); - } - - /** - * Get all the log files for the passed in FileId in the partition path - */ - public static Stream getAllLogFiles(FileSystem fs, Path partitionPath, - final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException { - return Arrays.stream(fs.listStatus(partitionPath, - path -> path.getName().startsWith("." + fileId) && path.getName().contains(logFileExtension))) - .map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime)); - } - - /** - * Get the latest log version for the fileId in the partition path - */ - public static Optional getLatestLogVersion(FileSystem fs, Path partitionPath, - final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException { - Optional latestLogFile = - getLatestLogFile(getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime)); - if (latestLogFile.isPresent()) { - return Optional.of(latestLogFile.get().getLogVersion()); - } - return Optional.empty(); - } - - public static int getCurrentLogVersion(FileSystem fs, Path partitionPath, - final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException { - Optional currentVersion = - getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime); - // handle potential overflow - return (currentVersion.isPresent()) ? currentVersion.get() : 1; - } - - /** - * computes the next log version for the specified fileId in the partition path - */ - public static int computeNextLogVersion(FileSystem fs, Path partitionPath, final String fileId, - final String logFileExtension, final String baseCommitTime) throws IOException { - Optional currentVersion = - getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime); - // handle potential overflow - return (currentVersion.isPresent()) ? currentVersion.get() + 1 : 1; - } - - public static int getDefaultBufferSize(final FileSystem fs) { - return fs.getConf().getInt("io.file.buffer.size", 4096); - } - - public static Short getDefaultReplication(FileSystem fs, Path path) { - return fs.getDefaultReplication(path); - } - - public static Long getDefaultBlockSize(FileSystem fs, Path path) { - return fs.getDefaultBlockSize(path); - } - - /** - * When a file was opened and the task died without closing the stream, another task executor - * cannot open because the existing lease will be active. We will try to recover the lease, from - * HDFS. If a data node went down, it takes about 10 minutes for the lease to be rocovered. But - * if the client dies, this should be instant. - */ - public static boolean recoverDFSFileLease(final DistributedFileSystem dfs, final Path p) - throws IOException, InterruptedException { - LOG.info("Recover lease on dfs file " + p); - // initiate the recovery - boolean recovered = false; - for (int nbAttempt = 0; nbAttempt < MAX_ATTEMPTS_RECOVER_LEASE; nbAttempt++) { - LOG.info("Attempt " + nbAttempt + " to recover lease on dfs file " + p); - recovered = dfs.recoverLease(p); - if (recovered) - break; - // Sleep for 1 second before trying again. Typically it takes about 2-3 seconds to recover under default settings - Thread.sleep(1000); - } - return recovered; - - } - - public static void deleteOlderCleanMetaFiles(FileSystem fs, String metaPath, - Stream instants) { - //TODO - this should be archived when archival is made general for all meta-data - // skip MIN_CLEAN_TO_KEEP and delete rest - instants.skip(MIN_CLEAN_TO_KEEP).map(s -> { - try { - return fs.delete(new Path(metaPath, s.getFileName()), false); - } catch (IOException e) { - throw new HoodieIOException("Could not delete clean meta files" + s.getFileName(), - e); - } - }); - } - - public static void deleteOlderRollbackMetaFiles(FileSystem fs, String metaPath, - Stream instants) { - //TODO - this should be archived when archival is made general for all meta-data - // skip MIN_ROLLBACK_TO_KEEP and delete rest - instants.skip(MIN_ROLLBACK_TO_KEEP).map(s -> { - try { - return fs.delete(new Path(metaPath, s.getFileName()), false); - } catch (IOException e) { - throw new HoodieIOException( - "Could not delete rollback meta files " + s.getFileName(), e); - } - }); - } - - public static void createPathIfNotExists(FileSystem fs, Path partitionPath) throws IOException { - if(!fs.exists(partitionPath)) { - fs.mkdirs(partitionPath); - } - } - - public static Long getSizeInMB(long sizeInBytes) { - return sizeInBytes / (1024 * 1024); - } + public static Long getSizeInMB(long sizeInBytes) { + return sizeInBytes / (1024 * 1024); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java index 8323bc5ca..519ce7b60 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java @@ -17,156 +17,167 @@ package com.uber.hoodie.common.util; import com.uber.hoodie.common.model.HoodieRecord; - import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.SchemaCompatabilityException; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.zip.DeflaterOutputStream; import java.util.zip.InflaterInputStream; import org.apache.avro.Schema; -import org.apache.avro.generic.*; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.BinaryEncoder; import org.apache.avro.io.Decoder; import org.apache.avro.io.DecoderFactory; import org.apache.avro.io.EncoderFactory; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - /** * Helper class to do common stuff across Avro. */ public class HoodieAvroUtils { - // All metadata fields are optional strings. - private final static Schema METADATA_FIELD_SCHEMA = Schema.createUnion(Arrays.asList( - Schema.create(Schema.Type.NULL), - Schema.create(Schema.Type.STRING))); + // All metadata fields are optional strings. + private final static Schema METADATA_FIELD_SCHEMA = Schema.createUnion(Arrays.asList( + Schema.create(Schema.Type.NULL), + Schema.create(Schema.Type.STRING))); - private final static Schema RECORD_KEY_SCHEMA = initRecordKeySchema(); + private final static Schema RECORD_KEY_SCHEMA = initRecordKeySchema(); - /** - * Convert a given avro record to bytes - */ - public static byte[] avroToBytes(GenericRecord record) throws IOException { - GenericDatumWriter writer = - new GenericDatumWriter<>(record.getSchema()); - ByteArrayOutputStream out = new ByteArrayOutputStream(); - BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, null); - writer.write(record, encoder); - encoder.flush(); - out.close(); - return out.toByteArray(); + /** + * Convert a given avro record to bytes + */ + public static byte[] avroToBytes(GenericRecord record) throws IOException { + GenericDatumWriter writer = + new GenericDatumWriter<>(record.getSchema()); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, null); + writer.write(record, encoder); + encoder.flush(); + out.close(); + return out.toByteArray(); + } + + /** + * Convert serialized bytes back into avro record + */ + public static GenericRecord bytesToAvro(byte[] bytes, Schema schema) throws IOException { + Decoder decoder = DecoderFactory.get().binaryDecoder(bytes, null); + GenericDatumReader reader = new GenericDatumReader(schema); + return reader.read(null, decoder); + } + + + /** + * Adds the Hoodie metadata fields to the given schema + */ + public static Schema addMetadataFields(Schema schema) { + List parentFields = new ArrayList<>(); + + Schema.Field commitTimeField = new Schema.Field(HoodieRecord.COMMIT_TIME_METADATA_FIELD, + METADATA_FIELD_SCHEMA, "", null); + Schema.Field commitSeqnoField = new Schema.Field(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, + METADATA_FIELD_SCHEMA, "", null); + Schema.Field recordKeyField = new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, + METADATA_FIELD_SCHEMA, "", null); + Schema.Field partitionPathField = new Schema.Field(HoodieRecord.PARTITION_PATH_METADATA_FIELD, + METADATA_FIELD_SCHEMA, "", null); + Schema.Field fileNameField = new Schema.Field(HoodieRecord.FILENAME_METADATA_FIELD, + METADATA_FIELD_SCHEMA, "", null); + + parentFields.add(commitTimeField); + parentFields.add(commitSeqnoField); + parentFields.add(recordKeyField); + parentFields.add(partitionPathField); + parentFields.add(fileNameField); + for (Schema.Field field : schema.getFields()) { + parentFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null)); } - /** - * Convert serialized bytes back into avro record - */ - public static GenericRecord bytesToAvro(byte[] bytes, Schema schema) throws IOException { - Decoder decoder = DecoderFactory.get().binaryDecoder(bytes, null); - GenericDatumReader reader = new GenericDatumReader(schema); - return reader.read(null, decoder); + Schema mergedSchema = Schema + .createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false); + mergedSchema.setFields(parentFields); + return mergedSchema; + } + + private static Schema initRecordKeySchema() { + Schema.Field recordKeyField = new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, + METADATA_FIELD_SCHEMA, "", null); + Schema recordKeySchema = Schema.createRecord("HoodieRecordKey", "", "", false); + recordKeySchema.setFields(Arrays.asList(recordKeyField)); + return recordKeySchema; + } + + public static Schema getRecordKeySchema() { + return RECORD_KEY_SCHEMA; + } + + public static GenericRecord addHoodieKeyToRecord(GenericRecord record, String recordKey, + String partitionPath, String fileName) { + record.put(HoodieRecord.FILENAME_METADATA_FIELD, fileName); + record.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath); + record.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recordKey); + return record; + } + + /** + * Adds the Hoodie commit metadata into the provided Generic Record. + */ + public static GenericRecord addCommitMetadataToRecord(GenericRecord record, String commitTime, + String commitSeqno) { + record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime); + record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, commitSeqno); + return record; + } + + + /** + * Given a avro record with a given schema, rewrites it into the new schema + */ + public static GenericRecord rewriteRecord(GenericRecord record, Schema newSchema) { + GenericRecord newRecord = new GenericData.Record(newSchema); + for (Schema.Field f : record.getSchema().getFields()) { + newRecord.put(f.name(), record.get(f.name())); } - - - /** - * Adds the Hoodie metadata fields to the given schema - */ - public static Schema addMetadataFields(Schema schema) { - List parentFields = new ArrayList<>(); - - Schema.Field commitTimeField = new Schema.Field(HoodieRecord.COMMIT_TIME_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", null); - Schema.Field commitSeqnoField = new Schema.Field(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", null); - Schema.Field recordKeyField = new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", null); - Schema.Field partitionPathField = new Schema.Field(HoodieRecord.PARTITION_PATH_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", null); - Schema.Field fileNameField = new Schema.Field(HoodieRecord.FILENAME_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", null); - - parentFields.add(commitTimeField); - parentFields.add(commitSeqnoField); - parentFields.add(recordKeyField); - parentFields.add(partitionPathField); - parentFields.add(fileNameField); - for (Schema.Field field : schema.getFields()) { - parentFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null)); - } - - Schema mergedSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false); - mergedSchema.setFields(parentFields); - return mergedSchema; + if (!new GenericData().validate(newSchema, newRecord)) { + throw new SchemaCompatabilityException( + "Unable to validate the rewritten record " + record + " against schema " + + newSchema); } + return newRecord; + } - private static Schema initRecordKeySchema() { - Schema.Field recordKeyField = new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", null); - Schema recordKeySchema = Schema.createRecord("HoodieRecordKey", "", "", false); - recordKeySchema.setFields(Arrays.asList(recordKeyField)); - return recordKeySchema; + public static byte[] compress(String text) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try { + OutputStream out = new DeflaterOutputStream(baos); + out.write(text.getBytes("UTF-8")); + out.close(); + } catch (IOException e) { + throw new HoodieIOException("IOException while compressing text " + text, e); } + return baos.toByteArray(); + } - public static Schema getRecordKeySchema() { - return RECORD_KEY_SCHEMA; - } - - public static GenericRecord addHoodieKeyToRecord(GenericRecord record, String recordKey, String partitionPath, String fileName) { - record.put(HoodieRecord.FILENAME_METADATA_FIELD, fileName); - record.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath); - record.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recordKey); - return record; - } - - /** - * Adds the Hoodie commit metadata into the provided Generic Record. - */ - public static GenericRecord addCommitMetadataToRecord(GenericRecord record, String commitTime, String commitSeqno) { - record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime); - record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, commitSeqno); - return record; - } - - - /** - * Given a avro record with a given schema, rewrites it into the new schema - */ - public static GenericRecord rewriteRecord(GenericRecord record, Schema newSchema) { - GenericRecord newRecord = new GenericData.Record(newSchema); - for (Schema.Field f : record.getSchema().getFields()) { - newRecord.put(f.name(), record.get(f.name())); - } - if (!new GenericData().validate(newSchema, newRecord)) { - throw new SchemaCompatabilityException( - "Unable to validate the rewritten record " + record + " against schema " - + newSchema); - } - return newRecord; - } - - public static byte[] compress(String text) { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try { - OutputStream out = new DeflaterOutputStream(baos); - out.write(text.getBytes("UTF-8")); - out.close(); - } catch (IOException e) { - throw new HoodieIOException("IOException while compressing text " + text, e); - } - return baos.toByteArray(); - } - - public static String decompress(byte[] bytes) { - InputStream in = new InflaterInputStream(new ByteArrayInputStream(bytes)); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try { - byte[] buffer = new byte[8192]; - int len; - while((len = in.read(buffer))>0) - baos.write(buffer, 0, len); - return new String(baos.toByteArray(), "UTF-8"); - } catch (IOException e) { - throw new HoodieIOException("IOException while decompressing text", e); - } + public static String decompress(byte[] bytes) { + InputStream in = new InflaterInputStream(new ByteArrayInputStream(bytes)); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try { + byte[] buffer = new byte[8192]; + int len; + while ((len = in.read(buffer)) > 0) { + baos.write(buffer, 0, len); + } + return new String(baos.toByteArray(), "UTF-8"); + } catch (IOException e) { + throw new HoodieIOException("IOException while decompressing text", e); } + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/NumericUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/NumericUtils.java index 7828c1e73..a15ae1ec1 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/NumericUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/NumericUtils.java @@ -17,10 +17,13 @@ package com.uber.hoodie.common.util; public class NumericUtils { - public static String humanReadableByteCount(double bytes) { - if (bytes < 1024) return String.format("%.1f B", bytes); - int exp = (int) (Math.log(bytes) / Math.log(1024)); - String pre = "KMGTPE".charAt(exp-1) + ""; - return String.format("%.1f %sB", bytes / Math.pow(1024, exp), pre); + + public static String humanReadableByteCount(double bytes) { + if (bytes < 1024) { + return String.format("%.1f B", bytes); } + int exp = (int) (Math.log(bytes) / Math.log(1024)); + String pre = "KMGTPE".charAt(exp - 1) + ""; + return String.format("%.1f %sB", bytes / Math.pow(1024, exp), pre); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/ParquetUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/ParquetUtils.java index 017f3cfbf..a4a683350 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/ParquetUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/ParquetUtils.java @@ -16,14 +16,20 @@ package com.uber.hoodie.common.util; +import static com.uber.hoodie.common.util.FSUtils.getFs; + import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.model.HoodieRecord; - import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.MetadataNotFoundException; - +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; @@ -36,163 +42,144 @@ import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; -import java.io.*; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static com.uber.hoodie.common.util.FSUtils.getFs; - /** * Utility functions involving with parquet. */ public class ParquetUtils { - /** - * Read the rowKey list from the given parquet file. - * - * @param filePath The parquet file path. - */ - public static Set readRowKeysFromParquet(Path filePath) { - Configuration conf = new Configuration(); - conf.addResource(getFs().getConf()); - Schema readSchema = HoodieAvroUtils.getRecordKeySchema(); - AvroReadSupport.setAvroReadSchema(conf, readSchema); - AvroReadSupport.setRequestedProjection(conf, readSchema); - ParquetReader reader = null; - Set rowKeys = new HashSet<>(); + /** + * Read the rowKey list from the given parquet file. + * + * @param filePath The parquet file path. + */ + public static Set readRowKeysFromParquet(Path filePath) { + Configuration conf = new Configuration(); + conf.addResource(getFs().getConf()); + Schema readSchema = HoodieAvroUtils.getRecordKeySchema(); + AvroReadSupport.setAvroReadSchema(conf, readSchema); + AvroReadSupport.setRequestedProjection(conf, readSchema); + ParquetReader reader = null; + Set rowKeys = new HashSet<>(); + try { + reader = AvroParquetReader.builder(filePath).withConf(conf).build(); + Object obj = reader.read(); + while (obj != null) { + if (obj instanceof GenericRecord) { + rowKeys.add(((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()); + } + obj = reader.read(); + } + } catch (IOException e) { + throw new HoodieIOException("Failed to read row keys from Parquet " + filePath, e); + + } finally { + if (reader != null) { try { - reader = AvroParquetReader.builder(filePath).withConf(conf).build(); - Object obj = reader.read(); - while (obj != null) { - if (obj instanceof GenericRecord) { - rowKeys.add(((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()); - } - obj = reader.read(); - } + reader.close(); } catch (IOException e) { - throw new HoodieIOException("Failed to read row keys from Parquet " + filePath, e); - - } finally { - if (reader != null) { - try { - reader.close(); - } catch (IOException e) { - // ignore - } - } + // ignore } - return rowKeys; + } } + return rowKeys; + } - /** - * - * Read the metadata from a parquet file - * - * @param parquetFilePath - * @return - */ - public static ParquetMetadata readMetadata(Path parquetFilePath) { - return readMetadata(new Configuration(), parquetFilePath); + /** + * Read the metadata from a parquet file + */ + public static ParquetMetadata readMetadata(Path parquetFilePath) { + return readMetadata(new Configuration(), parquetFilePath); + } + + public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) { + ParquetMetadata footer; + try { + // TODO(vc): Should we use the parallel reading version here? + footer = ParquetFileReader.readFooter(getFs().getConf(), parquetFilePath); + } catch (IOException e) { + throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, + e); } + return footer; + } - public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) { - ParquetMetadata footer; + + /** + * Get the schema of the given parquet file. + */ + public static MessageType readSchema(Path parquetFilePath) { + return readMetadata(parquetFilePath).getFileMetaData().getSchema(); + } + + + private static List readParquetFooter(Path parquetFilePath, String... footerNames) { + List footerVals = new ArrayList<>(); + ParquetMetadata footer = readMetadata(parquetFilePath); + Map metadata = footer.getFileMetaData().getKeyValueMetaData(); + for (String footerName : footerNames) { + if (metadata.containsKey(footerName)) { + footerVals.add(metadata.get(footerName)); + } else { + throw new MetadataNotFoundException("Could not find index in Parquet footer. " + + "Looked for key " + footerName + " in " + parquetFilePath); + } + } + return footerVals; + } + + public static Schema readAvroSchema(Path parquetFilePath) { + return new AvroSchemaConverter().convert(readSchema(parquetFilePath)); + } + + /** + * Read out the bloom filter from the parquet file meta data. + */ + public static BloomFilter readBloomFilterFromParquetMetadata(Path parquetFilePath) { + String footerVal = readParquetFooter(parquetFilePath, + HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY).get(0); + return new BloomFilter(footerVal); + } + + public static String[] readMinMaxRecordKeys(Path parquetFilePath) { + List minMaxKeys = readParquetFooter(parquetFilePath, + HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, + HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER); + if (minMaxKeys.size() != 2) { + throw new HoodieException(String.format( + "Could not read min/max record key out of footer correctly from %s. read) : %s", + parquetFilePath, minMaxKeys)); + } + return new String[]{minMaxKeys.get(0), minMaxKeys.get(1)}; + } + + /** + * NOTE: This literally reads the entire file contents, thus should be used with caution. + */ + public static List readAvroRecords(Path filePath) { + ParquetReader reader = null; + List records = new ArrayList<>(); + try { + reader = AvroParquetReader.builder(filePath).build(); + Object obj = reader.read(); + while (obj != null) { + if (obj instanceof GenericRecord) { + records.add(((GenericRecord) obj)); + } + obj = reader.read(); + } + } catch (IOException e) { + throw new HoodieIOException("Failed to read avro records from Parquet " + filePath, e); + + } finally { + if (reader != null) { try { - // TODO(vc): Should we use the parallel reading version here? - footer = ParquetFileReader.readFooter(getFs().getConf(), parquetFilePath); + reader.close(); } catch (IOException e) { - throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, - e); + // ignore } - return footer; - } - - - /** - * Get the schema of the given parquet file. - * - * @param parquetFilePath - * @return - */ - public static MessageType readSchema(Path parquetFilePath) { - return readMetadata(parquetFilePath).getFileMetaData().getSchema(); - } - - - private static List readParquetFooter(Path parquetFilePath, String... footerNames) { - List footerVals = new ArrayList<>(); - ParquetMetadata footer = readMetadata(parquetFilePath); - Map metadata = footer.getFileMetaData().getKeyValueMetaData(); - for (String footerName : footerNames) { - if (metadata.containsKey(footerName)) { - footerVals.add(metadata.get(footerName)); - } else { - throw new MetadataNotFoundException("Could not find index in Parquet footer. " + - "Looked for key " + footerName + " in " + parquetFilePath); - } - } - return footerVals; - } - - public static Schema readAvroSchema(Path parquetFilePath) { - return new AvroSchemaConverter().convert(readSchema(parquetFilePath)); - } - - /** - * Read out the bloom filter from the parquet file meta data. - */ - public static BloomFilter readBloomFilterFromParquetMetadata(Path parquetFilePath) { - String footerVal = readParquetFooter(parquetFilePath, - HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY).get(0); - return new BloomFilter(footerVal); - } - - public static String[] readMinMaxRecordKeys(Path parquetFilePath) { - List minMaxKeys = readParquetFooter(parquetFilePath, HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, - HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER); - if (minMaxKeys.size() != 2) { - throw new HoodieException(String.format( - "Could not read min/max record key out of footer correctly from %s. read) : %s", - parquetFilePath, minMaxKeys)); - } - return new String[]{minMaxKeys.get(0), minMaxKeys.get(1)}; - } - - /** - * - * NOTE: This literally reads the entire file contents, thus should be used with caution. - * - * @param filePath - * @return - */ - public static List readAvroRecords(Path filePath) { - ParquetReader reader = null; - List records = new ArrayList<>(); - try { - reader = AvroParquetReader.builder(filePath).build(); - Object obj = reader.read(); - while (obj != null) { - if (obj instanceof GenericRecord) { - records.add(((GenericRecord) obj)); - } - obj = reader.read(); - } - } catch (IOException e) { - throw new HoodieIOException("Failed to read avro records from Parquet " + filePath, e); - - } finally { - if (reader != null) { - try { - reader.close(); - } catch (IOException e) { - // ignore - } - } - } - return records; + } } + return records; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/ReflectionUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/ReflectionUtils.java index c1a8e9062..629935ee9 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/ReflectionUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/ReflectionUtils.java @@ -17,22 +17,18 @@ package com.uber.hoodie.common.util; import com.uber.hoodie.common.model.HoodieRecordPayload; - import com.uber.hoodie.exception.HoodieException; -import org.apache.avro.generic.GenericRecord; - -import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.HashMap; import java.util.Map; -import java.util.Optional; public class ReflectionUtils { + private static Map> clazzCache = new HashMap<>(); public static T loadClass(String fqcn) { try { - if(clazzCache.get(fqcn) == null) { + if (clazzCache.get(fqcn) == null) { Class clazz = Class.forName(fqcn); clazzCache.put(fqcn, clazz); } @@ -48,21 +44,17 @@ public class ReflectionUtils { /** * Instantiate a given class with a generic record payload - * - * @param recordPayloadClass - * @param payloadArgs - * @param - * @return */ public static T loadPayload(String recordPayloadClass, - Object [] payloadArgs, - Class ... constructorArgTypes) { + Object[] payloadArgs, + Class... constructorArgTypes) { try { - if(clazzCache.get(recordPayloadClass) == null) { + if (clazzCache.get(recordPayloadClass) == null) { Class clazz = Class.forName(recordPayloadClass); clazzCache.put(recordPayloadClass, clazz); } - return (T) clazzCache.get(recordPayloadClass).getConstructor(constructorArgTypes).newInstance(payloadArgs); + return (T) clazzCache.get(recordPayloadClass).getConstructor(constructorArgTypes) + .newInstance(payloadArgs); } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { throw new HoodieException("Unable to instantiate payload class ", e); } catch (ClassNotFoundException e) { diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/DatasetNotFoundException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/DatasetNotFoundException.java index 476ebddde..5510dbe00 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/DatasetNotFoundException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/DatasetNotFoundException.java @@ -16,43 +16,40 @@ package com.uber.hoodie.exception; +import java.io.IOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import java.io.IOException; - /** - *

- * Exception thrown to indicate that a hoodie dataset was not found on the path provided - *

+ *

Exception thrown to indicate that a hoodie dataset was not found on the path provided

*/ public class DatasetNotFoundException extends HoodieException { - public DatasetNotFoundException(String basePath) { - super(getErrorMessage(basePath)); - } - private static String getErrorMessage(String basePath) { - return "Hoodie dataset not found in path " + basePath; - } + public DatasetNotFoundException(String basePath) { + super(getErrorMessage(basePath)); + } - public static void checkValidDataset(FileSystem fs, Path basePathDir, Path metaPathDir) - throws DatasetNotFoundException { - // Check if the base path is found - try { - if (!fs.exists(basePathDir) || !fs.isDirectory(basePathDir)) { - throw new DatasetNotFoundException(basePathDir.toString()); - } - // Check if the meta path is found - if (!fs.exists(metaPathDir) || !fs.isDirectory(metaPathDir)) { - throw new DatasetNotFoundException(metaPathDir.toString()); - } - } catch (IllegalArgumentException e) { - // if the base path is file:///, then we have a IllegalArgumentException - throw new DatasetNotFoundException(metaPathDir.toString()); - } - catch (IOException e) { - throw new HoodieIOException( - "Could not check if dataset " + basePathDir + " is valid dataset", e); - } + private static String getErrorMessage(String basePath) { + return "Hoodie dataset not found in path " + basePath; + } + + public static void checkValidDataset(FileSystem fs, Path basePathDir, Path metaPathDir) + throws DatasetNotFoundException { + // Check if the base path is found + try { + if (!fs.exists(basePathDir) || !fs.isDirectory(basePathDir)) { + throw new DatasetNotFoundException(basePathDir.toString()); + } + // Check if the meta path is found + if (!fs.exists(metaPathDir) || !fs.isDirectory(metaPathDir)) { + throw new DatasetNotFoundException(metaPathDir.toString()); + } + } catch (IllegalArgumentException e) { + // if the base path is file:///, then we have a IllegalArgumentException + throw new DatasetNotFoundException(metaPathDir.toString()); + } catch (IOException e) { + throw new HoodieIOException( + "Could not check if dataset " + basePathDir + " is valid dataset", e); } + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieException.java index 4c933826e..b216b4710 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieException.java @@ -19,39 +19,34 @@ package com.uber.hoodie.exception; import java.io.Serializable; /** - *

- * Exception thrown for Hoodie failures. The root of - * the exception hierarchy. - *

- *

- * Hoodie Write/Read clients will throw this exception if - * any of its operations fail. This is a runtime (unchecked) exception. - *

- * + *

Exception thrown for Hoodie failures. The root of the exception hierarchy.

Hoodie + * Write/Read clients will throw this exception if any of its operations fail. This is a runtime + * (unchecked) exception.

*/ public class HoodieException extends RuntimeException implements Serializable { - public HoodieException() { - super(); - } - public HoodieException(String message) { - super(message); - } + public HoodieException() { + super(); + } - public HoodieException(String message, Throwable t) { - super(message, t); - } + public HoodieException(String message) { + super(message); + } - public HoodieException(Throwable t) { - super(t); - } + public HoodieException(String message, Throwable t) { + super(message, t); + } - protected static String format(String message, Object... args) { - String[] argStrings = new String[args.length]; - for (int i = 0; i < args.length; i += 1) { - argStrings[i] = String.valueOf(args[i]); - } - return String.format(String.valueOf(message), (Object[]) argStrings); + public HoodieException(Throwable t) { + super(t); + } + + protected static String format(String message, Object... args) { + String[] argStrings = new String[args.length]; + for (int i = 0; i < args.length; i += 1) { + argStrings[i] = String.valueOf(args[i]); } + return String.format(String.valueOf(message), (Object[]) argStrings); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIOException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIOException.java index 74f7ed164..7353de940 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIOException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIOException.java @@ -19,23 +19,22 @@ package com.uber.hoodie.exception; import java.io.IOException; /** - *

- * Exception thrown for dataset IO-related failures. - *

+ *

Exception thrown for dataset IO-related failures.

*/ public class HoodieIOException extends HoodieException { - private IOException ioException; - public HoodieIOException(String msg, IOException t) { - super(msg, t); - this.ioException = t; - } + private IOException ioException; - public HoodieIOException(String msg) { - super(msg); - } + public HoodieIOException(String msg, IOException t) { + super(msg, t); + this.ioException = t; + } - public IOException getIOException() { - return ioException; - } + public HoodieIOException(String msg) { + super(msg); + } + + public IOException getIOException() { + return ioException; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIndexException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIndexException.java index 93da5b9d8..ae46893a7 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIndexException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieIndexException.java @@ -17,16 +17,15 @@ package com.uber.hoodie.exception; /** - *

- * Exception thrown for HoodieIndex related errors. - *

+ *

Exception thrown for HoodieIndex related errors.

*/ public class HoodieIndexException extends HoodieException { - public HoodieIndexException(String msg) { - super(msg); - } - public HoodieIndexException(String msg, Throwable e) { - super(msg, e); - } + public HoodieIndexException(String msg) { + super(msg); + } + + public HoodieIndexException(String msg, Throwable e) { + super(msg, e); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieNotSupportedException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieNotSupportedException.java index 2305df3ab..65cc63093 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieNotSupportedException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieNotSupportedException.java @@ -17,7 +17,8 @@ package com.uber.hoodie.exception; public class HoodieNotSupportedException extends HoodieException { - public HoodieNotSupportedException(String errorMsg) { - super(errorMsg); - } + + public HoodieNotSupportedException(String errorMsg) { + super(errorMsg); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieRecordMissingException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieRecordMissingException.java index 72b1d29a2..0316e4d29 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieRecordMissingException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/HoodieRecordMissingException.java @@ -19,17 +19,15 @@ package com.uber.hoodie.exception; import com.uber.hoodie.common.model.HoodieRecord; /** - *

- * Exception throws when indexing fails to locate the hoodie record. - * HoodieRecord current location and partition path does not match. - * This is an unrecoverable error - *

+ *

Exception throws when indexing fails to locate the hoodie record. HoodieRecord current + * location and partition path does not match. This is an unrecoverable error

*/ public class HoodieRecordMissingException extends HoodieException { - public HoodieRecordMissingException(HoodieRecord record) { - super( - "Record " + record.getRecordKey() + " with partition path " + record.getPartitionPath() - + " in current location " + record.getCurrentLocation() - + " is not found in the partition"); - } + + public HoodieRecordMissingException(HoodieRecord record) { + super( + "Record " + record.getRecordKey() + " with partition path " + record.getPartitionPath() + + " in current location " + record.getCurrentLocation() + + " is not found in the partition"); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/InvalidDatasetException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/InvalidDatasetException.java index e80ae306a..60fcbdf0a 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/InvalidDatasetException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/InvalidDatasetException.java @@ -17,16 +17,15 @@ package com.uber.hoodie.exception; /** - *

- * Exception thrown to indicate that a hoodie dataset is invalid - *

+ *

Exception thrown to indicate that a hoodie dataset is invalid

*/ public class InvalidDatasetException extends HoodieException { - public InvalidDatasetException(String basePath) { - super(getErrorMessage(basePath)); - } - private static String getErrorMessage(String basePath) { - return "Invalid Hoodie Dataset. " + basePath; - } + public InvalidDatasetException(String basePath) { + super(getErrorMessage(basePath)); + } + + private static String getErrorMessage(String basePath) { + return "Invalid Hoodie Dataset. " + basePath; + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/InvalidHoodiePathException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/InvalidHoodiePathException.java index 2d3f342b5..3ab4e5fdf 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/InvalidHoodiePathException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/InvalidHoodiePathException.java @@ -19,7 +19,8 @@ package com.uber.hoodie.exception; import org.apache.hadoop.fs.Path; public class InvalidHoodiePathException extends HoodieException { - public InvalidHoodiePathException(Path path, String type) { - super("Invalid path " + path + " of type " + type); - } + + public InvalidHoodiePathException(Path path, String type) { + super("Invalid path " + path + " of type " + type); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/MetadataNotFoundException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/MetadataNotFoundException.java index 8be9ff401..19a461107 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/MetadataNotFoundException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/MetadataNotFoundException.java @@ -22,11 +22,12 @@ package com.uber.hoodie.exception; * Thrown when expected metadata is not found */ public class MetadataNotFoundException extends HoodieException { - public MetadataNotFoundException(String msg) { - super(msg); - } - public MetadataNotFoundException(String msg, Throwable e) { - super(msg, e); - } + public MetadataNotFoundException(String msg) { + super(msg); + } + + public MetadataNotFoundException(String msg, Throwable e) { + super(msg, e); + } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/exception/SchemaCompatabilityException.java b/hoodie-common/src/main/java/com/uber/hoodie/exception/SchemaCompatabilityException.java index 773f4e53d..420d8c6d4 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/exception/SchemaCompatabilityException.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/exception/SchemaCompatabilityException.java @@ -17,15 +17,16 @@ package com.uber.hoodie.exception; public class SchemaCompatabilityException extends HoodieException { - public SchemaCompatabilityException(String message) { - super(message); - } - public SchemaCompatabilityException(String message, Throwable t) { - super(message, t); - } + public SchemaCompatabilityException(String message) { + super(message); + } - public SchemaCompatabilityException(Throwable t) { - super(t); - } + public SchemaCompatabilityException(String message, Throwable t) { + super(message, t); + } + + public SchemaCompatabilityException(Throwable t) { + super(t); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/TestBloomFilter.java b/hoodie-common/src/test/java/com/uber/hoodie/common/TestBloomFilter.java index 54f93c49d..6c024a897 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/TestBloomFilter.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/TestBloomFilter.java @@ -16,30 +16,30 @@ package com.uber.hoodie.common; +import java.io.IOException; import org.junit.Test; -import java.io.*; - public class TestBloomFilter { - @Test - public void testAddKey() { - BloomFilter filter = new BloomFilter(100, 0.0000001); - filter.add("key1"); - assert (filter.mightContain("key1")); - } - @Test - public void testSerialize() throws IOException, ClassNotFoundException { - BloomFilter filter = new BloomFilter(1000, 0.0000001); - filter.add("key1"); - filter.add("key2"); - String filterStr = filter.serializeToString(); + @Test + public void testAddKey() { + BloomFilter filter = new BloomFilter(100, 0.0000001); + filter.add("key1"); + assert (filter.mightContain("key1")); + } - // Rebuild - BloomFilter newFilter = new BloomFilter(filterStr); - assert (newFilter.mightContain("key1")); - assert (newFilter.mightContain("key2")); - } + @Test + public void testSerialize() throws IOException, ClassNotFoundException { + BloomFilter filter = new BloomFilter(1000, 0.0000001); + filter.add("key1"); + filter.add("key2"); + String filterStr = filter.serializeToString(); + + // Rebuild + BloomFilter newFilter = new BloomFilter(filterStr); + assert (newFilter.mightContain("key1")); + assert (newFilter.mightContain("key2")); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/HdfsTestService.java b/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/HdfsTestService.java index 8dd9a799e..0b3a6bdeb 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/HdfsTestService.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/HdfsTestService.java @@ -19,6 +19,8 @@ package com.uber.hoodie.common.minicluster; import com.google.common.base.Preconditions; import com.google.common.io.Files; +import java.io.File; +import java.io.IOException; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -27,140 +29,129 @@ import org.apache.hadoop.hdfs.MiniDFSCluster; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; -import java.io.IOException; - /** * An HDFS minicluster service implementation. */ public class HdfsTestService { - private static final Logger logger = LoggerFactory.getLogger(HdfsTestService.class); + private static final Logger logger = LoggerFactory.getLogger(HdfsTestService.class); - /** - * Configuration settings - */ - private Configuration hadoopConf; - private String workDir; - private String bindIP = "127.0.0.1"; - private int namenodeRpcPort = 8020; - private int namenodeHttpPort = 50070; - private int datanodePort = 50010; - private int datanodeIpcPort = 50020; - private int datanodeHttpPort = 50075; + /** + * Configuration settings + */ + private Configuration hadoopConf; + private String workDir; + private String bindIP = "127.0.0.1"; + private int namenodeRpcPort = 8020; + private int namenodeHttpPort = 50070; + private int datanodePort = 50010; + private int datanodeIpcPort = 50020; + private int datanodeHttpPort = 50075; - /** - * Embedded HDFS cluster - */ - private MiniDFSCluster miniDfsCluster; + /** + * Embedded HDFS cluster + */ + private MiniDFSCluster miniDfsCluster; - public HdfsTestService() { - hadoopConf = new Configuration(); - workDir = Files.createTempDir().getAbsolutePath(); + public HdfsTestService() { + hadoopConf = new Configuration(); + workDir = Files.createTempDir().getAbsolutePath(); + } + + public Configuration getHadoopConf() { + return hadoopConf; + } + + public MiniDFSCluster start(boolean format) throws IOException { + Preconditions + .checkState(workDir != null, "The work dir must be set before starting cluster."); + + if (hadoopConf == null) { + hadoopConf = new Configuration(); } - public Configuration getHadoopConf() { - return hadoopConf; + // If clean, then remove the work dir so we can start fresh. + String localDFSLocation = getDFSLocation(workDir); + if (format) { + logger.info( + "Cleaning HDFS cluster data at: " + localDFSLocation + " and starting fresh."); + File file = new File(localDFSLocation); + FileUtils.deleteDirectory(file); } - public MiniDFSCluster start(boolean format) throws IOException { - Preconditions - .checkState(workDir != null, "The work dir must be set before starting cluster."); + // Configure and start the HDFS cluster + // boolean format = shouldFormatDFSCluster(localDFSLocation, clean); + hadoopConf = configureDFSCluster(hadoopConf, localDFSLocation, bindIP, namenodeRpcPort, + namenodeHttpPort, datanodePort, datanodeIpcPort, datanodeHttpPort); + miniDfsCluster = new MiniDFSCluster.Builder(hadoopConf).numDataNodes(1).format(format) + .checkDataNodeAddrConfig(true).checkDataNodeHostConfig(true).build(); + logger.info("HDFS Minicluster service started."); + return miniDfsCluster; + } - if (hadoopConf == null) { - hadoopConf = new Configuration(); - } + public void stop() throws IOException { + miniDfsCluster.shutdown(); + logger.info("HDFS Minicluster service shut down."); + miniDfsCluster = null; + hadoopConf = null; + } - // If clean, then remove the work dir so we can start fresh. - String localDFSLocation = getDFSLocation(workDir); - if (format) { - logger.info( - "Cleaning HDFS cluster data at: " + localDFSLocation + " and starting fresh."); - File file = new File(localDFSLocation); - FileUtils.deleteDirectory(file); - } + /** + * Get the location on the local FS where we store the HDFS data. + * + * @param baseFsLocation The base location on the local filesystem we have write access to create + * dirs. + * @return The location for HDFS data. + */ + private static String getDFSLocation(String baseFsLocation) { + return baseFsLocation + Path.SEPARATOR + "dfs"; + } - // Configure and start the HDFS cluster - // boolean format = shouldFormatDFSCluster(localDFSLocation, clean); - hadoopConf = configureDFSCluster(hadoopConf, localDFSLocation, bindIP, namenodeRpcPort, - namenodeHttpPort, datanodePort, datanodeIpcPort, datanodeHttpPort); - miniDfsCluster = new MiniDFSCluster.Builder(hadoopConf).numDataNodes(1).format(format) - .checkDataNodeAddrConfig(true).checkDataNodeHostConfig(true).build(); - logger.info("HDFS Minicluster service started."); - return miniDfsCluster; + /** + * Returns true if we should format the DFS Cluster. We'll format if clean is true, or if the + * dfsFsLocation does not exist. + * + * @param localDFSLocation The location on the local FS to hold the HDFS metadata and block data + * @param clean Specifies if we want to start a clean cluster + * @return Returns true if we should format a DFSCluster, otherwise false + */ + private static boolean shouldFormatDFSCluster(String localDFSLocation, boolean clean) { + boolean format = true; + File f = new File(localDFSLocation); + if (f.exists() && f.isDirectory() && !clean) { + format = false; } + return format; + } - public void stop() throws IOException { - miniDfsCluster.shutdown(); - logger.info("HDFS Minicluster service shut down."); - miniDfsCluster = null; - hadoopConf = null; - } + /** + * Configure the DFS Cluster before launching it. + * + * @param config The already created Hadoop configuration we'll further configure for HDFS + * @param localDFSLocation The location on the local filesystem where cluster data is stored + * @param bindIP An IP address we want to force the datanode and namenode to bind to. + * @return The updated Configuration object. + */ + private static Configuration configureDFSCluster(Configuration config, String localDFSLocation, + String bindIP, int namenodeRpcPort, int namenodeHttpPort, int datanodePort, + int datanodeIpcPort, int datanodeHttpPort) { - /** - * Get the location on the local FS where we store the HDFS data. - * - * @param baseFsLocation The base location on the local filesystem we have write access to - * create dirs. - * @return The location for HDFS data. - */ - private static String getDFSLocation(String baseFsLocation) { - return baseFsLocation + Path.SEPARATOR + "dfs"; - } - - /** - * Returns true if we should format the DFS Cluster. We'll format if clean is - * true, or if the dfsFsLocation does not exist. - * - * @param localDFSLocation The location on the local FS to hold the HDFS metadata and block - * data - * @param clean Specifies if we want to start a clean cluster - * @return Returns true if we should format a DFSCluster, otherwise false - */ - private static boolean shouldFormatDFSCluster(String localDFSLocation, boolean clean) { - boolean format = true; - File f = new File(localDFSLocation); - if (f.exists() && f.isDirectory() && !clean) { - format = false; - } - return format; - } - - /** - * Configure the DFS Cluster before launching it. - * - * @param config The already created Hadoop configuration we'll further configure - * for HDFS - * @param localDFSLocation The location on the local filesystem where cluster data is stored - * @param bindIP An IP address we want to force the datanode and namenode to bind - * to. - * @param namenodeRpcPort - * @param namenodeHttpPort - * @param datanodePort - * @param datanodeIpcPort - * @param datanodeHttpPort - * @return The updated Configuration object. - */ - private static Configuration configureDFSCluster(Configuration config, String localDFSLocation, - String bindIP, int namenodeRpcPort, int namenodeHttpPort, int datanodePort, - int datanodeIpcPort, int datanodeHttpPort) { - - logger.info("HDFS force binding to ip: " + bindIP); - config.set(DFSConfigKeys.FS_DEFAULT_NAME_KEY, "hdfs://" + bindIP + ":" + namenodeRpcPort); - config.set(DFSConfigKeys.DFS_DATANODE_ADDRESS_KEY, bindIP + ":" + datanodePort); - config.set(DFSConfigKeys.DFS_DATANODE_IPC_ADDRESS_KEY, bindIP + ":" + datanodeIpcPort); - config.set(DFSConfigKeys.DFS_DATANODE_HTTP_ADDRESS_KEY, bindIP + ":" + datanodeHttpPort); - // When a datanode registers with the namenode, the Namenode do a hostname - // check of the datanode which will fail on OpenShift due to reverse DNS - // issues with the internal IP addresses. This config disables that check, - // and will allow a datanode to connect regardless. - config.setBoolean("dfs.namenode.datanode.registration.ip-hostname-check", false); - config.set("hdfs.minidfs.basedir", localDFSLocation); - // allow current user to impersonate others - String user = System.getProperty("user.name"); - config.set("hadoop.proxyuser." + user + ".groups", "*"); - config.set("hadoop.proxyuser." + user + ".hosts", "*"); - return config; - } + logger.info("HDFS force binding to ip: " + bindIP); + config.set(DFSConfigKeys.FS_DEFAULT_NAME_KEY, "hdfs://" + bindIP + ":" + namenodeRpcPort); + config.set(DFSConfigKeys.DFS_DATANODE_ADDRESS_KEY, bindIP + ":" + datanodePort); + config.set(DFSConfigKeys.DFS_DATANODE_IPC_ADDRESS_KEY, bindIP + ":" + datanodeIpcPort); + config.set(DFSConfigKeys.DFS_DATANODE_HTTP_ADDRESS_KEY, bindIP + ":" + datanodeHttpPort); + // When a datanode registers with the namenode, the Namenode do a hostname + // check of the datanode which will fail on OpenShift due to reverse DNS + // issues with the internal IP addresses. This config disables that check, + // and will allow a datanode to connect regardless. + config.setBoolean("dfs.namenode.datanode.registration.ip-hostname-check", false); + config.set("hdfs.minidfs.basedir", localDFSLocation); + // allow current user to impersonate others + String user = System.getProperty("user.name"); + config.set("hadoop.proxyuser." + user + ".groups", "*"); + config.set("hadoop.proxyuser." + user + ".hosts", "*"); + return config; + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/MiniClusterUtil.java b/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/MiniClusterUtil.java index 11f9a45dd..1633e3d48 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/MiniClusterUtil.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/MiniClusterUtil.java @@ -16,38 +16,38 @@ package com.uber.hoodie.common.minicluster; +import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.zookeeper.server.ZooKeeperServer; -import java.io.IOException; - public class MiniClusterUtil { - private static MiniDFSCluster dfsCluster; - private static ZooKeeperServer zkServer; - public static Configuration configuration; - public static FileSystem fileSystem; - public static void setUp() throws IOException, InterruptedException { - if (dfsCluster == null) { - HdfsTestService service = new HdfsTestService(); - dfsCluster = service.start(true); - configuration = service.getHadoopConf(); - } - if (zkServer == null) { - ZookeeperTestService zkService = new ZookeeperTestService(configuration); - zkServer = zkService.start(); - } - fileSystem = FileSystem.get(configuration); - } + private static MiniDFSCluster dfsCluster; + private static ZooKeeperServer zkServer; + public static Configuration configuration; + public static FileSystem fileSystem; - public static void shutdown() { - if (dfsCluster != null) { - dfsCluster.shutdown(); - } - if (zkServer != null) { - zkServer.shutdown(); - } + public static void setUp() throws IOException, InterruptedException { + if (dfsCluster == null) { + HdfsTestService service = new HdfsTestService(); + dfsCluster = service.start(true); + configuration = service.getHadoopConf(); } + if (zkServer == null) { + ZookeeperTestService zkService = new ZookeeperTestService(configuration); + zkServer = zkService.start(); + } + fileSystem = FileSystem.get(configuration); + } + + public static void shutdown() { + if (dfsCluster != null) { + dfsCluster.shutdown(); + } + if (zkServer != null) { + zkServer.shutdown(); + } + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/ZookeeperTestService.java b/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/ZookeeperTestService.java index 4ed32f114..290064e19 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/ZookeeperTestService.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/ZookeeperTestService.java @@ -18,14 +18,6 @@ package com.uber.hoodie.common.minicluster; import com.google.common.base.Preconditions; import com.google.common.io.Files; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileUtil; -import org.apache.zookeeper.server.NIOServerCnxnFactory; -import org.apache.zookeeper.server.ZooKeeperServer; -import org.apache.zookeeper.server.persistence.FileTxnLog; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.BufferedReader; import java.io.File; import java.io.IOException; @@ -34,208 +26,212 @@ import java.io.OutputStream; import java.io.Reader; import java.net.InetSocketAddress; import java.net.Socket; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileUtil; +import org.apache.zookeeper.server.NIOServerCnxnFactory; +import org.apache.zookeeper.server.ZooKeeperServer; +import org.apache.zookeeper.server.persistence.FileTxnLog; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A Zookeeper minicluster service implementation. *

- * This class was ripped from MiniZooKeeperCluster from the HBase tests. Changes - * made include: + * This class was ripped from MiniZooKeeperCluster from the HBase tests. Changes made include: *

* 1. It will now only launch 1 zookeeper server. *

- * 2. It will only attempt to bind to the port specified, and will fail if it - * can't. + * 2. It will only attempt to bind to the port specified, and will fail if it can't. *

- * 3. The startup method now takes a bindAddress, which allows us to configure - * which IP the ZK server binds to. This was not configurable in the original - * class. + * 3. The startup method now takes a bindAddress, which allows us to configure which IP the ZK + * server binds to. This was not configurable in the original class. *

- * 4. The ZK cluster will re-use a data dir on the local filesystem if it - * already exists instead of blowing it away. + * 4. The ZK cluster will re-use a data dir on the local filesystem if it already exists instead of + * blowing it away. */ public class ZookeeperTestService { - private static final Logger logger = LoggerFactory.getLogger(ZookeeperTestService.class); + private static final Logger logger = LoggerFactory.getLogger(ZookeeperTestService.class); - private static final int TICK_TIME = 2000; - private static final int CONNECTION_TIMEOUT = 30000; + private static final int TICK_TIME = 2000; + private static final int CONNECTION_TIMEOUT = 30000; - /** - * Configuration settings - */ - private Configuration hadoopConf; - private String workDir; - private Integer clientPort = 2828; - private String bindIP = "127.0.0.1"; - private Boolean clean = false; - private int tickTime = 0; + /** + * Configuration settings + */ + private Configuration hadoopConf; + private String workDir; + private Integer clientPort = 2828; + private String bindIP = "127.0.0.1"; + private Boolean clean = false; + private int tickTime = 0; - /** - * Embedded ZooKeeper cluster - */ - private NIOServerCnxnFactory standaloneServerFactory; - private ZooKeeperServer zooKeeperServer; - private boolean started = false; + /** + * Embedded ZooKeeper cluster + */ + private NIOServerCnxnFactory standaloneServerFactory; + private ZooKeeperServer zooKeeperServer; + private boolean started = false; - public ZookeeperTestService(Configuration config) { - this.workDir = Files.createTempDir().getAbsolutePath(); - this.hadoopConf = config; + public ZookeeperTestService(Configuration config) { + this.workDir = Files.createTempDir().getAbsolutePath(); + this.hadoopConf = config; + } + + public Configuration getHadoopConf() { + return hadoopConf; + } + + public ZooKeeperServer start() throws IOException, InterruptedException { + Preconditions.checkState(workDir != null, + "The localBaseFsLocation must be set before starting cluster."); + + setupTestEnv(); + stop(); + + File dir = new File(workDir, "zookeeper").getAbsoluteFile(); + recreateDir(dir, clean); + int tickTimeToUse; + if (this.tickTime > 0) { + tickTimeToUse = this.tickTime; + } else { + tickTimeToUse = TICK_TIME; + } + this.zooKeeperServer = new ZooKeeperServer(dir, dir, tickTimeToUse); + standaloneServerFactory = new NIOServerCnxnFactory(); + + // NOTE: Changed from the original, where InetSocketAddress was + // originally created to bind to the wildcard IP, we now configure it. + logger.info("Zookeeper force binding to: " + this.bindIP); + standaloneServerFactory.configure(new InetSocketAddress(bindIP, clientPort), 1000); + + // Start up this ZK server + standaloneServerFactory.startup(zooKeeperServer); + + String serverHostname; + if (bindIP.equals("0.0.0.0")) { + serverHostname = "localhost"; + } else { + serverHostname = bindIP; + } + if (!waitForServerUp(serverHostname, clientPort, CONNECTION_TIMEOUT)) { + throw new IOException("Waiting for startup of standalone server"); } - public Configuration getHadoopConf() { - return hadoopConf; + started = true; + logger.info("Zookeeper Minicluster service started on client port: " + clientPort); + return zooKeeperServer; + } + + public void stop() throws IOException { + if (!started) { + return; } - public ZooKeeperServer start() throws IOException, InterruptedException { - Preconditions.checkState(workDir != null, - "The localBaseFsLocation must be set before starting cluster."); - - setupTestEnv(); - stop(); - - File dir = new File(workDir, "zookeeper").getAbsoluteFile(); - recreateDir(dir, clean); - int tickTimeToUse; - if (this.tickTime > 0) { - tickTimeToUse = this.tickTime; - } else { - tickTimeToUse = TICK_TIME; - } - this.zooKeeperServer = new ZooKeeperServer(dir, dir, tickTimeToUse); - standaloneServerFactory = new NIOServerCnxnFactory(); - - // NOTE: Changed from the original, where InetSocketAddress was - // originally created to bind to the wildcard IP, we now configure it. - logger.info("Zookeeper force binding to: " + this.bindIP); - standaloneServerFactory.configure(new InetSocketAddress(bindIP, clientPort), 1000); - - // Start up this ZK server - standaloneServerFactory.startup(zooKeeperServer); - - String serverHostname; - if (bindIP.equals("0.0.0.0")) { - serverHostname = "localhost"; - } else { - serverHostname = bindIP; - } - if (!waitForServerUp(serverHostname, clientPort, CONNECTION_TIMEOUT)) { - throw new IOException("Waiting for startup of standalone server"); - } - - started = true; - logger.info("Zookeeper Minicluster service started on client port: " + clientPort); - return zooKeeperServer; + standaloneServerFactory.shutdown(); + if (!waitForServerDown(clientPort, CONNECTION_TIMEOUT)) { + throw new IOException("Waiting for shutdown of standalone server"); } - public void stop() throws IOException { - if (!started) { - return; - } + // clear everything + started = false; + standaloneServerFactory = null; + zooKeeperServer = null; - standaloneServerFactory.shutdown(); - if (!waitForServerDown(clientPort, CONNECTION_TIMEOUT)) { - throw new IOException("Waiting for shutdown of standalone server"); - } + logger.info("Zookeeper Minicluster service shut down."); + } - // clear everything - started = false; - standaloneServerFactory = null; - zooKeeperServer = null; - - logger.info("Zookeeper Minicluster service shut down."); + private void recreateDir(File dir, boolean clean) throws IOException { + if (dir.exists() && clean) { + FileUtil.fullyDelete(dir); + } else if (dir.exists() && !clean) { + // the directory's exist, and we don't want to clean, so exit + return; } + try { + dir.mkdirs(); + } catch (SecurityException e) { + throw new IOException("creating dir: " + dir, e); + } + } - private void recreateDir(File dir, boolean clean) throws IOException { - if (dir.exists() && clean) { - FileUtil.fullyDelete(dir); - } else if (dir.exists() && !clean) { - // the directory's exist, and we don't want to clean, so exit - return; - } + // / XXX: From o.a.zk.t.ClientBase + private static void setupTestEnv() { + // during the tests we run with 100K prealloc in the logs. + // on windows systems prealloc of 64M was seen to take ~15seconds + // resulting in test failure (client timeout on first session). + // set env and directly in order to handle static init/gc issues + System.setProperty("zookeeper.preAllocSize", "100"); + FileTxnLog.setPreallocSize(100 * 1024); + } + + // XXX: From o.a.zk.t.ClientBase + private static boolean waitForServerDown(int port, long timeout) { + long start = System.currentTimeMillis(); + while (true) { + try { + Socket sock = new Socket("localhost", port); try { - dir.mkdirs(); - } catch (SecurityException e) { - throw new IOException("creating dir: " + dir, e); + OutputStream outstream = sock.getOutputStream(); + outstream.write("stat".getBytes()); + outstream.flush(); + } finally { + sock.close(); } + } catch (IOException e) { + return true; + } + + if (System.currentTimeMillis() > start + timeout) { + break; + } + try { + Thread.sleep(250); + } catch (InterruptedException e) { + // ignore + } } + return false; + } - // / XXX: From o.a.zk.t.ClientBase - private static void setupTestEnv() { - // during the tests we run with 100K prealloc in the logs. - // on windows systems prealloc of 64M was seen to take ~15seconds - // resulting in test failure (client timeout on first session). - // set env and directly in order to handle static init/gc issues - System.setProperty("zookeeper.preAllocSize", "100"); - FileTxnLog.setPreallocSize(100 * 1024); - } + // XXX: From o.a.zk.t.ClientBase + private static boolean waitForServerUp(String hostname, int port, long timeout) { + long start = System.currentTimeMillis(); + while (true) { + try { + Socket sock = new Socket(hostname, port); + BufferedReader reader = null; + try { + OutputStream outstream = sock.getOutputStream(); + outstream.write("stat".getBytes()); + outstream.flush(); - // XXX: From o.a.zk.t.ClientBase - private static boolean waitForServerDown(int port, long timeout) { - long start = System.currentTimeMillis(); - while (true) { - try { - Socket sock = new Socket("localhost", port); - try { - OutputStream outstream = sock.getOutputStream(); - outstream.write("stat".getBytes()); - outstream.flush(); - } finally { - sock.close(); - } - } catch (IOException e) { - return true; - } - - if (System.currentTimeMillis() > start + timeout) { - break; - } - try { - Thread.sleep(250); - } catch (InterruptedException e) { - // ignore - } + Reader isr = new InputStreamReader(sock.getInputStream()); + reader = new BufferedReader(isr); + String line = reader.readLine(); + if (line != null && line.startsWith("Zookeeper version:")) { + return true; + } + } finally { + sock.close(); + if (reader != null) { + reader.close(); + } } - return false; - } - - // XXX: From o.a.zk.t.ClientBase - private static boolean waitForServerUp(String hostname, int port, long timeout) { - long start = System.currentTimeMillis(); - while (true) { - try { - Socket sock = new Socket(hostname, port); - BufferedReader reader = null; - try { - OutputStream outstream = sock.getOutputStream(); - outstream.write("stat".getBytes()); - outstream.flush(); - - Reader isr = new InputStreamReader(sock.getInputStream()); - reader = new BufferedReader(isr); - String line = reader.readLine(); - if (line != null && line.startsWith("Zookeeper version:")) { - return true; - } - } finally { - sock.close(); - if (reader != null) { - reader.close(); - } - } - } catch (IOException e) { - // ignore as this is expected - logger.info("server " + hostname + ":" + port + " not up " + e); - } - - if (System.currentTimeMillis() > start + timeout) { - break; - } - try { - Thread.sleep(250); - } catch (InterruptedException e) { - // ignore - } - } - return false; + } catch (IOException e) { + // ignore as this is expected + logger.info("server " + hostname + ":" + port + " not up " + e); + } + + if (System.currentTimeMillis() > start + timeout) { + break; + } + try { + Thread.sleep(250); + } catch (InterruptedException e) { + // ignore + } } + return false; + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java index b316691ce..1ccca51b2 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java @@ -16,6 +16,9 @@ package com.uber.hoodie.common.model; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -34,18 +37,6 @@ import com.uber.hoodie.common.table.log.block.HoodieLogBlock; import com.uber.hoodie.common.util.AvroUtils; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.hadoop.util.StringUtils; -import org.junit.rules.TemporaryFolder; - import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; @@ -64,224 +55,262 @@ import java.util.Random; import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.Stream; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.util.StringUtils; +import org.junit.rules.TemporaryFolder; public class HoodieTestUtils { - public static FileSystem fs = FSUtils.getFs(); - public static final String TEST_EXTENSION = ".test"; - public static final String RAW_TRIPS_TEST_NAME = "raw_trips"; - public static final int DEFAULT_TASK_PARTITIONID = 1; - public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"}; - private static Random rand = new Random(46474747); - public static void resetFS() { - HoodieTestUtils.fs = FSUtils.getFs(); + public static FileSystem fs = FSUtils.getFs(); + public static final String TEST_EXTENSION = ".test"; + public static final String RAW_TRIPS_TEST_NAME = "raw_trips"; + public static final int DEFAULT_TASK_PARTITIONID = 1; + public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"}; + private static Random rand = new Random(46474747); + + public static void resetFS() { + HoodieTestUtils.fs = FSUtils.getFs(); + } + + public static HoodieTableMetaClient init(String basePath) throws IOException { + return initTableType(basePath, HoodieTableType.COPY_ON_WRITE); + } + + public static HoodieTableMetaClient initTableType(String basePath, HoodieTableType tableType) + throws IOException { + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, RAW_TRIPS_TEST_NAME); + properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name()); + properties.setProperty(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, + HoodieAvroPayload.class.getName()); + return HoodieTableMetaClient.initializePathAsHoodieDataset(fs, basePath, properties); + } + + public static HoodieTableMetaClient initOnTemp() throws IOException { + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + String basePath = folder.getRoot().getAbsolutePath(); + return HoodieTestUtils.init(basePath); + } + + public static String makeNewCommitTime() { + return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); + } + + public static final void createCommitFiles(String basePath, String... commitTimes) + throws IOException { + for (String commitTime : commitTimes) { + new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline + .makeCommitFileName(commitTime)).createNewFile(); } + } - public static HoodieTableMetaClient init(String basePath) throws IOException { - return initTableType(basePath, HoodieTableType.COPY_ON_WRITE); + public static final void createInflightCommitFiles(String basePath, String... commitTimes) + throws IOException { + for (String commitTime : commitTimes) { + new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline + .makeInflightCommitFileName(commitTime)).createNewFile(); } + } - public static HoodieTableMetaClient initTableType(String basePath, HoodieTableType tableType) throws IOException { - Properties properties = new Properties(); - properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, RAW_TRIPS_TEST_NAME); - properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name()); - properties.setProperty(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, HoodieAvroPayload.class.getName()); - return HoodieTableMetaClient.initializePathAsHoodieDataset(fs, basePath, properties); + public static final String createNewDataFile(String basePath, String partitionPath, + String commitTime) throws IOException { + String fileID = UUID.randomUUID().toString(); + return createDataFile(basePath, partitionPath, commitTime, fileID); + } + + public static final String createDataFile(String basePath, String partitionPath, + String commitTime, String fileID) throws IOException { + String folderPath = basePath + "/" + partitionPath + "/"; + new File(folderPath).mkdirs(); + new File(folderPath + FSUtils.makeDataFileName(commitTime, DEFAULT_TASK_PARTITIONID, fileID)) + .createNewFile(); + return fileID; + } + + public static final String createNewLogFile(String basePath, String partitionPath, + String commitTime, String fileID, Optional version) throws IOException { + String folderPath = basePath + "/" + partitionPath + "/"; + boolean makeDir = fs.mkdirs(new Path(folderPath)); + if (!makeDir) { + throw new IOException("cannot create directory for path " + folderPath); } - - public static HoodieTableMetaClient initOnTemp() throws IOException { - // Create a temp folder as the base path - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - String basePath = folder.getRoot().getAbsolutePath(); - return HoodieTestUtils.init(basePath); + boolean createFile = fs.createNewFile(new Path(folderPath + FSUtils + .makeLogFileName(fileID, ".log", commitTime, version.orElse(DEFAULT_TASK_PARTITIONID)))); + if (!createFile) { + throw new IOException(StringUtils + .format("cannot create data file for commit %s and fileId %s", commitTime, fileID)); } + return fileID; + } - public static String makeNewCommitTime() { - return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); + public static final void createCompactionCommitFiles(String basePath, String... commitTimes) + throws IOException { + for (String commitTime : commitTimes) { + boolean createFile = fs.createNewFile(new Path( + basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline + .makeCompactionFileName(commitTime))); + if (!createFile) { + throw new IOException("cannot create commit file for commit " + commitTime); + } } + } - public static final void createCommitFiles(String basePath, String... commitTimes) throws IOException { - for (String commitTime: commitTimes) { - new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + HoodieTimeline.makeCommitFileName(commitTime)).createNewFile(); - } + public static final String getDataFilePath(String basePath, String partitionPath, + String commitTime, String fileID) throws IOException { + return basePath + "/" + partitionPath + "/" + FSUtils + .makeDataFileName(commitTime, DEFAULT_TASK_PARTITIONID, fileID); + } + + public static final String getLogFilePath(String basePath, String partitionPath, + String commitTime, String fileID, Optional version) throws IOException { + return basePath + "/" + partitionPath + "/" + FSUtils + .makeLogFileName(fileID, ".log", commitTime, version.orElse(DEFAULT_TASK_PARTITIONID)); + } + + public static final String getCommitFilePath(String basePath, String commitTime) + throws IOException { + return basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + + HoodieTimeline.COMMIT_EXTENSION; + } + + public static final boolean doesDataFileExist(String basePath, String partitionPath, + String commitTime, String fileID) throws IOException { + return new File(getDataFilePath(basePath, partitionPath, commitTime, fileID)).exists(); + } + + public static final boolean doesLogFileExist(String basePath, String partitionPath, + String commitTime, String fileID, Optional version) throws IOException { + return new File(getLogFilePath(basePath, partitionPath, commitTime, fileID, version)).exists(); + } + + public static final boolean doesCommitExist(String basePath, String commitTime) { + return new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + + HoodieTimeline.COMMIT_EXTENSION).exists(); + } + + public static final boolean doesInflightExist(String basePath, String commitTime) { + return new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + + HoodieTimeline.INFLIGHT_EXTENSION).exists(); + } + + public static String makeInflightTestFileName(String instant) { + return instant + TEST_EXTENSION + HoodieTimeline.INFLIGHT_EXTENSION; + } + + public static void createCleanFiles(String basePath, String commitTime) throws IOException { + Path commitFile = + new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline + .makeCleanerFileName(commitTime)); + FileSystem fs = FSUtils.getFs(); + FSDataOutputStream os = fs.create(commitFile, true); + try { + HoodieCleanStat cleanStats = new HoodieCleanStat( + HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, + DEFAULT_PARTITION_PATHS[rand.nextInt(DEFAULT_PARTITION_PATHS.length)], + new ArrayList<>(), new ArrayList<>(), + new ArrayList<>(), commitTime); + // Create the clean metadata + HoodieCleanMetadata cleanMetadata = + AvroUtils.convertCleanMetadata(commitTime, Optional.of(0L), Arrays.asList(cleanStats)); + // Write empty clean metadata + os.write(AvroUtils.serializeCleanMetadata(cleanMetadata).get()); + } finally { + os.close(); } + } - public static final void createInflightCommitFiles(String basePath, String... commitTimes) throws IOException { - for (String commitTime: commitTimes) { - new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + HoodieTimeline.makeInflightCommitFileName(commitTime)).createNewFile(); - } + public static String makeTestFileName(String instant) { + return instant + TEST_EXTENSION; + } + + public static String makeCommitFileName(String instant) { + return instant + ".commit"; + } + + public static void assertStreamEquals(String message, Stream expected, Stream actual) { + Iterator iter1 = expected.iterator(), iter2 = actual.iterator(); + while (iter1.hasNext() && iter2.hasNext()) { + assertEquals(message, iter1.next(), iter2.next()); } + assert !iter1.hasNext() && !iter2.hasNext(); + } - public static final String createNewDataFile(String basePath, String partitionPath, String commitTime) throws IOException { - String fileID = UUID.randomUUID().toString(); - return createDataFile(basePath, partitionPath, commitTime, fileID); - } - - public static final String createDataFile(String basePath, String partitionPath, String commitTime, String fileID) throws IOException { - String folderPath = basePath + "/" + partitionPath + "/"; - new File(folderPath).mkdirs(); - new File(folderPath + FSUtils.makeDataFileName(commitTime, DEFAULT_TASK_PARTITIONID, fileID)).createNewFile(); - return fileID; - } - - public static final String createNewLogFile(String basePath, String partitionPath, String commitTime, String fileID, Optional version) throws IOException { - String folderPath = basePath + "/" + partitionPath + "/"; - boolean makeDir = fs.mkdirs(new Path(folderPath)); - if(!makeDir) { - throw new IOException("cannot create directory for path " + folderPath); - } - boolean createFile = fs.createNewFile(new Path(folderPath + FSUtils.makeLogFileName(fileID, ".log",commitTime, version.orElse(DEFAULT_TASK_PARTITIONID)))); - if(!createFile) { - throw new IOException(StringUtils.format("cannot create data file for commit %s and fileId %s", commitTime, fileID)); - } - return fileID; - } - - public static final void createCompactionCommitFiles(String basePath, String... commitTimes) throws IOException { - for (String commitTime: commitTimes) { - boolean createFile = fs.createNewFile(new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + HoodieTimeline.makeCompactionFileName(commitTime))); - if(!createFile) { - throw new IOException("cannot create commit file for commit " + commitTime); - } - } - } - - public static final String getDataFilePath(String basePath, String partitionPath, String commitTime, String fileID) throws IOException { - return basePath + "/" + partitionPath + "/" + FSUtils.makeDataFileName(commitTime, DEFAULT_TASK_PARTITIONID, fileID); - } - - public static final String getLogFilePath(String basePath, String partitionPath, String commitTime, String fileID, Optional version) throws IOException { - return basePath + "/" + partitionPath + "/" + FSUtils.makeLogFileName(fileID, ".log", commitTime, version.orElse(DEFAULT_TASK_PARTITIONID)); - } - - public static final String getCommitFilePath(String basePath, String commitTime) throws IOException { - return basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + HoodieTimeline.COMMIT_EXTENSION; - } - - public static final boolean doesDataFileExist(String basePath, String partitionPath, String commitTime, String fileID) throws IOException { - return new File(getDataFilePath(basePath, partitionPath, commitTime, fileID)).exists(); - } - - public static final boolean doesLogFileExist(String basePath, String partitionPath, String commitTime, String fileID, Optional version) throws IOException { - return new File(getLogFilePath(basePath, partitionPath, commitTime, fileID, version)).exists(); - } - - public static final boolean doesCommitExist(String basePath, String commitTime) { - return new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + commitTime + HoodieTimeline.COMMIT_EXTENSION).exists(); - } - - public static final boolean doesInflightExist(String basePath, String commitTime) { - return new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME+ "/" + commitTime + HoodieTimeline.INFLIGHT_EXTENSION).exists(); - } - - public static String makeInflightTestFileName(String instant) { - return instant + TEST_EXTENSION + HoodieTimeline.INFLIGHT_EXTENSION; - } - - public static void createCleanFiles(String basePath, String commitTime) throws IOException { - Path commitFile = - new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCleanerFileName(commitTime)); - FileSystem fs = FSUtils.getFs(); - FSDataOutputStream os = fs.create(commitFile, true); - try { - HoodieCleanStat cleanStats = new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, - DEFAULT_PARTITION_PATHS[rand.nextInt(DEFAULT_PARTITION_PATHS.length)], - new ArrayList<>(), new ArrayList<>(), - new ArrayList<>(), commitTime); - // Create the clean metadata - HoodieCleanMetadata cleanMetadata = - AvroUtils.convertCleanMetadata(commitTime, Optional.of(0L), Arrays.asList(cleanStats)); - // Write empty clean metadata - os.write(AvroUtils.serializeCleanMetadata(cleanMetadata).get()); - } finally { - os.close(); - } - } - - public static String makeTestFileName(String instant) { - return instant + TEST_EXTENSION; - } - - public static String makeCommitFileName(String instant) { - return instant + ".commit"; - } - - public static void assertStreamEquals(String message, Stream expected, Stream actual) { - Iterator iter1 = expected.iterator(), iter2 = actual.iterator(); - while(iter1.hasNext() && iter2.hasNext()) - assertEquals(message, iter1.next(), iter2.next()); - assert !iter1.hasNext() && !iter2.hasNext(); - } - - public static T serializeDeserialize(T object, Class clazz) - throws IOException, ClassNotFoundException { - // Using Kyro as the default serializer in Spark Jobs - Kryo kryo = new Kryo(); - kryo.register(HoodieTableMetaClient.class, new JavaSerializer()); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - Output output = new Output(baos); - kryo.writeObject(output, object); - output.close(); - - Input input = new Input(new ByteArrayInputStream(baos.toByteArray())); - T deseralizedObject = kryo.readObject(input, clazz); - input.close(); - return deseralizedObject; - } - - public static void writeRecordsToLogFiles(String basePath, Schema schema, List updatedRecords) { - Map> groupedUpdated = updatedRecords.stream() - .collect(Collectors.groupingBy(HoodieRecord::getCurrentLocation)); - - groupedUpdated.entrySet().forEach(s -> { - HoodieRecordLocation location = s.getKey(); - String partitionPath = s.getValue().get(0).getPartitionPath(); - - Writer logWriter; - try { - logWriter = HoodieLogFormat.newWriterBuilder() - .onParentPath(new Path(basePath, partitionPath)) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId(location.getFileId()) - .overBaseCommit(location.getCommitTime()) - .withFs(fs).build(); - - Map metadata = Maps.newHashMap(); - metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, location.getCommitTime()); - logWriter.appendBlock(new HoodieAvroDataBlock(s.getValue().stream().map(r -> { - try { - GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get(); - HoodieAvroUtils.addHoodieKeyToRecord(val, - r.getRecordKey(), - r.getPartitionPath(), - ""); - return (IndexedRecord) val; - } catch (IOException e) { - return null; - } - }).collect(Collectors.toList()), schema, metadata)); - logWriter.close(); - } catch (Exception e) { - fail(e.toString()); - } - }); - } - - public static FileStatus[] listAllDataFilesInPath(FileSystem fs, String basePath) - throws IOException { - RemoteIterator itr = fs.listFiles(new Path(basePath), true); - List returns = Lists.newArrayList(); - while(itr.hasNext()) { - LocatedFileStatus status = itr.next(); - if(status.getPath().getName().contains(".parquet")) { - returns.add(status); - } - } - return returns.toArray(new FileStatus[returns.size()]); + public static T serializeDeserialize(T object, Class clazz) + throws IOException, ClassNotFoundException { + // Using Kyro as the default serializer in Spark Jobs + Kryo kryo = new Kryo(); + kryo.register(HoodieTableMetaClient.class, new JavaSerializer()); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + Output output = new Output(baos); + kryo.writeObject(output, object); + output.close(); + + Input input = new Input(new ByteArrayInputStream(baos.toByteArray())); + T deseralizedObject = kryo.readObject(input, clazz); + input.close(); + return deseralizedObject; + } + + public static void writeRecordsToLogFiles(String basePath, Schema schema, + List updatedRecords) { + Map> groupedUpdated = updatedRecords.stream() + .collect(Collectors.groupingBy(HoodieRecord::getCurrentLocation)); + + groupedUpdated.entrySet().forEach(s -> { + HoodieRecordLocation location = s.getKey(); + String partitionPath = s.getValue().get(0).getPartitionPath(); + + Writer logWriter; + try { + logWriter = HoodieLogFormat.newWriterBuilder() + .onParentPath(new Path(basePath, partitionPath)) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId(location.getFileId()) + .overBaseCommit(location.getCommitTime()) + .withFs(fs).build(); + + Map metadata = Maps.newHashMap(); + metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, location.getCommitTime()); + logWriter.appendBlock(new HoodieAvroDataBlock(s.getValue().stream().map(r -> { + try { + GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get(); + HoodieAvroUtils.addHoodieKeyToRecord(val, + r.getRecordKey(), + r.getPartitionPath(), + ""); + return (IndexedRecord) val; + } catch (IOException e) { + return null; + } + }).collect(Collectors.toList()), schema, metadata)); + logWriter.close(); + } catch (Exception e) { + fail(e.toString()); + } + }); + } + + public static FileStatus[] listAllDataFilesInPath(FileSystem fs, String basePath) + throws IOException { + RemoteIterator itr = fs.listFiles(new Path(basePath), true); + List returns = Lists.newArrayList(); + while (itr.hasNext()) { + LocatedFileStatus status = itr.next(); + if (status.getPath().getName().contains(".parquet")) { + returns.add(status); + } } + return returns.toArray(new FileStatus[returns.size()]); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java index b70e03c48..d771b8236 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/HoodieTableMetaClientTest.java @@ -16,11 +16,20 @@ package com.uber.hoodie.common.table; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + import com.google.common.collect.Lists; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; +import java.io.IOException; +import java.util.Optional; +import java.util.stream.Collectors; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; @@ -29,116 +38,106 @@ import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.IOException; -import java.util.Optional; -import java.util.stream.Collectors; - -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; - public class HoodieTableMetaClientTest { - private HoodieTableMetaClient metaClient; - private String basePath; - @Before - public void init() throws IOException { - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - this.basePath = folder.getRoot().getAbsolutePath(); - metaClient = HoodieTestUtils.init(basePath); - } + private HoodieTableMetaClient metaClient; + private String basePath; - @Test - public void checkMetadata() { - assertEquals("Table name should be raw_trips", HoodieTestUtils.RAW_TRIPS_TEST_NAME, - metaClient.getTableConfig().getTableName()); - assertEquals("Basepath should be the one assigned", basePath, metaClient.getBasePath()); - assertEquals("Metapath should be ${basepath}/.hoodie", basePath + "/.hoodie", - metaClient.getMetaPath()); - } + @Before + public void init() throws IOException { + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + this.basePath = folder.getRoot().getAbsolutePath(); + metaClient = HoodieTestUtils.init(basePath); + } - @Test - public void checkSerDe() throws IOException, ClassNotFoundException { - // check if this object is serialized and de-serialized, we are able to read from the file system - HoodieTableMetaClient deseralizedMetaClient = - HoodieTestUtils.serializeDeserialize(metaClient, HoodieTableMetaClient.class); - assertNotNull(deseralizedMetaClient); - HoodieActiveTimeline commitTimeline = deseralizedMetaClient.getActiveTimeline(); - HoodieInstant instant = - new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); - commitTimeline.createInflight(instant); - commitTimeline.saveAsComplete(instant, Optional.of("test-detail".getBytes())); - commitTimeline = commitTimeline.reload(); - HoodieInstant completedInstant = HoodieTimeline.getCompletedInstant(instant); - assertEquals("Commit should be 1 and completed", completedInstant, - commitTimeline.getInstants().findFirst().get()); - assertArrayEquals("Commit value should be \"test-detail\"", "test-detail".getBytes(), - commitTimeline.getInstantDetails(completedInstant).get()); - } + @Test + public void checkMetadata() { + assertEquals("Table name should be raw_trips", HoodieTestUtils.RAW_TRIPS_TEST_NAME, + metaClient.getTableConfig().getTableName()); + assertEquals("Basepath should be the one assigned", basePath, metaClient.getBasePath()); + assertEquals("Metapath should be ${basepath}/.hoodie", basePath + "/.hoodie", + metaClient.getMetaPath()); + } - @Test - public void checkCommitTimeline() throws IOException { - HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); - HoodieTimeline activeCommitTimeline = activeTimeline.getCommitTimeline(); - assertTrue("Should be empty commit timeline", activeCommitTimeline.empty()); + @Test + public void checkSerDe() throws IOException, ClassNotFoundException { + // check if this object is serialized and de-serialized, we are able to read from the file system + HoodieTableMetaClient deseralizedMetaClient = + HoodieTestUtils.serializeDeserialize(metaClient, HoodieTableMetaClient.class); + assertNotNull(deseralizedMetaClient); + HoodieActiveTimeline commitTimeline = deseralizedMetaClient.getActiveTimeline(); + HoodieInstant instant = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); + commitTimeline.createInflight(instant); + commitTimeline.saveAsComplete(instant, Optional.of("test-detail".getBytes())); + commitTimeline = commitTimeline.reload(); + HoodieInstant completedInstant = HoodieTimeline.getCompletedInstant(instant); + assertEquals("Commit should be 1 and completed", completedInstant, + commitTimeline.getInstants().findFirst().get()); + assertArrayEquals("Commit value should be \"test-detail\"", "test-detail".getBytes(), + commitTimeline.getInstantDetails(completedInstant).get()); + } - HoodieInstant instant = - new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); - activeTimeline.createInflight(instant); - activeTimeline.saveAsComplete(instant, Optional.of("test-detail".getBytes())); + @Test + public void checkCommitTimeline() throws IOException { + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + HoodieTimeline activeCommitTimeline = activeTimeline.getCommitTimeline(); + assertTrue("Should be empty commit timeline", activeCommitTimeline.empty()); - // Commit timeline should not auto-reload every time getActiveCommitTimeline(), it should be cached - activeTimeline = metaClient.getActiveTimeline(); - activeCommitTimeline = activeTimeline.getCommitTimeline(); - assertTrue("Should be empty commit timeline", activeCommitTimeline.empty()); + HoodieInstant instant = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); + activeTimeline.createInflight(instant); + activeTimeline.saveAsComplete(instant, Optional.of("test-detail".getBytes())); - HoodieInstant completedInstant = HoodieTimeline.getCompletedInstant(instant); - activeTimeline = activeTimeline.reload(); - activeCommitTimeline = activeTimeline.getCommitTimeline(); - assertFalse("Should be the 1 commit we made", activeCommitTimeline.empty()); - assertEquals("Commit should be 1", completedInstant, - activeCommitTimeline.getInstants().findFirst().get()); - assertArrayEquals("Commit value should be \"test-detail\"", "test-detail".getBytes(), - activeCommitTimeline.getInstantDetails(completedInstant).get()); - } + // Commit timeline should not auto-reload every time getActiveCommitTimeline(), it should be cached + activeTimeline = metaClient.getActiveTimeline(); + activeCommitTimeline = activeTimeline.getCommitTimeline(); + assertTrue("Should be empty commit timeline", activeCommitTimeline.empty()); - @Test - public void checkArchiveCommitTimeline() throws IOException { - Path archiveLogPath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getMetaPath()); - SequenceFile.Writer writer = SequenceFile - .createWriter(HoodieTestUtils.fs.getConf(), SequenceFile.Writer.file(archiveLogPath), - SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class)); + HoodieInstant completedInstant = HoodieTimeline.getCompletedInstant(instant); + activeTimeline = activeTimeline.reload(); + activeCommitTimeline = activeTimeline.getCommitTimeline(); + assertFalse("Should be the 1 commit we made", activeCommitTimeline.empty()); + assertEquals("Commit should be 1", completedInstant, + activeCommitTimeline.getInstants().findFirst().get()); + assertArrayEquals("Commit value should be \"test-detail\"", "test-detail".getBytes(), + activeCommitTimeline.getInstantDetails(completedInstant).get()); + } - writer.append(new Text("1"), new Text("data1")); - writer.append(new Text("2"), new Text("data2")); - writer.append(new Text("3"), new Text("data3")); + @Test + public void checkArchiveCommitTimeline() throws IOException { + Path archiveLogPath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getMetaPath()); + SequenceFile.Writer writer = SequenceFile + .createWriter(HoodieTestUtils.fs.getConf(), SequenceFile.Writer.file(archiveLogPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class)); - IOUtils.closeStream(writer); + writer.append(new Text("1"), new Text("data1")); + writer.append(new Text("2"), new Text("data2")); + writer.append(new Text("3"), new Text("data3")); - HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); + IOUtils.closeStream(writer); - HoodieInstant instant1 = - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "1"); - HoodieInstant instant2 = - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "2"); - HoodieInstant instant3 = - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "3"); + HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); - assertEquals(Lists.newArrayList(instant1, instant2, instant3), - archivedTimeline.getInstants().collect(Collectors.toList())); + HoodieInstant instant1 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "1"); + HoodieInstant instant2 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "2"); + HoodieInstant instant3 = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "3"); - assertArrayEquals(new Text("data1").getBytes(), - archivedTimeline.getInstantDetails(instant1).get()); - assertArrayEquals(new Text("data2").getBytes(), - archivedTimeline.getInstantDetails(instant2).get()); - assertArrayEquals(new Text("data3").getBytes(), - archivedTimeline.getInstantDetails(instant3).get()); - } + assertEquals(Lists.newArrayList(instant1, instant2, instant3), + archivedTimeline.getInstants().collect(Collectors.toList())); + assertArrayEquals(new Text("data1").getBytes(), + archivedTimeline.getInstantDetails(instant1).get()); + assertArrayEquals(new Text("data2").getBytes(), + archivedTimeline.getInstantDetails(instant2).get()); + assertArrayEquals(new Text("data3").getBytes(), + archivedTimeline.getInstantDetails(instant3).get()); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/log/HoodieLogFormatTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/log/HoodieLogFormatTest.java index 607cdaea5..8fc7fb46f 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/log/HoodieLogFormatTest.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/log/HoodieLogFormatTest.java @@ -16,13 +16,18 @@ package com.uber.hoodie.common.table.log; +import static com.uber.hoodie.common.util.SchemaTestUtil.getSimpleSchema; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + import com.google.common.collect.Maps; import com.uber.hoodie.common.minicluster.MiniClusterUtil; import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.model.HoodieTestUtils; -import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.log.HoodieLogFormat.Reader; import com.uber.hoodie.common.table.log.HoodieLogFormat.Writer; import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock; @@ -35,6 +40,15 @@ import com.uber.hoodie.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.common.util.SchemaTestUtil; +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; @@ -48,22 +62,6 @@ import org.junit.BeforeClass; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.IOException; -import java.net.URISyntaxException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import static com.uber.hoodie.common.util.SchemaTestUtil.getSimpleSchema; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - @SuppressWarnings("Duplicates") public class HoodieLogFormatTest { @@ -140,7 +138,7 @@ public class HoodieLogFormatTest { Map metadata = Maps.newHashMap(); metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, - getSimpleSchema(), metadata); + getSimpleSchema(), metadata); // Write out a block writer = writer.appendBlock(dataBlock); // Get the size of the block @@ -170,7 +168,7 @@ public class HoodieLogFormatTest { Map metadata = Maps.newHashMap(); metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, - getSimpleSchema(), metadata); + getSimpleSchema(), metadata); writer = writer.appendBlock(dataBlock); long size1 = writer.getCurrentSize(); writer.close(); @@ -222,7 +220,7 @@ public class HoodieLogFormatTest { Map metadata = Maps.newHashMap(); metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, - getSimpleSchema(), metadata); + getSimpleSchema(), metadata); writer = writer.appendBlock(dataBlock); long size1 = writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying without closing the file @@ -254,11 +252,12 @@ public class HoodieLogFormatTest { Map metadata = Maps.newHashMap(); metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, - getSimpleSchema(), metadata); + getSimpleSchema(), metadata); writer = writer.appendBlock(dataBlock); writer.close(); - Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true); + Reader reader = HoodieLogFormat + .newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true); assertTrue("We wrote a block, we should be able to read it", reader.hasNext()); HoodieLogBlock nextBlock = reader.next(); assertEquals("The next block should be a data block", HoodieLogBlockType.AVRO_DATA_BLOCK, @@ -281,7 +280,7 @@ public class HoodieLogFormatTest { Map metadata = Maps.newHashMap(); metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, - getSimpleSchema(), metadata); + getSimpleSchema(), metadata); writer = writer.appendBlock(dataBlock); writer.close(); @@ -304,7 +303,8 @@ public class HoodieLogFormatTest { writer = writer.appendBlock(dataBlock); writer.close(); - Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true); + Reader reader = HoodieLogFormat + .newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true); assertTrue("First block should be available", reader.hasNext()); HoodieLogBlock nextBlock = reader.next(); HoodieAvroDataBlock dataBlockRead = (HoodieAvroDataBlock) nextBlock; @@ -338,7 +338,7 @@ public class HoodieLogFormatTest { Map metadata = Maps.newHashMap(); metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, - getSimpleSchema(), metadata); + getSimpleSchema(), metadata); writer = writer.appendBlock(dataBlock); writer.close(); @@ -358,7 +358,8 @@ public class HoodieLogFormatTest { outputStream.close(); // First round of reads - we should be able to read the first block and then EOF - Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true); + Reader reader = HoodieLogFormat + .newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true); assertTrue("First block should be available", reader.hasNext()); reader.next(); assertTrue("We should have corrupted block next", reader.hasNext()); @@ -393,7 +394,8 @@ public class HoodieLogFormatTest { writer.close(); // Second round of reads - we should be able to read the first and last block - reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true); + reader = HoodieLogFormat + .newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true); assertTrue("First block should be available", reader.hasNext()); reader.next(); assertTrue("We should get the 1st corrupted block next", reader.hasNext()); @@ -424,7 +426,7 @@ public class HoodieLogFormatTest { Map metadata = Maps.newHashMap(); metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, - schema, metadata); + schema, metadata); writer = writer.appendBlock(dataBlock); // Write 2 @@ -438,7 +440,8 @@ public class HoodieLogFormatTest { .map(s -> s.getPath().toString()) .collect(Collectors.toList()); - HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, allLogFiles, + HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, + allLogFiles, schema, "100"); assertEquals("", 200, scanner.getTotalLogRecords()); Set readKeys = new HashSet<>(200); @@ -469,7 +472,7 @@ public class HoodieLogFormatTest { metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, - schema, metadata); + schema, metadata); writer = writer.appendBlock(dataBlock); // Write 2 @@ -493,9 +496,11 @@ public class HoodieLogFormatTest { .map(s -> s.getPath().toString()) .collect(Collectors.toList()); - HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, allLogFiles, + HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, + allLogFiles, schema, "100"); - assertEquals("We only read 200 records, but only 200 of them are valid", 200, scanner.getTotalLogRecords()); + assertEquals("We only read 200 records, but only 200 of them are valid", 200, + scanner.getTotalLogRecords()); Set readKeys = new HashSet<>(200); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 200 records", 200, readKeys.size()); @@ -523,7 +528,7 @@ public class HoodieLogFormatTest { metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, - schema, metadata); + schema, metadata); writer = writer.appendBlock(dataBlock); writer.close(); @@ -561,7 +566,8 @@ public class HoodieLogFormatTest { .map(s -> s.getPath().toString()) .collect(Collectors.toList()); - HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, allLogFiles, + HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, + allLogFiles, schema, "100"); assertEquals("We would read 200 records", 200, scanner.getTotalLogRecords()); @@ -592,7 +598,7 @@ public class HoodieLogFormatTest { metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, - schema, metadata); + schema, metadata); writer = writer.appendBlock(dataBlock); // Write 2 @@ -609,7 +615,8 @@ public class HoodieLogFormatTest { // Delete 50 keys List deletedKeys = originalKeys.subList(0, 50); - HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new String[50]), metadata); + HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new String[50]), + metadata); writer = writer.appendBlock(deleteBlock); List allLogFiles = FSUtils @@ -617,7 +624,8 @@ public class HoodieLogFormatTest { .map(s -> s.getPath().toString()) .collect(Collectors.toList()); - HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, allLogFiles, + HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, + allLogFiles, schema, "100"); assertEquals("We still would read 200 records", 200, scanner.getTotalLogRecords()); @@ -632,25 +640,26 @@ public class HoodieLogFormatTest { // Rollback the last block HoodieCommandBlock commandBlock = new HoodieCommandBlock( - HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata); + HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata); writer = writer.appendBlock(commandBlock); readKeys.clear(); scanner = new HoodieCompactedLogRecordScanner(fs, basePath, allLogFiles, schema, "100"); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); - assertEquals("Stream collect should return all 200 records after rollback of delete", 200, readKeys.size()); + assertEquals("Stream collect should return all 200 records after rollback of delete", 200, + readKeys.size()); } @Test public void testAvroLogRecordReaderWithFailedRollbacks() - throws IOException, URISyntaxException, InterruptedException { + throws IOException, URISyntaxException, InterruptedException { // Write a Data block and Delete block with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") + .overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); @@ -658,7 +667,7 @@ public class HoodieLogFormatTest { metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, - schema, metadata); + schema, metadata); writer = writer.appendBlock(dataBlock); // Write 2 @@ -667,23 +676,24 @@ public class HoodieLogFormatTest { writer = writer.appendBlock(dataBlock); List originalKeys = records1.stream() - .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) - .collect( - Collectors.toList()); + .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) + .collect( + Collectors.toList()); // Delete 50 keys List deletedKeys = originalKeys.subList(0, 50); - HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new String[50]), metadata); + HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new String[50]), + metadata); writer = writer.appendBlock(deleteBlock); // Attemp 1 : Write 2 rollback blocks (1 data block + 1 delete bloc) for a failed write HoodieCommandBlock commandBlock = new HoodieCommandBlock( - HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata); + HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata); try { writer = writer.appendBlock(commandBlock); // Say job failed, retry writing 2 rollback in the next rollback(..) attempt throw new Exception("simulating failure"); - } catch(Exception e) { + } catch (Exception e) { // it's okay } // Attempt 2 : Write 2 rollback blocks (1 data block + 1 delete bloc) for a failed write @@ -691,14 +701,15 @@ public class HoodieLogFormatTest { writer = writer.appendBlock(commandBlock); List allLogFiles = FSUtils - .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") - .map(s -> s.getPath().toString()) - .collect(Collectors.toList()); + .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()) + .collect(Collectors.toList()); - HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, allLogFiles, - schema, "100"); + HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, + allLogFiles, + schema, "100"); assertEquals("We would read 100 records", 100, - scanner.getTotalLogRecords()); + scanner.getTotalLogRecords()); final List readKeys = new ArrayList<>(100); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); @@ -707,14 +718,14 @@ public class HoodieLogFormatTest { @Test public void testAvroLogRecordReaderWithInsertDeleteAndRollback() - throws IOException, URISyntaxException, InterruptedException { + throws IOException, URISyntaxException, InterruptedException { // Write a Data block and Delete block with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") + .overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); @@ -722,43 +733,45 @@ public class HoodieLogFormatTest { metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, "100"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, - schema, metadata); + schema, metadata); writer = writer.appendBlock(dataBlock); List originalKeys = records1.stream() - .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) - .collect( - Collectors.toList()); + .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) + .collect( + Collectors.toList()); // Delete 50 keys List deletedKeys = originalKeys.subList(0, 50); - HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new String[50]), metadata); + HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new String[50]), + metadata); writer = writer.appendBlock(deleteBlock); // Write 2 rollback blocks (1 data block + 1 delete bloc) for a failed write HoodieCommandBlock commandBlock = new HoodieCommandBlock( - HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata); + HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata); writer = writer.appendBlock(commandBlock); writer = writer.appendBlock(commandBlock); List allLogFiles = FSUtils - .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") - .map(s -> s.getPath().toString()) - .collect(Collectors.toList()); + .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()) + .collect(Collectors.toList()); HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, - allLogFiles, schema, "100"); + allLogFiles, schema, "100"); assertEquals("We would read 0 records", 0, - scanner.getTotalLogRecords()); + scanner.getTotalLogRecords()); } @Test - public void testAvroLogRecordReaderWithInvalidRollback() throws IOException, URISyntaxException, InterruptedException { + public void testAvroLogRecordReaderWithInvalidRollback() + throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") + .overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); @@ -766,23 +779,23 @@ public class HoodieLogFormatTest { metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, "100"); metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, "101"); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, - schema, metadata); + schema, metadata); writer = writer.appendBlock(dataBlock); // Write invalid rollback for a failed write (possible for in-flight commits) HoodieCommandBlock commandBlock = new HoodieCommandBlock( - HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata); + HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata); writer = writer.appendBlock(commandBlock); List allLogFiles = FSUtils - .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") - .map(s -> s.getPath().toString()) - .collect(Collectors.toList()); + .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()) + .collect(Collectors.toList()); HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, basePath, - allLogFiles, schema, "100"); + allLogFiles, schema, "100"); assertEquals("We still would read 100 records", 100, - scanner.getTotalLogRecords()); + scanner.getTotalLogRecords()); final List readKeys = new ArrayList<>(100); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 150 records", 100, readKeys.size()); diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java index cc520acf2..22285a6c5 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java @@ -16,11 +16,18 @@ package com.uber.hoodie.common.table.string; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; +import java.io.IOException; +import java.util.Optional; +import java.util.stream.Stream; import org.apache.hadoop.fs.Path; import org.junit.After; import org.junit.Before; @@ -28,109 +35,104 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; -import java.io.IOException; -import java.util.Optional; -import java.util.stream.Stream; - -import static org.junit.Assert.*; - public class HoodieActiveTimelineTest { - private HoodieActiveTimeline timeline; - private HoodieTableMetaClient metaClient; - @Rule - public final ExpectedException exception = ExpectedException.none(); - @Before - public void setUp() throws Exception { - this.metaClient = HoodieTestUtils.initOnTemp(); - } + private HoodieActiveTimeline timeline; + private HoodieTableMetaClient metaClient; + @Rule + public final ExpectedException exception = ExpectedException.none(); - @After - public void tearDown() throws Exception { - HoodieTestUtils.fs.delete(new Path(this.metaClient.getBasePath()), true); - } + @Before + public void setUp() throws Exception { + this.metaClient = HoodieTestUtils.initOnTemp(); + } - @Test - public void testLoadingInstantsFromFiles() throws IOException { - HoodieInstant instant1 = - new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); - HoodieInstant instant2 = - new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "3"); - HoodieInstant instant3 = - new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "5"); - HoodieInstant instant4 = - new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "8"); - HoodieInstant instant1_complete = - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "1"); - HoodieInstant instant2_complete = - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "3"); - HoodieInstant instant3_complete = - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "5"); - HoodieInstant instant4_complete = - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "8"); + @After + public void tearDown() throws Exception { + HoodieTestUtils.fs.delete(new Path(this.metaClient.getBasePath()), true); + } - HoodieInstant instant5 = - new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "9"); + @Test + public void testLoadingInstantsFromFiles() throws IOException { + HoodieInstant instant1 = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); + HoodieInstant instant2 = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "3"); + HoodieInstant instant3 = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "5"); + HoodieInstant instant4 = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "8"); + HoodieInstant instant1_complete = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "1"); + HoodieInstant instant2_complete = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "3"); + HoodieInstant instant3_complete = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "5"); + HoodieInstant instant4_complete = + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "8"); - timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath()); - timeline.saveAsComplete(instant1, Optional.empty()); - timeline.saveAsComplete(instant2, Optional.empty()); - timeline.saveAsComplete(instant3, Optional.empty()); - timeline.saveAsComplete(instant4, Optional.empty()); - timeline.createInflight(instant5); - timeline = timeline.reload(); + HoodieInstant instant5 = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "9"); - assertEquals("Total instants should be 5", 5, timeline.countInstants()); - HoodieTestUtils.assertStreamEquals("Check the instants stream", Stream - .of(instant1_complete, instant2_complete, instant3_complete, instant4_complete, - instant5), timeline.getInstants()); - HoodieTestUtils.assertStreamEquals("Check the instants stream", Stream - .of(instant1_complete, instant2_complete, instant3_complete, instant4_complete, - instant5), timeline.getCommitTimeline().getInstants()); - HoodieTestUtils.assertStreamEquals("Check the instants stream", - Stream.of(instant1_complete, instant2_complete, instant3_complete, instant4_complete), - timeline.getCommitTimeline().filterCompletedInstants().getInstants()); - HoodieTestUtils.assertStreamEquals("Check the instants stream", Stream.of(instant5), - timeline.getCommitTimeline().filterInflights().getInstants()); - } + timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath()); + timeline.saveAsComplete(instant1, Optional.empty()); + timeline.saveAsComplete(instant2, Optional.empty()); + timeline.saveAsComplete(instant3, Optional.empty()); + timeline.saveAsComplete(instant4, Optional.empty()); + timeline.createInflight(instant5); + timeline = timeline.reload(); - @Test - public void testTimelineOperationsBasic() throws Exception { - timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath()); - assertTrue(timeline.empty()); - assertEquals("", 0, timeline.countInstants()); - assertEquals("", Optional.empty(), timeline.firstInstant()); - assertEquals("", Optional.empty(), timeline.nthInstant(5)); - assertEquals("", Optional.empty(), timeline.nthInstant(-1)); - assertEquals("", Optional.empty(), timeline.lastInstant()); - assertFalse("", timeline.containsInstant( - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "01"))); - } + assertEquals("Total instants should be 5", 5, timeline.countInstants()); + HoodieTestUtils.assertStreamEquals("Check the instants stream", Stream + .of(instant1_complete, instant2_complete, instant3_complete, instant4_complete, + instant5), timeline.getInstants()); + HoodieTestUtils.assertStreamEquals("Check the instants stream", Stream + .of(instant1_complete, instant2_complete, instant3_complete, instant4_complete, + instant5), timeline.getCommitTimeline().getInstants()); + HoodieTestUtils.assertStreamEquals("Check the instants stream", + Stream.of(instant1_complete, instant2_complete, instant3_complete, instant4_complete), + timeline.getCommitTimeline().filterCompletedInstants().getInstants()); + HoodieTestUtils.assertStreamEquals("Check the instants stream", Stream.of(instant5), + timeline.getCommitTimeline().filterInflights().getInstants()); + } - @Test - public void testTimelineOperations() throws Exception { - timeline = new MockHoodieTimeline( - Stream.of("01", "03", "05", "07", "09", "11", "13", "15", "17", "19"), - Stream.of("21", "23")); - HoodieTestUtils.assertStreamEquals("", Stream.of("05", "07", "09", "11"), - timeline.getCommitTimeline().filterCompletedInstants().findInstantsInRange("04", "11") - .getInstants().map(HoodieInstant::getTimestamp)); - HoodieTestUtils.assertStreamEquals("", Stream.of("09", "11"), - timeline.getCommitTimeline().filterCompletedInstants().findInstantsAfter("07", 2) - .getInstants().map(HoodieInstant::getTimestamp)); - assertFalse(timeline.empty()); - assertFalse(timeline.getCommitTimeline().filterInflights().empty()); - assertEquals("", 12, timeline.countInstants()); - HoodieTimeline activeCommitTimeline = timeline.getCommitTimeline().filterCompletedInstants(); - assertEquals("", 10, activeCommitTimeline.countInstants()); + @Test + public void testTimelineOperationsBasic() throws Exception { + timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath()); + assertTrue(timeline.empty()); + assertEquals("", 0, timeline.countInstants()); + assertEquals("", Optional.empty(), timeline.firstInstant()); + assertEquals("", Optional.empty(), timeline.nthInstant(5)); + assertEquals("", Optional.empty(), timeline.nthInstant(-1)); + assertEquals("", Optional.empty(), timeline.lastInstant()); + assertFalse("", timeline.containsInstant( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "01"))); + } - assertEquals("", "01", activeCommitTimeline.firstInstant().get().getTimestamp()); - assertEquals("", "11", activeCommitTimeline.nthInstant(5).get().getTimestamp()); - assertEquals("", "19", activeCommitTimeline.lastInstant().get().getTimestamp()); - assertEquals("", "09", activeCommitTimeline.nthFromLastInstant(5).get().getTimestamp()); - assertTrue("", activeCommitTimeline.containsInstant( - new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "09"))); - assertFalse("", activeCommitTimeline.isBeforeTimelineStarts("02")); - assertTrue("", activeCommitTimeline.isBeforeTimelineStarts("00")); - } + @Test + public void testTimelineOperations() throws Exception { + timeline = new MockHoodieTimeline( + Stream.of("01", "03", "05", "07", "09", "11", "13", "15", "17", "19"), + Stream.of("21", "23")); + HoodieTestUtils.assertStreamEquals("", Stream.of("05", "07", "09", "11"), + timeline.getCommitTimeline().filterCompletedInstants().findInstantsInRange("04", "11") + .getInstants().map(HoodieInstant::getTimestamp)); + HoodieTestUtils.assertStreamEquals("", Stream.of("09", "11"), + timeline.getCommitTimeline().filterCompletedInstants().findInstantsAfter("07", 2) + .getInstants().map(HoodieInstant::getTimestamp)); + assertFalse(timeline.empty()); + assertFalse(timeline.getCommitTimeline().filterInflights().empty()); + assertEquals("", 12, timeline.countInstants()); + HoodieTimeline activeCommitTimeline = timeline.getCommitTimeline().filterCompletedInstants(); + assertEquals("", 10, activeCommitTimeline.countInstants()); + + assertEquals("", "01", activeCommitTimeline.firstInstant().get().getTimestamp()); + assertEquals("", "11", activeCommitTimeline.nthInstant(5).get().getTimestamp()); + assertEquals("", "19", activeCommitTimeline.lastInstant().get().getTimestamp()); + assertEquals("", "09", activeCommitTimeline.nthFromLastInstant(5).get().getTimestamp()); + assertTrue("", activeCommitTimeline.containsInstant( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "09"))); + assertFalse("", activeCommitTimeline.isBeforeTimelineStarts("02")); + assertTrue("", activeCommitTimeline.isBeforeTimelineStarts("00")); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java index 050bbe145..5e3b48844 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/MockHoodieTimeline.java @@ -19,7 +19,6 @@ package com.uber.hoodie.common.table.string; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; - import java.io.IOException; import java.util.Comparator; import java.util.function.Function; @@ -27,18 +26,19 @@ import java.util.stream.Collectors; import java.util.stream.Stream; public class MockHoodieTimeline extends HoodieActiveTimeline { - public MockHoodieTimeline(Stream completed, Stream inflights) - throws IOException { - super(); - this.instants = Stream.concat(completed - .map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)), - inflights.map( - s -> new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, s))) - .sorted(Comparator.comparing(new Function() { - @Override - public String apply(HoodieInstant hoodieInstant) { - return hoodieInstant.getFileName(); - } - })).collect(Collectors.toList()); - } + + public MockHoodieTimeline(Stream completed, Stream inflights) + throws IOException { + super(); + this.instants = Stream.concat(completed + .map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)), + inflights.map( + s -> new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, s))) + .sorted(Comparator.comparing(new Function() { + @Override + public String apply(HoodieInstant hoodieInstant) { + return hoodieInstant.getFileName(); + } + })).collect(Collectors.toList()); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemViewTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemViewTest.java index 1e273fa1e..992b86416 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemViewTest.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemViewTest.java @@ -16,9 +16,12 @@ package com.uber.hoodie.common.table.view; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + import com.google.common.collect.Lists; import com.google.common.collect.Sets; - import com.uber.hoodie.common.model.FileSlice; import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieFileGroup; @@ -30,499 +33,501 @@ import com.uber.hoodie.common.table.TableFileSystemView; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.FSUtils; +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.UUID; -import java.util.function.Function; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static org.junit.Assert.*; - @SuppressWarnings("ResultOfMethodCallIgnored") public class HoodieTableFileSystemViewTest { - private HoodieTableMetaClient metaClient; - private String basePath; - private TableFileSystemView fsView; - private TableFileSystemView.ReadOptimizedView roView; - private TableFileSystemView.RealtimeView rtView; - @Before - public void init() throws IOException { - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - this.basePath = folder.getRoot().getAbsolutePath(); - metaClient = HoodieTestUtils.init(basePath); - fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants()); - roView = (TableFileSystemView.ReadOptimizedView) fsView; - rtView = (TableFileSystemView.RealtimeView) fsView; + private HoodieTableMetaClient metaClient; + private String basePath; + private TableFileSystemView fsView; + private TableFileSystemView.ReadOptimizedView roView; + private TableFileSystemView.RealtimeView rtView; + + @Before + public void init() throws IOException { + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + this.basePath = folder.getRoot().getAbsolutePath(); + metaClient = HoodieTestUtils.init(basePath); + fsView = new HoodieTableFileSystemView(metaClient, + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants()); + roView = (TableFileSystemView.ReadOptimizedView) fsView; + rtView = (TableFileSystemView.RealtimeView) fsView; + } + + private void refreshFsView(FileStatus[] statuses) { + metaClient = new HoodieTableMetaClient(HoodieTestUtils.fs, basePath, true); + if (statuses != null) { + fsView = new HoodieTableFileSystemView(metaClient, + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(), + statuses); + } else { + fsView = new HoodieTableFileSystemView(metaClient, + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants()); } + roView = (TableFileSystemView.ReadOptimizedView) fsView; + rtView = (TableFileSystemView.RealtimeView) fsView; + } - private void refreshFsView(FileStatus[] statuses) { - metaClient = new HoodieTableMetaClient(HoodieTestUtils.fs, basePath, true); - if (statuses != null) { - fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(), - statuses); - } else { - fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants()); - } - roView = (TableFileSystemView.ReadOptimizedView) fsView; - rtView = (TableFileSystemView.RealtimeView) fsView; - } + @Test + public void testGetLatestDataFilesForFileId() throws IOException { + String partitionPath = "2016/05/01"; + new File(basePath + "/" + partitionPath).mkdirs(); + String fileId = UUID.randomUUID().toString(); - @Test - public void testGetLatestDataFilesForFileId() throws IOException { - String partitionPath = "2016/05/01"; - new File(basePath + "/" + partitionPath).mkdirs(); - String fileId = UUID.randomUUID().toString(); + assertFalse("No commit, should not find any data file", + roView.getLatestDataFiles(partitionPath) + .filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().isPresent()); - assertFalse("No commit, should not find any data file", - roView.getLatestDataFiles(partitionPath) - .filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().isPresent()); + // Only one commit, but is not safe + String commitTime1 = "1"; + String fileName1 = FSUtils.makeDataFileName(commitTime1, 1, fileId); + new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); + refreshFsView(null); + assertFalse("No commit, should not find any data file", + roView.getLatestDataFiles(partitionPath) + .filter(dfile -> dfile.getFileId().equals(fileId)) + .findFirst().isPresent()); - // Only one commit, but is not safe - String commitTime1 = "1"; - String fileName1 = FSUtils.makeDataFileName(commitTime1, 1, fileId); - new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); - refreshFsView(null); - assertFalse("No commit, should not find any data file", - roView.getLatestDataFiles(partitionPath) - .filter(dfile -> dfile.getFileId().equals(fileId)) - .findFirst().isPresent()); + // Make this commit safe + HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); + HoodieInstant instant1 = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime1); + commitTimeline.saveAsComplete(instant1, Optional.empty()); + refreshFsView(null); + assertEquals("", fileName1, roView + .getLatestDataFiles(partitionPath) + .filter(dfile -> dfile.getFileId().equals(fileId)) + .findFirst().get() + .getFileName()); - // Make this commit safe - HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); - HoodieInstant instant1 = - new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime1); - commitTimeline.saveAsComplete(instant1, Optional.empty()); - refreshFsView(null); - assertEquals("", fileName1, roView - .getLatestDataFiles(partitionPath) - .filter(dfile -> dfile.getFileId().equals(fileId)) - .findFirst().get() - .getFileName()); + // Do another commit, but not safe + String commitTime2 = "2"; + String fileName2 = FSUtils.makeDataFileName(commitTime2, 1, fileId); + new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); + refreshFsView(null); + assertEquals("", fileName1, roView + .getLatestDataFiles(partitionPath) + .filter(dfile -> dfile.getFileId().equals(fileId)) + .findFirst().get() + .getFileName()); - // Do another commit, but not safe - String commitTime2 = "2"; - String fileName2 = FSUtils.makeDataFileName(commitTime2, 1, fileId); - new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); - refreshFsView(null); - assertEquals("", fileName1, roView - .getLatestDataFiles(partitionPath) - .filter(dfile -> dfile.getFileId().equals(fileId)) - .findFirst().get() - .getFileName()); + // Make it safe + HoodieInstant instant2 = + new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime2); + commitTimeline.saveAsComplete(instant2, Optional.empty()); + refreshFsView(null); + assertEquals("", fileName2, roView + .getLatestDataFiles(partitionPath) + .filter(dfile -> dfile.getFileId().equals(fileId)) + .findFirst().get() + .getFileName()); + } - // Make it safe - HoodieInstant instant2 = - new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime2); - commitTimeline.saveAsComplete(instant2, Optional.empty()); - refreshFsView(null); - assertEquals("", fileName2, roView - .getLatestDataFiles(partitionPath) - .filter(dfile -> dfile.getFileId().equals(fileId)) - .findFirst().get() - .getFileName()); - } + @Test + public void testStreamLatestVersionInPartition() throws IOException { + // Put some files in the partition + String fullPartitionPath = basePath + "/2016/05/01/"; + new File(fullPartitionPath).mkdirs(); + String commitTime1 = "1"; + String commitTime2 = "2"; + String commitTime3 = "3"; + String commitTime4 = "4"; + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + String fileId4 = UUID.randomUUID().toString(); - @Test - public void testStreamLatestVersionInPartition() throws IOException { - // Put some files in the partition - String fullPartitionPath = basePath + "/2016/05/01/"; - new File(fullPartitionPath).mkdirs(); - String commitTime1 = "1"; - String commitTime2 = "2"; - String commitTime3 = "3"; - String commitTime4 = "4"; - String fileId1 = UUID.randomUUID().toString(); - String fileId2 = UUID.randomUUID().toString(); - String fileId3 = UUID.randomUUID().toString(); - String fileId4 = UUID.randomUUID().toString(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils + .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0)) + .createNewFile(); + new File(fullPartitionPath + FSUtils + .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils + .makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils + .makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0)) + .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0)) - .createNewFile(); + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + // Now we list the entire partition + FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); + assertEquals(11, statuses.length); + refreshFsView(null); - new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + // Check files as of lastest commit. + List allSlices = rtView.getAllFileSlices("2016/05/01").collect(Collectors.toList()); + assertEquals(8, allSlices.size()); + Map fileSliceMap = allSlices.stream().collect(Collectors.groupingBy( + slice -> slice.getFileId(), Collectors.counting())); + assertEquals(2, fileSliceMap.get(fileId1).longValue()); + assertEquals(3, fileSliceMap.get(fileId2).longValue()); + assertEquals(2, fileSliceMap.get(fileId3).longValue()); + assertEquals(1, fileSliceMap.get(fileId4).longValue()); - // Now we list the entire partition - FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); - assertEquals(11, statuses.length); - refreshFsView(null); - - // Check files as of lastest commit. - List allSlices = rtView.getAllFileSlices("2016/05/01").collect(Collectors.toList()); - assertEquals(8, allSlices.size()); - Map fileSliceMap = allSlices.stream().collect(Collectors.groupingBy( - slice -> slice.getFileId(), Collectors.counting())); - assertEquals(2, fileSliceMap.get(fileId1).longValue()); - assertEquals(3, fileSliceMap.get(fileId2).longValue()); - assertEquals(2, fileSliceMap.get(fileId3).longValue()); - assertEquals(1, fileSliceMap.get(fileId4).longValue()); - - - List dataFileList = - roView.getLatestDataFilesBeforeOrOn("2016/05/01", commitTime4) - .collect(Collectors.toList()); - assertEquals(3, dataFileList.size()); - Set filenames = Sets.newHashSet(); - for (HoodieDataFile status : dataFileList) { - filenames.add(status.getFileName()); - } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId3))); - - filenames = Sets.newHashSet(); - List logFilesList = - rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime4) - .map(slice -> slice.getLogFiles()) - .flatMap(logFileList -> logFileList) - .collect(Collectors.toList()); - assertEquals(logFilesList.size(), 4); - for (HoodieLogFile logFile: logFilesList) { - filenames.add(logFile.getFileName()); - } - assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0))); - assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1))); - assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0))); - assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0))); - - // Reset the max commit time - List dataFiles = - roView.getLatestDataFilesBeforeOrOn("2016/05/01", commitTime3) - .collect(Collectors.toList()); - assertEquals(dataFiles.size(), 3); - filenames = Sets.newHashSet(); - for (HoodieDataFile status : dataFiles) { - filenames.add(status.getFileName()); - } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, 1, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId3))); - - logFilesList = - rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime3) - .map(slice -> slice.getLogFiles()) - .flatMap(logFileList -> logFileList).collect(Collectors.toList()); - assertEquals(logFilesList.size(), 1); - assertTrue(logFilesList.get(0).getFileName().equals(FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0))); - } - - @Test - public void testStreamEveryVersionInPartition() throws IOException { - // Put some files in the partition - String fullPartitionPath = basePath + "/2016/05/01/"; - new File(fullPartitionPath).mkdirs(); - String commitTime1 = "1"; - String commitTime2 = "2"; - String commitTime3 = "3"; - String commitTime4 = "4"; - String fileId1 = UUID.randomUUID().toString(); - String fileId2 = UUID.randomUUID().toString(); - String fileId3 = UUID.randomUUID().toString(); - - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) - .createNewFile(); - - new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); - - // Now we list the entire partition - FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); - assertEquals(7, statuses.length); - - refreshFsView(null); - List fileGroups = - fsView.getAllFileGroups("2016/05/01").collect(Collectors.toList()); - assertEquals(3, fileGroups.size()); - - for (HoodieFileGroup fileGroup : fileGroups) { - String fileId = fileGroup.getId(); - Set filenames = Sets.newHashSet(); - fileGroup.getAllDataFiles().forEach(dataFile -> { - assertEquals("All same fileId should be grouped", fileId, dataFile.getFileId()); - filenames.add(dataFile.getFileName()); - }); - if (fileId.equals(fileId1)) { - assertEquals(filenames, - Sets.newHashSet(FSUtils.makeDataFileName(commitTime1, 1, fileId1), - FSUtils.makeDataFileName(commitTime4, 1, fileId1))); - } else if (fileId.equals(fileId2)) { - assertEquals(filenames, - Sets.newHashSet(FSUtils.makeDataFileName(commitTime1, 1, fileId2), - FSUtils.makeDataFileName(commitTime2, 1, fileId2), - FSUtils.makeDataFileName(commitTime3, 1, fileId2))); - } else { - assertEquals(filenames, - Sets.newHashSet(FSUtils.makeDataFileName(commitTime3, 1, fileId3), - FSUtils.makeDataFileName(commitTime4, 1, fileId3))); - } - } - } - - @Test - public void streamLatestVersionInRange() throws IOException { - // Put some files in the partition - String fullPartitionPath = basePath + "/2016/05/01/"; - new File(fullPartitionPath).mkdirs(); - String commitTime1 = "1"; - String commitTime2 = "2"; - String commitTime3 = "3"; - String commitTime4 = "4"; - String fileId1 = UUID.randomUUID().toString(); - String fileId2 = UUID.randomUUID().toString(); - String fileId3 = UUID.randomUUID().toString(); - - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId1)) - .createNewFile(); - - - - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0)) - .createNewFile(); - - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) - .createNewFile(); - - - new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); - - // Now we list the entire partition - FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); - assertEquals(9, statuses.length); - - refreshFsView(statuses); - List dataFiles = roView - .getLatestDataFilesInRange(Lists.newArrayList(commitTime2, commitTime3)) + List dataFileList = + roView.getLatestDataFilesBeforeOrOn("2016/05/01", commitTime4) .collect(Collectors.toList()); - assertEquals(3, dataFiles.size()); - Set filenames = Sets.newHashSet(); - for (HoodieDataFile status : dataFiles) { - filenames.add(status.getFileName()); - } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId3))); + assertEquals(3, dataFileList.size()); + Set filenames = Sets.newHashSet(); + for (HoodieDataFile status : dataFileList) { + filenames.add(status.getFileName()); + } + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId1))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId3))); + filenames = Sets.newHashSet(); + List logFilesList = + rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime4) + .map(slice -> slice.getLogFiles()) + .flatMap(logFileList -> logFileList) + .collect(Collectors.toList()); + assertEquals(logFilesList.size(), 4); + for (HoodieLogFile logFile : logFilesList) { + filenames.add(logFile.getFileName()); + } + assertTrue(filenames + .contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0))); + assertTrue(filenames + .contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1))); + assertTrue(filenames + .contains(FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0))); + assertTrue(filenames + .contains(FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0))); - List slices = rtView - .getLatestFileSliceInRange(Lists.newArrayList(commitTime3, commitTime4)) - .collect(Collectors.toList()); + // Reset the max commit time + List dataFiles = + roView.getLatestDataFilesBeforeOrOn("2016/05/01", commitTime3) + .collect(Collectors.toList()); + assertEquals(dataFiles.size(), 3); + filenames = Sets.newHashSet(); + for (HoodieDataFile status : dataFiles) { + filenames.add(status.getFileName()); + } + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, 1, fileId1))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId3))); + + logFilesList = + rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime3) + .map(slice -> slice.getLogFiles()) + .flatMap(logFileList -> logFileList).collect(Collectors.toList()); + assertEquals(logFilesList.size(), 1); + assertTrue(logFilesList.get(0).getFileName() + .equals(FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0))); + } + + @Test + public void testStreamEveryVersionInPartition() throws IOException { + // Put some files in the partition + String fullPartitionPath = basePath + "/2016/05/01/"; + new File(fullPartitionPath).mkdirs(); + String commitTime1 = "1"; + String commitTime2 = "2"; + String commitTime3 = "3"; + String commitTime4 = "4"; + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) + .createNewFile(); + + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + + // Now we list the entire partition + FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); + assertEquals(7, statuses.length); + + refreshFsView(null); + List fileGroups = + fsView.getAllFileGroups("2016/05/01").collect(Collectors.toList()); + assertEquals(3, fileGroups.size()); + + for (HoodieFileGroup fileGroup : fileGroups) { + String fileId = fileGroup.getId(); + Set filenames = Sets.newHashSet(); + fileGroup.getAllDataFiles().forEach(dataFile -> { + assertEquals("All same fileId should be grouped", fileId, dataFile.getFileId()); + filenames.add(dataFile.getFileName()); + }); + if (fileId.equals(fileId1)) { + assertEquals(filenames, + Sets.newHashSet(FSUtils.makeDataFileName(commitTime1, 1, fileId1), + FSUtils.makeDataFileName(commitTime4, 1, fileId1))); + } else if (fileId.equals(fileId2)) { + assertEquals(filenames, + Sets.newHashSet(FSUtils.makeDataFileName(commitTime1, 1, fileId2), + FSUtils.makeDataFileName(commitTime2, 1, fileId2), + FSUtils.makeDataFileName(commitTime3, 1, fileId2))); + } else { + assertEquals(filenames, + Sets.newHashSet(FSUtils.makeDataFileName(commitTime3, 1, fileId3), + FSUtils.makeDataFileName(commitTime4, 1, fileId3))); + } + } + } + + @Test + public void streamLatestVersionInRange() throws IOException { + // Put some files in the partition + String fullPartitionPath = basePath + "/2016/05/01/"; + new File(fullPartitionPath).mkdirs(); + String commitTime1 = "1"; + String commitTime2 = "2"; + String commitTime3 = "3"; + String commitTime4 = "4"; + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils + .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId1)) + .createNewFile(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils + .makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0)) + .createNewFile(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) + .createNewFile(); + + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + + // Now we list the entire partition + FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); + assertEquals(9, statuses.length); + + refreshFsView(statuses); + List dataFiles = roView + .getLatestDataFilesInRange(Lists.newArrayList(commitTime2, commitTime3)) + .collect(Collectors.toList()); + assertEquals(3, dataFiles.size()); + Set filenames = Sets.newHashSet(); + for (HoodieDataFile status : dataFiles) { + filenames.add(status.getFileName()); + } + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId3))); + + List slices = rtView + .getLatestFileSliceInRange(Lists.newArrayList(commitTime3, commitTime4)) + .collect(Collectors.toList()); + assertEquals(3, slices.size()); + for (FileSlice slice : slices) { + if (slice.getFileId().equals(fileId1)) { + assertEquals(slice.getBaseCommitTime(), commitTime3); + assertTrue(slice.getDataFile().isPresent()); + assertEquals(slice.getLogFiles().count(), 0); + } else if (slice.getFileId().equals(fileId2)) { + assertEquals(slice.getBaseCommitTime(), commitTime4); + assertFalse(slice.getDataFile().isPresent()); + assertEquals(slice.getLogFiles().count(), 1); + } else if (slice.getFileId().equals(fileId3)) { + assertEquals(slice.getBaseCommitTime(), commitTime4); + assertTrue(slice.getDataFile().isPresent()); + assertEquals(slice.getLogFiles().count(), 0); + } + } + } + + @Test + public void streamLatestVersionsBefore() throws IOException { + // Put some files in the partition + String partitionPath = "2016/05/01/"; + String fullPartitionPath = basePath + "/" + partitionPath; + new File(fullPartitionPath).mkdirs(); + String commitTime1 = "1"; + String commitTime2 = "2"; + String commitTime3 = "3"; + String commitTime4 = "4"; + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) + .createNewFile(); + + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + + // Now we list the entire partition + FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); + assertEquals(7, statuses.length); + + refreshFsView(null); + List dataFiles = + roView.getLatestDataFilesBeforeOrOn(partitionPath, commitTime2) + .collect(Collectors.toList()); + assertEquals(2, dataFiles.size()); + Set filenames = Sets.newHashSet(); + for (HoodieDataFile status : dataFiles) { + filenames.add(status.getFileName()); + } + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, 1, fileId1))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime2, 1, fileId2))); + } + + @Test + public void streamLatestVersions() throws IOException { + // Put some files in the partition + String partitionPath = "2016/05/01/"; + String fullPartitionPath = basePath + "/" + partitionPath; + new File(fullPartitionPath).mkdirs(); + String commitTime1 = "1"; + String commitTime2 = "2"; + String commitTime3 = "3"; + String commitTime4 = "4"; + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils + .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) + .createNewFile(); + new File(fullPartitionPath + FSUtils + .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0)) + .createNewFile(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) + .createNewFile(); + new File(fullPartitionPath + FSUtils + .makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime2, 0)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) + .createNewFile(); + + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) + .createNewFile(); + new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) + .createNewFile(); + + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); + + // Now we list the entire partition + FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); + assertEquals(10, statuses.length); + + refreshFsView(statuses); + + List fileGroups = fsView + .getAllFileGroups(partitionPath) + .collect(Collectors.toList()); + assertEquals(3, fileGroups.size()); + for (HoodieFileGroup fileGroup : fileGroups) { + List slices = fileGroup.getAllFileSlices().collect(Collectors.toList()); + if (fileGroup.getId().equals(fileId1)) { + assertEquals(2, slices.size()); + assertEquals(commitTime4, slices.get(0).getBaseCommitTime()); + assertEquals(commitTime1, slices.get(1).getBaseCommitTime()); + } else if (fileGroup.getId().equals(fileId2)) { assertEquals(3, slices.size()); - for (FileSlice slice: slices) { - if (slice.getFileId().equals(fileId1)) { - assertEquals(slice.getBaseCommitTime(), commitTime3); - assertTrue(slice.getDataFile().isPresent()); - assertEquals(slice.getLogFiles().count(), 0); - } else if (slice.getFileId().equals(fileId2)) { - assertEquals(slice.getBaseCommitTime(), commitTime4); - assertFalse(slice.getDataFile().isPresent()); - assertEquals(slice.getLogFiles().count(), 1); - } else if (slice.getFileId().equals(fileId3)) { - assertEquals(slice.getBaseCommitTime(), commitTime4); - assertTrue(slice.getDataFile().isPresent()); - assertEquals(slice.getLogFiles().count(), 0); - } - } + assertEquals(commitTime3, slices.get(0).getBaseCommitTime()); + assertEquals(commitTime2, slices.get(1).getBaseCommitTime()); + assertEquals(commitTime1, slices.get(2).getBaseCommitTime()); + } else if (fileGroup.getId().equals(fileId3)) { + assertEquals(2, slices.size()); + assertEquals(commitTime4, slices.get(0).getBaseCommitTime()); + assertEquals(commitTime3, slices.get(1).getBaseCommitTime()); + } } - @Test - public void streamLatestVersionsBefore() throws IOException { - // Put some files in the partition - String partitionPath = "2016/05/01/"; - String fullPartitionPath = basePath + "/" + partitionPath; - new File(fullPartitionPath).mkdirs(); - String commitTime1 = "1"; - String commitTime2 = "2"; - String commitTime3 = "3"; - String commitTime4 = "4"; - String fileId1 = UUID.randomUUID().toString(); - String fileId2 = UUID.randomUUID().toString(); - String fileId3 = UUID.randomUUID().toString(); - - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) - .createNewFile(); - - new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); - - // Now we list the entire partition - FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); - assertEquals(7, statuses.length); - - refreshFsView(null); - List dataFiles = - roView.getLatestDataFilesBeforeOrOn(partitionPath, commitTime2) - .collect(Collectors.toList()); - assertEquals(2, dataFiles.size()); - Set filenames = Sets.newHashSet(); - for (HoodieDataFile status : dataFiles) { - filenames.add(status.getFileName()); - } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, 1, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime2, 1, fileId2))); - } - - @Test - public void streamLatestVersions() throws IOException { - // Put some files in the partition - String partitionPath = "2016/05/01/"; - String fullPartitionPath = basePath + "/" + partitionPath; - new File(fullPartitionPath).mkdirs(); - String commitTime1 = "1"; - String commitTime2 = "2"; - String commitTime3 = "3"; - String commitTime4 = "4"; - String fileId1 = UUID.randomUUID().toString(); - String fileId2 = UUID.randomUUID().toString(); - String fileId3 = UUID.randomUUID().toString(); - - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId1)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId1)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0)) - .createNewFile(); - - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, 1, fileId2)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime2, 0)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId2)) - .createNewFile(); - - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, 1, fileId3)) - .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, 1, fileId3)) - .createNewFile(); - - - new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime3 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime4 + ".commit").createNewFile(); - - // Now we list the entire partition - FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath)); - assertEquals(10, statuses.length); - - refreshFsView(statuses); - - List fileGroups = fsView - .getAllFileGroups(partitionPath) - .collect(Collectors.toList()); - assertEquals(3, fileGroups.size()); - for (HoodieFileGroup fileGroup: fileGroups) { - List slices = fileGroup.getAllFileSlices().collect(Collectors.toList()); - if (fileGroup.getId().equals(fileId1)) { - assertEquals(2, slices.size()); - assertEquals(commitTime4, slices.get(0).getBaseCommitTime()); - assertEquals(commitTime1, slices.get(1).getBaseCommitTime()); - } else if (fileGroup.getId().equals(fileId2)) { - assertEquals(3, slices.size()); - assertEquals(commitTime3, slices.get(0).getBaseCommitTime()); - assertEquals(commitTime2, slices.get(1).getBaseCommitTime()); - assertEquals(commitTime1, slices.get(2).getBaseCommitTime()); - } else if (fileGroup.getId().equals(fileId3)) { - assertEquals(2, slices.size()); - assertEquals(commitTime4, slices.get(0).getBaseCommitTime()); - assertEquals(commitTime3, slices.get(1).getBaseCommitTime()); - } - } - - List statuses1 = - roView.getLatestDataFiles().collect(Collectors.toList()); - assertEquals(3, statuses1.size()); - Set filenames = Sets.newHashSet(); - for (HoodieDataFile status : statuses1) { - filenames.add(status.getFileName()); - } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId3))); + List statuses1 = + roView.getLatestDataFiles().collect(Collectors.toList()); + assertEquals(3, statuses1.size()); + Set filenames = Sets.newHashSet(); + for (HoodieDataFile status : statuses1) { + filenames.add(status.getFileName()); } + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId1))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, 1, fileId2))); + assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId3))); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/SchemaTestUtil.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/SchemaTestUtil.java index e42137391..9af169294 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/util/SchemaTestUtil.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/SchemaTestUtil.java @@ -19,111 +19,111 @@ package com.uber.hoodie.common.util; import com.uber.hoodie.avro.MercifulJsonConverter; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.exception.HoodieIOException; +import java.io.IOException; import java.net.URI; +import java.net.URISyntaxException; import java.nio.file.FileSystem; import java.nio.file.FileSystemNotFoundException; import java.nio.file.FileSystems; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.avro.Schema; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.avro.io.DecoderFactory; -import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.Stream; - public class SchemaTestUtil { - public static Schema getSimpleSchema() throws IOException { - return new Schema.Parser() - .parse(SchemaTestUtil.class.getResourceAsStream("/simple-test.avro")); + + public static Schema getSimpleSchema() throws IOException { + return new Schema.Parser() + .parse(SchemaTestUtil.class.getResourceAsStream("/simple-test.avro")); + } + + public static List generateTestRecords(int from, int limit) + throws IOException, URISyntaxException { + return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit); + } + + private static List toRecords(Schema writerSchema, Schema readerSchema, int from, + int limit) throws IOException, URISyntaxException { + GenericDatumReader reader = + new GenericDatumReader<>(writerSchema, readerSchema); + // Required to register the necessary JAR:// file system + URI resource = SchemaTestUtil.class.getClass().getResource("/sample.data").toURI(); + Path dataPath; + if (resource.toString().contains("!")) { + dataPath = uriToPath(resource); + } else { + dataPath = Paths.get(SchemaTestUtil.class.getClass().getResource("/sample.data").toURI()); } - public static List generateTestRecords(int from, int limit) - throws IOException, URISyntaxException { - return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit); - } - - private static List toRecords(Schema writerSchema, Schema readerSchema, int from, - int limit) throws IOException, URISyntaxException { - GenericDatumReader reader = - new GenericDatumReader<>(writerSchema, readerSchema); - // Required to register the necessary JAR:// file system - URI resource = SchemaTestUtil.class.getClass().getResource("/sample.data").toURI(); - Path dataPath; - if(resource.toString().contains("!")) { - dataPath = uriToPath(resource); - } else { - dataPath = Paths.get(SchemaTestUtil.class.getClass().getResource("/sample.data").toURI()); - } - - try (Stream stream = Files.lines(dataPath)) { - return stream.skip(from).limit(limit).map(s -> { - try { - return reader.read(null, DecoderFactory.get().jsonDecoder(writerSchema, s)); - } catch (IOException e) { - throw new HoodieIOException("Could not read data from simple_data.json", e); - } - }).collect(Collectors.toList()); - } catch (IOException e) { - throw new HoodieIOException("Could not read data from simple_data.json", e); - } - } - - static Path uriToPath(URI uri) throws IOException { - final Map env = new HashMap<>(); - final String[] array = uri.toString().split("!"); - FileSystem fs; + try (Stream stream = Files.lines(dataPath)) { + return stream.skip(from).limit(limit).map(s -> { try { - fs = FileSystems.getFileSystem(URI.create(array[0])); - } catch (FileSystemNotFoundException e) { - fs = FileSystems.newFileSystem(URI.create(array[0]), env); + return reader.read(null, DecoderFactory.get().jsonDecoder(writerSchema, s)); + } catch (IOException e) { + throw new HoodieIOException("Could not read data from simple_data.json", e); } - return fs.getPath(array[1]); + }).collect(Collectors.toList()); + } catch (IOException e) { + throw new HoodieIOException("Could not read data from simple_data.json", e); } + } - public static List generateHoodieTestRecords(int from, int limit) - throws IOException, URISyntaxException { - List records = generateTestRecords(from, limit); - Schema hoodieFieldsSchema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); - return records.stream() - .map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, hoodieFieldsSchema)) - .map(p -> { - p.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, UUID.randomUUID().toString()); - p.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, "0000/00/00"); - return p; - }).collect( - Collectors.toList()); - + static Path uriToPath(URI uri) throws IOException { + final Map env = new HashMap<>(); + final String[] array = uri.toString().split("!"); + FileSystem fs; + try { + fs = FileSystems.getFileSystem(URI.create(array[0])); + } catch (FileSystemNotFoundException e) { + fs = FileSystems.newFileSystem(URI.create(array[0]), env); } + return fs.getPath(array[1]); + } - public static Schema getEvolvedSchema() throws IOException { - return new Schema.Parser() - .parse(SchemaTestUtil.class.getResourceAsStream("/simple-test-evolved.avro")); - } + public static List generateHoodieTestRecords(int from, int limit) + throws IOException, URISyntaxException { + List records = generateTestRecords(from, limit); + Schema hoodieFieldsSchema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); + return records.stream() + .map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, hoodieFieldsSchema)) + .map(p -> { + p.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, UUID.randomUUID().toString()); + p.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, "0000/00/00"); + return p; + }).collect( + Collectors.toList()); - public static List generateEvolvedTestRecords(int from, int limit) - throws IOException, URISyntaxException { - return toRecords(getSimpleSchema(), getEvolvedSchema(), from, limit); - } + } - public static Schema getComplexEvolvedSchema() throws IOException { - return new Schema.Parser() - .parse(SchemaTestUtil.class.getResourceAsStream("/complex-test-evolved.avro")); - } + public static Schema getEvolvedSchema() throws IOException { + return new Schema.Parser() + .parse(SchemaTestUtil.class.getResourceAsStream("/simple-test-evolved.avro")); + } - public static GenericRecord generateAvroRecordFromJson(Schema schema, int recordNumber, - String commitTime, String fileId) throws IOException { - TestRecord record = new TestRecord(commitTime, recordNumber, fileId); - MercifulJsonConverter converter = new MercifulJsonConverter(schema); - return converter.convert(record.toJsonString()); - } + public static List generateEvolvedTestRecords(int from, int limit) + throws IOException, URISyntaxException { + return toRecords(getSimpleSchema(), getEvolvedSchema(), from, limit); + } + + public static Schema getComplexEvolvedSchema() throws IOException { + return new Schema.Parser() + .parse(SchemaTestUtil.class.getResourceAsStream("/complex-test-evolved.avro")); + } + + public static GenericRecord generateAvroRecordFromJson(Schema schema, int recordNumber, + String commitTime, String fileId) throws IOException { + TestRecord record = new TestRecord(commitTime, recordNumber, fileId); + MercifulJsonConverter converter = new MercifulJsonConverter(schema); + return converter.convert(record.toJsonString()); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java index c5d19b50e..edcc1509b 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java @@ -16,48 +16,47 @@ package com.uber.hoodie.common.util; -import org.junit.Test; +import static org.junit.Assert.assertTrue; import java.text.SimpleDateFormat; import java.util.Date; import java.util.UUID; - -import static org.junit.Assert.assertTrue; +import org.junit.Test; public class TestFSUtils { - @Test - public void testMakeDataFileName() { - String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); - int taskPartitionId = 2; - String fileName = UUID.randomUUID().toString(); - assertTrue(FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName) - .equals(fileName + "_" + taskPartitionId + "_" + commitTime + ".parquet")); - } + @Test + public void testMakeDataFileName() { + String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); + int taskPartitionId = 2; + String fileName = UUID.randomUUID().toString(); + assertTrue(FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName) + .equals(fileName + "_" + taskPartitionId + "_" + commitTime + ".parquet")); + } - @Test - public void testMaskFileName() { - String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); - int taskPartitionId = 2; - assertTrue(FSUtils.maskWithoutFileId(commitTime, taskPartitionId) - .equals("*_" + taskPartitionId + "_" + commitTime + ".parquet")); - } + @Test + public void testMaskFileName() { + String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); + int taskPartitionId = 2; + assertTrue(FSUtils.maskWithoutFileId(commitTime, taskPartitionId) + .equals("*_" + taskPartitionId + "_" + commitTime + ".parquet")); + } - @Test - public void testGetCommitTime() { - String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); - int taskPartitionId = 2; - String fileName = UUID.randomUUID().toString(); - String fullFileName = FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName); - assertTrue(FSUtils.getCommitTime(fullFileName).equals(commitTime)); - } + @Test + public void testGetCommitTime() { + String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); + int taskPartitionId = 2; + String fileName = UUID.randomUUID().toString(); + String fullFileName = FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName); + assertTrue(FSUtils.getCommitTime(fullFileName).equals(commitTime)); + } - @Test - public void testGetFileNameWithoutMeta() { - String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); - int taskPartitionId = 2; - String fileName = UUID.randomUUID().toString(); - String fullFileName = FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName); - assertTrue(FSUtils.getFileId(fullFileName).equals(fileName)); - } + @Test + public void testGetFileNameWithoutMeta() { + String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); + int taskPartitionId = 2; + String fileName = UUID.randomUUID().toString(); + String fullFileName = FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName); + assertTrue(FSUtils.getFileId(fullFileName).equals(fileName)); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestNumericUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestNumericUtils.java index 99f8a67f2..8a7141869 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestNumericUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestNumericUtils.java @@ -16,20 +16,22 @@ package com.uber.hoodie.common.util; +import static org.junit.Assert.assertTrue; + import org.junit.Test; -import static org.junit.Assert.*; public class TestNumericUtils { - @Test - public void testHumanReadableByteCount() { - assertTrue(NumericUtils.humanReadableByteCount(0).equals("0.0 B")); - assertTrue(NumericUtils.humanReadableByteCount(27).equals("27.0 B")); - assertTrue(NumericUtils.humanReadableByteCount(1023).equals("1023.0 B")); - assertTrue(NumericUtils.humanReadableByteCount(1024).equals("1.0 KB")); - assertTrue(NumericUtils.humanReadableByteCount(110592).equals("108.0 KB")); - assertTrue(NumericUtils.humanReadableByteCount(28991029248L).equals("27.0 GB")); - assertTrue(NumericUtils.humanReadableByteCount(1855425871872L).equals("1.7 TB")); - assertTrue(NumericUtils.humanReadableByteCount(9223372036854775807L).equals("8.0 EB")); - } + @Test + public void testHumanReadableByteCount() { + assertTrue(NumericUtils.humanReadableByteCount(0).equals("0.0 B")); + assertTrue(NumericUtils.humanReadableByteCount(27).equals("27.0 B")); + assertTrue(NumericUtils.humanReadableByteCount(1023).equals("1023.0 B")); + assertTrue(NumericUtils.humanReadableByteCount(1024).equals("1.0 KB")); + assertTrue(NumericUtils.humanReadableByteCount(110592).equals("108.0 KB")); + assertTrue(NumericUtils.humanReadableByteCount(28991029248L).equals("27.0 GB")); + assertTrue(NumericUtils.humanReadableByteCount(1855425871872L).equals("1.7 TB")); + assertTrue(NumericUtils.humanReadableByteCount(9223372036854775807L).equals("8.0 EB")); + + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java index 4145ed6ed..266cb1158 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java @@ -16,10 +16,17 @@ package com.uber.hoodie.common.util; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.model.HoodieRecord; - +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.UUID; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; @@ -31,62 +38,55 @@ import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.UUID; - -import static org.junit.Assert.*; - public class TestParquetUtils { - private String basePath; + private String basePath; - @Before - public void setup() throws IOException { - // Create a temp folder as the base path - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - basePath = folder.getRoot().getAbsolutePath(); + @Before + public void setup() throws IOException { + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + basePath = folder.getRoot().getAbsolutePath(); + } + + @Test + public void testHoodieWriteSupport() throws Exception { + + List rowKeys = new ArrayList<>(); + for (int i = 0; i < 1000; i++) { + rowKeys.add(UUID.randomUUID().toString()); } - @Test - public void testHoodieWriteSupport() throws Exception { + // Write out a parquet file + Schema schema = HoodieAvroUtils.getRecordKeySchema(); + BloomFilter filter = new BloomFilter(1000, 0.0001); + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( + new AvroSchemaConverter().convert(schema), schema, filter); - List rowKeys = new ArrayList<>(); - for (int i = 0; i < 1000; i++) { - rowKeys.add(UUID.randomUUID().toString()); - } - - // Write out a parquet file - Schema schema = HoodieAvroUtils.getRecordKeySchema(); - BloomFilter filter = new BloomFilter(1000, 0.0001); - HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); - - - String filePath = basePath + "/test.parquet"; - ParquetWriter writer = new ParquetWriter(new Path(filePath), - writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE); - for (String rowKey : rowKeys) { - GenericRecord rec = new GenericData.Record(schema); - rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey); - writer.write(rec); - filter.add(rowKey); - } - writer.close(); - - - // Read and verify - List rowKeysInFile = new ArrayList<>(ParquetUtils.readRowKeysFromParquet(new Path(filePath))); - Collections.sort(rowKeysInFile); - Collections.sort(rowKeys); - - assertEquals("Did not read back the expected list of keys", rowKeys, rowKeysInFile); - BloomFilter filterInFile = ParquetUtils.readBloomFilterFromParquetMetadata(new Path(filePath)); - for (String rowKey : rowKeys) { - assertTrue("key should be found in bloom filter", filterInFile.mightContain(rowKey)); - } + String filePath = basePath + "/test.parquet"; + ParquetWriter writer = new ParquetWriter(new Path(filePath), + writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, + ParquetWriter.DEFAULT_PAGE_SIZE); + for (String rowKey : rowKeys) { + GenericRecord rec = new GenericData.Record(schema); + rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey); + writer.write(rec); + filter.add(rowKey); } + writer.close(); + + // Read and verify + List rowKeysInFile = new ArrayList<>( + ParquetUtils.readRowKeysFromParquet(new Path(filePath))); + Collections.sort(rowKeysInFile); + Collections.sort(rowKeys); + + assertEquals("Did not read back the expected list of keys", rowKeys, rowKeysInFile); + BloomFilter filterInFile = ParquetUtils.readBloomFilterFromParquetMetadata(new Path(filePath)); + for (String rowKey : rowKeys) { + assertTrue("key should be found in bloom filter", filterInFile.mightContain(rowKey)); + } + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestRecord.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestRecord.java index 7852749a0..5df7f3252 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestRecord.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestRecord.java @@ -17,82 +17,87 @@ package com.uber.hoodie.common.util; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import org.codehaus.jackson.annotate.JsonAutoDetect; -import org.codehaus.jackson.annotate.JsonMethod; -import org.codehaus.jackson.map.ObjectMapper; - import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.Map; +import org.codehaus.jackson.annotate.JsonAutoDetect; +import org.codehaus.jackson.annotate.JsonMethod; +import org.codehaus.jackson.map.ObjectMapper; @JsonIgnoreProperties(ignoreUnknown = true) @SuppressWarnings({"unused", "FieldCanBeLocal", "MismatchedQueryAndUpdateOfCollection"}) public class TestRecord implements Serializable { - class TestMapItemRecord implements Serializable { - private String item1; - private String item2; - TestMapItemRecord(String item1, String item2) { - this.item1 = item1; - this.item2 = item2; - } + class TestMapItemRecord implements Serializable { + + private String item1; + private String item2; + + TestMapItemRecord(String item1, String item2) { + this.item1 = item1; + this.item2 = item2; } + } - class TestNestedRecord implements Serializable { - private boolean isAdmin; - private String userId; + class TestNestedRecord implements Serializable { - TestNestedRecord(boolean isAdmin, String userId) { - this.isAdmin = isAdmin; - this.userId = userId; - } + private boolean isAdmin; + private String userId; + + TestNestedRecord(boolean isAdmin, String userId) { + this.isAdmin = isAdmin; + this.userId = userId; } + } - private String _hoodie_commit_time; - private String _hoodie_record_key; - private String _hoodie_partition_path; - private String _hoodie_file_name; - private String _hoodie_commit_seqno; + private String _hoodie_commit_time; + private String _hoodie_record_key; + private String _hoodie_partition_path; + private String _hoodie_file_name; + private String _hoodie_commit_seqno; - private String field1; - private String field2; - private String name; - private Integer favoriteIntNumber; - private Long favoriteNumber; - private Float favoriteFloatNumber; - private Double favoriteDoubleNumber; - private Map tags; - private TestNestedRecord testNestedRecord; - private String[] stringArray; + private String field1; + private String field2; + private String name; + private Integer favoriteIntNumber; + private Long favoriteNumber; + private Float favoriteFloatNumber; + private Double favoriteDoubleNumber; + private Map tags; + private TestNestedRecord testNestedRecord; + private String[] stringArray; - public TestRecord(String commitTime, int recordNumber, String fileId) { - this._hoodie_commit_time = commitTime; - this._hoodie_record_key = "key" + recordNumber; - this._hoodie_partition_path = commitTime; - this._hoodie_file_name = fileId; - this._hoodie_commit_seqno = commitTime + recordNumber; + public TestRecord(String commitTime, int recordNumber, String fileId) { + this._hoodie_commit_time = commitTime; + this._hoodie_record_key = "key" + recordNumber; + this._hoodie_partition_path = commitTime; + this._hoodie_file_name = fileId; + this._hoodie_commit_seqno = commitTime + recordNumber; - String commitTimeSuffix = "@" + commitTime; - int commitHashCode = commitTime.hashCode(); + String commitTimeSuffix = "@" + commitTime; + int commitHashCode = commitTime.hashCode(); - this.field1 = "field" + recordNumber; - this.field2 = "field" + recordNumber + commitTimeSuffix; - this.name = "name" + recordNumber; - this.favoriteIntNumber = recordNumber + commitHashCode; - this.favoriteNumber = (long)(recordNumber + commitHashCode); - this.favoriteFloatNumber = (float)((recordNumber + commitHashCode) / 1024.0); - this.favoriteDoubleNumber = (recordNumber + commitHashCode) / 1024.0; - this.tags = new HashMap<>(); - this.tags.put("mapItem1", new TestMapItemRecord("item" + recordNumber, "item" + recordNumber + commitTimeSuffix)); - this.tags.put("mapItem2", new TestMapItemRecord("item2" + recordNumber, "item2" + recordNumber + commitTimeSuffix)); - this.testNestedRecord = new TestNestedRecord(false, "UserId" + recordNumber + commitTimeSuffix); - this.stringArray = new String[]{"stringArray0" + commitTimeSuffix, "stringArray1" + commitTimeSuffix}; - } + this.field1 = "field" + recordNumber; + this.field2 = "field" + recordNumber + commitTimeSuffix; + this.name = "name" + recordNumber; + this.favoriteIntNumber = recordNumber + commitHashCode; + this.favoriteNumber = (long) (recordNumber + commitHashCode); + this.favoriteFloatNumber = (float) ((recordNumber + commitHashCode) / 1024.0); + this.favoriteDoubleNumber = (recordNumber + commitHashCode) / 1024.0; + this.tags = new HashMap<>(); + this.tags.put("mapItem1", + new TestMapItemRecord("item" + recordNumber, "item" + recordNumber + commitTimeSuffix)); + this.tags.put("mapItem2", + new TestMapItemRecord("item2" + recordNumber, "item2" + recordNumber + commitTimeSuffix)); + this.testNestedRecord = new TestNestedRecord(false, "UserId" + recordNumber + commitTimeSuffix); + this.stringArray = new String[]{"stringArray0" + commitTimeSuffix, + "stringArray1" + commitTimeSuffix}; + } - public String toJsonString() throws IOException { - ObjectMapper mapper = new ObjectMapper(); - mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY); - return mapper.writerWithDefaultPrettyPrinter().writeValueAsString(this); - } + public String toJsonString() throws IOException { + ObjectMapper mapper = new ObjectMapper(); + mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY); + return mapper.writerWithDefaultPrettyPrinter().writeValueAsString(this); + } } diff --git a/hoodie-common/src/test/resources/log4j-surefire.properties b/hoodie-common/src/test/resources/log4j-surefire.properties index 017045b23..ea3e93545 100644 --- a/hoodie-common/src/test/resources/log4j-surefire.properties +++ b/hoodie-common/src/test/resources/log4j-surefire.properties @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # - log4j.rootLogger=WARN, A1 log4j.category.com.uber=INFO log4j.category.com.uber.hoodie.table.log=WARN log4j.category.com.uber.hoodie.common.util=WARN log4j.category.org.apache.parquet.hadoop=WARN - # A1 is set to be a ConsoleAppender. log4j.appender.A1=org.apache.log4j.ConsoleAppender # A1 uses PatternLayout. diff --git a/hoodie-hadoop-mr/pom.xml b/hoodie-hadoop-mr/pom.xml index fe91cc91b..73ee95e51 100644 --- a/hoodie-hadoop-mr/pom.xml +++ b/hoodie-hadoop-mr/pom.xml @@ -15,7 +15,9 @@ ~ limitations under the License. --> - + hoodie com.uber.hoodie @@ -107,7 +109,8 @@ shade - ${project.build.directory}/dependency-reduced-pom.xml + ${project.build.directory}/dependency-reduced-pom.xml + true diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieHiveUtil.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieHiveUtil.java index 3eed58d67..12b4abf40 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieHiveUtil.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieHiveUtil.java @@ -22,47 +22,48 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; public class HoodieHiveUtil { - public static final Logger LOG = - LogManager.getLogger(HoodieHiveUtil.class); - public static final String HOODIE_CONSUME_MODE_PATTERN = "hoodie.%s.consume.mode"; - public static final String HOODIE_START_COMMIT_PATTERN = "hoodie.%s.consume.start.timestamp"; - public static final String HOODIE_MAX_COMMIT_PATTERN = "hoodie.%s.consume.max.commits"; - public static final String INCREMENTAL_SCAN_MODE = "INCREMENTAL"; - public static final String LATEST_SCAN_MODE = "LATEST"; - public static final String DEFAULT_SCAN_MODE = LATEST_SCAN_MODE; - public static final int DEFAULT_MAX_COMMITS = 1; - public static final int MAX_COMMIT_ALL = -1; - public static final int DEFAULT_LEVELS_TO_BASEPATH = 3; + public static final Logger LOG = + LogManager.getLogger(HoodieHiveUtil.class); - public static Integer readMaxCommits(JobContext job, String tableName) { - String maxCommitName = String.format(HOODIE_MAX_COMMIT_PATTERN, tableName); - int maxCommits = job.getConfiguration().getInt(maxCommitName, DEFAULT_MAX_COMMITS); - if (maxCommits == MAX_COMMIT_ALL) { - maxCommits = Integer.MAX_VALUE; - } - LOG.info("Read max commits - " + maxCommits); - return maxCommits; + public static final String HOODIE_CONSUME_MODE_PATTERN = "hoodie.%s.consume.mode"; + public static final String HOODIE_START_COMMIT_PATTERN = "hoodie.%s.consume.start.timestamp"; + public static final String HOODIE_MAX_COMMIT_PATTERN = "hoodie.%s.consume.max.commits"; + public static final String INCREMENTAL_SCAN_MODE = "INCREMENTAL"; + public static final String LATEST_SCAN_MODE = "LATEST"; + public static final String DEFAULT_SCAN_MODE = LATEST_SCAN_MODE; + public static final int DEFAULT_MAX_COMMITS = 1; + public static final int MAX_COMMIT_ALL = -1; + public static final int DEFAULT_LEVELS_TO_BASEPATH = 3; + + public static Integer readMaxCommits(JobContext job, String tableName) { + String maxCommitName = String.format(HOODIE_MAX_COMMIT_PATTERN, tableName); + int maxCommits = job.getConfiguration().getInt(maxCommitName, DEFAULT_MAX_COMMITS); + if (maxCommits == MAX_COMMIT_ALL) { + maxCommits = Integer.MAX_VALUE; } + LOG.info("Read max commits - " + maxCommits); + return maxCommits; + } - public static String readStartCommitTime(JobContext job, String tableName) { - String startCommitTimestampName = String.format(HOODIE_START_COMMIT_PATTERN, tableName); - LOG.info("Read start commit time - " + job.getConfiguration().get(startCommitTimestampName)); - return job.getConfiguration().get(startCommitTimestampName); - } + public static String readStartCommitTime(JobContext job, String tableName) { + String startCommitTimestampName = String.format(HOODIE_START_COMMIT_PATTERN, tableName); + LOG.info("Read start commit time - " + job.getConfiguration().get(startCommitTimestampName)); + return job.getConfiguration().get(startCommitTimestampName); + } - public static String readMode(JobContext job, String tableName) { - String modePropertyName = String.format(HOODIE_CONSUME_MODE_PATTERN, tableName); - String mode =job.getConfiguration().get(modePropertyName, DEFAULT_SCAN_MODE); - LOG.info(modePropertyName + ": " + mode); - return mode; - } + public static String readMode(JobContext job, String tableName) { + String modePropertyName = String.format(HOODIE_CONSUME_MODE_PATTERN, tableName); + String mode = job.getConfiguration().get(modePropertyName, DEFAULT_SCAN_MODE); + LOG.info(modePropertyName + ": " + mode); + return mode; + } - public static Path getNthParent(Path path, int n) { - Path parent = path; - for (int i = 0; i < n; i++) { - parent = parent.getParent(); - } - return parent; + public static Path getNthParent(Path path, int n) { + Path parent = path; + for (int i = 0; i < n; i++) { + parent = parent.getParent(); } + return parent; + } } diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java index 2b30a33b3..03e86d3bd 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java @@ -16,6 +16,10 @@ package com.uber.hoodie.hadoop; +import static parquet.filter2.predicate.FilterApi.and; +import static parquet.filter2.predicate.FilterApi.binaryColumn; +import static parquet.filter2.predicate.FilterApi.gt; + import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.model.HoodieRecord; @@ -26,6 +30,12 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.view.HoodieTableFileSystemView; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.InvalidDatasetException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurable; @@ -54,156 +64,151 @@ import parquet.hadoop.metadata.FileMetaData; import parquet.hadoop.metadata.ParquetMetadata; import parquet.io.api.Binary; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import static parquet.filter2.predicate.FilterApi.and; -import static parquet.filter2.predicate.FilterApi.binaryColumn; -import static parquet.filter2.predicate.FilterApi.gt; - /** - * HoodieInputFormat which understands the Hoodie File Structure and filters - * files based on the Hoodie Mode. If paths that does not correspond to a hoodie dataset - * then they are passed in as is (as what FileInputFormat.listStatus() would do). - * The JobConf could have paths from multipe Hoodie/Non-Hoodie datasets + * HoodieInputFormat which understands the Hoodie File Structure and filters files based on the + * Hoodie Mode. If paths that does not correspond to a hoodie dataset then they are passed in as is + * (as what FileInputFormat.listStatus() would do). The JobConf could have paths from multipe + * Hoodie/Non-Hoodie datasets */ @UseFileSplitsFromInputFormat public class HoodieInputFormat extends MapredParquetInputFormat implements Configurable { - public static final Log LOG = LogFactory.getLog(HoodieInputFormat.class); + public static final Log LOG = LogFactory.getLog(HoodieInputFormat.class); - protected Configuration conf; + protected Configuration conf; - @Override - public FileStatus[] listStatus(JobConf job) throws IOException { - // Get all the file status from FileInputFormat and then do the filter - FileStatus[] fileStatuses = super.listStatus(job); - Map> groupedFileStatus = groupFileStatus(fileStatuses); - LOG.info("Found a total of " + groupedFileStatus.size() + " groups"); - List returns = new ArrayList<>(); - for(Map.Entry> entry: groupedFileStatus.entrySet()) { - HoodieTableMetaClient metadata = entry.getKey(); - if (metadata == null) { - // Add all the paths which are not hoodie specific - returns.addAll(entry.getValue()); - continue; - } + @Override + public FileStatus[] listStatus(JobConf job) throws IOException { + // Get all the file status from FileInputFormat and then do the filter + FileStatus[] fileStatuses = super.listStatus(job); + Map> groupedFileStatus = groupFileStatus(fileStatuses); + LOG.info("Found a total of " + groupedFileStatus.size() + " groups"); + List returns = new ArrayList<>(); + for (Map.Entry> entry : groupedFileStatus.entrySet()) { + HoodieTableMetaClient metadata = entry.getKey(); + if (metadata == null) { + // Add all the paths which are not hoodie specific + returns.addAll(entry.getValue()); + continue; + } - FileStatus[] statuses = entry.getValue().toArray(new FileStatus[entry.getValue().size()]); - if (LOG.isDebugEnabled()) { - LOG.debug("Hoodie Metadata initialized with completed commit Ts as :" + metadata); - } - String tableName = metadata.getTableConfig().getTableName(); - String mode = HoodieHiveUtil.readMode(Job.getInstance(job), tableName); - // Get all commits, delta commits, compactions, as all of them produce a base parquet file today - HoodieTimeline timeline = metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants(); - TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metadata, timeline, statuses); + FileStatus[] statuses = entry.getValue().toArray(new FileStatus[entry.getValue().size()]); + if (LOG.isDebugEnabled()) { + LOG.debug("Hoodie Metadata initialized with completed commit Ts as :" + metadata); + } + String tableName = metadata.getTableConfig().getTableName(); + String mode = HoodieHiveUtil.readMode(Job.getInstance(job), tableName); + // Get all commits, delta commits, compactions, as all of them produce a base parquet file today + HoodieTimeline timeline = metadata.getActiveTimeline().getCommitsAndCompactionsTimeline() + .filterCompletedInstants(); + TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metadata, + timeline, statuses); - if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) { - // this is of the form commitTs_partition_sequenceNumber - String lastIncrementalTs = HoodieHiveUtil.readStartCommitTime(Job.getInstance(job), tableName); - // Total number of commits to return in this batch. Set this to -1 to get all the commits. - Integer maxCommits = HoodieHiveUtil.readMaxCommits(Job.getInstance(job), tableName); - LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs); - List commitsToReturn = - timeline.findInstantsAfter(lastIncrementalTs, maxCommits).getInstants() - .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); - List filteredFiles = roView - .getLatestDataFilesInRange(commitsToReturn) - .collect(Collectors.toList()); - for (HoodieDataFile filteredFile : filteredFiles) { - LOG.info("Processing incremental hoodie file - " + filteredFile.getPath()); - filteredFile = checkFileStatus(filteredFile); - returns.add(filteredFile.getFileStatus()); - } - LOG.info( - "Total paths to process after hoodie incremental filter " + filteredFiles.size()); - } else { - // filter files on the latest commit found - List filteredFiles = roView.getLatestDataFiles().collect(Collectors.toList()); - LOG.info("Total paths to process after hoodie filter " + filteredFiles.size()); - for (HoodieDataFile filteredFile : filteredFiles) { - if (LOG.isDebugEnabled()) { - LOG.debug("Processing latest hoodie file - " + filteredFile.getPath()); - } - filteredFile = checkFileStatus(filteredFile); - returns.add(filteredFile.getFileStatus()); - } - } + if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) { + // this is of the form commitTs_partition_sequenceNumber + String lastIncrementalTs = HoodieHiveUtil + .readStartCommitTime(Job.getInstance(job), tableName); + // Total number of commits to return in this batch. Set this to -1 to get all the commits. + Integer maxCommits = HoodieHiveUtil.readMaxCommits(Job.getInstance(job), tableName); + LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs); + List commitsToReturn = + timeline.findInstantsAfter(lastIncrementalTs, maxCommits).getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + List filteredFiles = roView + .getLatestDataFilesInRange(commitsToReturn) + .collect(Collectors.toList()); + for (HoodieDataFile filteredFile : filteredFiles) { + LOG.info("Processing incremental hoodie file - " + filteredFile.getPath()); + filteredFile = checkFileStatus(filteredFile); + returns.add(filteredFile.getFileStatus()); } - return returns.toArray(new FileStatus[returns.size()]); - + LOG.info( + "Total paths to process after hoodie incremental filter " + filteredFiles.size()); + } else { + // filter files on the latest commit found + List filteredFiles = roView.getLatestDataFiles() + .collect(Collectors.toList()); + LOG.info("Total paths to process after hoodie filter " + filteredFiles.size()); + for (HoodieDataFile filteredFile : filteredFiles) { + if (LOG.isDebugEnabled()) { + LOG.debug("Processing latest hoodie file - " + filteredFile.getPath()); + } + filteredFile = checkFileStatus(filteredFile); + returns.add(filteredFile.getFileStatus()); + } + } } + return returns.toArray(new FileStatus[returns.size()]); - /** - * Checks the file status for a race condition which can set the file size to 0. 1. - * HiveInputFormat does super.listStatus() and gets back a FileStatus[] 2. Then it creates the - * HoodieTableMetaClient for the paths listed. 3. Generation of splits looks at FileStatus size - * to create splits, which skips this file - */ - private HoodieDataFile checkFileStatus(HoodieDataFile dataFile) throws IOException { - Path dataPath = dataFile.getFileStatus().getPath(); + } + + /** + * Checks the file status for a race condition which can set the file size to 0. 1. + * HiveInputFormat does super.listStatus() and gets back a FileStatus[] 2. Then it creates the + * HoodieTableMetaClient for the paths listed. 3. Generation of splits looks at FileStatus size to + * create splits, which skips this file + */ + private HoodieDataFile checkFileStatus(HoodieDataFile dataFile) throws IOException { + Path dataPath = dataFile.getFileStatus().getPath(); + try { + if (dataFile.getFileSize() == 0) { + FileSystem fs = dataPath.getFileSystem(conf); + LOG.info("Refreshing file status " + dataFile.getPath()); + return new HoodieDataFile(fs.getFileStatus(dataPath)); + } + return dataFile; + } catch (IOException e) { + throw new HoodieIOException("Could not get FileStatus on path " + dataPath); + } + } + + private Map> groupFileStatus(FileStatus[] fileStatuses) + throws IOException { + // This assumes the paths for different tables are grouped together + Map> grouped = new HashMap<>(); + HoodieTableMetaClient metadata = null; + String nonHoodieBasePath = null; + for (FileStatus status : fileStatuses) { + if (!status.getPath().getName().endsWith(".parquet")) { + //FIXME(vc): skip non parquet files for now. This wont be needed once log file name start with "." + continue; + } + if ((metadata == null && nonHoodieBasePath == null) || (metadata == null && !status.getPath() + .toString() + .contains(nonHoodieBasePath)) || (metadata != null && !status.getPath().toString() + .contains(metadata.getBasePath()))) { try { - if (dataFile.getFileSize() == 0) { - FileSystem fs = dataPath.getFileSystem(conf); - LOG.info("Refreshing file status " + dataFile.getPath()); - return new HoodieDataFile(fs.getFileStatus(dataPath)); - } - return dataFile; - } catch (IOException e) { - throw new HoodieIOException("Could not get FileStatus on path " + dataPath); + metadata = getTableMetaClient(status.getPath().getFileSystem(conf), + status.getPath().getParent()); + nonHoodieBasePath = null; + } catch (InvalidDatasetException e) { + LOG.info("Handling a non-hoodie path " + status.getPath()); + metadata = null; + nonHoodieBasePath = + status.getPath().getParent().toString(); } - } - - private Map> groupFileStatus(FileStatus[] fileStatuses) - throws IOException { - // This assumes the paths for different tables are grouped together - Map> grouped = new HashMap<>(); - HoodieTableMetaClient metadata = null; - String nonHoodieBasePath = null; - for(FileStatus status: fileStatuses) { - if (!status.getPath().getName().endsWith(".parquet")) { - //FIXME(vc): skip non parquet files for now. This wont be needed once log file name start with "." - continue; - } - if ((metadata == null && nonHoodieBasePath == null) || (metadata == null && !status.getPath().toString() - .contains(nonHoodieBasePath)) || (metadata != null && !status.getPath().toString() - .contains(metadata.getBasePath()))) { - try { - metadata = getTableMetaClient(status.getPath().getFileSystem(conf), status.getPath().getParent()); - nonHoodieBasePath = null; - } catch (InvalidDatasetException e) { - LOG.info("Handling a non-hoodie path " + status.getPath()); - metadata = null; - nonHoodieBasePath = - status.getPath().getParent().toString(); - } - if(!grouped.containsKey(metadata)) { - grouped.put(metadata, new ArrayList<>()); - } - } - grouped.get(metadata).add(status); + if (!grouped.containsKey(metadata)) { + grouped.put(metadata, new ArrayList<>()); } - return grouped; + } + grouped.get(metadata).add(status); } + return grouped; + } - public void setConf(Configuration conf) { - this.conf = conf; - } + public void setConf(Configuration conf) { + this.conf = conf; + } - public Configuration getConf() { - return conf; - } + public Configuration getConf() { + return conf; + } - @Override - public RecordReader getRecordReader(final InputSplit split, - final JobConf job, final Reporter reporter) throws IOException { - // TODO enable automatic predicate pushdown after fixing issues + @Override + public RecordReader getRecordReader(final InputSplit split, + final JobConf job, final Reporter reporter) throws IOException { + // TODO enable automatic predicate pushdown after fixing issues // FileSplit fileSplit = (FileSplit) split; // HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent()); // String tableName = metadata.getTableName(); @@ -213,91 +218,83 @@ public class HoodieInputFormat extends MapredParquetInputFormat // FilterPredicate predicate = constructHoodiePredicate(job, tableName, split); // LOG.info("Setting parquet predicate push down as " + predicate); // ParquetInputFormat.setFilterPredicate(job, predicate); - //clearOutExistingPredicate(job); + //clearOutExistingPredicate(job); // } - return super.getRecordReader(split, job, reporter); - } + return super.getRecordReader(split, job, reporter); + } - /** - * Clears out the filter expression (if this is not done, then ParquetReader will override the FilterPredicate set) - * - * @param job - */ - private void clearOutExistingPredicate(JobConf job) { - job.unset(TableScanDesc.FILTER_EXPR_CONF_STR); - } + /** + * Clears out the filter expression (if this is not done, then ParquetReader will override the + * FilterPredicate set) + */ + private void clearOutExistingPredicate(JobConf job) { + job.unset(TableScanDesc.FILTER_EXPR_CONF_STR); + } - /** - * Constructs the predicate to push down to parquet storage. - * This creates the predicate for `hoodie_commit_time` > 'start_commit_time' and ANDs with the existing predicate if one is present already. - * - * @param job - * @param tableName - * @return - */ - private FilterPredicate constructHoodiePredicate(JobConf job, - String tableName, - InputSplit split) throws IOException { - FilterPredicate commitTimePushdown = constructCommitTimePushdownPredicate(job, tableName); - LOG.info("Commit time predicate - " + commitTimePushdown.toString()); - FilterPredicate existingPushdown = constructHQLPushdownPredicate(job, split); - LOG.info("Existing predicate - " + existingPushdown); + /** + * Constructs the predicate to push down to parquet storage. This creates the predicate for + * `hoodie_commit_time` > 'start_commit_time' and ANDs with the existing predicate if one is + * present already. + */ + private FilterPredicate constructHoodiePredicate(JobConf job, + String tableName, + InputSplit split) throws IOException { + FilterPredicate commitTimePushdown = constructCommitTimePushdownPredicate(job, tableName); + LOG.info("Commit time predicate - " + commitTimePushdown.toString()); + FilterPredicate existingPushdown = constructHQLPushdownPredicate(job, split); + LOG.info("Existing predicate - " + existingPushdown); - FilterPredicate hoodiePredicate; - if (existingPushdown != null) { - hoodiePredicate = and(existingPushdown, commitTimePushdown); - } else { - hoodiePredicate = commitTimePushdown; - } - LOG.info("Hoodie Predicate - " + hoodiePredicate); - return hoodiePredicate; + FilterPredicate hoodiePredicate; + if (existingPushdown != null) { + hoodiePredicate = and(existingPushdown, commitTimePushdown); + } else { + hoodiePredicate = commitTimePushdown; } + LOG.info("Hoodie Predicate - " + hoodiePredicate); + return hoodiePredicate; + } - private FilterPredicate constructHQLPushdownPredicate(JobConf job, InputSplit split) - throws IOException { - String serializedPushdown = job.get(TableScanDesc.FILTER_EXPR_CONF_STR); - String columnNamesString = job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR); - if (serializedPushdown == null || columnNamesString == null || serializedPushdown.isEmpty() - || columnNamesString.isEmpty()) { - return null; - } else { - SearchArgument sarg = - SearchArgumentFactory.create(Utilities.deserializeExpression(serializedPushdown)); - final Path finalPath = ((FileSplit) split).getPath(); - final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(job, finalPath); - final FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); - return ParquetFilterPredicateConverter - .toFilterPredicate(sarg, fileMetaData.getSchema()); - } + private FilterPredicate constructHQLPushdownPredicate(JobConf job, InputSplit split) + throws IOException { + String serializedPushdown = job.get(TableScanDesc.FILTER_EXPR_CONF_STR); + String columnNamesString = job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR); + if (serializedPushdown == null || columnNamesString == null || serializedPushdown.isEmpty() + || columnNamesString.isEmpty()) { + return null; + } else { + SearchArgument sarg = + SearchArgumentFactory.create(Utilities.deserializeExpression(serializedPushdown)); + final Path finalPath = ((FileSplit) split).getPath(); + final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(job, finalPath); + final FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); + return ParquetFilterPredicateConverter + .toFilterPredicate(sarg, fileMetaData.getSchema()); } + } - private FilterPredicate constructCommitTimePushdownPredicate(JobConf job, String tableName) - throws IOException { - String lastIncrementalTs = HoodieHiveUtil.readStartCommitTime(Job.getInstance(job), tableName); - Operators.BinaryColumn sequenceColumn = - binaryColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD); - FilterPredicate p = gt(sequenceColumn, Binary.fromString(lastIncrementalTs)); - LOG.info("Setting predicate in InputFormat " + p.toString()); - return p; - } + private FilterPredicate constructCommitTimePushdownPredicate(JobConf job, String tableName) + throws IOException { + String lastIncrementalTs = HoodieHiveUtil.readStartCommitTime(Job.getInstance(job), tableName); + Operators.BinaryColumn sequenceColumn = + binaryColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD); + FilterPredicate p = gt(sequenceColumn, Binary.fromString(lastIncrementalTs)); + LOG.info("Setting predicate in InputFormat " + p.toString()); + return p; + } - /** - * Read the table metadata from a data path. This assumes certain hierarchy of files which - * should be changed once a better way is figured out to pass in the hoodie meta directory - * - * @param dataPath - * @return - * @throws IOException - */ - protected static HoodieTableMetaClient getTableMetaClient(FileSystem fs, Path dataPath) { - int levels = HoodieHiveUtil.DEFAULT_LEVELS_TO_BASEPATH; - if (HoodiePartitionMetadata.hasPartitionMetadata(fs, dataPath)) { - HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, dataPath); - metadata.readFromFS(); - levels = metadata.getPartitionDepth(); - } - Path baseDir = HoodieHiveUtil.getNthParent(dataPath, levels); - LOG.info("Reading hoodie metadata from path " + baseDir.toString()); - return new HoodieTableMetaClient(fs, baseDir.toString()); + /** + * Read the table metadata from a data path. This assumes certain hierarchy of files which should + * be changed once a better way is figured out to pass in the hoodie meta directory + */ + protected static HoodieTableMetaClient getTableMetaClient(FileSystem fs, Path dataPath) { + int levels = HoodieHiveUtil.DEFAULT_LEVELS_TO_BASEPATH; + if (HoodiePartitionMetadata.hasPartitionMetadata(fs, dataPath)) { + HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, dataPath); + metadata.readFromFS(); + levels = metadata.getPartitionDepth(); } + Path baseDir = HoodieHiveUtil.getNthParent(dataPath, levels); + LOG.info("Reading hoodie metadata from path " + baseDir.toString()); + return new HoodieTableMetaClient(fs, baseDir.toString()); + } } diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieROTablePathFilter.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieROTablePathFilter.java index 769bc4d52..c8ffbcc9a 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieROTablePathFilter.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieROTablePathFilter.java @@ -21,7 +21,11 @@ import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.view.HoodieTableFileSystemView; import com.uber.hoodie.exception.DatasetNotFoundException; import com.uber.hoodie.exception.HoodieException; - +import java.io.Serializable; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.stream.Collectors; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -29,150 +33,142 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; -import java.io.Serializable; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.stream.Collectors; - /** - * Given a path is a part of - * - Hoodie dataset = accepts ONLY the latest version of each path - * - Non-Hoodie dataset = then always accept + * Given a path is a part of - Hoodie dataset = accepts ONLY the latest version of each path - + * Non-Hoodie dataset = then always accept * * We can set this filter, on a query engine's Hadoop Config and if it respects path filters, then * you should be able to query both hoodie and non-hoodie datasets as you would normally do. * - * hadoopConf.setClass("mapreduce.input.pathFilter.class", - * com.uber.hoodie.hadoop.HoodieROTablePathFilter.class, - * org.apache.hadoop.fs.PathFilter.class) - * + * hadoopConf.setClass("mapreduce.input.pathFilter.class", com.uber.hoodie.hadoop.HoodieROTablePathFilter.class, + * org.apache.hadoop.fs.PathFilter.class) */ public class HoodieROTablePathFilter implements PathFilter, Serializable { - public static final Log LOG = LogFactory.getLog(HoodieROTablePathFilter.class); + public static final Log LOG = LogFactory.getLog(HoodieROTablePathFilter.class); - /** - * Its quite common, to have all files from a given partition path be passed into accept(), - * cache the check for hoodie metadata for known partition paths and the latest versions of files - */ - private HashMap> hoodiePathCache; + /** + * Its quite common, to have all files from a given partition path be passed into accept(), cache + * the check for hoodie metadata for known partition paths and the latest versions of files + */ + private HashMap> hoodiePathCache; - /** - * Paths that are known to be non-hoodie datasets. - */ - private HashSet nonHoodiePathCache; + /** + * Paths that are known to be non-hoodie datasets. + */ + private HashSet nonHoodiePathCache; - public HoodieROTablePathFilter() { - hoodiePathCache = new HashMap<>(); - nonHoodiePathCache = new HashSet<>(); + public HoodieROTablePathFilter() { + hoodiePathCache = new HashMap<>(); + nonHoodiePathCache = new HashSet<>(); + } + + /** + * Obtain the path, two levels from provided path + * + * @return said path if available, null otherwise + */ + private Path safeGetParentsParent(Path path) { + if (path.getParent() != null && path.getParent().getParent() != null + && path.getParent().getParent().getParent() != null) { + return path.getParent().getParent().getParent(); } + return null; + } - /** - * Obtain the path, two levels from provided path - * - * @return said path if available, null otherwise - */ - private Path safeGetParentsParent(Path path) { - if (path.getParent() != null && path.getParent().getParent() != null && path.getParent().getParent().getParent() != null) { - return path.getParent().getParent().getParent(); - } - return null; + + @Override + public boolean accept(Path path) { + + if (LOG.isDebugEnabled()) { + LOG.debug("Checking acceptance for path " + path); } + Path folder = null; + try { + FileSystem fs = path.getFileSystem(new Configuration()); + if (fs.isDirectory(path)) { + return true; + } - - @Override - public boolean accept(Path path) { - + // Assumes path is a file + folder = path.getParent(); // get the immediate parent. + // Try to use the caches. + if (nonHoodiePathCache.contains(folder.toString())) { if (LOG.isDebugEnabled()) { - LOG.debug("Checking acceptance for path " + path); + LOG.debug("Accepting non-hoodie path from cache: " + path); } - Path folder = null; + return true; + } + + if (hoodiePathCache.containsKey(folder.toString())) { + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("%s Hoodie path checked against cache, accept => %s \n", + path, + hoodiePathCache.get(folder.toString()).contains(path))); + } + return hoodiePathCache.get(folder.toString()).contains(path); + } + + // Perform actual checking. + Path baseDir; + if (HoodiePartitionMetadata.hasPartitionMetadata(fs, folder)) { + HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, folder); + metadata.readFromFS(); + baseDir = HoodieHiveUtil.getNthParent(folder, metadata.getPartitionDepth()); + } else { + baseDir = safeGetParentsParent(folder); + } + + if (baseDir != null) { try { - FileSystem fs = path.getFileSystem(new Configuration()); - if (fs.isDirectory(path)) { - return true; - } + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(fs, baseDir.toString()); + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, + metaClient.getActiveTimeline().getCommitTimeline() + .filterCompletedInstants(), + fs.listStatus(folder)); + List latestFiles = fsView + .getLatestDataFiles() + .collect(Collectors.toList()); + // populate the cache + if (!hoodiePathCache.containsKey(folder.toString())) { + hoodiePathCache.put(folder.toString(), new HashSet<>()); + } + LOG.info("Based on hoodie metadata from base path: " + baseDir.toString() + + ", caching " + latestFiles.size() + " files under " + folder); + for (HoodieDataFile lfile : latestFiles) { + hoodiePathCache.get(folder.toString()).add(new Path(lfile.getPath())); + } - // Assumes path is a file - folder = path.getParent(); // get the immediate parent. - // Try to use the caches. - if (nonHoodiePathCache.contains(folder.toString())) { - if (LOG.isDebugEnabled()) { - LOG.debug("Accepting non-hoodie path from cache: " + path); - } - return true; - } - - if (hoodiePathCache.containsKey(folder.toString())) { - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("%s Hoodie path checked against cache, accept => %s \n", - path, - hoodiePathCache.get(folder.toString()).contains(path))); - } - return hoodiePathCache.get(folder.toString()).contains(path); - } - - // Perform actual checking. - Path baseDir; - if (HoodiePartitionMetadata.hasPartitionMetadata(fs, folder)) { - HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, folder); - metadata.readFromFS(); - baseDir = HoodieHiveUtil.getNthParent(folder, metadata.getPartitionDepth()); - } else { - baseDir = safeGetParentsParent(folder); - } - - if (baseDir != null) { - try { - HoodieTableMetaClient metaClient = - new HoodieTableMetaClient(fs, baseDir.toString()); - HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline().getCommitTimeline() - .filterCompletedInstants(), - fs.listStatus(folder)); - List latestFiles = fsView - .getLatestDataFiles() - .collect(Collectors.toList()); - // populate the cache - if (!hoodiePathCache.containsKey(folder.toString())) { - hoodiePathCache.put(folder.toString(), new HashSet<>()); - } - LOG.info("Based on hoodie metadata from base path: " + baseDir.toString() + - ", caching " + latestFiles.size() + " files under "+ folder); - for (HoodieDataFile lfile: latestFiles) { - hoodiePathCache.get(folder.toString()).add(new Path(lfile.getPath())); - } - - // accept the path, if its among the latest files. - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("%s checked after cache population, accept => %s \n", - path, - hoodiePathCache.get(folder.toString()).contains(path))); - } - return hoodiePathCache.get(folder.toString()).contains(path); - } catch (DatasetNotFoundException e) { - // Non-hoodie path, accept it. - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("(1) Caching non-hoodie path under %s \n", - folder.toString())); - } - nonHoodiePathCache.add(folder.toString()); - return true; - } - } else { - // files is at < 3 level depth in FS tree, can't be hoodie dataset - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("(2) Caching non-hoodie path under %s \n", folder.toString())); - } - nonHoodiePathCache.add(folder.toString()); - return true; - } - } catch (Exception e) { - String msg = "Error checking path :" + path +", under folder: "+ folder; - LOG.error(msg, e); - throw new HoodieException(msg, e); + // accept the path, if its among the latest files. + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("%s checked after cache population, accept => %s \n", + path, + hoodiePathCache.get(folder.toString()).contains(path))); + } + return hoodiePathCache.get(folder.toString()).contains(path); + } catch (DatasetNotFoundException e) { + // Non-hoodie path, accept it. + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("(1) Caching non-hoodie path under %s \n", + folder.toString())); + } + nonHoodiePathCache.add(folder.toString()); + return true; } + } else { + // files is at < 3 level depth in FS tree, can't be hoodie dataset + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("(2) Caching non-hoodie path under %s \n", folder.toString())); + } + nonHoodiePathCache.add(folder.toString()); + return true; + } + } catch (Exception e) { + String msg = "Error checking path :" + path + ", under folder: " + folder; + LOG.error(msg, e); + throw new HoodieException(msg, e); } + } } diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/UseFileSplitsFromInputFormat.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/UseFileSplitsFromInputFormat.java index f69e93cff..01059d67b 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/UseFileSplitsFromInputFormat.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/UseFileSplitsFromInputFormat.java @@ -24,13 +24,13 @@ import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** - * When annotated on a InputFormat, informs the query engines, - * that they should use the FileSplits provided by the input format - * to execute the queries + * When annotated on a InputFormat, informs the query engines, that they should use the FileSplits + * provided by the input format to execute the queries */ @Inherited @Documented @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface UseFileSplitsFromInputFormat { + } diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieParquetSerde.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieParquetSerde.java index 701ab90a9..53c7c8cf7 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieParquetSerde.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieParquetSerde.java @@ -25,7 +25,7 @@ import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; */ public class HoodieParquetSerde extends ParquetHiveSerDe { - public HoodieParquetSerde() { - super(); - } + public HoodieParquetSerde() { + super(); + } } diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeFileSplit.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeFileSplit.java index 0ce79fc60..5ba7545b7 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeFileSplit.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeFileSplit.java @@ -18,81 +18,83 @@ package com.uber.hoodie.hadoop.realtime; -import org.apache.hadoop.mapred.FileSplit; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.mapred.FileSplit; /** * Filesplit that wraps the base split and a list of log files to merge deltas from. */ public class HoodieRealtimeFileSplit extends FileSplit { - private List deltaFilePaths; + private List deltaFilePaths; - private String maxCommitTime; + private String maxCommitTime; - private String basePath; + private String basePath; - public HoodieRealtimeFileSplit() { - super(); + public HoodieRealtimeFileSplit() { + super(); + } + + public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List deltaLogFiles, + String maxCommitTime) throws IOException { + super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(), + baseSplit.getLocations()); + this.deltaFilePaths = deltaLogFiles; + this.maxCommitTime = maxCommitTime; + this.basePath = basePath; + } + + public List getDeltaFilePaths() { + return deltaFilePaths; + } + + public String getMaxCommitTime() { + return maxCommitTime; + } + + public String getBasePath() { + return basePath; + } + + private static void writeString(String str, DataOutput out) throws IOException { + byte[] pathBytes = str.getBytes(StandardCharsets.UTF_8); + out.writeInt(pathBytes.length); + out.write(pathBytes); + } + + private static String readString(DataInput in) throws IOException { + byte[] pathBytes = new byte[in.readInt()]; + in.readFully(pathBytes); + return new String(pathBytes, StandardCharsets.UTF_8); + } + + + @Override + public void write(DataOutput out) throws IOException { + super.write(out); + + writeString(maxCommitTime, out); + out.writeInt(deltaFilePaths.size()); + for (String logFilePath : deltaFilePaths) { + writeString(logFilePath, out); } + } - public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List deltaLogFiles, String maxCommitTime) throws IOException { - super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(), baseSplit.getLocations()); - this.deltaFilePaths = deltaLogFiles; - this.maxCommitTime = maxCommitTime; - this.basePath = basePath; - } - - public List getDeltaFilePaths() { - return deltaFilePaths; - } - - public String getMaxCommitTime() { - return maxCommitTime; - } - - public String getBasePath() { - return basePath; - } - - private static void writeString(String str, DataOutput out) throws IOException { - byte[] pathBytes = str.getBytes(StandardCharsets.UTF_8); - out.writeInt(pathBytes.length); - out.write(pathBytes); - } - - private static String readString(DataInput in) throws IOException { - byte[] pathBytes = new byte[in.readInt()]; - in.readFully(pathBytes); - return new String(pathBytes, StandardCharsets.UTF_8); - } - - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - - writeString(maxCommitTime, out); - out.writeInt(deltaFilePaths.size()); - for (String logFilePath: deltaFilePaths) { - writeString(logFilePath, out); - } - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - - maxCommitTime = readString(in); - int totalLogFiles = in.readInt(); - deltaFilePaths = new ArrayList<>(totalLogFiles); - for (int i=0; i < totalLogFiles; i++) { - deltaFilePaths.add(readString(in)); - } + @Override + public void readFields(DataInput in) throws IOException { + super.readFields(in); + + maxCommitTime = readString(in); + int totalLogFiles = in.readInt(); + deltaFilePaths = new ArrayList<>(totalLogFiles); + for (int i = 0; i < totalLogFiles; i++) { + deltaFilePaths.add(readString(in)); } + } } diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeInputFormat.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeInputFormat.java index f4849109e..c92f0e593 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeInputFormat.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeInputFormat.java @@ -19,9 +19,7 @@ package com.uber.hoodie.hadoop.realtime; import com.google.common.base.Preconditions; - import com.google.common.collect.Sets; - import com.uber.hoodie.common.model.FileSlice; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.table.HoodieTableMetaClient; @@ -32,7 +30,16 @@ import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.hadoop.HoodieInputFormat; import com.uber.hoodie.hadoop.UseFileSplitsFromInputFormat; - +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurable; @@ -47,168 +54,168 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.function.Function; -import java.util.stream.Collectors; -import java.util.stream.Stream; - /** * Input Format, that provides a real-time view of data in a Hoodie dataset */ @UseFileSplitsFromInputFormat public class HoodieRealtimeInputFormat extends HoodieInputFormat implements Configurable { - public static final Log LOG = LogFactory.getLog(HoodieRealtimeInputFormat.class); + public static final Log LOG = LogFactory.getLog(HoodieRealtimeInputFormat.class); - // These positions have to be deterministic across all tables - public static final int HOODIE_COMMIT_TIME_COL_POS = 0; - public static final int HOODIE_RECORD_KEY_COL_POS = 2; - public static final int HOODIE_PARTITION_PATH_COL_POS = 3; + // These positions have to be deterministic across all tables + public static final int HOODIE_COMMIT_TIME_COL_POS = 0; + public static final int HOODIE_RECORD_KEY_COL_POS = 2; + public static final int HOODIE_PARTITION_PATH_COL_POS = 3; - @Override - public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { + @Override + public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { - Stream fileSplits = Arrays.stream(super.getSplits(job, numSplits)).map(is -> (FileSplit) is); + Stream fileSplits = Arrays.stream(super.getSplits(job, numSplits)) + .map(is -> (FileSplit) is); - // obtain all unique parent folders for splits - Map> partitionsToParquetSplits = fileSplits.collect(Collectors.groupingBy(split -> split.getPath().getParent())); - // TODO(vc): Should we handle also non-hoodie splits here? - Map metaClientMap = new HashMap<>(); - Map partitionsToMetaClient = partitionsToParquetSplits.keySet().stream() - .collect(Collectors.toMap(Function.identity(), p -> { - // find if we have a metaclient already for this partition. - Optional matchingBasePath = metaClientMap.keySet().stream() - .filter(basePath -> p.toString().startsWith(basePath)).findFirst(); - if (matchingBasePath.isPresent()) { - return metaClientMap.get(matchingBasePath.get()); - } + // obtain all unique parent folders for splits + Map> partitionsToParquetSplits = fileSplits + .collect(Collectors.groupingBy(split -> split.getPath().getParent())); + // TODO(vc): Should we handle also non-hoodie splits here? + Map metaClientMap = new HashMap<>(); + Map partitionsToMetaClient = partitionsToParquetSplits.keySet() + .stream() + .collect(Collectors.toMap(Function.identity(), p -> { + // find if we have a metaclient already for this partition. + Optional matchingBasePath = metaClientMap.keySet().stream() + .filter(basePath -> p.toString().startsWith(basePath)).findFirst(); + if (matchingBasePath.isPresent()) { + return metaClientMap.get(matchingBasePath.get()); + } - try { - HoodieTableMetaClient metaClient = getTableMetaClient(p.getFileSystem(conf), p); - metaClientMap.put(metaClient.getBasePath(), metaClient); - return metaClient; - } catch (IOException e) { - throw new HoodieIOException("Error creating hoodie meta client against : " + p, e); - } - })); + try { + HoodieTableMetaClient metaClient = getTableMetaClient(p.getFileSystem(conf), p); + metaClientMap.put(metaClient.getBasePath(), metaClient); + return metaClient; + } catch (IOException e) { + throw new HoodieIOException("Error creating hoodie meta client against : " + p, e); + } + })); - // for all unique split parents, obtain all delta files based on delta commit timeline, grouped on file id - List rtSplits = new ArrayList<>(); - partitionsToParquetSplits.keySet().stream().forEach(partitionPath -> { - // for each partition path obtain the data & log file groupings, then map back to inputsplits - HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath); - HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline()); - String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath); + // for all unique split parents, obtain all delta files based on delta commit timeline, grouped on file id + List rtSplits = new ArrayList<>(); + partitionsToParquetSplits.keySet().stream().forEach(partitionPath -> { + // for each partition path obtain the data & log file groupings, then map back to inputsplits + HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath); + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, + metaClient.getActiveTimeline()); + String relPartitionPath = FSUtils + .getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath); + try { + Stream latestFileSlices = fsView.getLatestFileSlices(relPartitionPath); + + // subgroup splits again by file id & match with log files. + Map> groupedInputSplits = partitionsToParquetSplits + .get(partitionPath).stream() + .collect(Collectors.groupingBy(split -> FSUtils.getFileId(split.getPath().getName()))); + latestFileSlices.forEach(fileSlice -> { + List dataFileSplits = groupedInputSplits.get(fileSlice.getFileId()); + dataFileSplits.forEach(split -> { try { - Stream latestFileSlices = fsView.getLatestFileSlices(relPartitionPath); - - // subgroup splits again by file id & match with log files. - Map> groupedInputSplits = partitionsToParquetSplits.get(partitionPath).stream() - .collect(Collectors.groupingBy(split -> FSUtils.getFileId(split.getPath().getName()))); - latestFileSlices.forEach(fileSlice -> { - List dataFileSplits = groupedInputSplits.get(fileSlice.getFileId()); - dataFileSplits.forEach(split -> { - try { - List logFilePaths = fileSlice.getLogFiles() - .map(logFile -> logFile.getPath().toString()) - .collect(Collectors.toList()); - // Get the maxCommit from the last delta or compaction or commit - when bootstrapped from COW table - String maxCommitTime = metaClient.getActiveTimeline() - .getTimelineOfActions( - Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, - HoodieTimeline.COMPACTION_ACTION, - HoodieTimeline.DELTA_COMMIT_ACTION)) - .filterCompletedInstants().lastInstant().get().getTimestamp(); - rtSplits.add( - new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFilePaths, maxCommitTime)); - } catch (IOException e) { - throw new HoodieIOException("Error creating hoodie real time split ", e); - } - }); - }); - } catch (Exception e) { - throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e); + List logFilePaths = fileSlice.getLogFiles() + .map(logFile -> logFile.getPath().toString()) + .collect(Collectors.toList()); + // Get the maxCommit from the last delta or compaction or commit - when bootstrapped from COW table + String maxCommitTime = metaClient.getActiveTimeline() + .getTimelineOfActions( + Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, + HoodieTimeline.COMPACTION_ACTION, + HoodieTimeline.DELTA_COMMIT_ACTION)) + .filterCompletedInstants().lastInstant().get().getTimestamp(); + rtSplits.add( + new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFilePaths, + maxCommitTime)); + } catch (IOException e) { + throw new HoodieIOException("Error creating hoodie real time split ", e); } + }); }); - LOG.info("Returning a total splits of " + rtSplits.size()); - return rtSplits.toArray(new InputSplit[rtSplits.size()]); + } catch (Exception e) { + throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, + e); + } + }); + LOG.info("Returning a total splits of " + rtSplits.size()); + return rtSplits.toArray(new InputSplit[rtSplits.size()]); + } + + + @Override + public FileStatus[] listStatus(JobConf job) throws IOException { + // Call the HoodieInputFormat::listStatus to obtain all latest parquet files, based on commit timeline. + return super.listStatus(job); + } + + /** + * Add a field to the existing fields projected + */ + private static Configuration addProjectionField(Configuration conf, String fieldName, + int fieldIndex) { + String readColNames = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""); + String readColIds = conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""); + + String readColNamesPrefix = readColNames + ","; + if (readColNames == null || readColNames.isEmpty()) { + readColNamesPrefix = ""; + } + String readColIdsPrefix = readColIds + ","; + if (readColIds == null || readColIds.isEmpty()) { + readColIdsPrefix = ""; } - - @Override - public FileStatus[] listStatus(JobConf job) throws IOException { - // Call the HoodieInputFormat::listStatus to obtain all latest parquet files, based on commit timeline. - return super.listStatus(job); + if (!readColNames.contains(fieldName)) { + // If not already in the list - then add it + conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, + readColNamesPrefix + fieldName); + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIdsPrefix + fieldIndex); + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("Adding extra column " + fieldName + + ", to enable log merging cols (%s) ids (%s) ", + conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR), + conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR))); + } } + return conf; + } - /** - * Add a field to the existing fields projected - */ - private static Configuration addProjectionField(Configuration conf, String fieldName, - int fieldIndex) { - String readColNames = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""); - String readColIds = conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""); + private static Configuration addRequiredProjectionFields(Configuration configuration) { + // Need this to do merge records in HoodieRealtimeRecordReader + configuration = addProjectionField(configuration, HoodieRecord.RECORD_KEY_METADATA_FIELD, + HOODIE_RECORD_KEY_COL_POS); + configuration = addProjectionField(configuration, HoodieRecord.COMMIT_TIME_METADATA_FIELD, + HOODIE_COMMIT_TIME_COL_POS); + configuration = addProjectionField(configuration, + HoodieRecord.PARTITION_PATH_METADATA_FIELD, HOODIE_PARTITION_PATH_COL_POS); + return configuration; + } - String readColNamesPrefix = readColNames + ","; - if (readColNames == null || readColNames.isEmpty()) { - readColNamesPrefix = ""; - } - String readColIdsPrefix = readColIds + ","; - if (readColIds == null || readColIds.isEmpty()) { - readColIdsPrefix = ""; - } + @Override + public RecordReader getRecordReader(final InputSplit split, + final JobConf job, + final Reporter reporter) throws IOException { + LOG.info("Creating record reader with readCols :" + job + .get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)); + // sanity check + Preconditions.checkArgument(split instanceof HoodieRealtimeFileSplit, + "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + + split); + return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, job, + super.getRecordReader(split, job, reporter)); + } - if (!readColNames.contains(fieldName)) { - // If not already in the list - then add it - conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, - readColNamesPrefix + fieldName); - conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIdsPrefix + fieldIndex); - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("Adding extra column " + fieldName - + ", to enable log merging cols (%s) ids (%s) ", - conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR), - conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR))); - } - } - return conf; - } + @Override + public void setConf(Configuration conf) { + this.conf = addRequiredProjectionFields(conf); + } - private static Configuration addRequiredProjectionFields(Configuration configuration) { - // Need this to do merge records in HoodieRealtimeRecordReader - configuration = addProjectionField(configuration, HoodieRecord.RECORD_KEY_METADATA_FIELD, - HOODIE_RECORD_KEY_COL_POS); - configuration = addProjectionField(configuration, HoodieRecord.COMMIT_TIME_METADATA_FIELD, - HOODIE_COMMIT_TIME_COL_POS); - configuration = addProjectionField(configuration, - HoodieRecord.PARTITION_PATH_METADATA_FIELD, HOODIE_PARTITION_PATH_COL_POS); - return configuration; - } - - @Override - public RecordReader getRecordReader(final InputSplit split, - final JobConf job, - final Reporter reporter) throws IOException { - LOG.info("Creating record reader with readCols :" + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)); - // sanity check - Preconditions.checkArgument(split instanceof HoodieRealtimeFileSplit, - "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split ); - return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, job, super.getRecordReader(split, job, reporter)); - } - - @Override - public void setConf(Configuration conf) { - this.conf = addRequiredProjectionFields(conf); - } - - @Override - public Configuration getConf() { - return conf; - } + @Override + public Configuration getConf() { + return conf; + } } diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReader.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReader.java index 08a23d7ba..00ef57e42 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReader.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReader.java @@ -18,14 +18,21 @@ package com.uber.hoodie.hadoop.realtime; -import com.uber.hoodie.common.model.HoodieAvroPayload; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.table.log.HoodieCompactedLogRecordScanner; import com.uber.hoodie.common.util.FSUtils; -import com.uber.hoodie.common.util.ReflectionUtils; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieIOException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.stream.Collectors; import org.apache.avro.Schema; import org.apache.avro.generic.GenericArray; import org.apache.avro.generic.GenericFixed; @@ -51,291 +58,274 @@ import parquet.avro.AvroSchemaConverter; import parquet.hadoop.ParquetFileReader; import parquet.schema.MessageType; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import java.util.stream.Collectors; - /** - * Record Reader implementation to merge fresh avro data with base parquet data, to support real time - * queries. + * Record Reader implementation to merge fresh avro data with base parquet data, to support real + * time queries. */ public class HoodieRealtimeRecordReader implements RecordReader { - private final RecordReader parquetReader; - private final HoodieRealtimeFileSplit split; - private final JobConf jobConf; + private final RecordReader parquetReader; + private final HoodieRealtimeFileSplit split; + private final JobConf jobConf; - public static final Log LOG = LogFactory.getLog(HoodieRealtimeRecordReader.class); + public static final Log LOG = LogFactory.getLog(HoodieRealtimeRecordReader.class); - private final HashMap deltaRecordMap; - private final MessageType baseFileSchema; + private final HashMap deltaRecordMap; + private final MessageType baseFileSchema; - public HoodieRealtimeRecordReader(HoodieRealtimeFileSplit split, - JobConf job, - RecordReader realReader) { - this.split = split; - this.jobConf = job; - this.parquetReader = realReader; - this.deltaRecordMap = new HashMap<>(); + public HoodieRealtimeRecordReader(HoodieRealtimeFileSplit split, + JobConf job, + RecordReader realReader) { + this.split = split; + this.jobConf = job; + this.parquetReader = realReader; + this.deltaRecordMap = new HashMap<>(); - LOG.info("cfg ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)); - try { - baseFileSchema = readSchema(jobConf, split.getPath()); - readAndCompactLog(); - } catch (IOException e) { - throw new HoodieIOException( - "Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); - } + LOG.info("cfg ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)); + try { + baseFileSchema = readSchema(jobConf, split.getPath()); + readAndCompactLog(); + } catch (IOException e) { + throw new HoodieIOException( + "Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); + } + } + + /** + * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the + * twitter parquet to support hive 1.1.0 + */ + private static MessageType readSchema(Configuration conf, Path parquetFilePath) { + try { + return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData() + .getSchema(); + } catch (IOException e) { + throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, + e); + } + } + + + /** + * Goes through the log files and populates a map with latest version of each key logged, since + * the base split was written. + */ + private void readAndCompactLog() throws IOException { + Schema writerSchema = new AvroSchemaConverter().convert(baseFileSchema); + List projectionFields = orderFields( + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR), + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), + jobConf.get("partition_columns", "")); + // TODO(vc): In the future, the reader schema should be updated based on log files & be able to null out fields not present before + Schema readerSchema = generateProjectionSchema(writerSchema, projectionFields); + + LOG.info( + String.format("About to read compacted logs %s for base split %s, projecting cols %s", + split.getDeltaFilePaths(), split.getPath(), projectionFields)); + HoodieCompactedLogRecordScanner compactedLogRecordScanner = + new HoodieCompactedLogRecordScanner(FSUtils.getFs(), split.getBasePath(), + split.getDeltaFilePaths(), + readerSchema, split.getMaxCommitTime()); + // NOTE: HoodieCompactedLogRecordScanner will not return records for an in-flight commit + // but can return records for completed commits > the commit we are trying to read (if using readCommit() API) + for (HoodieRecord hoodieRecord : compactedLogRecordScanner) { + GenericRecord rec = (GenericRecord) hoodieRecord.getData().getInsertValue(readerSchema) + .get(); + String key = hoodieRecord.getRecordKey(); + // we assume, a later safe record in the log, is newer than what we have in the map & replace it. + ArrayWritable aWritable = (ArrayWritable) avroToArrayWritable(rec, writerSchema); + deltaRecordMap.put(key, aWritable); + if (LOG.isDebugEnabled()) { + LOG.debug("Log record : " + arrayWritableToString(aWritable)); + } + } + } + + private static String arrayWritableToString(ArrayWritable writable) { + if (writable == null) { + return "null"; } - /** - * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the - * twitter parquet to support hive 1.1.0 - */ - private static MessageType readSchema(Configuration conf, Path parquetFilePath) { - try { - return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData() - .getSchema(); - } catch (IOException e) { - throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, - e); - } + StringBuilder builder = new StringBuilder(); + Writable[] values = writable.get(); + builder.append(String.format("Size: %s,", values.length)); + for (Writable w : values) { + builder.append(w + " "); + } + return builder.toString(); + } + + /** + * Given a comma separated list of field names and positions at which they appear on Hive, return + * a ordered list of field names, that can be passed onto storage. + */ + public static List orderFields(String fieldNameCsv, String fieldOrderCsv, + String partitioningFieldsCsv) { + + String[] fieldOrders = fieldOrderCsv.split(","); + Set partitioningFields = Arrays.stream(partitioningFieldsCsv.split(",")) + .collect(Collectors.toSet()); + List fieldNames = Arrays.stream(fieldNameCsv.split(",")) + .filter(fn -> !partitioningFields.contains(fn)).collect( + Collectors.toList()); + + // Hive does not provide ids for partitioning fields, so check for lengths excluding that. + if (fieldNames.size() != fieldOrders.length) { + throw new HoodieException(String.format( + "Error ordering fields for storage read. #fieldNames: %d, #fieldPositions: %d", + fieldNames.size(), fieldOrders.length)); + } + TreeMap orderedFieldMap = new TreeMap<>(); + for (int ox = 0; ox < fieldOrders.length; ox++) { + orderedFieldMap.put(Integer.parseInt(fieldOrders[ox]), fieldNames.get(ox)); + } + return new ArrayList<>(orderedFieldMap.values()); + } + + /** + * Generate a reader schema off the provided writeSchema, to just project out the provided + * columns + */ + public static Schema generateProjectionSchema(Schema writeSchema, List fieldNames) { + List projectedFields = new ArrayList<>(); + for (String fn : fieldNames) { + Schema.Field field = writeSchema.getField(fn); + if (field == null) { + throw new HoodieException("Field " + fn + " not found log schema. Query cannot proceed!"); + } + projectedFields + .add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue())); } + return Schema.createRecord(projectedFields); + } - /** - * Goes through the log files and populates a map with latest version of each key logged, since the base split was written. - */ - private void readAndCompactLog() throws IOException { - Schema writerSchema = new AvroSchemaConverter().convert(baseFileSchema); - List projectionFields = orderFields( - jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR), - jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), - jobConf.get("partition_columns", "")); - // TODO(vc): In the future, the reader schema should be updated based on log files & be able to null out fields not present before - Schema readerSchema = generateProjectionSchema(writerSchema, projectionFields); + /** + * Convert the projected read from delta record into an array writable + */ + public static Writable avroToArrayWritable(Object value, Schema schema) { - LOG.info( - String.format("About to read compacted logs %s for base split %s, projecting cols %s", - split.getDeltaFilePaths(), split.getPath(), projectionFields)); - HoodieCompactedLogRecordScanner compactedLogRecordScanner = - new HoodieCompactedLogRecordScanner(FSUtils.getFs(), split.getBasePath(), split.getDeltaFilePaths(), - readerSchema, split.getMaxCommitTime()); - // NOTE: HoodieCompactedLogRecordScanner will not return records for an in-flight commit - // but can return records for completed commits > the commit we are trying to read (if using readCommit() API) - for (HoodieRecord hoodieRecord : compactedLogRecordScanner) { - GenericRecord rec = (GenericRecord) hoodieRecord.getData().getInsertValue(readerSchema) - .get(); - String key = hoodieRecord.getRecordKey(); - // we assume, a later safe record in the log, is newer than what we have in the map & replace it. - ArrayWritable aWritable = (ArrayWritable) avroToArrayWritable(rec, writerSchema); - deltaRecordMap.put(key, aWritable); - if (LOG.isDebugEnabled()) { - LOG.debug("Log record : " + arrayWritableToString(aWritable)); - } - } + // if value is null, make a NullWritable + if (value == null) { + return NullWritable.get(); } - private static String arrayWritableToString(ArrayWritable writable) { - if (writable == null) { - return "null"; + switch (schema.getType()) { + case STRING: + return new Text(value.toString()); + case BYTES: + return new BytesWritable((byte[]) value); + case INT: + return new IntWritable((Integer) value); + case LONG: + return new LongWritable((Long) value); + case FLOAT: + return new FloatWritable((Float) value); + case DOUBLE: + return new DoubleWritable((Double) value); + case BOOLEAN: + return new BooleanWritable((Boolean) value); + case NULL: + return NullWritable.get(); + case RECORD: + GenericRecord record = (GenericRecord) value; + Writable[] values1 = new Writable[schema.getFields().size()]; + int index1 = 0; + for (Schema.Field field : schema.getFields()) { + values1[index1++] = avroToArrayWritable(record.get(field.name()), field.schema()); } - - StringBuilder builder = new StringBuilder(); - Writable[] values = writable.get(); - builder.append(String.format("Size: %s,", values.length)); - for (Writable w: values) { - builder.append(w + " "); + return new ArrayWritable(Writable.class, values1); + case ENUM: + return new Text(value.toString()); + case ARRAY: + GenericArray arrayValue = (GenericArray) value; + Writable[] values2 = new Writable[arrayValue.size()]; + int index2 = 0; + for (Object obj : arrayValue) { + values2[index2++] = avroToArrayWritable(obj, schema.getElementType()); } - return builder.toString(); - } - - /** - * Given a comma separated list of field names and positions at which they appear on Hive, - * return a ordered list of field names, that can be passed onto storage. - * - * @param fieldNameCsv - * @param fieldOrderCsv - * @return - */ - public static List orderFields(String fieldNameCsv, String fieldOrderCsv, - String partitioningFieldsCsv) { - - String[] fieldOrders = fieldOrderCsv.split(","); - Set partitioningFields = Arrays.stream(partitioningFieldsCsv.split(",")) - .collect(Collectors.toSet()); - List fieldNames = Arrays.stream(fieldNameCsv.split(",")) - .filter(fn -> !partitioningFields.contains(fn)).collect( - Collectors.toList()); - - // Hive does not provide ids for partitioning fields, so check for lengths excluding that. - if (fieldNames.size() != fieldOrders.length) { - throw new HoodieException(String.format( - "Error ordering fields for storage read. #fieldNames: %d, #fieldPositions: %d", - fieldNames.size(), fieldOrders.length)); + return new ArrayWritable(Writable.class, values2); + case MAP: + Map mapValue = (Map) value; + Writable[] values3 = new Writable[mapValue.size()]; + int index3 = 0; + for (Object entry : mapValue.entrySet()) { + Map.Entry mapEntry = (Map.Entry) entry; + Writable[] mapValues = new Writable[2]; + mapValues[0] = new Text(mapEntry.getKey().toString()); + mapValues[1] = avroToArrayWritable(mapEntry.getValue(), schema.getValueType()); + values3[index3++] = new ArrayWritable(Writable.class, mapValues); } - TreeMap orderedFieldMap = new TreeMap<>(); - for (int ox = 0; ox < fieldOrders.length; ox++) { - orderedFieldMap.put(Integer.parseInt(fieldOrders[ox]), fieldNames.get(ox)); + return new ArrayWritable(Writable.class, values3); + case UNION: + List types = schema.getTypes(); + if (types.size() != 2) { + throw new IllegalArgumentException("Only support union with 2 fields"); } - return new ArrayList<>(orderedFieldMap.values()); - } - - /** - * Generate a reader schema off the provided writeSchema, to just project out - * the provided columns - * - * @param writeSchema - * @param fieldNames - * @return - */ - public static Schema generateProjectionSchema(Schema writeSchema, List fieldNames) { - List projectedFields = new ArrayList<>(); - for (String fn: fieldNames) { - Schema.Field field = writeSchema.getField(fn); - if (field == null) { - throw new HoodieException("Field "+ fn + " not found log schema. Query cannot proceed!"); - } - projectedFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue())); - } - - return Schema.createRecord(projectedFields); - } - - /** - * Convert the projected read from delta record into an array writable - * - * @param value - * @param schema - * @return - */ - public static Writable avroToArrayWritable(Object value, Schema schema) { - - // if value is null, make a NullWritable - if (value == null) { - return NullWritable.get(); - } - - switch (schema.getType()) { - case STRING: - return new Text(value.toString()); - case BYTES: - return new BytesWritable((byte[]) value); - case INT: - return new IntWritable((Integer) value); - case LONG: - return new LongWritable((Long) value); - case FLOAT: - return new FloatWritable((Float) value); - case DOUBLE: - return new DoubleWritable((Double) value); - case BOOLEAN: - return new BooleanWritable((Boolean) value); - case NULL: - return NullWritable.get(); - case RECORD: - GenericRecord record = (GenericRecord) value; - Writable[] values1 = new Writable[schema.getFields().size()]; - int index1 = 0; - for (Schema.Field field : schema.getFields()) { - values1[index1++] = avroToArrayWritable(record.get(field.name()), field.schema()); - } - return new ArrayWritable(Writable.class, values1); - case ENUM: - return new Text(value.toString()); - case ARRAY: - GenericArray arrayValue = (GenericArray) value; - Writable[] values2 = new Writable[arrayValue.size()]; - int index2 = 0; - for (Object obj : arrayValue) { - values2[index2++] = avroToArrayWritable(obj, schema.getElementType()); - } - return new ArrayWritable(Writable.class, values2); - case MAP: - Map mapValue = (Map) value; - Writable[] values3 = new Writable[mapValue.size()]; - int index3 = 0; - for (Object entry : mapValue.entrySet()) { - Map.Entry mapEntry = (Map.Entry) entry; - Writable[] mapValues = new Writable[2]; - mapValues[0] = new Text(mapEntry.getKey().toString()); - mapValues[1] = avroToArrayWritable(mapEntry.getValue(), schema.getValueType()); - values3[index3++] = new ArrayWritable(Writable.class, mapValues); - } - return new ArrayWritable(Writable.class, values3); - case UNION: - List types = schema.getTypes(); - if (types.size() != 2) { - throw new IllegalArgumentException("Only support union with 2 fields"); - } - Schema s1 = types.get(0); - Schema s2 = types.get(1); - if (s1.getType() == Schema.Type.NULL) { - return avroToArrayWritable(value, s2); - } else if (s2.getType() == Schema.Type.NULL) { - return avroToArrayWritable(value, s1); - } else { - throw new IllegalArgumentException("Only support union with null"); - } - case FIXED: - return new BytesWritable(((GenericFixed) value).bytes()); - } - return null; - } - - @Override - public boolean next(Void aVoid, ArrayWritable arrayWritable) throws IOException { - // Call the underlying parquetReader.next - which may replace the passed in ArrayWritable with a new block of values - boolean result = this.parquetReader.next(aVoid, arrayWritable); - if(!result) { - // if the result is false, then there are no more records - return false; + Schema s1 = types.get(0); + Schema s2 = types.get(1); + if (s1.getType() == Schema.Type.NULL) { + return avroToArrayWritable(value, s2); + } else if (s2.getType() == Schema.Type.NULL) { + return avroToArrayWritable(value, s1); } else { - // TODO(VC): Right now, we assume all records in log, have a matching base record. (which would be true until we have a way to index logs too) - // return from delta records map if we have some match. - String key = arrayWritable.get()[HoodieRealtimeInputFormat.HOODIE_RECORD_KEY_COL_POS].toString(); - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("key %s, base values: %s, log values: %s", - key, arrayWritableToString(arrayWritable), arrayWritableToString(deltaRecordMap.get(key)))); - } - if (deltaRecordMap.containsKey(key)) { - Writable[] replaceValue = deltaRecordMap.get(key).get(); - Writable[] originalValue = arrayWritable.get(); - System.arraycopy(replaceValue, 0, originalValue, 0, originalValue.length); - arrayWritable.set(originalValue); - } - return true; + throw new IllegalArgumentException("Only support union with null"); } + case FIXED: + return new BytesWritable(((GenericFixed) value).bytes()); } + return null; + } - @Override - public Void createKey() { - return parquetReader.createKey(); + @Override + public boolean next(Void aVoid, ArrayWritable arrayWritable) throws IOException { + // Call the underlying parquetReader.next - which may replace the passed in ArrayWritable with a new block of values + boolean result = this.parquetReader.next(aVoid, arrayWritable); + if (!result) { + // if the result is false, then there are no more records + return false; + } else { + // TODO(VC): Right now, we assume all records in log, have a matching base record. (which would be true until we have a way to index logs too) + // return from delta records map if we have some match. + String key = arrayWritable.get()[HoodieRealtimeInputFormat.HOODIE_RECORD_KEY_COL_POS] + .toString(); + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("key %s, base values: %s, log values: %s", + key, arrayWritableToString(arrayWritable), + arrayWritableToString(deltaRecordMap.get(key)))); + } + if (deltaRecordMap.containsKey(key)) { + Writable[] replaceValue = deltaRecordMap.get(key).get(); + Writable[] originalValue = arrayWritable.get(); + System.arraycopy(replaceValue, 0, originalValue, 0, originalValue.length); + arrayWritable.set(originalValue); + } + return true; } + } - @Override - public ArrayWritable createValue() { - return parquetReader.createValue(); - } + @Override + public Void createKey() { + return parquetReader.createKey(); + } - @Override - public long getPos() throws IOException { - return parquetReader.getPos(); - } + @Override + public ArrayWritable createValue() { + return parquetReader.createValue(); + } - @Override - public void close() throws IOException { - parquetReader.close(); - } + @Override + public long getPos() throws IOException { + return parquetReader.getPos(); + } - @Override - public float getProgress() throws IOException { - return parquetReader.getProgress(); - } + @Override + public void close() throws IOException { + parquetReader.close(); + } + + @Override + public float getProgress() throws IOException { + return parquetReader.getProgress(); + } } diff --git a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/AnnotationTest.java b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/AnnotationTest.java index 9247582e0..ebe3fd805 100644 --- a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/AnnotationTest.java +++ b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/AnnotationTest.java @@ -17,22 +17,23 @@ package com.uber.hoodie.hadoop; -import org.junit.Test; -import static org.junit.Assert.*; +import static org.junit.Assert.assertTrue; + import java.lang.annotation.Annotation; +import org.junit.Test; public class AnnotationTest { - @Test - public void testAnnotation() { - assertTrue(HoodieInputFormat.class.isAnnotationPresent(UseFileSplitsFromInputFormat.class)); - Annotation[] annotations = HoodieInputFormat.class.getAnnotations(); - boolean found = false; - for (Annotation annotation : annotations) { - if ("UseFileSplitsFromInputFormat".equals(annotation.annotationType().getSimpleName())){ - found = true; - } - } - assertTrue(found); + @Test + public void testAnnotation() { + assertTrue(HoodieInputFormat.class.isAnnotationPresent(UseFileSplitsFromInputFormat.class)); + Annotation[] annotations = HoodieInputFormat.class.getAnnotations(); + boolean found = false; + for (Annotation annotation : annotations) { + if ("UseFileSplitsFromInputFormat".equals(annotation.annotationType().getSimpleName())) { + found = true; + } } + assertTrue(found); + } } diff --git a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/HoodieInputFormatTest.java b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/HoodieInputFormatTest.java index 997c91f22..5bedb3023 100644 --- a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/HoodieInputFormatTest.java +++ b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/HoodieInputFormatTest.java @@ -16,222 +16,235 @@ package com.uber.hoodie.hadoop; +import static org.junit.Assert.assertEquals; + import com.uber.hoodie.common.util.FSUtils; +import java.io.File; +import java.io.IOException; import org.apache.avro.Schema; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.mapred.*; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.File; -import java.io.IOException; - -import static org.junit.Assert.assertEquals; - public class HoodieInputFormatTest { - private HoodieInputFormat inputFormat; - private JobConf jobConf; - @Before public void setUp() { - inputFormat = new HoodieInputFormat(); - jobConf = new JobConf(); - inputFormat.setConf(jobConf); - } + private HoodieInputFormat inputFormat; + private JobConf jobConf; - @Rule public TemporaryFolder basePath = new TemporaryFolder(); + @Before + public void setUp() { + inputFormat = new HoodieInputFormat(); + jobConf = new JobConf(); + inputFormat.setConf(jobConf); + } - @Test public void testInputFormatLoad() throws IOException { - // initial commit - File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); - InputFormatTestUtil.commit(basePath, "100"); + @Rule + public TemporaryFolder basePath = new TemporaryFolder(); - // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + @Test + public void testInputFormatLoad() throws IOException { + // initial commit + File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); + InputFormatTestUtil.commit(basePath, "100"); - InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10); - assertEquals(10, inputSplits.length); + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); - FileStatus[] files = inputFormat.listStatus(jobConf); - assertEquals(10, files.length); - } + InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10); + assertEquals(10, inputSplits.length); - @Test public void testInputFormatUpdates() throws IOException { - // initial commit - File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); - InputFormatTestUtil.commit(basePath, "100"); + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length); + } - // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + @Test + public void testInputFormatUpdates() throws IOException { + // initial commit + File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); + InputFormatTestUtil.commit(basePath, "100"); - FileStatus[] files = inputFormat.listStatus(jobConf); - assertEquals(10, files.length); + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); - // update files - InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true); - // Before the commit - files = inputFormat.listStatus(jobConf); - assertEquals(10, files.length); - ensureFilesInCommit( - "Commit 200 has not been committed. We should not see files from this commit", files, - "200", 0); - InputFormatTestUtil.commit(basePath, "200"); - files = inputFormat.listStatus(jobConf); - assertEquals(10, files.length); - ensureFilesInCommit( - "5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 files from 100 commit", - files, "200", 5); - ensureFilesInCommit( - "5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 files from 200 commit", - files, "100", 5); - } + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length); - @Test public void testIncrementalSimple() throws IOException { - // initial commit - File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); - InputFormatTestUtil.commit(basePath, "100"); + // update files + InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true); + // Before the commit + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length); + ensureFilesInCommit( + "Commit 200 has not been committed. We should not see files from this commit", files, + "200", 0); + InputFormatTestUtil.commit(basePath, "200"); + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length); + ensureFilesInCommit( + "5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 files from 100 commit", + files, "200", 5); + ensureFilesInCommit( + "5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 files from 200 commit", + files, "100", 5); + } - // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + @Test + public void testIncrementalSimple() throws IOException { + // initial commit + File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); + InputFormatTestUtil.commit(basePath, "100"); - InputFormatTestUtil.setupIncremental(jobConf, "100", 1); + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); - FileStatus[] files = inputFormat.listStatus(jobConf); - assertEquals( - "We should exclude commit 100 when returning incremental pull with start commit time as 100", - 0, files.length); - } + InputFormatTestUtil.setupIncremental(jobConf, "100", 1); - @Test public void testIncrementalWithMultipleCommits() throws IOException { - // initial commit - File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); - InputFormatTestUtil.commit(basePath, "100"); - // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); - // update files - InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false); - InputFormatTestUtil.commit(basePath, "200"); + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals( + "We should exclude commit 100 when returning incremental pull with start commit time as 100", + 0, files.length); + } - InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false); - InputFormatTestUtil.commit(basePath, "300"); + @Test + public void testIncrementalWithMultipleCommits() throws IOException { + // initial commit + File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); + InputFormatTestUtil.commit(basePath, "100"); + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + // update files + InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false); + InputFormatTestUtil.commit(basePath, "200"); - InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false); - InputFormatTestUtil.commit(basePath, "400"); + InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false); + InputFormatTestUtil.commit(basePath, "300"); - InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false); - InputFormatTestUtil.commit(basePath, "500"); + InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false); + InputFormatTestUtil.commit(basePath, "400"); - InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false); - InputFormatTestUtil.commit(basePath, "600"); + InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false); + InputFormatTestUtil.commit(basePath, "500"); - InputFormatTestUtil.setupIncremental(jobConf, "100", 1); - FileStatus[] files = inputFormat.listStatus(jobConf); - assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5, - files.length); - ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200", - files, "200", 5); + InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false); + InputFormatTestUtil.commit(basePath, "600"); - InputFormatTestUtil.setupIncremental(jobConf, "100", 3); - files = inputFormat.listStatus(jobConf); + InputFormatTestUtil.setupIncremental(jobConf, "100", 1); + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5, + files.length); + ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200", + files, "200", 5); - assertEquals( - "Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 commit and 1 file from 200 commit", - 5, files.length); - ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit", - files, "400", 3); - ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit", - files, "300", 1); - ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit", - files, "200", 1); + InputFormatTestUtil.setupIncremental(jobConf, "100", 3); + files = inputFormat.listStatus(jobConf); - InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtil.MAX_COMMIT_ALL); - files = inputFormat.listStatus(jobConf); + assertEquals( + "Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 commit and 1 file from 200 commit", + 5, files.length); + ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit", + files, "400", 3); + ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit", + files, "300", 1); + ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit", + files, "200", 1); - assertEquals( - "Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 commits", - 5, files.length); - ensureFilesInCommit( - "Pulling all commits from 100, should get us the 1 files from 600 commit", files, "600", - 1); - ensureFilesInCommit( - "Pulling all commits from 100, should get us the 1 files from 500 commit", files, "500", - 1); - ensureFilesInCommit( - "Pulling all commits from 100, should get us the 1 files from 400 commit", files, "400", - 1); - ensureFilesInCommit( - "Pulling all commits from 100, should get us the 1 files from 300 commit", files, "300", - 1); - ensureFilesInCommit( - "Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200", - 1); - } + InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtil.MAX_COMMIT_ALL); + files = inputFormat.listStatus(jobConf); - //TODO enable this after enabling predicate pushdown - public void testPredicatePushDown() throws IOException { - // initial commit - Schema schema = InputFormatTestUtil.readSchema("/sample1.avro"); - String commit1 = "20160628071126"; - File partitionDir = - InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 10, commit1); - InputFormatTestUtil.commit(basePath, commit1); - // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); - // check whether we have 10 records at this point - ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, 10, 10); + assertEquals( + "Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 commits", + 5, files.length); + ensureFilesInCommit( + "Pulling all commits from 100, should get us the 1 files from 600 commit", files, "600", + 1); + ensureFilesInCommit( + "Pulling all commits from 100, should get us the 1 files from 500 commit", files, "500", + 1); + ensureFilesInCommit( + "Pulling all commits from 100, should get us the 1 files from 400 commit", files, "400", + 1); + ensureFilesInCommit( + "Pulling all commits from 100, should get us the 1 files from 300 commit", files, "300", + 1); + ensureFilesInCommit( + "Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200", + 1); + } - // update 2 records in the original parquet file and save it as commit 200 - String commit2 = "20160629193623"; - InputFormatTestUtil.simulateParquetUpdates(partitionDir, schema, commit1, 10, 2, commit2); - InputFormatTestUtil.commit(basePath, commit2); + //TODO enable this after enabling predicate pushdown + public void testPredicatePushDown() throws IOException { + // initial commit + Schema schema = InputFormatTestUtil.readSchema("/sample1.avro"); + String commit1 = "20160628071126"; + File partitionDir = + InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 10, commit1); + InputFormatTestUtil.commit(basePath, commit1); + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + // check whether we have 10 records at this point + ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, + 10, 10); - InputFormatTestUtil.setupIncremental(jobConf, commit1, 1); - // check whether we have 2 records at this point - ensureRecordsInCommit( - "We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, 2, 2); - // Make sure we have the 10 records if we roll back the stattime - InputFormatTestUtil.setupIncremental(jobConf, "0", 2); - ensureRecordsInCommit( - "We need to have 8 records that was modified at commit " + commit1 + " and no more", commit1, 8, 10); - ensureRecordsInCommit( - "We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, 2, 10); - } + // update 2 records in the original parquet file and save it as commit 200 + String commit2 = "20160629193623"; + InputFormatTestUtil.simulateParquetUpdates(partitionDir, schema, commit1, 10, 2, commit2); + InputFormatTestUtil.commit(basePath, commit2); - private void ensureRecordsInCommit(String msg, String commit, - int expectedNumberOfRecordsInCommit, int totalExpected) throws IOException { - int actualCount = 0; - int totalCount = 0; - InputSplit[] splits = inputFormat.getSplits(jobConf, 1); - for(InputSplit split:splits) { - RecordReader - recordReader = inputFormat.getRecordReader(split, jobConf, null); - Void key = recordReader.createKey(); - ArrayWritable writable = recordReader.createValue(); + InputFormatTestUtil.setupIncremental(jobConf, commit1, 1); + // check whether we have 2 records at this point + ensureRecordsInCommit( + "We need to have 2 records that was modified at commit " + commit2 + " and no more", + commit2, 2, 2); + // Make sure we have the 10 records if we roll back the stattime + InputFormatTestUtil.setupIncremental(jobConf, "0", 2); + ensureRecordsInCommit( + "We need to have 8 records that was modified at commit " + commit1 + " and no more", + commit1, 8, 10); + ensureRecordsInCommit( + "We need to have 2 records that was modified at commit " + commit2 + " and no more", + commit2, 2, 10); + } - while(recordReader.next(key, writable)) { - // writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno] - // Take the commit time and compare with the one we are interested in - if(commit.equals((writable.get()[2]).toString())) { - actualCount++; - } - totalCount++; - } + private void ensureRecordsInCommit(String msg, String commit, + int expectedNumberOfRecordsInCommit, int totalExpected) throws IOException { + int actualCount = 0; + int totalCount = 0; + InputSplit[] splits = inputFormat.getSplits(jobConf, 1); + for (InputSplit split : splits) { + RecordReader + recordReader = inputFormat.getRecordReader(split, jobConf, null); + Void key = recordReader.createKey(); + ArrayWritable writable = recordReader.createValue(); + + while (recordReader.next(key, writable)) { + // writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno] + // Take the commit time and compare with the one we are interested in + if (commit.equals((writable.get()[2]).toString())) { + actualCount++; } - assertEquals(msg, expectedNumberOfRecordsInCommit, actualCount); - assertEquals(msg, totalExpected, totalCount); + totalCount++; + } } + assertEquals(msg, expectedNumberOfRecordsInCommit, actualCount); + assertEquals(msg, totalExpected, totalCount); + } - public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit, - int expected) { - int count = 0; - for (FileStatus file : files) { - String commitTs = FSUtils.getCommitTime(file.getPath().getName()); - if (commit.equals(commitTs)) { - count++; - } - } - assertEquals(msg, expected, count); + public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit, + int expected) { + int count = 0; + for (FileStatus file : files) { + String commitTs = FSUtils.getCommitTime(file.getPath().getName()); + if (commit.equals(commitTs)) { + count++; + } } + assertEquals(msg, expected, count); + } } diff --git a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/InputFormatTestUtil.java b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/InputFormatTestUtil.java index ae57a4fa5..ac14e6484 100644 --- a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/InputFormatTestUtil.java +++ b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/InputFormatTestUtil.java @@ -16,20 +16,10 @@ package com.uber.hoodie.hadoop; -import com.uber.hoodie.avro.MercifulJsonConverter; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.SchemaTestUtil; -import com.uber.hoodie.common.util.TestRecord; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapred.JobConf; -import org.apache.parquet.avro.AvroParquetWriter; -import org.junit.rules.TemporaryFolder; - import java.io.File; import java.io.FilenameFilter; import java.io.IOException; @@ -37,123 +27,139 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.JobConf; +import org.apache.parquet.avro.AvroParquetWriter; +import org.junit.rules.TemporaryFolder; public class InputFormatTestUtil { - public static File prepareDataset(TemporaryFolder basePath, int numberOfFiles, - String commitNumber) throws IOException { - basePath.create(); - HoodieTestUtils.init(basePath.getRoot().toString()); - File partitionPath = basePath.newFolder("2016", "05", "01"); - for (int i = 0; i < numberOfFiles; i++) { - File dataFile = - new File(partitionPath, FSUtils.makeDataFileName(commitNumber, 1, "fileid" + i)); - dataFile.createNewFile(); + + public static File prepareDataset(TemporaryFolder basePath, int numberOfFiles, + String commitNumber) throws IOException { + basePath.create(); + HoodieTestUtils.init(basePath.getRoot().toString()); + File partitionPath = basePath.newFolder("2016", "05", "01"); + for (int i = 0; i < numberOfFiles; i++) { + File dataFile = + new File(partitionPath, FSUtils.makeDataFileName(commitNumber, 1, "fileid" + i)); + dataFile.createNewFile(); + } + return partitionPath; + } + + public static void simulateUpdates(File directory, final String originalCommit, + int numberOfFilesUpdated, + String newCommit, boolean randomize) throws IOException { + List dataFiles = Arrays.asList(directory.listFiles(new FilenameFilter() { + @Override + public boolean accept(File dir, String name) { + String commitTs = FSUtils.getCommitTime(name); + return originalCommit.equals(commitTs); + } + })); + if (randomize) { + Collections.shuffle(dataFiles); + } + List toUpdateList = + dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size())); + for (File file : toUpdateList) { + String fileId = FSUtils.getFileId(file.getName()); + File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, 1, fileId)); + dataFile.createNewFile(); + } + } + + public static void commit(TemporaryFolder basePath, String commitNumber) throws IOException { + // create the commit + new File(basePath.getRoot().toString() + "/.hoodie/", commitNumber + ".commit").createNewFile(); + } + + public static void setupIncremental(JobConf jobConf, String startCommit, + int numberOfCommitsToPull) { + String modePropertyName = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, + HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE); + + String startCommitTimestampName = String + .format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(startCommitTimestampName, startCommit); + + String maxCommitPulls = String + .format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); + } + + public static Schema readSchema(String location) throws IOException { + return new Schema.Parser().parse(InputFormatTestUtil.class.getResourceAsStream(location)); + } + + public static File prepareParquetDataset(TemporaryFolder basePath, Schema schema, + int numberOfFiles, int numberOfRecords, + String commitNumber) throws IOException { + basePath.create(); + HoodieTestUtils.init(basePath.getRoot().toString()); + File partitionPath = basePath.newFolder("2016", "05", "01"); + AvroParquetWriter parquetWriter; + for (int i = 0; i < numberOfFiles; i++) { + String fileId = FSUtils.makeDataFileName(commitNumber, 1, "fileid" + i); + File dataFile = + new File(partitionPath, fileId); + // dataFile.createNewFile(); + parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), + schema); + try { + for (GenericRecord record : generateAvroRecords(schema, numberOfRecords, commitNumber, + fileId)) { + parquetWriter.write(record); } - return partitionPath; + } finally { + parquetWriter.close(); + } } + return partitionPath; - public static void simulateUpdates(File directory, final String originalCommit, int numberOfFilesUpdated, - String newCommit, boolean randomize) throws IOException { - List dataFiles = Arrays.asList(directory.listFiles(new FilenameFilter() { - @Override public boolean accept(File dir, String name) { - String commitTs = FSUtils.getCommitTime(name); - return originalCommit.equals(commitTs); - } - })); - if(randomize) { - Collections.shuffle(dataFiles); - } - List toUpdateList = - dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size())); - for (File file : toUpdateList) { - String fileId = FSUtils.getFileId(file.getName()); - File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, 1, fileId)); - dataFile.createNewFile(); + } + + private static Iterable generateAvroRecords(Schema schema, + int numberOfRecords, String commitTime, String fileId) throws IOException { + List records = new ArrayList<>(numberOfRecords); + for (int i = 0; i < numberOfRecords; i++) { + records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, commitTime, fileId)); + } + return records; + } + + public static void simulateParquetUpdates(File directory, Schema schema, String originalCommit, + int totalNumberOfRecords, int numberOfRecordsToUpdate, + String newCommit) throws IOException { + File fileToUpdate = directory.listFiles(new FilenameFilter() { + @Override + public boolean accept(File dir, String name) { + return name.endsWith("parquet"); + } + })[0]; + String fileId = FSUtils.getFileId(fileToUpdate.getName()); + File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, 1, fileId)); + AvroParquetWriter parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), + schema); + try { + for (GenericRecord record : generateAvroRecords(schema, totalNumberOfRecords, + originalCommit, fileId)) { + if (numberOfRecordsToUpdate > 0) { + // update this record + record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, newCommit); + String oldSeqNo = (String) record.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD); + record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, + oldSeqNo.replace(originalCommit, newCommit)); + numberOfRecordsToUpdate--; } + parquetWriter.write(record); + } + } finally { + parquetWriter.close(); } - public static void commit(TemporaryFolder basePath, String commitNumber) throws IOException { - // create the commit - new File(basePath.getRoot().toString() + "/.hoodie/", commitNumber + ".commit").createNewFile(); - } - - public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) { - String modePropertyName = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, - HoodieTestUtils.RAW_TRIPS_TEST_NAME); - jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE); - - String startCommitTimestampName = String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); - jobConf.set(startCommitTimestampName, startCommit); - - String maxCommitPulls = String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); - jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); - } - - public static Schema readSchema(String location) throws IOException { - return new Schema.Parser().parse(InputFormatTestUtil.class.getResourceAsStream(location)); - } - - public static File prepareParquetDataset(TemporaryFolder basePath, Schema schema, int numberOfFiles, int numberOfRecords, - String commitNumber) throws IOException { - basePath.create(); - HoodieTestUtils.init(basePath.getRoot().toString()); - File partitionPath = basePath.newFolder("2016", "05", "01"); - AvroParquetWriter parquetWriter; - for (int i = 0; i < numberOfFiles; i++) { - String fileId = FSUtils.makeDataFileName(commitNumber, 1, "fileid" + i); - File dataFile = - new File(partitionPath, fileId); - // dataFile.createNewFile(); - parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), - schema); - try { - for (GenericRecord record : generateAvroRecords(schema, numberOfRecords, commitNumber, fileId)) { - parquetWriter.write(record); - } - } finally { - parquetWriter.close(); - } - } - return partitionPath; - - } - - private static Iterable generateAvroRecords(Schema schema, int numberOfRecords, String commitTime, String fileId) throws IOException { - List records = new ArrayList<>(numberOfRecords); - for(int i=0;i 0) { - // update this record - record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, newCommit); - String oldSeqNo = (String) record.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD); - record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, - oldSeqNo.replace(originalCommit, newCommit)); - numberOfRecordsToUpdate--; - } - parquetWriter.write(record); - } - } finally { - parquetWriter.close(); - } - - } + } } diff --git a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/TestHoodieROTablePathFilter.java b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/TestHoodieROTablePathFilter.java index 7470e6bbb..e64f918d4 100644 --- a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/TestHoodieROTablePathFilter.java +++ b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/TestHoodieROTablePathFilter.java @@ -15,63 +15,66 @@ */ package com.uber.hoodie.hadoop; -import com.uber.hoodie.common.model.HoodieTestUtils; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.HoodieTableMetaClient; +import java.io.File; +import java.io.IOException; import org.apache.hadoop.fs.Path; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.File; -import java.io.IOException; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - /** */ public class TestHoodieROTablePathFilter { - @Test - public void testHoodiePaths() throws IOException { - // Create a temp folder as the base path - HoodieTableMetaClient metaClient = HoodieTestUtils.initOnTemp(); - String basePath = metaClient.getBasePath(); + @Test + public void testHoodiePaths() throws IOException { + // Create a temp folder as the base path + HoodieTableMetaClient metaClient = HoodieTestUtils.initOnTemp(); + String basePath = metaClient.getBasePath(); - HoodieTestUtils.createCommitFiles(basePath, "001", "002"); - HoodieTestUtils.createInflightCommitFiles(basePath, "003"); + HoodieTestUtils.createCommitFiles(basePath, "001", "002"); + HoodieTestUtils.createInflightCommitFiles(basePath, "003"); - HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f1"); - HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f2"); - HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f3"); - HoodieTestUtils.createDataFile(basePath, "2017/01/01", "002", "f2"); - HoodieTestUtils.createDataFile(basePath, "2017/01/01", "003", "f3"); + HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f1"); + HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f2"); + HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f3"); + HoodieTestUtils.createDataFile(basePath, "2017/01/01", "002", "f2"); + HoodieTestUtils.createDataFile(basePath, "2017/01/01", "003", "f3"); - HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter(); - Path partitionPath = new Path("file://" + basePath + File.separator + "2017/01/01"); - assertTrue("Directories should be accepted", pathFilter.accept(partitionPath)); + HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter(); + Path partitionPath = new Path("file://" + basePath + File.separator + "2017/01/01"); + assertTrue("Directories should be accepted", pathFilter.accept(partitionPath)); - assertTrue(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f1")))); - assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2")))); - assertTrue(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f3")))); - assertTrue(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2")))); - assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3")))); - } + assertTrue(pathFilter.accept(new Path( + "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f1")))); + assertFalse(pathFilter.accept(new Path( + "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2")))); + assertTrue(pathFilter.accept(new Path( + "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f3")))); + assertTrue(pathFilter.accept(new Path( + "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2")))); + assertFalse(pathFilter.accept(new Path( + "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3")))); + } - @Test - public void testNonHoodiePaths() throws IOException { - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - String basePath = folder.getRoot().getAbsolutePath(); - HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter(); + @Test + public void testNonHoodiePaths() throws IOException { + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + String basePath = folder.getRoot().getAbsolutePath(); + HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter(); - String path = basePath + File.separator + "nonhoodiefolder"; - new File(path).mkdirs(); - assertTrue(pathFilter.accept(new Path("file:///" + path))); + String path = basePath + File.separator + "nonhoodiefolder"; + new File(path).mkdirs(); + assertTrue(pathFilter.accept(new Path("file:///" + path))); - path = basePath + File.separator + "nonhoodiefolder/somefile"; - new File(path).createNewFile(); - assertTrue(pathFilter.accept(new Path("file:///" + path))); - } + path = basePath + File.separator + "nonhoodiefolder/somefile"; + new File(path).createNewFile(); + assertTrue(pathFilter.accept(new Path("file:///" + path))); + } } diff --git a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReaderTest.java b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReaderTest.java index 73595f6eb..84f02b868 100644 --- a/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReaderTest.java +++ b/hoodie-hadoop-mr/src/test/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReaderTest.java @@ -19,6 +19,8 @@ package com.uber.hoodie.hadoop.realtime; +import static org.junit.Assert.assertTrue; + import com.google.common.collect.Maps; import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.model.HoodieTableType; @@ -30,6 +32,13 @@ import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.common.util.SchemaTestUtil; import com.uber.hoodie.hadoop.InputFormatTestUtil; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; @@ -52,211 +61,234 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import static org.junit.Assert.assertTrue; - public class HoodieRealtimeRecordReaderTest { - private JobConf jobConf; + private JobConf jobConf; - @Before - public void setUp() { - jobConf = new JobConf(); + @Before + public void setUp() { + jobConf = new JobConf(); + } + + @Rule + public TemporaryFolder basePath = new TemporaryFolder(); + + private HoodieLogFormat.Writer writeLogFile(File partitionDir, Schema schema, String fileId, + String baseCommit, String newCommit, int numberOfRecords) + throws InterruptedException, IOException { + HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(new Path(partitionDir.getPath())) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId) + .overBaseCommit(baseCommit).withFs(FSUtils.getFs()).build(); + List records = new ArrayList<>(); + for (int i = 0; i < numberOfRecords; i++) { + records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, newCommit, "fileid0")); } + Schema writeSchema = records.get(0).getSchema(); + Map metadata = Maps.newHashMap(); + metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, newCommit); + HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, writeSchema, metadata); + writer = writer.appendBlock(dataBlock); + long size = writer.getCurrentSize(); + return writer; + } - @Rule - public TemporaryFolder basePath = new TemporaryFolder(); + @Test + public void testReader() throws Exception { + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); + HoodieTestUtils + .initTableType(basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); + String commitTime = "100"; + File partitionDir = InputFormatTestUtil + .prepareParquetDataset(basePath, schema, 1, 100, commitTime); + InputFormatTestUtil.commit(basePath, commitTime); + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); - private HoodieLogFormat.Writer writeLogFile(File partitionDir, Schema schema, String fileId, - String baseCommit, String newCommit, int numberOfRecords) throws InterruptedException,IOException { - HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionDir.getPath())) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId) - .overBaseCommit(baseCommit).withFs(FSUtils.getFs()).build(); - List records = new ArrayList<>(); - for(int i=0; i < numberOfRecords; i++) { - records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, newCommit, "fileid0")); - } - Schema writeSchema = records.get(0).getSchema(); - Map metadata = Maps.newHashMap(); - metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, newCommit); - HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, writeSchema, metadata); - writer = writer.appendBlock(dataBlock); - long size = writer.getCurrentSize(); - return writer; - } + // update files or generate new log file + String newCommitTime = "101"; + HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, + newCommitTime, 100); + long size = writer.getCurrentSize(); + writer.close(); + assertTrue("block - size should be > 0", size > 0); - @Test - public void testReader() throws Exception { - // initial commit - Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); - HoodieTestUtils.initTableType(basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); - String commitTime = "100"; - File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, commitTime); - InputFormatTestUtil.commit(basePath, commitTime); - // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + //create a split with baseFile (parquet file written earlier) and new log file(s) + String logFilePath = writer.getLogFile().getPath().toString(); + HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir + + "/fileid0_1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), + Arrays.asList(logFilePath), newCommitTime); - // update files or generate new log file - String newCommitTime = "101"; - HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, 100); - long size = writer.getCurrentSize(); - writer.close(); - assertTrue("block - size should be > 0", size > 0); - - //create a split with baseFile (parquet file written earlier) and new log file(s) - String logFilePath = writer.getLogFile().getPath().toString(); - HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir - + "/fileid0_1_" + commitTime + ".parquet"),0,1,jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); - - //create a RecordReader to be used by HoodieRealtimeRecordReader - RecordReader reader = - new MapredParquetInputFormat(). - getRecordReader(new FileSplit(split.getPath(), 0, - FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null); - JobConf jobConf = new JobConf(); - List fields = schema.getFields(); - String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); - String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); - jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); - jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); - jobConf.set("partition_columns", "datestr"); - - //validate record reader compaction - HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); - - //use reader to read base Parquet File and log file, merge in flight and return latest commit - //here all 100 records should be updated, see above - Void key = recordReader.createKey(); - ArrayWritable value = recordReader.createValue(); - while(recordReader.next(key, value)) { - Writable[] values = value.get(); - //check if the record written is with latest commit, here "101" - Assert.assertEquals(values[0].toString(), newCommitTime); - key = recordReader.createKey(); - value = recordReader.createValue(); - } - } - - @Test - public void testReaderWithNestedAndComplexSchema() throws Exception { - // initial commit - Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema()); - HoodieTestUtils.initTableType(basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); - String commitTime = "100"; - int numberOfRecords = 100; - int numberOfLogRecords = numberOfRecords / 2; - File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numberOfRecords, commitTime); - InputFormatTestUtil.commit(basePath, commitTime); - // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); - - // update files or generate new log file - String newCommitTime = "101"; - HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numberOfLogRecords); - long size = writer.getCurrentSize(); - writer.close(); - assertTrue("block - size should be > 0", size > 0); - - //create a split with baseFile (parquet file written earlier) and new log file(s) - String logFilePath = writer.getLogFile().getPath().toString(); - HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir - + "/fileid0_1_" + commitTime + ".parquet"),0,1,jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); - - //create a RecordReader to be used by HoodieRealtimeRecordReader - RecordReader reader = - new MapredParquetInputFormat(). + //create a RecordReader to be used by HoodieRealtimeRecordReader + RecordReader reader = + new MapredParquetInputFormat(). getRecordReader(new FileSplit(split.getPath(), 0, - FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null); - JobConf jobConf = new JobConf(); - List fields = schema.getFields(); + FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null); + JobConf jobConf = new JobConf(); + List fields = schema.getFields(); + String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); + String postions = fields.stream().map(f -> String.valueOf(f.pos())) + .collect(Collectors.joining(",")); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); + jobConf.set("partition_columns", "datestr"); - String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(",")); - String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); - jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); - jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); - jobConf.set("partition_columns", "datestr"); + //validate record reader compaction + HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, + reader); - // validate record reader compaction - HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); - - // use reader to read base Parquet File and log file, merge in flight and return latest commit - // here the first 50 records should be updated, see above - Void key = recordReader.createKey(); - ArrayWritable value = recordReader.createValue(); - int numRecordsRead = 0; - while (recordReader.next(key, value)) { - int currentRecordNo = numRecordsRead; - ++numRecordsRead; - Writable[] values = value.get(); - String recordCommitTime; - //check if the record written is with latest commit, here "101" - if (numRecordsRead > numberOfLogRecords) { - recordCommitTime = commitTime; - } else { - recordCommitTime = newCommitTime; - } - String recordCommitTimeSuffix = "@" + recordCommitTime; - - Assert.assertEquals(values[0].toString(), recordCommitTime); - key = recordReader.createKey(); - value = recordReader.createValue(); - - // Assert type STRING - Assert.assertEquals("test value for field: field1", values[5].toString(), "field" + currentRecordNo); - Assert.assertEquals("test value for field: field2",values[6].toString(), "field" + currentRecordNo + recordCommitTimeSuffix); - Assert.assertEquals("test value for field: name", values[7].toString(), "name" + currentRecordNo); - - // Assert type INT - IntWritable intWritable = (IntWritable)values[8]; - Assert.assertEquals("test value for field: favoriteIntNumber", intWritable.get(), currentRecordNo + recordCommitTime.hashCode()); - - // Assert type LONG - LongWritable longWritable = (LongWritable)values[9]; - Assert.assertEquals("test value for field: favoriteNumber", longWritable.get(), currentRecordNo + recordCommitTime.hashCode()); - - // Assert type FLOAT - FloatWritable floatWritable = (FloatWritable)values[10]; - Assert.assertEquals("test value for field: favoriteFloatNumber", floatWritable.get(), (float)((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0); - - // Assert type DOUBLE - DoubleWritable doubleWritable = (DoubleWritable)values[11]; - Assert.assertEquals("test value for field: favoriteDoubleNumber", doubleWritable.get(), (currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0); - - // Assert type MAP - ArrayWritable mapItem = (ArrayWritable)values[12]; - Writable[] mapItemValues = mapItem.get(); - ArrayWritable mapItemValue1 = (ArrayWritable)mapItemValues[0]; - ArrayWritable mapItemValue2 = (ArrayWritable)mapItemValues[1]; - Assert.assertEquals("test value for field: tags", mapItemValue1.get()[0].toString(), "mapItem1"); - Assert.assertEquals("test value for field: tags", mapItemValue2.get()[0].toString(), "mapItem2"); - ArrayWritable mapItemValue1value = (ArrayWritable)mapItemValue1.get()[1]; - ArrayWritable mapItemValue2value = (ArrayWritable)mapItemValue2.get()[1]; - Assert.assertEquals("test value for field: tags", mapItemValue1value.get().length, 2); - Assert.assertEquals("test value for field: tags", mapItemValue2value.get().length, 2); - Assert.assertEquals("test value for field: tags[\"mapItem1\"].item1", mapItemValue1value.get()[0].toString(), "item" + currentRecordNo); - Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1", mapItemValue2value.get()[0].toString(), "item2" + currentRecordNo); - Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2", mapItemValue1value.get()[1].toString(), "item" + currentRecordNo + recordCommitTimeSuffix); - Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2", mapItemValue2value.get()[1].toString(), "item2" + currentRecordNo + recordCommitTimeSuffix); - - // Assert type RECORD - ArrayWritable recordItem = (ArrayWritable)values[13]; - Writable[] nestedRecord = recordItem.get(); - Assert.assertEquals("test value for field: testNestedRecord.isAdmin", ((BooleanWritable)nestedRecord[0]).get(), false); - Assert.assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(), "UserId" + currentRecordNo + recordCommitTimeSuffix); - - // Assert type ARRAY - ArrayWritable arrayValue = (ArrayWritable)values[14]; - Writable[] arrayValues = arrayValue.get(); - for (int i = 0; i < arrayValues.length; i++) { - Assert.assertEquals("test value for field: stringArray", arrayValues[i].toString(), "stringArray" + i + recordCommitTimeSuffix); - } - } + //use reader to read base Parquet File and log file, merge in flight and return latest commit + //here all 100 records should be updated, see above + Void key = recordReader.createKey(); + ArrayWritable value = recordReader.createValue(); + while (recordReader.next(key, value)) { + Writable[] values = value.get(); + //check if the record written is with latest commit, here "101" + Assert.assertEquals(values[0].toString(), newCommitTime); + key = recordReader.createKey(); + value = recordReader.createValue(); } + } + + @Test + public void testReaderWithNestedAndComplexSchema() throws Exception { + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema()); + HoodieTestUtils + .initTableType(basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); + String commitTime = "100"; + int numberOfRecords = 100; + int numberOfLogRecords = numberOfRecords / 2; + File partitionDir = InputFormatTestUtil + .prepareParquetDataset(basePath, schema, 1, numberOfRecords, commitTime); + InputFormatTestUtil.commit(basePath, commitTime); + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + + // update files or generate new log file + String newCommitTime = "101"; + HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, + newCommitTime, numberOfLogRecords); + long size = writer.getCurrentSize(); + writer.close(); + assertTrue("block - size should be > 0", size > 0); + + //create a split with baseFile (parquet file written earlier) and new log file(s) + String logFilePath = writer.getLogFile().getPath().toString(); + HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir + + "/fileid0_1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), + Arrays.asList(logFilePath), newCommitTime); + + //create a RecordReader to be used by HoodieRealtimeRecordReader + RecordReader reader = + new MapredParquetInputFormat(). + getRecordReader(new FileSplit(split.getPath(), 0, + FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null); + JobConf jobConf = new JobConf(); + List fields = schema.getFields(); + + String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(",")); + String positions = fields.stream().map(f -> String.valueOf(f.pos())) + .collect(Collectors.joining(",")); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); + jobConf.set("partition_columns", "datestr"); + + // validate record reader compaction + HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, + reader); + + // use reader to read base Parquet File and log file, merge in flight and return latest commit + // here the first 50 records should be updated, see above + Void key = recordReader.createKey(); + ArrayWritable value = recordReader.createValue(); + int numRecordsRead = 0; + while (recordReader.next(key, value)) { + int currentRecordNo = numRecordsRead; + ++numRecordsRead; + Writable[] values = value.get(); + String recordCommitTime; + //check if the record written is with latest commit, here "101" + if (numRecordsRead > numberOfLogRecords) { + recordCommitTime = commitTime; + } else { + recordCommitTime = newCommitTime; + } + String recordCommitTimeSuffix = "@" + recordCommitTime; + + Assert.assertEquals(values[0].toString(), recordCommitTime); + key = recordReader.createKey(); + value = recordReader.createValue(); + + // Assert type STRING + Assert.assertEquals("test value for field: field1", values[5].toString(), + "field" + currentRecordNo); + Assert.assertEquals("test value for field: field2", values[6].toString(), + "field" + currentRecordNo + recordCommitTimeSuffix); + Assert.assertEquals("test value for field: name", values[7].toString(), + "name" + currentRecordNo); + + // Assert type INT + IntWritable intWritable = (IntWritable) values[8]; + Assert.assertEquals("test value for field: favoriteIntNumber", intWritable.get(), + currentRecordNo + recordCommitTime.hashCode()); + + // Assert type LONG + LongWritable longWritable = (LongWritable) values[9]; + Assert.assertEquals("test value for field: favoriteNumber", longWritable.get(), + currentRecordNo + recordCommitTime.hashCode()); + + // Assert type FLOAT + FloatWritable floatWritable = (FloatWritable) values[10]; + Assert.assertEquals("test value for field: favoriteFloatNumber", floatWritable.get(), + (float) ((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0); + + // Assert type DOUBLE + DoubleWritable doubleWritable = (DoubleWritable) values[11]; + Assert.assertEquals("test value for field: favoriteDoubleNumber", doubleWritable.get(), + (currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0); + + // Assert type MAP + ArrayWritable mapItem = (ArrayWritable) values[12]; + Writable[] mapItemValues = mapItem.get(); + ArrayWritable mapItemValue1 = (ArrayWritable) mapItemValues[0]; + ArrayWritable mapItemValue2 = (ArrayWritable) mapItemValues[1]; + Assert.assertEquals("test value for field: tags", mapItemValue1.get()[0].toString(), + "mapItem1"); + Assert.assertEquals("test value for field: tags", mapItemValue2.get()[0].toString(), + "mapItem2"); + ArrayWritable mapItemValue1value = (ArrayWritable) mapItemValue1.get()[1]; + ArrayWritable mapItemValue2value = (ArrayWritable) mapItemValue2.get()[1]; + Assert.assertEquals("test value for field: tags", mapItemValue1value.get().length, 2); + Assert.assertEquals("test value for field: tags", mapItemValue2value.get().length, 2); + Assert.assertEquals("test value for field: tags[\"mapItem1\"].item1", + mapItemValue1value.get()[0].toString(), "item" + currentRecordNo); + Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1", + mapItemValue2value.get()[0].toString(), "item2" + currentRecordNo); + Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2", + mapItemValue1value.get()[1].toString(), + "item" + currentRecordNo + recordCommitTimeSuffix); + Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2", + mapItemValue2value.get()[1].toString(), + "item2" + currentRecordNo + recordCommitTimeSuffix); + + // Assert type RECORD + ArrayWritable recordItem = (ArrayWritable) values[13]; + Writable[] nestedRecord = recordItem.get(); + Assert.assertEquals("test value for field: testNestedRecord.isAdmin", + ((BooleanWritable) nestedRecord[0]).get(), false); + Assert + .assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(), + "UserId" + currentRecordNo + recordCommitTimeSuffix); + + // Assert type ARRAY + ArrayWritable arrayValue = (ArrayWritable) values[14]; + Writable[] arrayValues = arrayValue.get(); + for (int i = 0; i < arrayValues.length; i++) { + Assert.assertEquals("test value for field: stringArray", arrayValues[i].toString(), + "stringArray" + i + recordCommitTimeSuffix); + } + } + } } diff --git a/hoodie-hadoop-mr/src/test/resources/log4j-surefire.properties b/hoodie-hadoop-mr/src/test/resources/log4j-surefire.properties index 1c03f27e6..3613e7d12 100644 --- a/hoodie-hadoop-mr/src/test/resources/log4j-surefire.properties +++ b/hoodie-hadoop-mr/src/test/resources/log4j-surefire.properties @@ -13,11 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # - log4j.rootLogger=WARN, A1 log4j.category.com.uber=INFO log4j.category.org.apache.parquet.hadoop=WARN - # A1 is set to be a ConsoleAppender. log4j.appender.A1=org.apache.log4j.ConsoleAppender # A1 uses PatternLayout. diff --git a/hoodie-hive/pom.xml b/hoodie-hive/pom.xml index 5fd1f6295..dfefe9bfb 100644 --- a/hoodie-hive/pom.xml +++ b/hoodie-hive/pom.xml @@ -15,7 +15,9 @@ ~ limitations under the License. --> - + hoodie com.uber.hoodie diff --git a/hoodie-hive/src/assembly/src.xml b/hoodie-hive/src/assembly/src.xml index adb5044b1..41f7b276d 100644 --- a/hoodie-hive/src/assembly/src.xml +++ b/hoodie-hive/src/assembly/src.xml @@ -15,8 +15,8 @@ --> + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3 http://maven.apache.org/xsd/assembly-1.1.3.xsd"> jar-with-dependencies jar diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncConfig.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncConfig.java index 4f40355de..5379580ef 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncConfig.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncConfig.java @@ -19,7 +19,6 @@ package com.uber.hoodie.hive; import com.beust.jcommander.Parameter; - import java.io.Serializable; import java.util.ArrayList; import java.util.List; diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncTool.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncTool.java index 1268e69e8..088e24a9c 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncTool.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncTool.java @@ -26,6 +26,10 @@ import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat; import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent; import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType; import com.uber.hoodie.hive.util.SchemaUtil; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.Partition; @@ -35,20 +39,14 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import parquet.schema.MessageType; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; - /** - * Tool to sync a hoodie HDFS dataset with a hive metastore table. - * Either use it as a api HiveSyncTool.syncHoodieTable(HiveSyncConfig) - * or as a command line java -cp hoodie-hive.jar HiveSyncTool [args] + * Tool to sync a hoodie HDFS dataset with a hive metastore table. Either use it as a api + * HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar + * HiveSyncTool [args] * - * This utility will get the schema from the latest commit and will sync hive table schema - * Also this will sync the partitions incrementally - * (all the partitions modified since the last commit) + * This utility will get the schema from the latest commit and will sync hive table schema Also this + * will sync the partitions incrementally (all the partitions modified since the last commit) */ @SuppressWarnings("WeakerAccess") public class HiveSyncTool { @@ -64,7 +62,7 @@ public class HiveSyncTool { } public void syncHoodieTable() { - switch(hoodieHiveClient.getTableType()) { + switch (hoodieHiveClient.getTableType()) { case COPY_ON_WRITE: syncHoodieTable(false); break; @@ -125,15 +123,15 @@ public class HiveSyncTool { // Check and sync schema if (!tableExists) { LOG.info("Table " + cfg.tableName + " is not found. Creating it"); - if(!isRealTime) { + if (!isRealTime) { // TODO - RO Table for MOR only after major compaction (UnboundedCompaction is default for now) hoodieHiveClient.createTable(schema, HoodieInputFormat.class.getName(), - MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName()); + MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName()); } else { - // Custom serde will not work with ALTER TABLE REPLACE COLUMNS - // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java#L3488 - hoodieHiveClient.createTable(schema, HoodieRealtimeInputFormat.class.getName(), - MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName()); + // Custom serde will not work with ALTER TABLE REPLACE COLUMNS + // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java#L3488 + hoodieHiveClient.createTable(schema, HoodieRealtimeInputFormat.class.getName(), + MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName()); } } else { // Check if the dataset schema has evolved diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java index 5f0dc9337..6fd5019df 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java @@ -198,8 +198,8 @@ public class HoodieHiveClient { } /** - * Iterate over the storage partitions and find if there are any new partitions that need - * to be added or updated. Generate a list of PartitionEvent based on the changes required. + * Iterate over the storage partitions and find if there are any new partitions that need to be + * added or updated. Generate a list of PartitionEvent based on the changes required. */ List getPartitionEvents(List tablePartitions, List partitionStoragePartitions) { @@ -297,9 +297,9 @@ public class HoodieHiveClient { } /** - * Gets the schema for a hoodie dataset. - * Depending on the type of table, read from any file written in the latest commit. - * We will assume that the schema has not changed within a single atomic write. + * Gets the schema for a hoodie dataset. Depending on the type of table, read from any file + * written in the latest commit. We will assume that the schema has not changed within a single + * atomic write. * * @return Parquet schema for this dataset */ @@ -313,7 +313,8 @@ public class HoodieHiveClient { .orElseThrow(() -> new InvalidDatasetException(syncConfig.basePath)); HoodieCommitMetadata commitMetadata = HoodieCommitMetadata .fromBytes(activeTimeline.getInstantDetails(lastCommit).get()); - String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny() + String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values() + .stream().findAny() .orElseThrow(() -> new IllegalArgumentException( "Could not find any data file written for commit " + lastCommit + ", could not get schema for dataset " + metaClient.getBasePath())); @@ -330,7 +331,8 @@ public class HoodieHiveClient { lastDeltaCommitAfterCompaction = metaClient.getActiveTimeline() .getDeltaCommitTimeline() .filterCompletedInstants() - .findInstantsAfter(lastCompactionCommit.get().getTimestamp(), Integer.MAX_VALUE).lastInstant(); + .findInstantsAfter(lastCompactionCommit.get().getTimestamp(), Integer.MAX_VALUE) + .lastInstant(); } LOG.info("Found the last delta commit after last compaction as " + lastDeltaCommitAfterCompaction); @@ -340,8 +342,9 @@ public class HoodieHiveClient { // read from the log file wrote commitMetadata = HoodieCommitMetadata .fromBytes(activeTimeline.getInstantDetails(lastDeltaCommit).get()); - filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().filter(s -> s.contains( - HoodieLogFile.DELTA_EXTENSION)).findAny() + filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values() + .stream().filter(s -> s.contains( + HoodieLogFile.DELTA_EXTENSION)).findAny() .orElseThrow(() -> new IllegalArgumentException( "Could not find any data file written for commit " + lastDeltaCommit + ", could not get schema for dataset " + metaClient.getBasePath())); @@ -361,10 +364,6 @@ public class HoodieHiveClient { /** * Read schema from a data file from the last compaction commit done. - * - * @param lastCompactionCommitOpt - * @return - * @throws IOException */ @SuppressWarnings("OptionalUsedAsFieldOrParameterType") private MessageType readSchemaFromLastCompaction(Optional lastCompactionCommitOpt) @@ -377,7 +376,8 @@ public class HoodieHiveClient { // Read from the compacted file wrote HoodieCompactionMetadata compactionMetadata = HoodieCompactionMetadata .fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get()); - String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny() + String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values() + .stream().findAny() .orElseThrow(() -> new IllegalArgumentException( "Could not find any data file written for compaction " + lastCompactionCommit + ", could not get schema for dataset " + metaClient.getBasePath())); @@ -386,11 +386,6 @@ public class HoodieHiveClient { /** * Read the schema from the log file on path - * - * @param lastCompactionCommitOpt - * @param path - * @return - * @throws IOException */ @SuppressWarnings("OptionalUsedAsFieldOrParameterType") private MessageType readSchemaFromLogFile(Optional lastCompactionCommitOpt, @@ -422,7 +417,8 @@ public class HoodieHiveClient { + ". File does not exist."); } ParquetMetadata fileFooter = - ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER); + ParquetFileReader + .readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER); return fileFooter.getFileMetaData().getSchema(); } @@ -530,7 +526,7 @@ public class HoodieHiveClient { if (connection != null) { connection.close(); } - if(client != null) { + if (client != null) { client.close(); } } catch (SQLException e) { diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveSyncException.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveSyncException.java index 8419fdfa7..d490ba061 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveSyncException.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveSyncException.java @@ -18,23 +18,23 @@ package com.uber.hoodie.hive; public class HoodieHiveSyncException extends RuntimeException { - public HoodieHiveSyncException() { - super(); - } + public HoodieHiveSyncException() { + super(); + } - public HoodieHiveSyncException(String message) { - super(message); - } + public HoodieHiveSyncException(String message) { + super(message); + } - public HoodieHiveSyncException(String message, Throwable t) { - super(message, t); - } + public HoodieHiveSyncException(String message, Throwable t) { + super(message, t); + } - public HoodieHiveSyncException(Throwable t) { - super(t); - } + public HoodieHiveSyncException(Throwable t) { + super(t); + } - protected static String format(String message, Object... args) { - return String.format(String.valueOf(message), (Object[]) args); - } + protected static String format(String message, Object... args) { + return String.format(String.valueOf(message), (Object[]) args); + } } diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java index 8ef9a88fd..794c262e3 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java @@ -21,11 +21,13 @@ package com.uber.hoodie.hive; import java.util.List; /** - * HDFS Path contain hive partition values for the keys it is partitioned on. - * This mapping is not straight forward and requires a pluggable implementation to extract the partition value from HDFS path. + * HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not + * straight forward and requires a pluggable implementation to extract the partition value from HDFS + * path. * * e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd] */ public interface PartitionValueExtractor { + List extractPartitionValuesInPath(String partitionPath); } diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/SchemaDifference.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/SchemaDifference.java index 7435e803c..2e3b6c406 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/SchemaDifference.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/SchemaDifference.java @@ -21,88 +21,92 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; -import parquet.schema.MessageType; - import java.util.List; import java.util.Map; +import parquet.schema.MessageType; /** * Represents the schema difference between the storage schema and hive table schema */ public class SchemaDifference { + + private final MessageType storageSchema; + private final Map tableSchema; + private final List deleteColumns; + private final Map updateColumnTypes; + private final Map addColumnTypes; + + private SchemaDifference(MessageType storageSchema, Map tableSchema, + List deleteColumns, Map updateColumnTypes, + Map addColumnTypes) { + this.storageSchema = storageSchema; + this.tableSchema = tableSchema; + this.deleteColumns = ImmutableList.copyOf(deleteColumns); + this.updateColumnTypes = ImmutableMap.copyOf(updateColumnTypes); + this.addColumnTypes = ImmutableMap.copyOf(addColumnTypes); + } + + public List getDeleteColumns() { + return deleteColumns; + } + + public Map getUpdateColumnTypes() { + return updateColumnTypes; + } + + public Map getAddColumnTypes() { + return addColumnTypes; + } + + @Override + public String toString() { + return Objects.toStringHelper(this).add("deleteColumns", deleteColumns) + .add("updateColumnTypes", updateColumnTypes).add("addColumnTypes", addColumnTypes) + .toString(); + } + + public static Builder newBuilder(MessageType storageSchema, Map tableSchema) { + return new Builder(storageSchema, tableSchema); + } + + public boolean isEmpty() { + return deleteColumns.isEmpty() && updateColumnTypes.isEmpty() && addColumnTypes.isEmpty(); + } + + public static class Builder { + private final MessageType storageSchema; private final Map tableSchema; - private final List deleteColumns; - private final Map updateColumnTypes; - private final Map addColumnTypes; + private List deleteColumns; + private Map updateColumnTypes; + private Map addColumnTypes; - private SchemaDifference(MessageType storageSchema, Map tableSchema, - List deleteColumns, Map updateColumnTypes, Map addColumnTypes) { - this.storageSchema = storageSchema; - this.tableSchema = tableSchema; - this.deleteColumns = ImmutableList.copyOf(deleteColumns); - this.updateColumnTypes = ImmutableMap.copyOf(updateColumnTypes); - this.addColumnTypes = ImmutableMap.copyOf(addColumnTypes); + public Builder(MessageType storageSchema, Map tableSchema) { + this.storageSchema = storageSchema; + this.tableSchema = tableSchema; + deleteColumns = Lists.newArrayList(); + updateColumnTypes = Maps.newHashMap(); + addColumnTypes = Maps.newHashMap(); } - public List getDeleteColumns() { - return deleteColumns; + public Builder deleteTableColumn(String column) { + deleteColumns.add(column); + return this; } - public Map getUpdateColumnTypes() { - return updateColumnTypes; + public Builder updateTableColumn(String column, String storageColumnType) { + updateColumnTypes.put(column, storageColumnType); + return this; } - public Map getAddColumnTypes() { - return addColumnTypes; + public Builder addTableColumn(String name, String type) { + addColumnTypes.put(name, type); + return this; } - @Override public String toString() { - return Objects.toStringHelper(this).add("deleteColumns", deleteColumns) - .add("updateColumnTypes", updateColumnTypes).add("addColumnTypes", addColumnTypes) - .toString(); - } - - public static Builder newBuilder(MessageType storageSchema, Map tableSchema) { - return new Builder(storageSchema, tableSchema); - } - - public boolean isEmpty() { - return deleteColumns.isEmpty() && updateColumnTypes.isEmpty() && addColumnTypes.isEmpty(); - } - - public static class Builder { - private final MessageType storageSchema; - private final Map tableSchema; - private List deleteColumns; - private Map updateColumnTypes; - private Map addColumnTypes; - - public Builder(MessageType storageSchema, Map tableSchema) { - this.storageSchema = storageSchema; - this.tableSchema = tableSchema; - deleteColumns = Lists.newArrayList(); - updateColumnTypes = Maps.newHashMap(); - addColumnTypes = Maps.newHashMap(); - } - - public Builder deleteTableColumn(String column) { - deleteColumns.add(column); - return this; - } - - public Builder updateTableColumn(String column, String storageColumnType) { - updateColumnTypes.put(column, storageColumnType); - return this; - } - - public Builder addTableColumn(String name, String type) { - addColumnTypes.put(name, type); - return this; - } - - public SchemaDifference build() { - return new SchemaDifference(storageSchema, tableSchema, deleteColumns, updateColumnTypes, addColumnTypes); - } + public SchemaDifference build() { + return new SchemaDifference(storageSchema, tableSchema, deleteColumns, updateColumnTypes, + addColumnTypes); } + } } diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java index b3071641b..956bbb8b2 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java @@ -23,9 +23,11 @@ import java.util.List; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; + /** - * HDFS Path contain hive partition values for the keys it is partitioned on. - * This mapping is not straight forward and requires a pluggable implementation to extract the partition value from HDFS path. + * HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not + * straight forward and requires a pluggable implementation to extract the partition value from HDFS + * path. * * This implementation extracts datestr=yyyy-mm-dd from path of type /yyyy/mm/dd */ diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/util/ColumnNameXLator.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/util/ColumnNameXLator.java index 64049c68e..a06494fe2 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/util/ColumnNameXLator.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/util/ColumnNameXLator.java @@ -17,32 +17,32 @@ package com.uber.hoodie.hive.util; import com.google.common.collect.Maps; - import java.util.Iterator; import java.util.Map; public class ColumnNameXLator { - private static Map xformMap = Maps.newHashMap(); - public static String translateNestedColumn(String colName) { - Map.Entry entry; - for (Iterator i$ = xformMap.entrySet().iterator(); i$.hasNext(); - colName = colName.replaceAll((String) entry.getKey(), (String) entry.getValue())) { - entry = (Map.Entry) i$.next(); - } + private static Map xformMap = Maps.newHashMap(); - return colName; + public static String translateNestedColumn(String colName) { + Map.Entry entry; + for (Iterator i$ = xformMap.entrySet().iterator(); i$.hasNext(); + colName = colName.replaceAll((String) entry.getKey(), (String) entry.getValue())) { + entry = (Map.Entry) i$.next(); } - public static String translateColumn(String colName) { - return colName; - } + return colName; + } - public static String translate(String colName, boolean nestedColumn) { - return !nestedColumn ? translateColumn(colName) : translateNestedColumn(colName); - } + public static String translateColumn(String colName) { + return colName; + } - static { - xformMap.put("\\$", "_dollar_"); - } + public static String translate(String colName, boolean nestedColumn) { + return !nestedColumn ? translateColumn(colName) : translateNestedColumn(colName); + } + + static { + xformMap.put("\\$", "_dollar_"); + } } diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/util/SchemaUtil.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/util/SchemaUtil.java index 2a05ed1cf..9f16c777f 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/util/SchemaUtil.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/util/SchemaUtil.java @@ -21,6 +21,10 @@ import com.google.common.collect.Sets; import com.uber.hoodie.hive.HiveSyncConfig; import com.uber.hoodie.hive.HoodieHiveSyncException; import com.uber.hoodie.hive.SchemaDifference; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import parquet.schema.DecimalMetadata; @@ -30,404 +34,386 @@ import parquet.schema.OriginalType; import parquet.schema.PrimitiveType; import parquet.schema.Type; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.Set; - /** * Schema Utilities */ public class SchemaUtil { - private static Logger LOG = LoggerFactory.getLogger(SchemaUtil.class); - /** - * Get the schema difference between the storage schema and hive table schema - * - * @param storageSchema - * @param tableSchema - * @param partitionKeys - * @return - */ - public static SchemaDifference getSchemaDifference(MessageType storageSchema, - Map tableSchema, List partitionKeys) { - Map newTableSchema; - try { - newTableSchema = convertParquetSchemaToHiveSchema(storageSchema); - } catch (IOException e) { - throw new HoodieHiveSyncException("Failed to convert parquet schema to hive schema", - e); + private static Logger LOG = LoggerFactory.getLogger(SchemaUtil.class); + + /** + * Get the schema difference between the storage schema and hive table schema + */ + public static SchemaDifference getSchemaDifference(MessageType storageSchema, + Map tableSchema, List partitionKeys) { + Map newTableSchema; + try { + newTableSchema = convertParquetSchemaToHiveSchema(storageSchema); + } catch (IOException e) { + throw new HoodieHiveSyncException("Failed to convert parquet schema to hive schema", + e); + } + LOG.info("Getting schema difference for " + tableSchema + "\r\n\r\n" + newTableSchema); + SchemaDifference.Builder schemaDiffBuilder = + SchemaDifference.newBuilder(storageSchema, tableSchema); + Set tableColumns = Sets.newHashSet(); + + for (Map.Entry field : tableSchema.entrySet()) { + String fieldName = field.getKey().toLowerCase(); + String tickSurroundedFieldName = tickSurround(fieldName); + if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName) && !partitionKeys + .contains(fieldName)) { + schemaDiffBuilder.deleteTableColumn(fieldName); + } else { + // check type + String tableColumnType = field.getValue(); + if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName)) { + if (partitionKeys.contains(fieldName)) { + // Partition key does not have to be part of the storage schema + continue; + } + // We will log this and continue. Hive schema is a superset of all parquet schemas + LOG.warn("Ignoring table column " + fieldName + + " as its not present in the parquet schema"); + continue; } - LOG.info("Getting schema difference for " + tableSchema + "\r\n\r\n" + newTableSchema); - SchemaDifference.Builder schemaDiffBuilder = - SchemaDifference.newBuilder(storageSchema, tableSchema); - Set tableColumns = Sets.newHashSet(); + tableColumnType = tableColumnType.replaceAll("\\s+", ""); - for (Map.Entry field : tableSchema.entrySet()) { - String fieldName = field.getKey().toLowerCase(); - String tickSurroundedFieldName = tickSurround(fieldName); - if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName) && !partitionKeys.contains(fieldName)) { - schemaDiffBuilder.deleteTableColumn(fieldName); - } else { - // check type - String tableColumnType = field.getValue(); - if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName)) { - if (partitionKeys.contains(fieldName)) { - // Partition key does not have to be part of the storage schema - continue; - } - // We will log this and continue. Hive schema is a superset of all parquet schemas - LOG.warn("Ignoring table column " + fieldName - + " as its not present in the parquet schema"); - continue; - } - tableColumnType = tableColumnType.replaceAll("\\s+", ""); + String expectedType = getExpectedType(newTableSchema, tickSurroundedFieldName); + expectedType = expectedType.replaceAll("\\s+", ""); + expectedType = expectedType.replaceAll("`", ""); - String expectedType = getExpectedType(newTableSchema, tickSurroundedFieldName); - expectedType = expectedType.replaceAll("\\s+", ""); - expectedType = expectedType.replaceAll("`", ""); - - if (!tableColumnType.equalsIgnoreCase(expectedType)) { - // check for incremental datasets, the schema type change is allowed as per evolution rules - if (!isSchemaTypeUpdateAllowed(tableColumnType, expectedType)) { - throw new HoodieHiveSyncException( - "Could not convert field Type from " + tableColumnType + " to " - + expectedType + " for field " + fieldName); - } - schemaDiffBuilder.updateTableColumn(fieldName, - getExpectedType(newTableSchema, tickSurroundedFieldName)); - } - } - tableColumns.add(tickSurroundedFieldName); + if (!tableColumnType.equalsIgnoreCase(expectedType)) { + // check for incremental datasets, the schema type change is allowed as per evolution rules + if (!isSchemaTypeUpdateAllowed(tableColumnType, expectedType)) { + throw new HoodieHiveSyncException( + "Could not convert field Type from " + tableColumnType + " to " + + expectedType + " for field " + fieldName); + } + schemaDiffBuilder.updateTableColumn(fieldName, + getExpectedType(newTableSchema, tickSurroundedFieldName)); } - - for (Map.Entry entry : newTableSchema.entrySet()) { - if (!tableColumns.contains(entry.getKey().toLowerCase())) { - schemaDiffBuilder.addTableColumn(entry.getKey(), entry.getValue()); - } - } - LOG.info("Difference between schemas: " + schemaDiffBuilder.build().toString()); - - return schemaDiffBuilder.build(); + } + tableColumns.add(tickSurroundedFieldName); } - private static String getExpectedType(Map newTableSchema, String fieldName) { - for (Map.Entry entry : newTableSchema.entrySet()) { - if (entry.getKey().toLowerCase().equals(fieldName)) { - return entry.getValue(); - } - } - return null; + for (Map.Entry entry : newTableSchema.entrySet()) { + if (!tableColumns.contains(entry.getKey().toLowerCase())) { + schemaDiffBuilder.addTableColumn(entry.getKey(), entry.getValue()); + } } + LOG.info("Difference between schemas: " + schemaDiffBuilder.build().toString()); - private static boolean isFieldExistsInSchema(Map newTableSchema, - String fieldName) { - for (String entry : newTableSchema.keySet()) { - if (entry.toLowerCase().equals(fieldName)) { - return true; - } - } - return false; + return schemaDiffBuilder.build(); + } + + private static String getExpectedType(Map newTableSchema, String fieldName) { + for (Map.Entry entry : newTableSchema.entrySet()) { + if (entry.getKey().toLowerCase().equals(fieldName)) { + return entry.getValue(); + } } + return null; + } + + private static boolean isFieldExistsInSchema(Map newTableSchema, + String fieldName) { + for (String entry : newTableSchema.keySet()) { + if (entry.toLowerCase().equals(fieldName)) { + return true; + } + } + return false; + } - /** - * Returns equivalent Hive table schema read from a parquet file - * - * @param messageType : Parquet Schema - * @return : Hive Table schema read from parquet file MAP[String,String] - * @throws IOException - */ - public static Map convertParquetSchemaToHiveSchema(MessageType messageType) - throws IOException { - Map schema = Maps.newLinkedHashMap(); - List parquetFields = messageType.getFields(); - for (Type parquetType : parquetFields) { - StringBuilder result = new StringBuilder(); - String key = parquetType.getName(); - if (parquetType.isRepetition(Type.Repetition.REPEATED)) { - result.append(createHiveArray(parquetType, "")); - } else { - result.append(convertField(parquetType)); + /** + * Returns equivalent Hive table schema read from a parquet file + * + * @param messageType : Parquet Schema + * @return : Hive Table schema read from parquet file MAP[String,String] + */ + public static Map convertParquetSchemaToHiveSchema(MessageType messageType) + throws IOException { + Map schema = Maps.newLinkedHashMap(); + List parquetFields = messageType.getFields(); + for (Type parquetType : parquetFields) { + StringBuilder result = new StringBuilder(); + String key = parquetType.getName(); + if (parquetType.isRepetition(Type.Repetition.REPEATED)) { + result.append(createHiveArray(parquetType, "")); + } else { + result.append(convertField(parquetType)); + } + + schema.put(hiveCompatibleFieldName(key, false), result.toString()); + } + return schema; + } + + /** + * Convert one field data type of parquet schema into an equivalent Hive schema + * + * @param parquetType : Single paruet field + * @return : Equivalent sHive schema + */ + private static String convertField(final Type parquetType) { + StringBuilder field = new StringBuilder(); + if (parquetType.isPrimitive()) { + final PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName = + parquetType.asPrimitiveType().getPrimitiveTypeName(); + final OriginalType originalType = parquetType.getOriginalType(); + if (originalType == OriginalType.DECIMAL) { + final DecimalMetadata decimalMetadata = + parquetType.asPrimitiveType().getDecimalMetadata(); + return field.append("DECIMAL(").append(decimalMetadata.getPrecision()). + append(" , ").append(decimalMetadata.getScale()).append(")").toString(); + } + // TODO - fix the method naming here + return parquetPrimitiveTypeName + .convert(new PrimitiveType.PrimitiveTypeNameConverter() { + @Override + public String convertBOOLEAN( + PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "boolean"; } - schema.put(hiveCompatibleFieldName(key, false), result.toString()); - } - return schema; - } - - /** - * Convert one field data type of parquet schema into an equivalent Hive - * schema - * - * @param parquetType : Single paruet field - * @return : Equivalent sHive schema - */ - private static String convertField(final Type parquetType) { - StringBuilder field = new StringBuilder(); - if (parquetType.isPrimitive()) { - final PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName = - parquetType.asPrimitiveType().getPrimitiveTypeName(); - final OriginalType originalType = parquetType.getOriginalType(); - if (originalType == OriginalType.DECIMAL) { - final DecimalMetadata decimalMetadata = - parquetType.asPrimitiveType().getDecimalMetadata(); - return field.append("DECIMAL(").append(decimalMetadata.getPrecision()). - append(" , ").append(decimalMetadata.getScale()).append(")").toString(); + @Override + public String convertINT32(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "int"; } - // TODO - fix the method naming here - return parquetPrimitiveTypeName - .convert(new PrimitiveType.PrimitiveTypeNameConverter() { - @Override - public String convertBOOLEAN( - PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "boolean"; - } - @Override - public String convertINT32(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "int"; - } - - @Override - public String convertINT64(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "bigint"; - } - - @Override - public String convertINT96(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "timestamp-millis"; - } - - @Override - public String convertFLOAT(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "float"; - } - - @Override - public String convertDOUBLE(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "double"; - } - - @Override - public String convertFIXED_LEN_BYTE_ARRAY( - PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "binary"; - } - - @Override - public String convertBINARY(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - if (originalType == OriginalType.UTF8 - || originalType == OriginalType.ENUM) { - return "string"; - } else { - return "binary"; - } - } - }); - } else { - GroupType parquetGroupType = parquetType.asGroupType(); - OriginalType originalType = parquetGroupType.getOriginalType(); - if (originalType != null) { - switch (originalType) { - case LIST: - if (parquetGroupType.getFieldCount() != 1) { - throw new UnsupportedOperationException( - "Invalid list type " + parquetGroupType); - } - Type elementType = parquetGroupType.getType(0); - if (!elementType.isRepetition(Type.Repetition.REPEATED)) { - throw new UnsupportedOperationException( - "Invalid list type " + parquetGroupType); - } - return createHiveArray(elementType, parquetGroupType.getName()); - case MAP: - if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0) - .isPrimitive()) { - throw new UnsupportedOperationException( - "Invalid map type " + parquetGroupType); - } - GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType(); - if (!mapKeyValType.isRepetition(Type.Repetition.REPEATED) || - !mapKeyValType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE) || - mapKeyValType.getFieldCount() != 2) { - throw new UnsupportedOperationException( - "Invalid map type " + parquetGroupType); - } - Type keyType = mapKeyValType.getType(0); - if (!keyType.isPrimitive() || - !keyType.asPrimitiveType().getPrimitiveTypeName() - .equals(PrimitiveType.PrimitiveTypeName.BINARY) || - !keyType.getOriginalType().equals(OriginalType.UTF8)) { - throw new UnsupportedOperationException( - "Map key type must be binary (UTF8): " + keyType); - } - Type valueType = mapKeyValType.getType(1); - return createHiveMap(convertField(keyType), convertField(valueType)); - case ENUM: - case UTF8: - return "string"; - case MAP_KEY_VALUE: - // MAP_KEY_VALUE was supposed to be used to annotate key and - // value group levels in a - // MAP. However, that is always implied by the structure of - // MAP. Hence, PARQUET-113 - // dropped the requirement for having MAP_KEY_VALUE. - default: - throw new UnsupportedOperationException( - "Cannot convert Parquet type " + parquetType); - } - } else { - // if no original type then it's a record - return createHiveStruct(parquetGroupType.getFields()); + @Override + public String convertINT64(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "bigint"; } - } - } - /** - * Return a 'struct' Hive schema from a list of Parquet fields - * - * @param parquetFields : list of parquet fields - * @return : Equivalent 'struct' Hive schema - */ - private static String createHiveStruct(List parquetFields) { - StringBuilder struct = new StringBuilder(); - struct.append("STRUCT< "); - for (Type field : parquetFields) { - //TODO: struct field name is only translated to support special char($) - //We will need to extend it to other collection type - struct.append(hiveCompatibleFieldName(field.getName(), true)).append(" : "); - struct.append(convertField(field)).append(", "); - } - struct.delete(struct.length() - 2, struct.length()); // Remove the last - // ", " - struct.append(">"); - String finalStr = struct.toString(); - // Struct cannot have - in them. userstore_udr_entities has uuid in struct. This breaks the schema. - // HDrone sync should not fail because of this. - finalStr = finalStr.replaceAll("-", "_"); - return finalStr; - } - - - private static String hiveCompatibleFieldName(String fieldName, boolean isNested) { - String result = fieldName; - if (isNested) { - result = ColumnNameXLator.translateNestedColumn(fieldName); - } - return tickSurround(result); - } - - private static String tickSurround(String result) { - if (!result.startsWith("`")) { - result = "`" + result; - } - if (!result.endsWith("`")) { - result = result + "`"; - } - return result; - } - - /** - * Create a 'Map' schema from Parquet map field - * - * @param keyType - * @param valueType - * @return - */ - private static String createHiveMap(String keyType, String valueType) { - return "MAP< " + keyType + ", " + valueType + ">"; - } - - /** - * Create an Array Hive schema from equivalent parquet list type - * - * @param elementType - * @param elementName - * @return - */ - private static String createHiveArray(Type elementType, String elementName) { - StringBuilder array = new StringBuilder(); - array.append("ARRAY< "); - if (elementType.isPrimitive()) { - array.append(convertField(elementType)); - } else { - final GroupType groupType = elementType.asGroupType(); - final List groupFields = groupType.getFields(); - if (groupFields.size() > 1 || (groupFields.size() == 1 && ( - elementType.getName().equals("array") || elementType.getName() - .equals(elementName + "_tuple")))) { - array.append(convertField(elementType)); - } else { - array.append(convertField(groupType.getFields().get(0))); + @Override + public String convertINT96(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "timestamp-millis"; } + + @Override + public String convertFLOAT(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "float"; + } + + @Override + public String convertDOUBLE(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "double"; + } + + @Override + public String convertFIXED_LEN_BYTE_ARRAY( + PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "binary"; + } + + @Override + public String convertBINARY(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + if (originalType == OriginalType.UTF8 + || originalType == OriginalType.ENUM) { + return "string"; + } else { + return "binary"; + } + } + }); + } else { + GroupType parquetGroupType = parquetType.asGroupType(); + OriginalType originalType = parquetGroupType.getOriginalType(); + if (originalType != null) { + switch (originalType) { + case LIST: + if (parquetGroupType.getFieldCount() != 1) { + throw new UnsupportedOperationException( + "Invalid list type " + parquetGroupType); + } + Type elementType = parquetGroupType.getType(0); + if (!elementType.isRepetition(Type.Repetition.REPEATED)) { + throw new UnsupportedOperationException( + "Invalid list type " + parquetGroupType); + } + return createHiveArray(elementType, parquetGroupType.getName()); + case MAP: + if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0) + .isPrimitive()) { + throw new UnsupportedOperationException( + "Invalid map type " + parquetGroupType); + } + GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType(); + if (!mapKeyValType.isRepetition(Type.Repetition.REPEATED) || + !mapKeyValType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE) || + mapKeyValType.getFieldCount() != 2) { + throw new UnsupportedOperationException( + "Invalid map type " + parquetGroupType); + } + Type keyType = mapKeyValType.getType(0); + if (!keyType.isPrimitive() || + !keyType.asPrimitiveType().getPrimitiveTypeName() + .equals(PrimitiveType.PrimitiveTypeName.BINARY) || + !keyType.getOriginalType().equals(OriginalType.UTF8)) { + throw new UnsupportedOperationException( + "Map key type must be binary (UTF8): " + keyType); + } + Type valueType = mapKeyValType.getType(1); + return createHiveMap(convertField(keyType), convertField(valueType)); + case ENUM: + case UTF8: + return "string"; + case MAP_KEY_VALUE: + // MAP_KEY_VALUE was supposed to be used to annotate key and + // value group levels in a + // MAP. However, that is always implied by the structure of + // MAP. Hence, PARQUET-113 + // dropped the requirement for having MAP_KEY_VALUE. + default: + throw new UnsupportedOperationException( + "Cannot convert Parquet type " + parquetType); } - array.append(">"); - return array.toString(); + } else { + // if no original type then it's a record + return createHiveStruct(parquetGroupType.getFields()); + } + } + } + + /** + * Return a 'struct' Hive schema from a list of Parquet fields + * + * @param parquetFields : list of parquet fields + * @return : Equivalent 'struct' Hive schema + */ + private static String createHiveStruct(List parquetFields) { + StringBuilder struct = new StringBuilder(); + struct.append("STRUCT< "); + for (Type field : parquetFields) { + //TODO: struct field name is only translated to support special char($) + //We will need to extend it to other collection type + struct.append(hiveCompatibleFieldName(field.getName(), true)).append(" : "); + struct.append(convertField(field)).append(", "); + } + struct.delete(struct.length() - 2, struct.length()); // Remove the last + // ", " + struct.append(">"); + String finalStr = struct.toString(); + // Struct cannot have - in them. userstore_udr_entities has uuid in struct. This breaks the schema. + // HDrone sync should not fail because of this. + finalStr = finalStr.replaceAll("-", "_"); + return finalStr; + } + + + private static String hiveCompatibleFieldName(String fieldName, boolean isNested) { + String result = fieldName; + if (isNested) { + result = ColumnNameXLator.translateNestedColumn(fieldName); + } + return tickSurround(result); + } + + private static String tickSurround(String result) { + if (!result.startsWith("`")) { + result = "`" + result; + } + if (!result.endsWith("`")) { + result = result + "`"; + } + return result; + } + + /** + * Create a 'Map' schema from Parquet map field + */ + private static String createHiveMap(String keyType, String valueType) { + return "MAP< " + keyType + ", " + valueType + ">"; + } + + /** + * Create an Array Hive schema from equivalent parquet list type + */ + private static String createHiveArray(Type elementType, String elementName) { + StringBuilder array = new StringBuilder(); + array.append("ARRAY< "); + if (elementType.isPrimitive()) { + array.append(convertField(elementType)); + } else { + final GroupType groupType = elementType.asGroupType(); + final List groupFields = groupType.getFields(); + if (groupFields.size() > 1 || (groupFields.size() == 1 && ( + elementType.getName().equals("array") || elementType.getName() + .equals(elementName + "_tuple")))) { + array.append(convertField(elementType)); + } else { + array.append(convertField(groupType.getFields().get(0))); + } + } + array.append(">"); + return array.toString(); + } + + public static boolean isSchemaTypeUpdateAllowed(String prevType, String newType) { + if (prevType == null || prevType.trim().isEmpty() || + newType == null || newType.trim().isEmpty()) { + return false; + } + prevType = prevType.toLowerCase(); + newType = newType.toLowerCase(); + if (prevType.equals(newType)) { + return true; + } else if (prevType.equalsIgnoreCase("int") && newType.equalsIgnoreCase("bigint")) { + return true; + } else if (prevType.equalsIgnoreCase("float") && newType.equalsIgnoreCase("double")) { + return true; + } else if (prevType.contains("struct") && newType.toLowerCase().contains("struct")) { + return true; + } + return false; + } + + public static String generateSchemaString(MessageType storageSchema) throws IOException { + Map hiveSchema = convertParquetSchemaToHiveSchema(storageSchema); + StringBuilder columns = new StringBuilder(); + for (Map.Entry hiveSchemaEntry : hiveSchema.entrySet()) { + columns.append(hiveSchemaEntry.getKey()).append(" "); + columns.append(hiveSchemaEntry.getValue()).append(", "); + } + // Remove the last ", " + columns.delete(columns.length() - 2, columns.length()); + return columns.toString(); + } + + public static String generateCreateDDL(MessageType storageSchema, + HiveSyncConfig config, String inputFormatClass, + String outputFormatClass, String serdeClass) throws IOException { + Map hiveSchema = convertParquetSchemaToHiveSchema(storageSchema); + String columns = generateSchemaString(storageSchema); + + StringBuilder partitionFields = new StringBuilder(); + for (String partitionKey : config.partitionFields) { + partitionFields.append(partitionKey).append(" ") + .append(getPartitionKeyType(hiveSchema, partitionKey)); } - public static boolean isSchemaTypeUpdateAllowed(String prevType, String newType) { - if (prevType == null || prevType.trim().isEmpty() || - newType == null || newType.trim().isEmpty()) { - return false; - } - prevType = prevType.toLowerCase(); - newType = newType.toLowerCase(); - if (prevType.equals(newType)) { - return true; - } else if (prevType.equalsIgnoreCase("int") && newType.equalsIgnoreCase("bigint")) { - return true; - } else if (prevType.equalsIgnoreCase("float") && newType.equalsIgnoreCase("double")) { - return true; - } else if (prevType.contains("struct") && newType.toLowerCase().contains("struct")) { - return true; - } - return false; + StringBuilder sb = new StringBuilder("CREATE EXTERNAL TABLE IF NOT EXISTS "); + sb = sb.append(config.databaseName).append(".").append(config.tableName); + sb = sb.append("( ").append(columns).append(")"); + if (!config.partitionFields.isEmpty()) { + sb = sb.append(" PARTITIONED BY (").append(partitionFields).append(")"); } + sb = sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'"); + sb = sb.append(" STORED AS INPUTFORMAT '").append(inputFormatClass).append("'"); + sb = sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '") + .append(config.basePath).append("'"); + return sb.toString(); + } - public static String generateSchemaString(MessageType storageSchema) throws IOException { - Map hiveSchema = convertParquetSchemaToHiveSchema(storageSchema); - StringBuilder columns = new StringBuilder(); - for (Map.Entry hiveSchemaEntry : hiveSchema.entrySet()) { - columns.append(hiveSchemaEntry.getKey()).append(" "); - columns.append(hiveSchemaEntry.getValue()).append(", "); - } - // Remove the last ", " - columns.delete(columns.length() - 2, columns.length()); - return columns.toString(); - } - - public static String generateCreateDDL(MessageType storageSchema, - HiveSyncConfig config, String inputFormatClass, - String outputFormatClass, String serdeClass) throws IOException { - Map hiveSchema = convertParquetSchemaToHiveSchema(storageSchema); - String columns = generateSchemaString(storageSchema); - - StringBuilder partitionFields = new StringBuilder(); - for (String partitionKey : config.partitionFields) { - partitionFields.append(partitionKey).append(" ") - .append(getPartitionKeyType(hiveSchema, partitionKey)); - } - - StringBuilder sb = new StringBuilder("CREATE EXTERNAL TABLE IF NOT EXISTS "); - sb = sb.append(config.databaseName).append(".").append(config.tableName); - sb = sb.append("( ").append(columns).append(")"); - if (!config.partitionFields.isEmpty()) { - sb = sb.append(" PARTITIONED BY (").append(partitionFields).append(")"); - } - sb = sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'"); - sb = sb.append(" STORED AS INPUTFORMAT '").append(inputFormatClass).append("'"); - sb = sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '") - .append(config.basePath).append("'"); - return sb.toString(); - } - - private static String getPartitionKeyType(Map hiveSchema, String partitionKey) { - if (hiveSchema.containsKey(partitionKey)) { - return hiveSchema.get(partitionKey); - } - // Default the unknown partition fields to be String - // TODO - all partition fields should be part of the schema. datestr is treated as special. Dont do that - return "String"; + private static String getPartitionKeyType(Map hiveSchema, String partitionKey) { + if (hiveSchema.containsKey(partitionKey)) { + return hiveSchema.get(partitionKey); } + // Default the unknown partition fields to be String + // TODO - all partition fields should be part of the schema. datestr is treated as special. Dont do that + return "String"; + } } diff --git a/hoodie-hive/src/test/java/com/uber/hoodie/hive/HiveSyncToolTest.java b/hoodie-hive/src/test/java/com/uber/hoodie/hive/HiveSyncToolTest.java index 398d6e0a8..5250a660d 100644 --- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/HiveSyncToolTest.java +++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/HiveSyncToolTest.java @@ -18,7 +18,9 @@ package com.uber.hoodie.hive; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import com.uber.hoodie.common.util.SchemaTestUtil; import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent; @@ -52,9 +54,8 @@ public class HiveSyncToolTest { } /** - * Testing converting array types to Hive field declaration strings, - * according to the Parquet-113 spec: - * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + * Testing converting array types to Hive field declaration strings, according to the Parquet-113 + * spec: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists */ @Test public void testSchemaConvertArray() throws IOException { @@ -274,7 +275,8 @@ public class HiveSyncToolTest { assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes", hiveClient.doesTableExist()); assertEquals("Hive Schema should match the dataset schema + partition field", - hiveClient.getTableSchema().size(), SchemaTestUtil.getSimpleSchema().getFields().size() + 1); + hiveClient.getTableSchema().size(), + SchemaTestUtil.getSimpleSchema().getFields().size() + 1); assertEquals("Table partitions should match the number of partitions we wrote", 5, hiveClient.scanTablePartitions().size()); assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", @@ -296,7 +298,8 @@ public class HiveSyncToolTest { TestUtil.getHiveConf(), TestUtil.fileSystem); assertEquals("Hive Schema should match the evolved dataset schema + partition field", - hiveClient.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1); + hiveClient.getTableSchema().size(), + SchemaTestUtil.getEvolvedSchema().getFields().size() + 1); // Sync should add the one partition assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClient.scanTablePartitions().size()); @@ -307,33 +310,37 @@ public class HiveSyncToolTest { @Test public void testSyncMergeOnReadRT() - throws IOException, InitializationError, URISyntaxException, TException, InterruptedException { + throws IOException, InitializationError, URISyntaxException, TException, InterruptedException { String commitTime = "100"; String deltaCommitTime = "101"; String roTablename = TestUtil.hiveSyncConfig.tableName; - TestUtil.hiveSyncConfig.tableName = TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE; + TestUtil.hiveSyncConfig.tableName = + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE; TestUtil.createMORDataset(commitTime, deltaCommitTime, 5); HoodieHiveClient hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig, - TestUtil.getHiveConf(), TestUtil.fileSystem); + TestUtil.getHiveConf(), TestUtil.fileSystem); - assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE + " should not exist initially", - hiveClientRT.doesTableExist()); + assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE + + " should not exist initially", + hiveClientRT.doesTableExist()); // Lets do the sync HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), - TestUtil.fileSystem); + TestUtil.fileSystem); tool.syncHoodieTable(); - assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE + " should exist after sync completes", - hiveClientRT.doesTableExist()); + assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE + + " should exist after sync completes", + hiveClientRT.doesTableExist()); assertEquals("Hive Schema should match the dataset schema + partition field", - hiveClientRT.getTableSchema().size(), SchemaTestUtil.getSimpleSchema().getFields().size() + 1); + hiveClientRT.getTableSchema().size(), + SchemaTestUtil.getSimpleSchema().getFields().size() + 1); assertEquals("Table partitions should match the number of partitions we wrote", 5, - hiveClientRT.scanTablePartitions().size()); + hiveClientRT.scanTablePartitions().size()); assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", - deltaCommitTime, - hiveClientRT.getLastCommitTimeSynced().get()); + deltaCommitTime, + hiveClientRT.getLastCommitTimeSynced().get()); // Now lets create more parititions and these are the only ones which needs to be synced DateTime dateTime = DateTime.now().plusDays(6); @@ -344,20 +351,21 @@ public class HiveSyncToolTest { TestUtil.addMORPartitions(1, true, false, dateTime, commitTime2, deltaCommitTime2); // Lets do the sync tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), - TestUtil.fileSystem); + TestUtil.fileSystem); tool.syncHoodieTable(); hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig, - TestUtil.getHiveConf(), TestUtil.fileSystem); + TestUtil.getHiveConf(), TestUtil.fileSystem); assertEquals("Hive Schema should match the evolved dataset schema + partition field", - hiveClientRT.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1); + hiveClientRT.getTableSchema().size(), + SchemaTestUtil.getEvolvedSchema().getFields().size() + 1); // Sync should add the one partition assertEquals("The 2 partitions we wrote should be added to hive", 6, - hiveClientRT.scanTablePartitions().size()); + hiveClientRT.scanTablePartitions().size()); assertEquals("The last commit that was sycned should be 103", - deltaCommitTime2, - hiveClientRT.getLastCommitTimeSynced().get()); + deltaCommitTime2, + hiveClientRT.getLastCommitTimeSynced().get()); TestUtil.hiveSyncConfig.tableName = roTablename; } -} \ No newline at end of file +} diff --git a/hoodie-hive/src/test/java/com/uber/hoodie/hive/TestUtil.java b/hoodie-hive/src/test/java/com/uber/hoodie/hive/TestUtil.java index bb7d6e3cc..2707377af 100644 --- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/TestUtil.java +++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/TestUtil.java @@ -16,6 +16,9 @@ package com.uber.hoodie.hive; +import static com.uber.hoodie.common.model.HoodieTestUtils.DEFAULT_TASK_PARTITIONID; +import static org.junit.Assert.fail; + import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; @@ -41,6 +44,15 @@ import com.uber.hoodie.common.table.log.block.HoodieLogBlock; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.SchemaTestUtil; import com.uber.hoodie.hive.util.HiveTestService; +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.UUID; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.commons.io.FileUtils; @@ -60,19 +72,6 @@ import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.junit.runners.model.InitializationError; -import java.io.File; -import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.UUID; - -import static com.uber.hoodie.common.model.HoodieTestUtils.DEFAULT_TASK_PARTITIONID; -import static org.junit.Assert.fail; - @SuppressWarnings("SameParameterValue") public class TestUtil { @@ -161,7 +160,8 @@ public class TestUtil { boolean result = fileSystem.mkdirs(path); checkResult(result); DateTime dateTime = DateTime.now(); - HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime); + HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, + commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); createCommitFile(commitMetadata, commitTime); } @@ -177,16 +177,19 @@ public class TestUtil { boolean result = fileSystem.mkdirs(path); checkResult(result); DateTime dateTime = DateTime.now(); - HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime); + HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, + commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); - createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE); + createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + + HiveSyncTool.SUFFIX_REALTIME_TABLE); HoodieCompactionMetadata compactionMetadata = new HoodieCompactionMetadata(); commitMetadata.getPartitionToWriteStats() .forEach((key, value) -> value.stream().map(k -> new CompactionWriteStat(k, key, 0, 0, 0)) .forEach(l -> compactionMetadata.addWriteStat(key, l))); createCompactionCommitFile(compactionMetadata, commitTime); // Write a delta commit - HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), true); + HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), + true); createDeltaCommitFile(deltaMetadata, deltaCommitTime); } @@ -206,18 +209,20 @@ public class TestUtil { HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, isParquetSchemaSimple, startFrom, commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); - createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE); + createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + + HiveSyncTool.SUFFIX_REALTIME_TABLE); HoodieCompactionMetadata compactionMetadata = new HoodieCompactionMetadata(); commitMetadata.getPartitionToWriteStats() .forEach((key, value) -> value.stream().map(k -> new CompactionWriteStat(k, key, 0, 0, 0)) .forEach(l -> compactionMetadata.addWriteStat(key, l))); createCompactionCommitFile(compactionMetadata, commitTime); - HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), isLogSchemaSimple); + HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), + isLogSchemaSimple); createDeltaCommitFile(deltaMetadata, deltaCommitTime); } private static HoodieCommitMetadata createLogFiles( - Map> partitionWriteStats, boolean isLogSchemaSimple) + Map> partitionWriteStats, boolean isLogSchemaSimple) throws InterruptedException, IOException, URISyntaxException { HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); for (Entry> wEntry : partitionWriteStats.entrySet()) { @@ -246,7 +251,8 @@ public class TestUtil { Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath); fileSystem.makeQualified(partPath); fileSystem.mkdirs(partPath); - List writeStats = createTestData(partPath, isParquetSchemaSimple, commitTime); + List writeStats = createTestData(partPath, isParquetSchemaSimple, + commitTime); startFrom = startFrom.minusDays(1); writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s)); } diff --git a/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/HiveTestService.java b/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/HiveTestService.java index e9faa4536..26ed1b0f9 100644 --- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/HiveTestService.java +++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/HiveTestService.java @@ -20,6 +20,13 @@ package com.uber.hoodie.hive.util; import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import com.google.common.io.Files; +import java.io.File; +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.SocketException; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -46,277 +53,274 @@ import org.apache.thrift.transport.TTransportFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; -import java.io.IOException; -import java.net.InetSocketAddress; -import java.net.SocketException; -import java.util.Map; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - public class HiveTestService { - private static final Logger LOG = LoggerFactory.getLogger(HiveTestService.class); + private static final Logger LOG = LoggerFactory.getLogger(HiveTestService.class); - private static final int CONNECTION_TIMEOUT = 30000; + private static final int CONNECTION_TIMEOUT = 30000; - /** - * Configuration settings - */ - private Configuration hadoopConf; - private String workDir; - private String bindIP = "127.0.0.1"; - private int metastorePort = 9083; - private int serverPort = 9999; - private boolean clean = true; + /** + * Configuration settings + */ + private Configuration hadoopConf; + private String workDir; + private String bindIP = "127.0.0.1"; + private int metastorePort = 9083; + private int serverPort = 9999; + private boolean clean = true; - private Map sysProps = Maps.newHashMap(); - private ExecutorService executorService; - private TServer tServer; - private HiveServer2 hiveServer; + private Map sysProps = Maps.newHashMap(); + private ExecutorService executorService; + private TServer tServer; + private HiveServer2 hiveServer; - public HiveTestService(Configuration configuration) { - this.workDir = Files.createTempDir().getAbsolutePath(); + public HiveTestService(Configuration configuration) { + this.workDir = Files.createTempDir().getAbsolutePath(); + } + + public Configuration getHadoopConf() { + return hadoopConf; + } + + public HiveServer2 start() throws IOException { + Preconditions + .checkState(workDir != null, "The work dir must be set before starting cluster."); + + if (hadoopConf == null) { + hadoopConf = new Configuration(); } - public Configuration getHadoopConf() { - return hadoopConf; + String localHiveLocation = getHiveLocation(workDir); + if (clean) { + LOG.info( + "Cleaning Hive cluster data at: " + localHiveLocation + " and starting fresh."); + File file = new File(localHiveLocation); + FileUtils.deleteDirectory(file); } - public HiveServer2 start() throws IOException { - Preconditions - .checkState(workDir != null, "The work dir must be set before starting cluster."); + HiveConf serverConf = configureHive(hadoopConf, localHiveLocation); - if (hadoopConf == null) { - hadoopConf = new Configuration(); - } + executorService = Executors.newSingleThreadExecutor(); + tServer = startMetaStore(bindIP, metastorePort, serverConf); - String localHiveLocation = getHiveLocation(workDir); - if (clean) { - LOG.info( - "Cleaning Hive cluster data at: " + localHiveLocation + " and starting fresh."); - File file = new File(localHiveLocation); - FileUtils.deleteDirectory(file); - } + hiveServer = startHiveServer(serverConf); - HiveConf serverConf = configureHive(hadoopConf, localHiveLocation); - - executorService = Executors.newSingleThreadExecutor(); - tServer = startMetaStore(bindIP, metastorePort, serverConf); - - hiveServer = startHiveServer(serverConf); - - String serverHostname; - if (bindIP.equals("0.0.0.0")) { - serverHostname = "localhost"; - } else { - serverHostname = bindIP; - } - if (!waitForServerUp(serverConf, serverHostname, metastorePort, CONNECTION_TIMEOUT)) { - throw new IOException("Waiting for startup of standalone server"); - } - - LOG.info("Hive Minicluster service started."); - return hiveServer; + String serverHostname; + if (bindIP.equals("0.0.0.0")) { + serverHostname = "localhost"; + } else { + serverHostname = bindIP; + } + if (!waitForServerUp(serverConf, serverHostname, metastorePort, CONNECTION_TIMEOUT)) { + throw new IOException("Waiting for startup of standalone server"); } - public void stop() throws IOException { - resetSystemProperties(); - if (tServer != null) { - tServer.stop(); - } - if (hiveServer != null) { - hiveServer.stop(); - } - LOG.info("Hive Minicluster service shut down."); - tServer = null; - hiveServer = null; - hadoopConf = null; + LOG.info("Hive Minicluster service started."); + return hiveServer; + } + + public void stop() throws IOException { + resetSystemProperties(); + if (tServer != null) { + tServer.stop(); + } + if (hiveServer != null) { + hiveServer.stop(); + } + LOG.info("Hive Minicluster service shut down."); + tServer = null; + hiveServer = null; + hadoopConf = null; + } + + private HiveConf configureHive(Configuration conf, String localHiveLocation) + throws IOException { + conf.set("hive.metastore.local", "false"); + conf.set(HiveConf.ConfVars.METASTOREURIS.varname, + "thrift://" + bindIP + ":" + metastorePort); + conf.set(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST.varname, bindIP); + conf.setInt(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_PORT.varname, serverPort); + // The following line to turn of SASL has no effect since HiveAuthFactory calls + // 'new HiveConf()'. This is fixed by https://issues.apache.org/jira/browse/HIVE-6657, + // in Hive 0.14. + // As a workaround, the property is set in hive-site.xml in this module. + //conf.set(HiveConf.ConfVars.HIVE_SERVER2_AUTHENTICATION.varname, "NOSASL"); + File localHiveDir = new File(localHiveLocation); + localHiveDir.mkdirs(); + File metastoreDbDir = new File(localHiveDir, "metastore_db"); + conf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, + "jdbc:derby:" + metastoreDbDir.getPath() + ";create=true"); + File derbyLogFile = new File(localHiveDir, "derby.log"); + derbyLogFile.createNewFile(); + setSystemProperty("derby.stream.error.file", derbyLogFile.getPath()); + conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, + Files.createTempDir().getAbsolutePath()); + + return new HiveConf(conf, this.getClass()); + } + + private boolean waitForServerUp(HiveConf serverConf, String hostname, int port, int timeout) { + long start = System.currentTimeMillis(); + while (true) { + try { + new HiveMetaStoreClient(serverConf); + return true; + } catch (MetaException e) { + // ignore as this is expected + LOG.info("server " + hostname + ":" + port + " not up " + e); + } + + if (System.currentTimeMillis() > start + timeout) { + break; + } + try { + Thread.sleep(250); + } catch (InterruptedException e) { + // ignore + } + } + return false; + } + + private void setSystemProperty(String name, String value) { + if (!sysProps.containsKey(name)) { + String currentValue = System.getProperty(name); + sysProps.put(name, currentValue); + } + if (value != null) { + System.setProperty(name, value); + } else { + System.getProperties().remove(name); + } + } + + private void resetSystemProperties() { + for (Map.Entry entry : sysProps.entrySet()) { + if (entry.getValue() != null) { + System.setProperty(entry.getKey(), entry.getValue()); + } else { + System.getProperties().remove(entry.getKey()); + } + } + sysProps.clear(); + } + + private static String getHiveLocation(String baseLocation) { + return baseLocation + Path.SEPARATOR + "hive"; + } + + private HiveServer2 startHiveServer(HiveConf serverConf) { + HiveServer2 hiveServer = new HiveServer2(); + hiveServer.init(serverConf); + hiveServer.start(); + return hiveServer; + } + + // XXX: From org.apache.hadoop.hive.metastore.HiveMetaStore, + // with changes to support binding to a specified IP address (not only 0.0.0.0) + + + private static final class ChainedTTransportFactory extends TTransportFactory { + + private final TTransportFactory parentTransFactory; + private final TTransportFactory childTransFactory; + + private ChainedTTransportFactory(TTransportFactory parentTransFactory, + TTransportFactory childTransFactory) { + this.parentTransFactory = parentTransFactory; + this.childTransFactory = childTransFactory; } - private HiveConf configureHive(Configuration conf, String localHiveLocation) - throws IOException { - conf.set("hive.metastore.local", "false"); - conf.set(HiveConf.ConfVars.METASTOREURIS.varname, - "thrift://" + bindIP + ":" + metastorePort); - conf.set(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST.varname, bindIP); - conf.setInt(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_PORT.varname, serverPort); - // The following line to turn of SASL has no effect since HiveAuthFactory calls - // 'new HiveConf()'. This is fixed by https://issues.apache.org/jira/browse/HIVE-6657, - // in Hive 0.14. - // As a workaround, the property is set in hive-site.xml in this module. - //conf.set(HiveConf.ConfVars.HIVE_SERVER2_AUTHENTICATION.varname, "NOSASL"); - File localHiveDir = new File(localHiveLocation); - localHiveDir.mkdirs(); - File metastoreDbDir = new File(localHiveDir, "metastore_db"); - conf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, - "jdbc:derby:" + metastoreDbDir.getPath() + ";create=true"); - File derbyLogFile = new File(localHiveDir, "derby.log"); - derbyLogFile.createNewFile(); - setSystemProperty("derby.stream.error.file", derbyLogFile.getPath()); - conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, - Files.createTempDir().getAbsolutePath()); + @Override + public TTransport getTransport(TTransport trans) { + return childTransFactory.getTransport(parentTransFactory.getTransport(trans)); + } + } - return new HiveConf(conf, this.getClass()); + + private static final class TServerSocketKeepAlive extends TServerSocket { + + public TServerSocketKeepAlive(int port) throws TTransportException { + super(port, 0); } - private boolean waitForServerUp(HiveConf serverConf, String hostname, int port, int timeout) { - long start = System.currentTimeMillis(); - while (true) { - try { - new HiveMetaStoreClient(serverConf); - return true; - } catch (MetaException e) { - // ignore as this is expected - LOG.info("server " + hostname + ":" + port + " not up " + e); - } - - if (System.currentTimeMillis() > start + timeout) { - break; - } - try { - Thread.sleep(250); - } catch (InterruptedException e) { - // ignore - } - } - return false; + public TServerSocketKeepAlive(InetSocketAddress address) throws TTransportException { + super(address, 0); } - private void setSystemProperty(String name, String value) { - if (!sysProps.containsKey(name)) { - String currentValue = System.getProperty(name); - sysProps.put(name, currentValue); - } - if (value != null) { - System.setProperty(name, value); - } else { - System.getProperties().remove(name); - } + @Override + protected TSocket acceptImpl() throws TTransportException { + TSocket ts = super.acceptImpl(); + try { + ts.getSocket().setKeepAlive(true); + } catch (SocketException e) { + throw new TTransportException(e); + } + return ts; } + } - private void resetSystemProperties() { - for (Map.Entry entry : sysProps.entrySet()) { - if (entry.getValue() != null) { - System.setProperty(entry.getKey(), entry.getValue()); - } else { - System.getProperties().remove(entry.getKey()); - } - } - sysProps.clear(); - } - - private static String getHiveLocation(String baseLocation) { - return baseLocation + Path.SEPARATOR + "hive"; - } - - private HiveServer2 startHiveServer(HiveConf serverConf) { - HiveServer2 hiveServer = new HiveServer2(); - hiveServer.init(serverConf); - hiveServer.start(); - return hiveServer; - } - - // XXX: From org.apache.hadoop.hive.metastore.HiveMetaStore, - // with changes to support binding to a specified IP address (not only 0.0.0.0) - - - private static final class ChainedTTransportFactory extends TTransportFactory { - private final TTransportFactory parentTransFactory; - private final TTransportFactory childTransFactory; - - private ChainedTTransportFactory(TTransportFactory parentTransFactory, - TTransportFactory childTransFactory) { - this.parentTransFactory = parentTransFactory; - this.childTransFactory = childTransFactory; - } - - @Override public TTransport getTransport(TTransport trans) { - return childTransFactory.getTransport(parentTransFactory.getTransport(trans)); - } - } - - - private static final class TServerSocketKeepAlive extends TServerSocket { - public TServerSocketKeepAlive(int port) throws TTransportException { - super(port, 0); - } - - public TServerSocketKeepAlive(InetSocketAddress address) throws TTransportException { - super(address, 0); - } - - @Override protected TSocket acceptImpl() throws TTransportException { - TSocket ts = super.acceptImpl(); - try { - ts.getSocket().setKeepAlive(true); - } catch (SocketException e) { - throw new TTransportException(e); - } - return ts; - } - } - - public TServer startMetaStore(String forceBindIP, int port, HiveConf conf) throws IOException { - try { - // Server will create new threads up to max as necessary. After an idle - // period, it will destory threads to keep the number of threads in the - // pool to min. - int minWorkerThreads = conf.getIntVar(HiveConf.ConfVars.METASTORESERVERMINTHREADS); - int maxWorkerThreads = conf.getIntVar(HiveConf.ConfVars.METASTORESERVERMAXTHREADS); - boolean tcpKeepAlive = conf.getBoolVar(HiveConf.ConfVars.METASTORE_TCP_KEEP_ALIVE); - boolean useFramedTransport = - conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_FRAMED_TRANSPORT); - - // don't support SASL yet - //boolean useSasl = conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL); - - TServerTransport serverTransport; - if (forceBindIP != null) { - InetSocketAddress address = new InetSocketAddress(forceBindIP, port); - serverTransport = - tcpKeepAlive ? new TServerSocketKeepAlive(address) : new TServerSocket(address); - - } else { - serverTransport = - tcpKeepAlive ? new TServerSocketKeepAlive(port) : new TServerSocket(port); - } - - TProcessor processor; - TTransportFactory transFactory; - - IHMSHandler handler = (IHMSHandler) HiveMetaStore - .newRetryingHMSHandler("new db based metaserver", conf, true); - - if (conf.getBoolVar(HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI)) { - transFactory = useFramedTransport ? - new ChainedTTransportFactory(new TFramedTransport.Factory(), - new TUGIContainingTransport.Factory()) : - new TUGIContainingTransport.Factory(); - - processor = new TUGIBasedProcessor(handler); - LOG.info("Starting DB backed MetaStore Server with SetUGI enabled"); - } else { - transFactory = - useFramedTransport ? new TFramedTransport.Factory() : new TTransportFactory(); - processor = new TSetIpAddressProcessor(handler); - LOG.info("Starting DB backed MetaStore Server"); - } - - TThreadPoolServer.Args args = - new TThreadPoolServer.Args(serverTransport).processor(processor) - .transportFactory(transFactory).protocolFactory(new TBinaryProtocol.Factory()) - .minWorkerThreads(minWorkerThreads).maxWorkerThreads(maxWorkerThreads); - - final TServer tServer = new TThreadPoolServer(args); - executorService.submit(new Runnable() { - @Override public void run() { - tServer.serve(); - } - }); - return tServer; - } catch (Throwable x) { - throw new IOException(x); + public TServer startMetaStore(String forceBindIP, int port, HiveConf conf) throws IOException { + try { + // Server will create new threads up to max as necessary. After an idle + // period, it will destory threads to keep the number of threads in the + // pool to min. + int minWorkerThreads = conf.getIntVar(HiveConf.ConfVars.METASTORESERVERMINTHREADS); + int maxWorkerThreads = conf.getIntVar(HiveConf.ConfVars.METASTORESERVERMAXTHREADS); + boolean tcpKeepAlive = conf.getBoolVar(HiveConf.ConfVars.METASTORE_TCP_KEEP_ALIVE); + boolean useFramedTransport = + conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_FRAMED_TRANSPORT); + + // don't support SASL yet + //boolean useSasl = conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL); + + TServerTransport serverTransport; + if (forceBindIP != null) { + InetSocketAddress address = new InetSocketAddress(forceBindIP, port); + serverTransport = + tcpKeepAlive ? new TServerSocketKeepAlive(address) : new TServerSocket(address); + + } else { + serverTransport = + tcpKeepAlive ? new TServerSocketKeepAlive(port) : new TServerSocket(port); + } + + TProcessor processor; + TTransportFactory transFactory; + + IHMSHandler handler = (IHMSHandler) HiveMetaStore + .newRetryingHMSHandler("new db based metaserver", conf, true); + + if (conf.getBoolVar(HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI)) { + transFactory = useFramedTransport ? + new ChainedTTransportFactory(new TFramedTransport.Factory(), + new TUGIContainingTransport.Factory()) : + new TUGIContainingTransport.Factory(); + + processor = new TUGIBasedProcessor(handler); + LOG.info("Starting DB backed MetaStore Server with SetUGI enabled"); + } else { + transFactory = + useFramedTransport ? new TFramedTransport.Factory() : new TTransportFactory(); + processor = new TSetIpAddressProcessor(handler); + LOG.info("Starting DB backed MetaStore Server"); + } + + TThreadPoolServer.Args args = + new TThreadPoolServer.Args(serverTransport).processor(processor) + .transportFactory(transFactory).protocolFactory(new TBinaryProtocol.Factory()) + .minWorkerThreads(minWorkerThreads).maxWorkerThreads(maxWorkerThreads); + + final TServer tServer = new TThreadPoolServer(args); + executorService.submit(new Runnable() { + @Override + public void run() { + tServer.serve(); } + }); + return tServer; + } catch (Throwable x) { + throw new IOException(x); } + } } diff --git a/hoodie-hive/src/test/resources/log4j-surefire.properties b/hoodie-hive/src/test/resources/log4j-surefire.properties index cc6a57052..8027f04d8 100644 --- a/hoodie-hive/src/test/resources/log4j-surefire.properties +++ b/hoodie-hive/src/test/resources/log4j-surefire.properties @@ -13,12 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # - log4j.rootLogger=WARN, A1 log4j.category.com.uber=INFO log4j.category.org.apache.parquet.hadoop=WARN log4j.category.parquet.hadoop=WARN - # A1 is set to be a ConsoleAppender. log4j.appender.A1=org.apache.log4j.ConsoleAppender # A1 uses PatternLayout. diff --git a/hoodie-spark/pom.xml b/hoodie-spark/pom.xml index 114dd516f..5f3763220 100644 --- a/hoodie-spark/pom.xml +++ b/hoodie-spark/pom.xml @@ -17,216 +17,218 @@ ~ --> - - - hoodie - com.uber.hoodie - 0.4.1-SNAPSHOT - - 4.0.0 - + + + hoodie com.uber.hoodie - hoodie-spark - jar + 0.4.1-SNAPSHOT + + 4.0.0 - - 1.2.17 - 4.10 - + com.uber.hoodie + hoodie-spark + jar - - - scala-tools.org - Scala-tools Maven2 Repository - http://scala-tools.org/repo-releases - - + + 1.2.17 + 4.10 + - - - - - net.alchim31.maven - scala-maven-plugin - 3.3.1 - - - org.apache.maven.plugins - maven-compiler-plugin - 2.0.2 - - - + + + scala-tools.org + Scala-tools Maven2 Repository + http://scala-tools.org/repo-releases + + - - - org.apache.maven.plugins - maven-dependency-plugin - - - copy-dependencies - prepare-package - - copy-dependencies - - - ${project.build.directory}/lib - true - true - true - - - - - - net.alchim31.maven - scala-maven-plugin - - - scala-compile-first - process-resources - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - - compile - - compile - - - - - - org.apache.rat - apache-rat-plugin - - - + + + + + net.alchim31.maven + scala-maven-plugin + 3.3.1 + + + org.apache.maven.plugins + maven-compiler-plugin + 2.0.2 + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + prepare-package + + copy-dependencies + + + ${project.build.directory}/lib + true + true + true + + + + + + net.alchim31.maven + scala-maven-plugin + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + compile + + compile + + + + + + org.apache.rat + apache-rat-plugin + + + - - - org.scala-lang - scala-library - ${scala.version} - - - org.scalatest - scalatest_2.11 - 3.0.1 - test - - - org.apache.spark - spark-core_2.11 - - - org.apache.spark - spark-sql_2.11 - - - com.databricks - spark-avro_2.11 - 3.2.0 - - - com.fasterxml.jackson.core - jackson-annotations - - - org.codehaus.jackson - jackson-mapper-asl - + + + org.scala-lang + scala-library + ${scala.version} + + + org.scalatest + scalatest_2.11 + 3.0.1 + test + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + com.databricks + spark-avro_2.11 + 3.2.0 + + + com.fasterxml.jackson.core + jackson-annotations + + + org.codehaus.jackson + jackson-mapper-asl + - - org.apache.hadoop - hadoop-client - - - javax.servlet - * - - - provided - + + org.apache.hadoop + hadoop-client + + + javax.servlet + * + + + provided + - - org.apache.hadoop - hadoop-common - provided - + + org.apache.hadoop + hadoop-common + provided + - - log4j - log4j - ${log4j.version} - - - org.apache.avro - avro - + + log4j + log4j + ${log4j.version} + + + org.apache.avro + avro + - - org.apache.commons - commons-lang3 - + + org.apache.commons + commons-lang3 + - - org.apache.commons - commons-configuration2 - + + org.apache.commons + commons-configuration2 + - - com.uber.hoodie - hoodie-client - ${project.version} - - - com.uber.hoodie - hoodie-common - ${project.version} - - - com.uber.hoodie - hoodie-hadoop-mr - ${project.version} - - - junit - junit-dep - ${junit.version} - test - + + com.uber.hoodie + hoodie-client + ${project.version} + + + com.uber.hoodie + hoodie-common + ${project.version} + + + com.uber.hoodie + hoodie-hadoop-mr + ${project.version} + + + junit + junit-dep + ${junit.version} + test + - - com.uber.hoodie - hoodie-client - ${project.version} - test-jar - test - - - com.uber.hoodie - hoodie-common - ${project.version} - test-jar - test - - + + com.uber.hoodie + hoodie-client + ${project.version} + test-jar + test + + + com.uber.hoodie + hoodie-common + ${project.version} + test-jar + test + + diff --git a/hoodie-spark/src/main/java/com/uber/hoodie/BaseAvroPayload.java b/hoodie-spark/src/main/java/com/uber/hoodie/BaseAvroPayload.java index 45022f28a..26e9cd31e 100644 --- a/hoodie-spark/src/main/java/com/uber/hoodie/BaseAvroPayload.java +++ b/hoodie-spark/src/main/java/com/uber/hoodie/BaseAvroPayload.java @@ -18,8 +18,8 @@ package com.uber.hoodie; -import org.apache.avro.generic.GenericRecord; import java.io.Serializable; +import org.apache.avro.generic.GenericRecord; /** * Base class for all AVRO record based payloads, that can be ordered based on a field @@ -27,23 +27,23 @@ import java.io.Serializable; public abstract class BaseAvroPayload implements Serializable { - /** - * Avro data extracted from the source - */ - protected final GenericRecord record; + /** + * Avro data extracted from the source + */ + protected final GenericRecord record; - /** - * For purposes of preCombining - */ - protected final Comparable orderingVal; + /** + * For purposes of preCombining + */ + protected final Comparable orderingVal; - /** - * - * @param record - * @param orderingVal - */ - public BaseAvroPayload(GenericRecord record, Comparable orderingVal) { - this.record = record; - this.orderingVal = orderingVal; - } + /** + * + * @param record + * @param orderingVal + */ + public BaseAvroPayload(GenericRecord record, Comparable orderingVal) { + this.record = record; + this.orderingVal = orderingVal; + } } diff --git a/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java b/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java index 1af5c199e..b16202cff 100644 --- a/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java +++ b/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java @@ -27,126 +27,130 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieNotSupportedException; import com.uber.hoodie.index.HoodieIndex; - +import java.io.IOException; +import java.util.List; +import java.util.Map; import org.apache.avro.generic.GenericRecord; import org.apache.commons.configuration.PropertiesConfiguration; import org.apache.commons.lang3.reflect.ConstructorUtils; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import java.io.IOException; -import java.util.List; -import java.util.Map; - /** * Utilities used throughout the data source */ public class DataSourceUtils { - /** - * Obtain value of the provided field as string, denoted by dot notation. e.g: a.b.c - */ - public static String getNestedFieldValAsString(GenericRecord record, String fieldName) { - String[] parts = fieldName.split("\\."); - GenericRecord valueNode = record; - for (int i = 0; i < parts.length; i++) { - String part = parts[i]; - Object val = valueNode.get(part); - if (val == null) { - break; - } + /** + * Obtain value of the provided field as string, denoted by dot notation. e.g: a.b.c + */ + public static String getNestedFieldValAsString(GenericRecord record, String fieldName) { + String[] parts = fieldName.split("\\."); + GenericRecord valueNode = record; + for (int i = 0; i < parts.length; i++) { + String part = parts[i]; + Object val = valueNode.get(part); + if (val == null) { + break; + } - // return, if last part of name - if (i == parts.length - 1) { - return val.toString(); - } else { - // VC: Need a test here - if (!(val instanceof GenericRecord)) { - throw new HoodieException("Cannot find a record at part value :" + part); - } - valueNode = (GenericRecord) val; - } + // return, if last part of name + if (i == parts.length - 1) { + return val.toString(); + } else { + // VC: Need a test here + if (!(val instanceof GenericRecord)) { + throw new HoodieException("Cannot find a record at part value :" + part); } - throw new HoodieException(fieldName + " field not found in record"); + valueNode = (GenericRecord) val; + } } + throw new HoodieException(fieldName + " field not found in record"); + } - /** - * Create a key generator class via reflection, passing in any configs needed - */ - public static KeyGenerator createKeyGenerator(String keyGeneratorClass, PropertiesConfiguration cfg) throws IOException { - try { - return (KeyGenerator) ConstructorUtils.invokeConstructor(Class.forName(keyGeneratorClass), (Object) cfg); - } catch (Throwable e) { - throw new IOException("Could not load key generator class " + keyGeneratorClass, e); - } + /** + * Create a key generator class via reflection, passing in any configs needed + */ + public static KeyGenerator createKeyGenerator(String keyGeneratorClass, + PropertiesConfiguration cfg) throws IOException { + try { + return (KeyGenerator) ConstructorUtils + .invokeConstructor(Class.forName(keyGeneratorClass), (Object) cfg); + } catch (Throwable e) { + throw new IOException("Could not load key generator class " + keyGeneratorClass, e); } + } - /** - * Create a payload class via reflection, passing in an ordering/precombine value. - */ - public static HoodieRecordPayload createPayload(String payloadClass, GenericRecord record, Comparable orderingVal) throws IOException { - try { - return (HoodieRecordPayload) ConstructorUtils.invokeConstructor(Class.forName(payloadClass), (Object) record, (Object) orderingVal); - } catch (Throwable e) { - throw new IOException("Could not create payload for class: " + payloadClass, e); - } + /** + * Create a payload class via reflection, passing in an ordering/precombine value. + */ + public static HoodieRecordPayload createPayload(String payloadClass, GenericRecord record, + Comparable orderingVal) throws IOException { + try { + return (HoodieRecordPayload) ConstructorUtils + .invokeConstructor(Class.forName(payloadClass), (Object) record, (Object) orderingVal); + } catch (Throwable e) { + throw new IOException("Could not create payload for class: " + payloadClass, e); } + } - public static void checkRequiredProperties(PropertiesConfiguration configuration, List checkPropNames) { - checkPropNames.stream().forEach(prop -> { - if (!configuration.containsKey(prop)) { - throw new HoodieNotSupportedException("Required property " + prop + " is missing"); - } - }); + public static void checkRequiredProperties(PropertiesConfiguration configuration, + List checkPropNames) { + checkPropNames.stream().forEach(prop -> { + if (!configuration.containsKey(prop)) { + throw new HoodieNotSupportedException("Required property " + prop + " is missing"); + } + }); + } + + public static HoodieWriteClient createHoodieClient(JavaSparkContext jssc, + String schemaStr, + String basePath, + String tblName, + Map parameters) throws Exception { + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder() + .combineInput(true, true) + .withPath(basePath) + .withAutoCommit(false) + .withSchema(schemaStr) + .forTable(tblName) + .withIndexConfig( + HoodieIndexConfig.newBuilder() + .withIndexType(HoodieIndex.IndexType.BLOOM) + .build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withPayloadClass(parameters.get(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY())) + .build()) + // override above with Hoodie configs specified as options. + .withProps(parameters) + .build(); + + return new HoodieWriteClient<>(jssc, writeConfig); + } + + + public static JavaRDD doWriteOperation(HoodieWriteClient client, + JavaRDD hoodieRecords, + String commitTime, + String operation) { + if (operation.equals(DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL())) { + return client.bulkInsert(hoodieRecords, commitTime); + } else if (operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())) { + return client.insert(hoodieRecords, commitTime); + } else { + //default is upsert + return client.upsert(hoodieRecords, commitTime); } + } - public static HoodieWriteClient createHoodieClient(JavaSparkContext jssc, - String schemaStr, - String basePath, - String tblName, - Map parameters) throws Exception { - HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder() - .combineInput(true, true) - .withPath(basePath) - .withAutoCommit(false) - .withSchema(schemaStr) - .forTable(tblName) - .withIndexConfig( - HoodieIndexConfig.newBuilder() - .withIndexType(HoodieIndex.IndexType.BLOOM) - .build()) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withPayloadClass(parameters.get(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY())).build()) - // override above with Hoodie configs specified as options. - .withProps(parameters) - .build(); - - return new HoodieWriteClient<>(jssc, writeConfig); - } - - - public static JavaRDD doWriteOperation(HoodieWriteClient client, - JavaRDD hoodieRecords, - String commitTime, - String operation) { - if (operation.equals(DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL())) { - return client.bulkInsert(hoodieRecords, commitTime); - } else if (operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())) { - return client.insert(hoodieRecords, commitTime); - } else { - //default is upsert - return client.upsert(hoodieRecords, commitTime); - } - } - - public static HoodieRecord createHoodieRecord(GenericRecord gr, - Comparable orderingVal, - HoodieKey hKey, - String payloadClass) throws IOException { - HoodieRecordPayload payload = DataSourceUtils.createPayload( - payloadClass, - gr, - orderingVal); - return new HoodieRecord<>(hKey, payload); - } + public static HoodieRecord createHoodieRecord(GenericRecord gr, + Comparable orderingVal, + HoodieKey hKey, + String payloadClass) throws IOException { + HoodieRecordPayload payload = DataSourceUtils.createPayload( + payloadClass, + gr, + orderingVal); + return new HoodieRecord<>(hKey, payload); + } } diff --git a/hoodie-spark/src/main/java/com/uber/hoodie/HoodieDataSourceHelpers.java b/hoodie-spark/src/main/java/com/uber/hoodie/HoodieDataSourceHelpers.java index d43a81dd6..fb8db4a94 100644 --- a/hoodie-spark/src/main/java/com/uber/hoodie/HoodieDataSourceHelpers.java +++ b/hoodie-spark/src/main/java/com/uber/hoodie/HoodieDataSourceHelpers.java @@ -19,68 +19,62 @@ package com.uber.hoodie; import com.google.common.collect.Sets; - import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.table.HoodieTable; - -import org.apache.hadoop.fs.FileSystem; - import java.util.List; import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; /** - * List of helpers to aid, construction of instanttime for read and write operations using datasource + * List of helpers to aid, construction of instanttime for read and write operations using + * datasource */ public class HoodieDataSourceHelpers { - /** - * Checks if the Hoodie dataset has new data since given timestamp. This can be subsequently - * fed to an incremental view read, to perform incremental processing. - */ - public static boolean hasNewCommits(FileSystem fs, String basePath, String commitTimestamp) { - return listCommitsSince(fs, basePath, commitTimestamp).size() > 0; - } + /** + * Checks if the Hoodie dataset has new data since given timestamp. This can be subsequently fed + * to an incremental view read, to perform incremental processing. + */ + public static boolean hasNewCommits(FileSystem fs, String basePath, String commitTimestamp) { + return listCommitsSince(fs, basePath, commitTimestamp).size() > 0; + } - /** - * Get a list of instant times that have occurred, from the given instant timestamp. - * - * @param instantTimestamp - */ - public static List listCommitsSince(FileSystem fs, String basePath, String instantTimestamp) { - HoodieTimeline timeline = allCompletedCommitsCompactions(fs, basePath); - return timeline.findInstantsAfter(instantTimestamp, Integer.MAX_VALUE).getInstants() - .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); - } + /** + * Get a list of instant times that have occurred, from the given instant timestamp. + */ + public static List listCommitsSince(FileSystem fs, String basePath, + String instantTimestamp) { + HoodieTimeline timeline = allCompletedCommitsCompactions(fs, basePath); + return timeline.findInstantsAfter(instantTimestamp, Integer.MAX_VALUE).getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + } - /** - * Returns the last successful write operation's instant time - */ - public static String latestCommit(FileSystem fs, String basePath) { - HoodieTimeline timeline = allCompletedCommitsCompactions(fs, basePath); - return timeline.lastInstant().get().getTimestamp(); - } + /** + * Returns the last successful write operation's instant time + */ + public static String latestCommit(FileSystem fs, String basePath) { + HoodieTimeline timeline = allCompletedCommitsCompactions(fs, basePath); + return timeline.lastInstant().get().getTimestamp(); + } - /** - * Obtain all the commits, compactions that have occurred on the timeline, whose - * instant times could be fed into the datasource options. - * - * @param fs - * @param basePath - */ - public static HoodieTimeline allCompletedCommitsCompactions(FileSystem fs, String basePath) { - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null); - if (table.getMetaClient().getTableType().equals(HoodieTableType.MERGE_ON_READ)) { - return table.getActiveTimeline().getTimelineOfActions( - Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION, - HoodieActiveTimeline.DELTA_COMMIT_ACTION) - ); - } else { - return table.getCompletedCompactionCommitTimeline(); - } + /** + * Obtain all the commits, compactions that have occurred on the timeline, whose instant times + * could be fed into the datasource options. + */ + public static HoodieTimeline allCompletedCommitsCompactions(FileSystem fs, String basePath) { + HoodieTable table = HoodieTable + .getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null); + if (table.getMetaClient().getTableType().equals(HoodieTableType.MERGE_ON_READ)) { + return table.getActiveTimeline().getTimelineOfActions( + Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION, + HoodieActiveTimeline.DELTA_COMMIT_ACTION) + ); + } else { + return table.getCompletedCompactionCommitTimeline(); } + } } diff --git a/hoodie-spark/src/main/java/com/uber/hoodie/KeyGenerator.java b/hoodie-spark/src/main/java/com/uber/hoodie/KeyGenerator.java index 58bd3e909..f23148ad3 100644 --- a/hoodie-spark/src/main/java/com/uber/hoodie/KeyGenerator.java +++ b/hoodie-spark/src/main/java/com/uber/hoodie/KeyGenerator.java @@ -19,29 +19,24 @@ package com.uber.hoodie; import com.uber.hoodie.common.model.HoodieKey; - +import java.io.Serializable; import org.apache.avro.generic.GenericRecord; import org.apache.commons.configuration.PropertiesConfiguration; -import java.io.Serializable; - /** * Abstract class to extend for plugging in extraction of {@link com.uber.hoodie.common.model.HoodieKey} * from an Avro record */ public abstract class KeyGenerator implements Serializable { - protected transient PropertiesConfiguration config; + protected transient PropertiesConfiguration config; - protected KeyGenerator(PropertiesConfiguration config) { - this.config = config; - } + protected KeyGenerator(PropertiesConfiguration config) { + this.config = config; + } - /** - * Generate a Hoodie Key out of provided generic record. - * - * @param record - * @return - */ - public abstract HoodieKey getKey(GenericRecord record); + /** + * Generate a Hoodie Key out of provided generic record. + */ + public abstract HoodieKey getKey(GenericRecord record); } diff --git a/hoodie-spark/src/main/java/com/uber/hoodie/OverwriteWithLatestAvroPayload.java b/hoodie-spark/src/main/java/com/uber/hoodie/OverwriteWithLatestAvroPayload.java index e7877aafc..74424ac36 100644 --- a/hoodie-spark/src/main/java/com/uber/hoodie/OverwriteWithLatestAvroPayload.java +++ b/hoodie-spark/src/main/java/com/uber/hoodie/OverwriteWithLatestAvroPayload.java @@ -20,49 +20,49 @@ package com.uber.hoodie; import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.util.HoodieAvroUtils; - +import java.io.IOException; +import java.util.Optional; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; -import java.io.IOException; -import java.util.Optional; - /** * Default payload used for delta streamer. * - * 1. preCombine - Picks the latest delta record for a key, based on an ordering field - * 2. combineAndGetUpdateValue/getInsertValue - Simply overwrites storage with latest delta record + * 1. preCombine - Picks the latest delta record for a key, based on an ordering field 2. + * combineAndGetUpdateValue/getInsertValue - Simply overwrites storage with latest delta record */ -public class OverwriteWithLatestAvroPayload extends BaseAvroPayload implements HoodieRecordPayload { +public class OverwriteWithLatestAvroPayload extends BaseAvroPayload implements + HoodieRecordPayload { - /** - * - * @param record - * @param orderingVal - */ - public OverwriteWithLatestAvroPayload(GenericRecord record, Comparable orderingVal) { - super(record, orderingVal); - } + /** + * + * @param record + * @param orderingVal + */ + public OverwriteWithLatestAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } - @Override - public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another) { - // pick the payload with greatest ordering value - if (another.orderingVal.compareTo(orderingVal) > 0) { - return another; - } else { - return this; - } + @Override + public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another) { + // pick the payload with greatest ordering value + if (another.orderingVal.compareTo(orderingVal) > 0) { + return another; + } else { + return this; } + } - @Override - public Optional combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { - // combining strategy here trivially ignores currentValue on disk and writes this record - return getInsertValue(schema); - } + @Override + public Optional combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) + throws IOException { + // combining strategy here trivially ignores currentValue on disk and writes this record + return getInsertValue(schema); + } - @Override - public Optional getInsertValue(Schema schema) throws IOException { - return Optional.of(HoodieAvroUtils.rewriteRecord(record, schema)); - } + @Override + public Optional getInsertValue(Schema schema) throws IOException { + return Optional.of(HoodieAvroUtils.rewriteRecord(record, schema)); + } } diff --git a/hoodie-spark/src/main/java/com/uber/hoodie/SimpleKeyGenerator.java b/hoodie-spark/src/main/java/com/uber/hoodie/SimpleKeyGenerator.java index c5733856a..6c15ce434 100644 --- a/hoodie-spark/src/main/java/com/uber/hoodie/SimpleKeyGenerator.java +++ b/hoodie-spark/src/main/java/com/uber/hoodie/SimpleKeyGenerator.java @@ -20,32 +20,33 @@ package com.uber.hoodie; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.exception.HoodieException; - import org.apache.avro.generic.GenericRecord; import org.apache.commons.configuration.PropertiesConfiguration; /** - * Simple key generator, which takes names of fields to be used for recordKey and partitionPath - * as configs. + * Simple key generator, which takes names of fields to be used for recordKey and partitionPath as + * configs. */ public class SimpleKeyGenerator extends KeyGenerator { - protected final String recordKeyField; + protected final String recordKeyField; - protected final String partitionPathField; + protected final String partitionPathField; - public SimpleKeyGenerator(PropertiesConfiguration config) { - super(config); - this.recordKeyField = config.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY()); - this.partitionPathField = config.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY()); - } - - @Override - public HoodieKey getKey(GenericRecord record) { - if (recordKeyField == null || partitionPathField == null) { - throw new HoodieException("Unable to find field names for record key or partition path in cfg"); - } - return new HoodieKey(DataSourceUtils.getNestedFieldValAsString(record, recordKeyField), - DataSourceUtils.getNestedFieldValAsString(record, partitionPathField)); + public SimpleKeyGenerator(PropertiesConfiguration config) { + super(config); + this.recordKeyField = config.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY()); + this.partitionPathField = config + .getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY()); + } + + @Override + public HoodieKey getKey(GenericRecord record) { + if (recordKeyField == null || partitionPathField == null) { + throw new HoodieException( + "Unable to find field names for record key or partition path in cfg"); } + return new HoodieKey(DataSourceUtils.getNestedFieldValAsString(record, recordKeyField), + DataSourceUtils.getNestedFieldValAsString(record, partitionPathField)); + } } diff --git a/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala b/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala index 82cf7cc14..684024887 100644 --- a/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala +++ b/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala @@ -115,12 +115,12 @@ object AvroConversionUtils { def convertStructTypeToAvroSchema(structType: StructType, structName: String, - recordNamespace: String) : Schema = { + recordNamespace: String): Schema = { val builder = SchemaBuilder.record(structName).namespace(recordNamespace) SchemaConverters.convertStructToAvro(structType, builder, recordNamespace) } - def convertAvroSchemaToStructType(avroSchema: Schema) : StructType = { + def convertAvroSchemaToStructType(avroSchema: Schema): StructType = { SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType]; } } diff --git a/hoodie-spark/src/main/scala/com/uber/hoodie/package.scala b/hoodie-spark/src/main/scala/com/uber/hoodie/package.scala index 63265f3f5..390f07b81 100644 --- a/hoodie-spark/src/main/scala/com/uber/hoodie/package.scala +++ b/hoodie-spark/src/main/scala/com/uber/hoodie/package.scala @@ -20,6 +20,7 @@ package com.uber.hoodie import org.apache.spark.sql.{DataFrame, DataFrameReader, DataFrameWriter} package object hoodie { + /** * Adds a method, `hoodie`, to DataFrameWriter */ @@ -33,4 +34,5 @@ package object hoodie { implicit class AvroDataFrameReader(reader: DataFrameReader) { def avro: String => DataFrame = reader.format("com.uber.hoodie").load } + } diff --git a/hoodie-spark/src/test/java/DataSourceTestUtils.java b/hoodie-spark/src/test/java/DataSourceTestUtils.java index 47f069ee1..a39c42ac3 100644 --- a/hoodie-spark/src/test/java/DataSourceTestUtils.java +++ b/hoodie-spark/src/test/java/DataSourceTestUtils.java @@ -18,9 +18,6 @@ import com.uber.hoodie.common.TestRawTripPayload; import com.uber.hoodie.common.model.HoodieRecord; - -import org.apache.spark.api.java.JavaRDD; - import java.io.IOException; import java.util.List; import java.util.Optional; @@ -31,20 +28,21 @@ import java.util.stream.Collectors; */ public class DataSourceTestUtils { - public static Optional convertToString(HoodieRecord record) { - try { - String str = ((TestRawTripPayload) record.getData()).getJsonData(); - str = "{" + str.substring(str.indexOf("\"timestamp\":")); - return Optional.of(str.replaceAll("}", ", \"partition\": \"" + record.getPartitionPath() + "\"}")); - } catch (IOException e) { - return Optional.empty(); - } + public static Optional convertToString(HoodieRecord record) { + try { + String str = ((TestRawTripPayload) record.getData()).getJsonData(); + str = "{" + str.substring(str.indexOf("\"timestamp\":")); + return Optional + .of(str.replaceAll("}", ", \"partition\": \"" + record.getPartitionPath() + "\"}")); + } catch (IOException e) { + return Optional.empty(); } + } - public static List convertToStringList(List records) { - return records.stream().map(hr -> convertToString(hr)) - .filter(os -> os.isPresent()) - .map(os -> os.get()) - .collect(Collectors.toList()); - } + public static List convertToStringList(List records) { + return records.stream().map(hr -> convertToString(hr)) + .filter(os -> os.isPresent()) + .map(os -> os.get()) + .collect(Collectors.toList()); + } } diff --git a/hoodie-spark/src/test/java/HoodieJavaApp.java b/hoodie-spark/src/test/java/HoodieJavaApp.java index c61d8cca4..ff2a03416 100644 --- a/hoodie-spark/src/test/java/HoodieJavaApp.java +++ b/hoodie-spark/src/test/java/HoodieJavaApp.java @@ -25,7 +25,7 @@ import com.uber.hoodie.HoodieDataSourceHelpers; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.config.HoodieWriteConfig; - +import java.util.List; import org.apache.hadoop.fs.FileSystem; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -35,113 +35,123 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; -import java.util.List; /** * Sample program that writes & reads hoodie datasets via the Spark datasource */ public class HoodieJavaApp { - @Parameter(names={"--table-path", "-p"}, description = "path for Hoodie sample table") - private String tablePath = "file:///tmp/hoodie/sample-table"; + @Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table") + private String tablePath = "file:///tmp/hoodie/sample-table"; - @Parameter(names={"--table-name", "-n"}, description = "table name for Hoodie sample table") - private String tableName = "hoodie_test"; + @Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table") + private String tableName = "hoodie_test"; - @Parameter(names={"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ") - private String tableType = HoodieTableType.COPY_ON_WRITE.name(); + @Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ") + private String tableType = HoodieTableType.COPY_ON_WRITE.name(); - @Parameter(names = {"--help", "-h"}, help = true) - public Boolean help = false; + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; - private static Logger logger = LogManager.getLogger(HoodieJavaApp.class); + private static Logger logger = LogManager.getLogger(HoodieJavaApp.class); - public static void main(String[] args) throws Exception { - HoodieJavaApp cli = new HoodieJavaApp(); - JCommander cmd = new JCommander(cli, args); + public static void main(String[] args) throws Exception { + HoodieJavaApp cli = new HoodieJavaApp(); + JCommander cmd = new JCommander(cli, args); - if (cli.help) { - cmd.usage(); - System.exit(1); - } - cli.run(); + if (cli.help) { + cmd.usage(); + System.exit(1); } + cli.run(); + } - public void run() throws Exception { + public void run() throws Exception { - // Spark session setup.. - SparkSession spark = SparkSession.builder() - .appName("Hoodie Spark APP") - .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - .master("local[1]") - .getOrCreate(); - JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext()); - FileSystem fs = FileSystem.get(jssc.hadoopConfiguration()); + // Spark session setup.. + SparkSession spark = SparkSession.builder() + .appName("Hoodie Spark APP") + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .master("local[1]") + .getOrCreate(); + JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext()); + FileSystem fs = FileSystem.get(jssc.hadoopConfiguration()); - // Generator of some records to be loaded in. - HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + // Generator of some records to be loaded in. + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); - /** - * Commit with only inserts - */ - // Generate some input.. - List records1 = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("001"/* ignore */, 100)); - Dataset inputDF1 = spark.read().json(jssc.parallelize(records1, 2)); + /** + * Commit with only inserts + */ + // Generate some input.. + List records1 = DataSourceTestUtils + .convertToStringList(dataGen.generateInserts("001"/* ignore */, 100)); + Dataset inputDF1 = spark.read().json(jssc.parallelize(records1, 2)); - // Save as hoodie dataset (copy on write) - inputDF1.write() - .format("com.uber.hoodie") // specify the hoodie source - .option("hoodie.insert.shuffle.parallelism", "2") // any hoodie client config can be passed like this - .option("hoodie.upsert.shuffle.parallelism", "2") // full list in HoodieWriteConfig & its package - .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert - .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") // This is the record key - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") // this is the partition to place it into - .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") // use to combine duplicate records in input/with disk val - .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries - .mode(SaveMode.Overwrite) // This will remove any existing data at path below, and create a new dataset if needed - .save(tablePath); // ultimately where the dataset will be placed - String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); - logger.info("First commit at instant time :" + commitInstantTime1); + // Save as hoodie dataset (copy on write) + inputDF1.write() + .format("com.uber.hoodie") // specify the hoodie source + .option("hoodie.insert.shuffle.parallelism", + "2") // any hoodie client config can be passed like this + .option("hoodie.upsert.shuffle.parallelism", + "2") // full list in HoodieWriteConfig & its package + .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), + DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert + .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), + "_row_key") // This is the record key + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), + "partition") // this is the partition to place it into + .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), + "timestamp") // use to combine duplicate records in input/with disk val + .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries + .mode( + SaveMode.Overwrite) // This will remove any existing data at path below, and create a new dataset if needed + .save(tablePath); // ultimately where the dataset will be placed + String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); + logger.info("First commit at instant time :" + commitInstantTime1); - /** - * Commit that updates records - */ - List records2 = DataSourceTestUtils.convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100)); - Dataset inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); - inputDF2.write() - .format("com.uber.hoodie") - .option("hoodie.insert.shuffle.parallelism", "2") - .option("hoodie.upsert.shuffle.parallelism", "2") - .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") - .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") - .option(HoodieWriteConfig.TABLE_NAME, tableName) - .mode(SaveMode.Append) - .save(tablePath); - String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); - logger.info("Second commit at instant time :" + commitInstantTime1); + /** + * Commit that updates records + */ + List records2 = DataSourceTestUtils + .convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100)); + Dataset inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); + inputDF2.write() + .format("com.uber.hoodie") + .option("hoodie.insert.shuffle.parallelism", "2") + .option("hoodie.upsert.shuffle.parallelism", "2") + .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") + .option(HoodieWriteConfig.TABLE_NAME, tableName) + .mode(SaveMode.Append) + .save(tablePath); + String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); + logger.info("Second commit at instant time :" + commitInstantTime1); - /** - * Read & do some queries - */ - Dataset hoodieROViewDF = spark.read() - .format("com.uber.hoodie") - // pass any path glob, can include hoodie & non-hoodie datasets - .load(tablePath + "/*/*/*/*"); - hoodieROViewDF.registerTempTable("hoodie_ro"); - spark.sql("describe hoodie_ro").show(); - // all trips whose fare was greater than 2. - spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show(); + /** + * Read & do some queries + */ + Dataset hoodieROViewDF = spark.read() + .format("com.uber.hoodie") + // pass any path glob, can include hoodie & non-hoodie datasets + .load(tablePath + "/*/*/*/*"); + hoodieROViewDF.registerTempTable("hoodie_ro"); + spark.sql("describe hoodie_ro").show(); + // all trips whose fare was greater than 2. + spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0") + .show(); + /** + * Consume incrementally, only changes in commit 2 above. + */ + Dataset hoodieIncViewDF = spark.read().format("com.uber.hoodie") + .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), + DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), + commitInstantTime1) // Only changes in write 2 above + .load(tablePath); // For incremental view, pass in the root/base path of dataset - /** - * Consume incrementally, only changes in commit 2 above. - */ - Dataset hoodieIncViewDF = spark.read().format("com.uber.hoodie") - .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) - .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) // Only changes in write 2 above - .load(tablePath); // For incremental view, pass in the root/base path of dataset - - logger.info("You will only see records from : " + commitInstantTime2); - hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show(); - } + logger.info("You will only see records from : " + commitInstantTime2); + hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show(); + } } diff --git a/hoodie-spark/src/test/resources/log4j-surefire.properties b/hoodie-spark/src/test/resources/log4j-surefire.properties index 490c6411d..daf8d28c1 100644 --- a/hoodie-spark/src/test/resources/log4j-surefire.properties +++ b/hoodie-spark/src/test/resources/log4j-surefire.properties @@ -20,7 +20,6 @@ log4j.category.com.uber.hoodie.io=WARN log4j.category.com.uber.hoodie.common=WARN log4j.category.com.uber.hoodie.table.log=WARN log4j.category.org.apache.parquet.hadoop=WARN - # A1 is set to be a ConsoleAppender. log4j.appender.A1=org.apache.log4j.ConsoleAppender # A1 uses PatternLayout. diff --git a/hoodie-spark/src/test/scala/DataSourceDefaultsTest.scala b/hoodie-spark/src/test/scala/DataSourceDefaultsTest.scala index 2996c46fd..a2f82af41 100644 --- a/hoodie-spark/src/test/scala/DataSourceDefaultsTest.scala +++ b/hoodie-spark/src/test/scala/DataSourceDefaultsTest.scala @@ -16,9 +16,9 @@ * */ -import com.uber.hoodie.{DataSourceWriteOptions, OverwriteWithLatestAvroPayload, SimpleKeyGenerator} import com.uber.hoodie.common.util.SchemaTestUtil import com.uber.hoodie.exception.HoodieException +import com.uber.hoodie.{DataSourceWriteOptions, OverwriteWithLatestAvroPayload, SimpleKeyGenerator} import org.apache.avro.generic.GenericRecord import org.apache.commons.configuration.PropertiesConfiguration import org.junit.Assert._ @@ -31,7 +31,7 @@ import org.scalatest.junit.AssertionsForJUnit class DataSourceDefaultsTest extends AssertionsForJUnit { val schema = SchemaTestUtil.getComplexEvolvedSchema - var baseRecord : GenericRecord = null + var baseRecord: GenericRecord = null @Before def initialize(): Unit = { baseRecord = SchemaTestUtil @@ -39,12 +39,13 @@ class DataSourceDefaultsTest extends AssertionsForJUnit { } - private def getKeyConfig(recordKeyFieldName: String, paritionPathField: String): PropertiesConfiguration = { + private def getKeyConfig(recordKeyFieldName: String, paritionPathField: String): PropertiesConfiguration = { val props = new PropertiesConfiguration() props.addProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, recordKeyFieldName) props.addProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, paritionPathField) props } + @Test def testSimpleKeyGenerator() = { // top level, valid fields val hk1 = new SimpleKeyGenerator(getKeyConfig("field1", "name")).getKey(baseRecord) diff --git a/hoodie-spark/src/test/scala/DataSourceTest.scala b/hoodie-spark/src/test/scala/DataSourceTest.scala index 764206f24..b9fed1cfe 100644 --- a/hoodie-spark/src/test/scala/DataSourceTest.scala +++ b/hoodie-spark/src/test/scala/DataSourceTest.scala @@ -20,11 +20,11 @@ import com.uber.hoodie.common.HoodieTestDataGenerator import com.uber.hoodie.common.util.FSUtils import com.uber.hoodie.config.HoodieWriteConfig import com.uber.hoodie.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers} -import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql._ import org.junit.Assert._ -import org.junit.{Before, Test} import org.junit.rules.TemporaryFolder +import org.junit.{Before, Test} import org.scalatest.junit.AssertionsForJUnit import scala.collection.JavaConversions._ @@ -44,8 +44,8 @@ class DataSourceTest extends AssertionsForJUnit { DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> "timestamp", HoodieWriteConfig.TABLE_NAME -> "hoodie_test" ) - var basePath : String = null - var fs : FileSystem = null + var basePath: String = null + var fs: FileSystem = null @Before def initialize() { spark = SparkSession.builder diff --git a/hoodie-utilities/pom.xml b/hoodie-utilities/pom.xml index 901e6bf5f..a6df1595f 100644 --- a/hoodie-utilities/pom.xml +++ b/hoodie-utilities/pom.xml @@ -15,282 +15,284 @@ ~ limitations under the License. --> - - - hoodie - com.uber.hoodie - 0.4.1-SNAPSHOT - - 4.0.0 + + + hoodie + com.uber.hoodie + 0.4.1-SNAPSHOT + + 4.0.0 - hoodie-utilities - jar + hoodie-utilities + jar - - - - org.jacoco - jacoco-maven-plugin - - - org.apache.maven.plugins - maven-compiler-plugin - - 1.8 - 1.8 - - - - org.apache.maven.plugins - maven-assembly-plugin - 2.4.1 - - - src/assembly/src.xml - - - - com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer - - + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.8 + 1.8 + + + + org.apache.maven.plugins + maven-assembly-plugin + 2.4.1 + + + src/assembly/src.xml + + + + com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer + + - - - - make-assembly - - package - - single - - - - - + + + + make-assembly + + package + + single + + + + + - - - src/main/resources - - - src/test/resources - - - + + + src/main/resources + + + src/test/resources + + + - - - org.apache.spark - spark-sql_2.11 - + + + org.apache.spark + spark-sql_2.11 + - - com.uber.hoodie - hoodie-common - ${project.version} - + + com.uber.hoodie + hoodie-common + ${project.version} + - - com.uber.hoodie - hoodie-common - ${project.version} - test-jar - test - + + com.uber.hoodie + hoodie-common + ${project.version} + test-jar + test + - - com.uber.hoodie - hoodie-spark - ${project.version} - + + com.uber.hoodie + hoodie-spark + ${project.version} + - - org.apache.hadoop - hadoop-hdfs - tests - - - - org.mortbay.jetty - * - - - javax.servlet.jsp - * - - - javax.servlet - * - - - - - org.apache.hadoop - hadoop-common - tests - - - org.mortbay.jetty - * - - - javax.servlet.jsp - * - - - javax.servlet - * - - - + + org.apache.hadoop + hadoop-hdfs + tests + + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + org.apache.hadoop + hadoop-common + tests + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + - - com.uber.hoodie - hoodie-hive - ${project.version} - - - javax.servlet - servlet-api - - - + + com.uber.hoodie + hoodie-hive + ${project.version} + + + javax.servlet + servlet-api + + + - - com.uber.hoodie - hoodie-client - ${project.version} - + + com.uber.hoodie + hoodie-client + ${project.version} + - - com.uber.hoodie - hoodie-client - ${project.version} - test-jar - test - + + com.uber.hoodie + hoodie-client + ${project.version} + test-jar + test + - - org.apache.hive - hive-jdbc - ${hive.version}-cdh${cdh.version} - standalone - - - org.slf4j - slf4j-api - - - javax.servlet - servlet-api - - - + + org.apache.hive + hive-jdbc + ${hive.version}-cdh${cdh.version} + standalone + + + org.slf4j + slf4j-api + + + javax.servlet + servlet-api + + + - - commons-dbcp - commons-dbcp - - - org.apache.httpcomponents - httpcore - + + commons-dbcp + commons-dbcp + + + org.apache.httpcomponents + httpcore + - - log4j - log4j - - - org.slf4j - slf4j-api - + + log4j + log4j + + + org.slf4j + slf4j-api + - - org.apache.hadoop - hadoop-mapreduce-client-common - - - javax.servlet - servlet-api - - - + + org.apache.hadoop + hadoop-mapreduce-client-common + + + javax.servlet + servlet-api + + + - - org.apache.hadoop - hadoop-client - - - javax.servlet - servlet-api - - - + + org.apache.hadoop + hadoop-client + + + javax.servlet + servlet-api + + + - - org.apache.spark - spark-core_2.11 - - - javax.servlet - servlet-api - - - + + org.apache.spark + spark-core_2.11 + + + javax.servlet + servlet-api + + + - - org.apache.spark - spark-streaming_2.11 - ${spark.version} - provided - + + org.apache.spark + spark-streaming_2.11 + ${spark.version} + provided + - - org.apache.spark - spark-streaming-kafka-0-8_2.11 - ${spark.version} - + + org.apache.spark + spark-streaming-kafka-0-8_2.11 + ${spark.version} + - - - org.antlr - stringtemplate - 4.0.2 - + + + org.antlr + stringtemplate + 4.0.2 + - - com.beust - jcommander - + + com.beust + jcommander + - - org.mockito - mockito-all - 1.10.19 - test - - - org.apache.avro - avro-mapred - 1.7.6-cdh5.7.2 - + + org.mockito + mockito-all + 1.10.19 + test + + + org.apache.avro + avro-mapred + 1.7.6-cdh5.7.2 + - - org.apache.parquet - parquet-avro - + + org.apache.parquet + parquet-avro + - - org.apache.parquet - parquet-hadoop - + + org.apache.parquet + parquet-hadoop + - - com.twitter - bijection-avro_2.11 - 0.9.2 - + + com.twitter + bijection-avro_2.11 + 0.9.2 + - + diff --git a/hoodie-utilities/src/assembly/src.xml b/hoodie-utilities/src/assembly/src.xml index 77b5f87e3..aa2fbcd21 100644 --- a/hoodie-utilities/src/assembly/src.xml +++ b/hoodie-utilities/src/assembly/src.xml @@ -15,8 +15,8 @@ --> + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3 http://maven.apache.org/xsd/assembly-1.1.3.xsd"> bin jar @@ -40,9 +40,9 @@ - - - + + + diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HDFSParquetImporter.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HDFSParquetImporter.java index a8338b727..27f264974 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HDFSParquetImporter.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HDFSParquetImporter.java @@ -60,253 +60,255 @@ import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; -public class HDFSParquetImporter implements Serializable{ +public class HDFSParquetImporter implements Serializable { - private static volatile Logger logger = LogManager.getLogger(HDFSParquetImporter.class); - private final Config cfg; - private final transient FileSystem fs; - public static final SimpleDateFormat PARTITION_FORMATTER = new SimpleDateFormat("yyyy/MM/dd"); + private static volatile Logger logger = LogManager.getLogger(HDFSParquetImporter.class); + private final Config cfg; + private final transient FileSystem fs; + public static final SimpleDateFormat PARTITION_FORMATTER = new SimpleDateFormat("yyyy/MM/dd"); - public HDFSParquetImporter( - Config cfg) throws IOException { - this.cfg = cfg; - fs = FSUtils.getFs(); + public HDFSParquetImporter( + Config cfg) throws IOException { + this.cfg = cfg; + fs = FSUtils.getFs(); + } + + public static class FormatValidator implements IValueValidator { + + List validFormats = Arrays.asList("parquet"); + + @Override + public void validate(String name, String value) throws ParameterException { + if (value == null || !validFormats.contains(value)) { + throw new ParameterException(String + .format("Invalid format type: value:%s: supported formats:%s", value, + validFormats)); + } + } + } + + public static class SourceTypeValidator implements IValueValidator { + + List validSourceTypes = Arrays.asList("hdfs"); + + @Override + public void validate(String name, String value) throws ParameterException { + if (value == null || !validSourceTypes.contains(value)) { + throw new ParameterException(String + .format("Invalid source type: value:%s: supported source types:%s", value, + validSourceTypes)); + } + } + } + + public static class Config implements Serializable { + + @Parameter(names = {"--src-path", + "-sp"}, description = "Base path for the input dataset", required = true) + public String srcPath = null; + @Parameter(names = {"--src-type", + "-st"}, description = "Source type for the input dataset", required = true, + validateValueWith = SourceTypeValidator.class) + public String srcType = null; + @Parameter(names = {"--target-path", + "-tp"}, description = "Base path for the target hoodie dataset", required = true) + public String targetPath = null; + @Parameter(names = {"--table-name", "-tn"}, description = "Table name", required = true) + public String tableName = null; + @Parameter(names = {"--table-type", "-tt"}, description = "Table type", required = true) + public String tableType = null; + @Parameter(names = {"--row-key-field", + "-rk"}, description = "Row key field name", required = true) + public String rowKey = null; + @Parameter(names = {"--partition-key-field", + "-pk"}, description = "Partition key field name", required = true) + public String partitionKey = null; + @Parameter(names = {"--parallelism", + "-pl"}, description = "Parallelism for hoodie insert", required = true) + public int parallelism = 1; + @Parameter(names = {"--schema-file", + "-sf"}, description = "path for Avro schema file", required = true) + public String schemaFile = null; + @Parameter(names = {"--format", + "-f"}, description = "Format for the input data.", required = false, + validateValueWith = FormatValidator.class) + public String format = null; + @Parameter(names = {"--spark-master", + "-ms"}, description = "Spark master", required = false) + public String sparkMaster = null; + @Parameter(names = {"--spark-memory", + "-sm"}, description = "spark memory to use", required = true) + public String sparkMemory = null; + @Parameter(names = {"--retry", + "-rt"}, description = "number of retries", required = false) + public int retry = 0; + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + } + + public static void main(String args[]) throws Exception { + final HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config(); + JCommander cmd = new JCommander(cfg, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg); + dataImporter.dataImport(dataImporter.getSparkContext(), cfg.retry); + } + + private JavaSparkContext getSparkContext() { + SparkConf sparkConf = new SparkConf().setAppName("hoodie-data-importer-" + cfg.tableName); + sparkConf.setMaster(cfg.sparkMaster); + + if (cfg.sparkMaster.startsWith("yarn")) { + sparkConf.set("spark.eventLog.overwrite", "true"); + sparkConf.set("spark.eventLog.enabled", "true"); } - public static class FormatValidator implements IValueValidator { - List validFormats = Arrays.asList("parquet"); + sparkConf.set("spark.driver.maxResultSize", "2g"); + sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + sparkConf.set("spark.executor.memory", cfg.sparkMemory); + // Configure hadoop conf + sparkConf.set("spark.hadoop.mapred.output.compress", "true"); + sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); + sparkConf.set("spark.hadoop.mapred.output.compression.codec", + "org.apache.hadoop.io.compress.GzipCodec"); + sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); + + sparkConf = HoodieWriteClient.registerClasses(sparkConf); + return new JavaSparkContext(sparkConf); + } + + private String getSchema() throws Exception { + // Read schema file. + Path p = new Path(cfg.schemaFile); + if (!fs.exists(p)) { + throw new Exception( + String.format("Could not find - %s - schema file.", cfg.schemaFile)); + } + long len = fs.getFileStatus(p).getLen(); + ByteBuffer buf = ByteBuffer.allocate((int) len); + FSDataInputStream inputStream = null; + try { + inputStream = fs.open(p); + inputStream.readFully(0, buf.array(), 0, buf.array().length); + } finally { + if (inputStream != null) { + inputStream.close(); + } + } + return new String(buf.array()); + } + + public int dataImport(JavaSparkContext jsc, int retry) throws Exception { + int ret = -1; + try { + // Verify that targetPath is not present. + if (fs.exists(new Path(cfg.targetPath))) { + throw new HoodieIOException( + String.format("Make sure %s is not present.", cfg.targetPath)); + } + do { + ret = dataImport(jsc); + } while (ret != 0 && retry-- > 0); + } catch (Throwable t) { + logger.error(t); + } + return ret; + } + + @VisibleForTesting + protected int dataImport(JavaSparkContext jsc) throws IOException { + try { + if (fs.exists(new Path(cfg.targetPath))) { + // cleanup target directory. + fs.delete(new Path(cfg.targetPath), true); + } + + //Get schema. + String schemaStr = getSchema(); + + // Initialize target hoodie table. + Properties properties = new Properties(); + properties.put(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, cfg.tableName); + properties.put(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, cfg.tableType); + HoodieTableMetaClient.initializePathAsHoodieDataset(fs, cfg.targetPath, properties); + + HoodieWriteClient client = createHoodieClient(jsc, cfg.targetPath, schemaStr, + cfg.parallelism); + + Job job = Job.getInstance(jsc.hadoopConfiguration()); + // To parallelize reading file status. + job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024"); + AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), + (new Schema.Parser().parse(schemaStr))); + ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class)); + + JavaRDD> hoodieRecords = jsc + .newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, + GenericRecord.class, job.getConfiguration()) + // To reduce large number of tasks. + .coalesce(16 * cfg.parallelism) + .map(new Function, HoodieRecord>() { + @Override + public HoodieRecord call(Tuple2 entry) + throws Exception { + GenericRecord genericRecord = entry._2(); + Object partitionField = genericRecord.get(cfg.partitionKey); + if (partitionField == null) { + throw new HoodieIOException( + "partition key is missing. :" + cfg.partitionKey); + } + Object rowField = genericRecord.get(cfg.rowKey); + if (rowField == null) { + throw new HoodieIOException( + "row field is missing. :" + cfg.rowKey); + } + long ts = (long) ((Double) partitionField * 1000l); + String partitionPath = PARTITION_FORMATTER.format(new Date(ts)); + return new HoodieRecord( + new HoodieKey((String) rowField, partitionPath), + new HoodieJsonPayload(genericRecord.toString())); + } + } + ); + // Get commit time. + String commitTime = client.startCommit(); + + JavaRDD writeResponse = client.bulkInsert(hoodieRecords, commitTime); + Accumulator errors = jsc.accumulator(0); + writeResponse.foreach(new VoidFunction() { @Override - public void validate(String name, String value) throws ParameterException { - if (value == null || !validFormats.contains(value)) { - throw new ParameterException(String - .format("Invalid format type: value:%s: supported formats:%s", value, - validFormats)); - } + public void call(WriteStatus writeStatus) throws Exception { + if (writeStatus.hasErrors()) { + errors.add(1); + logger.error(String.format("Error processing records :writeStatus:%s", + writeStatus.getStat().toString())); + } } + }); + if (errors.value() == 0) { + logger.info(String + .format("Dataset imported into hoodie dataset with %s commit time.", + commitTime)); + return 0; + } + logger.error(String.format("Import failed with %d errors.", errors.value())); + } catch (Throwable t) { + logger.error("Error occurred.", t); } + return -1; + } - public static class SourceTypeValidator implements IValueValidator { - List validSourceTypes = Arrays.asList("hdfs"); - - @Override - public void validate(String name, String value) throws ParameterException { - if (value == null || !validSourceTypes.contains(value)) { - throw new ParameterException(String - .format("Invalid source type: value:%s: supported source types:%s", value, - validSourceTypes)); - } - } - } - - public static class Config implements Serializable { - - @Parameter(names = {"--src-path", - "-sp"}, description = "Base path for the input dataset", required = true) - public String srcPath = null; - @Parameter(names = {"--src-type", - "-st"}, description = "Source type for the input dataset", required = true, - validateValueWith = SourceTypeValidator.class) - public String srcType = null; - @Parameter(names = {"--target-path", - "-tp"}, description = "Base path for the target hoodie dataset", required = true) - public String targetPath = null; - @Parameter(names = {"--table-name", "-tn"}, description = "Table name", required = true) - public String tableName = null; - @Parameter(names = {"--table-type", "-tt"}, description = "Table type", required = true) - public String tableType = null; - @Parameter(names = {"--row-key-field", - "-rk"}, description = "Row key field name", required = true) - public String rowKey = null; - @Parameter(names = {"--partition-key-field", - "-pk"}, description = "Partition key field name", required = true) - public String partitionKey = null; - @Parameter(names = {"--parallelism", - "-pl"}, description = "Parallelism for hoodie insert", required = true) - public int parallelism = 1; - @Parameter(names = {"--schema-file", - "-sf"}, description = "path for Avro schema file", required = true) - public String schemaFile = null; - @Parameter(names = {"--format", - "-f"}, description = "Format for the input data.", required = false, - validateValueWith = FormatValidator.class) - public String format = null; - @Parameter(names = {"--spark-master", - "-ms"}, description = "Spark master", required = false) - public String sparkMaster = null; - @Parameter(names = {"--spark-memory", - "-sm"}, description = "spark memory to use", required = true) - public String sparkMemory = null; - @Parameter(names = {"--retry", - "-rt"}, description = "number of retries", required = false) - public int retry = 0; - @Parameter(names = {"--help", "-h"}, help = true) - public Boolean help = false; - } - - public static void main(String args[]) throws Exception { - final HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config(); - JCommander cmd = new JCommander(cfg, args); - if (cfg.help || args.length == 0) { - cmd.usage(); - System.exit(1); - } - HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg); - dataImporter.dataImport(dataImporter.getSparkContext(), cfg.retry); - } - - private JavaSparkContext getSparkContext() { - SparkConf sparkConf = new SparkConf().setAppName("hoodie-data-importer-" + cfg.tableName); - sparkConf.setMaster(cfg.sparkMaster); - - if (cfg.sparkMaster.startsWith("yarn")) { - sparkConf.set("spark.eventLog.overwrite", "true"); - sparkConf.set("spark.eventLog.enabled", "true"); - } - - sparkConf.set("spark.driver.maxResultSize", "2g"); - sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - sparkConf.set("spark.executor.memory", cfg.sparkMemory); - - // Configure hadoop conf - sparkConf.set("spark.hadoop.mapred.output.compress", "true"); - sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); - sparkConf.set("spark.hadoop.mapred.output.compression.codec", - "org.apache.hadoop.io.compress.GzipCodec"); - sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); - - sparkConf = HoodieWriteClient.registerClasses(sparkConf); - return new JavaSparkContext(sparkConf); - } - - private String getSchema() throws Exception { - // Read schema file. - Path p = new Path(cfg.schemaFile); - if (!fs.exists(p)) { - throw new Exception( - String.format("Could not find - %s - schema file.", cfg.schemaFile)); - } - long len = fs.getFileStatus(p).getLen(); - ByteBuffer buf = ByteBuffer.allocate((int) len); - FSDataInputStream inputStream = null; - try { - inputStream = fs.open(p); - inputStream.readFully(0, buf.array(), 0, buf.array().length); - } - finally { - if (inputStream != null) - inputStream.close(); - } - return new String(buf.array()); - } - - public int dataImport(JavaSparkContext jsc, int retry) throws Exception { - int ret = -1; - try { - // Verify that targetPath is not present. - if (fs.exists(new Path(cfg.targetPath))) { - throw new HoodieIOException( - String.format("Make sure %s is not present.", cfg.targetPath)); - } - do { - ret = dataImport(jsc); - } while (ret != 0 && retry-- > 0); - } catch (Throwable t) { - logger.error(t); - } - return ret; - } - - @VisibleForTesting - protected int dataImport(JavaSparkContext jsc) throws IOException { - try { - if (fs.exists(new Path(cfg.targetPath))) { - // cleanup target directory. - fs.delete(new Path(cfg.targetPath), true); - } - - //Get schema. - String schemaStr = getSchema(); - - // Initialize target hoodie table. - Properties properties = new Properties(); - properties.put(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, cfg.tableName); - properties.put(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, cfg.tableType); - HoodieTableMetaClient.initializePathAsHoodieDataset(fs, cfg.targetPath, properties); - - HoodieWriteClient client = createHoodieClient(jsc, cfg.targetPath, schemaStr, - cfg.parallelism); - - Job job = Job.getInstance(jsc.hadoopConfiguration()); - // To parallelize reading file status. - job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024"); - AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), - (new Schema.Parser().parse(schemaStr))); - ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class)); - - JavaRDD> hoodieRecords = jsc - .newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, - GenericRecord.class, job.getConfiguration()) - // To reduce large number of tasks. - .coalesce(16 * cfg.parallelism) - .map(new Function, HoodieRecord>() { - @Override - public HoodieRecord call(Tuple2 entry) - throws Exception { - GenericRecord genericRecord = entry._2(); - Object partitionField = genericRecord.get(cfg.partitionKey); - if (partitionField == null) { - throw new HoodieIOException( - "partition key is missing. :" + cfg.partitionKey); - } - Object rowField = genericRecord.get(cfg.rowKey); - if (rowField == null) { - throw new HoodieIOException( - "row field is missing. :" + cfg.rowKey); - } - long ts = (long) ((Double) partitionField * 1000l); - String partitionPath = PARTITION_FORMATTER.format(new Date(ts)); - return new HoodieRecord( - new HoodieKey((String) rowField, partitionPath), - new HoodieJsonPayload(genericRecord.toString())); - } - } - ); - // Get commit time. - String commitTime = client.startCommit(); - - JavaRDD writeResponse = client.bulkInsert(hoodieRecords, commitTime); - Accumulator errors = jsc.accumulator(0); - writeResponse.foreach(new VoidFunction() { - @Override - public void call(WriteStatus writeStatus) throws Exception { - if (writeStatus.hasErrors()) { - errors.add(1); - logger.error(String.format("Error processing records :writeStatus:%s", - writeStatus.getStat().toString())); - } - } - }); - if (errors.value() == 0) { - logger.info(String - .format("Dataset imported into hoodie dataset with %s commit time.", - commitTime)); - return 0; - } - logger.error(String.format("Import failed with %d errors.", errors.value())); - } catch (Throwable t) { - logger.error("Error occurred.", t); - } - return -1; - } - - private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, - String schemaStr, int parallelism) throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withParallelism(parallelism, parallelism).withSchema(schemaStr) - .combineInput(true, true).withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .build(); - return new HoodieWriteClient(jsc, config); - } + private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, + String schemaStr, int parallelism) throws Exception { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withParallelism(parallelism, parallelism).withSchema(schemaStr) + .combineInput(true, true).withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .build(); + return new HoodieWriteClient(jsc, config); + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java index 7c6230d5b..61aec29fb 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java @@ -25,19 +25,6 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.utilities.exception.HoodieIncrementalPullException; import com.uber.hoodie.utilities.exception.HoodieIncrementalPullSQLException; - -import org.apache.commons.dbcp.BasicDataSource; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.permission.FsAction; -import org.apache.hadoop.fs.permission.FsPermission; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.stringtemplate.v4.ST; - -import javax.sql.DataSource; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; @@ -50,301 +37,343 @@ import java.util.List; import java.util.Optional; import java.util.Scanner; import java.util.stream.Collectors; +import javax.sql.DataSource; +import org.apache.commons.dbcp.BasicDataSource; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.stringtemplate.v4.ST; /** - * Utility to pull data after a given commit, based on the supplied HiveQL and save the delta as another hive temporary table. + * Utility to pull data after a given commit, based on the supplied HiveQL and save the delta as + * another hive temporary table. * * Current Limitations: * - * - Only the source table can be incrementally pulled (usually the largest table) - * - The incrementally pulled table can't be referenced more than once. + * - Only the source table can be incrementally pulled (usually the largest table) - The + * incrementally pulled table can't be referenced more than once. */ public class HiveIncrementalPuller { - private static Logger log = LogManager.getLogger(HiveIncrementalPuller.class); - private static String driverName = "org.apache.hive.jdbc.HiveDriver"; + private static Logger log = LogManager.getLogger(HiveIncrementalPuller.class); + private static String driverName = "org.apache.hive.jdbc.HiveDriver"; - public static class Config implements Serializable { - @Parameter(names = {"--hiveUrl"}) public String hiveJDBCUrl = - "jdbc:hive2://localhost:10014/;transportMode=http;httpPath=hs2"; - @Parameter(names = {"--hiveUser"}) public String hiveUsername = "hive"; - @Parameter(names = {"--hivePass"}) public String hivePassword = ""; - @Parameter(names = {"--queue"}) public String yarnQueueName = "hadoop-queue"; - @Parameter(names = {"--tmp"}) public String hoodieTmpDir = "/app/hoodie/intermediate"; - @Parameter(names = {"--extractSQLFile"}, required = true) public String incrementalSQLFile; - @Parameter(names = {"--sourceDb"}, required = true) public String sourceDb; - @Parameter(names = {"--sourceTable"}, required = true) public String sourceTable; - @Parameter(names = {"--targetDb"}) public String targetDb; - @Parameter(names = {"--targetTable"}, required = true) public String targetTable; - @Parameter(names = {"--tmpdb"}) public String tmpDb = "tmp"; - @Parameter(names = {"--fromCommitTime"}) public String fromCommitTime; - @Parameter(names = {"--maxCommits"}) public int maxCommits = 3; - @Parameter(names = {"--help", "-h"}, help = true) public Boolean help = false; - @Parameter(names = {"--storageFormat"}) public String tempTableStorageFormat = "AVRO"; + public static class Config implements Serializable { + + @Parameter(names = {"--hiveUrl"}) + public String hiveJDBCUrl = + "jdbc:hive2://localhost:10014/;transportMode=http;httpPath=hs2"; + @Parameter(names = {"--hiveUser"}) + public String hiveUsername = "hive"; + @Parameter(names = {"--hivePass"}) + public String hivePassword = ""; + @Parameter(names = {"--queue"}) + public String yarnQueueName = "hadoop-queue"; + @Parameter(names = {"--tmp"}) + public String hoodieTmpDir = "/app/hoodie/intermediate"; + @Parameter(names = {"--extractSQLFile"}, required = true) + public String incrementalSQLFile; + @Parameter(names = {"--sourceDb"}, required = true) + public String sourceDb; + @Parameter(names = {"--sourceTable"}, required = true) + public String sourceTable; + @Parameter(names = {"--targetDb"}) + public String targetDb; + @Parameter(names = {"--targetTable"}, required = true) + public String targetTable; + @Parameter(names = {"--tmpdb"}) + public String tmpDb = "tmp"; + @Parameter(names = {"--fromCommitTime"}) + public String fromCommitTime; + @Parameter(names = {"--maxCommits"}) + public int maxCommits = 3; + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + @Parameter(names = {"--storageFormat"}) + public String tempTableStorageFormat = "AVRO"; + } + + static { + try { + Class.forName(driverName); + } catch (ClassNotFoundException e) { + throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e); } + } - static { - try { - Class.forName(driverName); - } catch (ClassNotFoundException e) { - throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e); + private Connection connection; + protected final Config config; + private final ST incrementalPullSQLtemplate; + + public HiveIncrementalPuller(Config config) throws IOException { + this.config = config; + validateConfig(config); + String templateContent = IOUtils + .toString(this.getClass().getResourceAsStream("IncrementalPull.sqltemplate")); + incrementalPullSQLtemplate = new ST(templateContent); + } + + private void validateConfig(Config config) { + if (config.maxCommits == -1) { + config.maxCommits = Integer.MAX_VALUE; + } + } + + public void saveDelta() throws IOException { + Configuration conf = new Configuration(); + FileSystem fs = FileSystem.get(conf); + Statement stmt = null; + try { + if (config.fromCommitTime == null) { + config.fromCommitTime = inferCommitTime(fs); + log.info("FromCommitTime inferred as " + config.fromCommitTime); + } + + log.info("FromCommitTime - " + config.fromCommitTime); + String sourceTableLocation = getTableLocation(config.sourceDb, config.sourceTable); + String lastCommitTime = getLastCommitTimePulled(fs, sourceTableLocation); + if (lastCommitTime == null) { + log.info("Nothing to pull. However we will continue to create a empty table"); + lastCommitTime = config.fromCommitTime; + } + + Connection conn = getConnection(); + stmt = conn.createStatement(); + // drop the temp table if exists + String tempDbTable = config.tmpDb + "." + config.targetTable + "__" + config.sourceTable; + String tempDbTablePath = + config.hoodieTmpDir + "/" + config.targetTable + "__" + config.sourceTable + "/" + + lastCommitTime; + executeStatement("drop table " + tempDbTable, stmt); + deleteHDFSPath(fs, tempDbTablePath); + if (!ensureTempPathExists(fs, lastCommitTime)) { + throw new IllegalStateException( + "Could not create target path at " + new Path(config.hoodieTmpDir, + config.targetTable + "/" + lastCommitTime)); + } + + initHiveBeelineProperties(stmt); + executeIncrementalSQL(tempDbTable, tempDbTablePath, stmt); + log.info("Finished HoodieReader execution"); + } catch (SQLException e) { + log.error("Exception when executing SQL", e); + throw new IOException("Could not scan " + config.sourceTable + " incrementally", e); + } finally { + try { + if (stmt != null) { + stmt.close(); } + } catch (SQLException e) { + log.error("Could not close the resultset opened ", e); + } + } + } + + private void executeIncrementalSQL(String tempDbTable, String tempDbTablePath, Statement stmt) + throws FileNotFoundException, SQLException { + incrementalPullSQLtemplate.add("tempDbTable", tempDbTable); + incrementalPullSQLtemplate.add("tempDbTablePath", tempDbTablePath); + + String storedAsClause = getStoredAsClause(); + + incrementalPullSQLtemplate.add("storedAsClause", storedAsClause); + String incrementalSQL = + new Scanner(new File(config.incrementalSQLFile)).useDelimiter("\\Z").next(); + if (!incrementalSQL.contains(config.sourceDb + "." + config.sourceTable)) { + log.info("Incremental SQL does not have " + config.sourceDb + "." + config.sourceTable + + ", which means its pulling from a different table. Fencing this from happening."); + throw new HoodieIncrementalPullSQLException( + "Incremental SQL does not have " + config.sourceDb + "." + config.sourceTable); + } + if (!incrementalSQL.contains("`_hoodie_commit_time` > '%targetBasePath'")) { + log.info("Incremental SQL : " + incrementalSQL + + " does not contain `_hoodie_commit_time` > '%targetBasePath'. Please add this clause for incremental to work properly."); + throw new HoodieIncrementalPullSQLException( + "Incremental SQL does not have clause `_hoodie_commit_time` > '%targetBasePath', which means its not pulling incrementally"); } - private Connection connection; - protected final Config config; - private final ST incrementalPullSQLtemplate; + incrementalPullSQLtemplate + .add("incrementalSQL", String.format(incrementalSQL, config.fromCommitTime)); + String sql = incrementalPullSQLtemplate.render(); + // Check if the SQL is pulling from the right database + executeStatement(sql, stmt); + } - public HiveIncrementalPuller(Config config) throws IOException { - this.config = config; - validateConfig(config); - String templateContent = IOUtils.toString(this.getClass().getResourceAsStream("IncrementalPull.sqltemplate")); - incrementalPullSQLtemplate = new ST(templateContent); + private String getStoredAsClause() { + if (config.tempTableStorageFormat.equalsIgnoreCase("json")) { + // Special case for json + // default json serde does not support having same key even if its under multiple depths + return "ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' STORED AS TEXTFILE"; } + return "STORED AS " + config.tempTableStorageFormat; + } - private void validateConfig(Config config) { - if(config.maxCommits == -1) { - config.maxCommits = Integer.MAX_VALUE; + private void initHiveBeelineProperties(Statement stmt) throws SQLException { + log.info("Setting up Hive JDBC Session with properties"); + // set the queue + executeStatement("set mapred.job.queue.name=" + config.yarnQueueName, stmt); + // Set the inputformat to HoodieCombineHiveInputFormat + executeStatement( + "set hive.input.format=com.uber.hoodie.hadoop.hive.HoodieCombineHiveInputFormat", stmt); + // Allow queries without partition predicate + executeStatement("set hive.strict.checks.large.query=false", stmt); + // Dont gather stats for the table created + executeStatement("set hive.stats.autogather=false", stmt); + // Set the hoodie modie + executeStatement("set hoodie." + config.sourceTable + ".consume.mode=INCREMENTAL", stmt); + // Set the from commit time + executeStatement("set hoodie." + config.sourceTable + ".consume.start.timestamp=" + + config.fromCommitTime, stmt); + // Set number of commits to pull + executeStatement("set hoodie." + config.sourceTable + ".consume.max.commits=" + String + .valueOf(config.maxCommits), stmt); + } + + private boolean deleteHDFSPath(FileSystem fs, String path) throws IOException { + log.info("Deleting path " + path); + return fs.delete(new Path(path), true); + } + + private void executeStatement(String sql, Statement stmt) throws SQLException { + log.info("Executing: " + sql); + stmt.execute(sql); + } + + private String inferCommitTime(FileSystem fs) throws SQLException, IOException { + log.info("FromCommitTime not specified. Trying to infer it from Hoodie dataset " + + config.targetDb + "." + config.targetTable); + String targetDataLocation = getTableLocation(config.targetDb, config.targetTable); + return scanForCommitTime(fs, targetDataLocation); + } + + private String getTableLocation(String db, String table) throws SQLException { + ResultSet resultSet = null; + Statement stmt = null; + try { + Connection conn = getConnection(); + stmt = conn.createStatement(); + resultSet = stmt.executeQuery("describe formatted `" + db + "." + table + "`"); + while (resultSet.next()) { + if (resultSet.getString(1).trim().equals("Location:")) { + log.info("Inferred table location for " + db + "." + table + " as " + resultSet + .getString(2)); + return resultSet.getString(2); } - } - - public void saveDelta() throws IOException { - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(conf); - Statement stmt = null; - try { - if (config.fromCommitTime == null) { - config.fromCommitTime = inferCommitTime(fs); - log.info("FromCommitTime inferred as " + config.fromCommitTime); - } - - log.info("FromCommitTime - " + config.fromCommitTime); - String sourceTableLocation = getTableLocation(config.sourceDb, config.sourceTable); - String lastCommitTime = getLastCommitTimePulled(fs, sourceTableLocation); - if (lastCommitTime == null) { - log.info("Nothing to pull. However we will continue to create a empty table"); - lastCommitTime = config.fromCommitTime; - } - - Connection conn = getConnection(); - stmt = conn.createStatement(); - // drop the temp table if exists - String tempDbTable = config.tmpDb + "." + config.targetTable + "__" + config.sourceTable; - String tempDbTablePath = config.hoodieTmpDir + "/" + config.targetTable + "__" + config.sourceTable + "/" + lastCommitTime; - executeStatement("drop table " + tempDbTable, stmt); - deleteHDFSPath(fs, tempDbTablePath); - if (!ensureTempPathExists(fs, lastCommitTime)) { - throw new IllegalStateException( - "Could not create target path at " + new Path(config.hoodieTmpDir, - config.targetTable + "/" + lastCommitTime)); - } - - initHiveBeelineProperties(stmt); - executeIncrementalSQL(tempDbTable, tempDbTablePath, stmt); - log.info("Finished HoodieReader execution"); - } catch (SQLException e) { - log.error("Exception when executing SQL", e); - throw new IOException("Could not scan " + config.sourceTable + " incrementally", e); - } finally { - try { - if (stmt != null) - stmt.close(); - } catch (SQLException e) { - log.error("Could not close the resultset opened ", e); - } + } + } catch (SQLException e) { + throw new HoodieIncrementalPullException( + "Failed to get data location for table " + db + "." + table, e); + } finally { + try { + if (stmt != null) { + stmt.close(); } - } - - private void executeIncrementalSQL(String tempDbTable, String tempDbTablePath, Statement stmt) - throws FileNotFoundException, SQLException { - incrementalPullSQLtemplate.add("tempDbTable", tempDbTable); - incrementalPullSQLtemplate.add("tempDbTablePath", tempDbTablePath); - - String storedAsClause = getStoredAsClause(); - - incrementalPullSQLtemplate.add("storedAsClause", storedAsClause); - String incrementalSQL = - new Scanner(new File(config.incrementalSQLFile)).useDelimiter("\\Z").next(); - if (!incrementalSQL.contains(config.sourceDb + "." + config.sourceTable)) { - log.info("Incremental SQL does not have " + config.sourceDb + "." + config.sourceTable - + ", which means its pulling from a different table. Fencing this from happening."); - throw new HoodieIncrementalPullSQLException( - "Incremental SQL does not have " + config.sourceDb + "." + config.sourceTable); - } - if (!incrementalSQL.contains("`_hoodie_commit_time` > '%targetBasePath'")) { - log.info("Incremental SQL : " + incrementalSQL - + " does not contain `_hoodie_commit_time` > '%targetBasePath'. Please add this clause for incremental to work properly."); - throw new HoodieIncrementalPullSQLException( - "Incremental SQL does not have clause `_hoodie_commit_time` > '%targetBasePath', which means its not pulling incrementally"); + if (resultSet != null) { + resultSet.close(); } + } catch (SQLException e) { + log.error("Could not close the resultset opened ", e); + } + } + return null; + } - incrementalPullSQLtemplate - .add("incrementalSQL", String.format(incrementalSQL, config.fromCommitTime)); - String sql = incrementalPullSQLtemplate.render(); - // Check if the SQL is pulling from the right database - executeStatement(sql, stmt); + private String scanForCommitTime(FileSystem fs, String targetDataPath) throws IOException { + if (targetDataPath == null) { + throw new IllegalArgumentException( + "Please specify either --fromCommitTime or --targetDataPath"); + } + if (!fs.exists(new Path(targetDataPath)) || !fs.exists(new Path(targetDataPath + "/.hoodie"))) { + return "0"; + } + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, targetDataPath); + + Optional + lastCommit = metadata.getActiveTimeline().getCommitsAndCompactionsTimeline() + .filterCompletedInstants().lastInstant(); + if (lastCommit.isPresent()) { + return lastCommit.get().getTimestamp(); + } + return "0"; + } + + private boolean ensureTempPathExists(FileSystem fs, String lastCommitTime) + throws IOException { + Path targetBaseDirPath = new Path(config.hoodieTmpDir, + config.targetTable + "__" + config.sourceTable); + if (!fs.exists(targetBaseDirPath)) { + log.info("Creating " + targetBaseDirPath + " with permission drwxrwxrwx"); + boolean result = FileSystem.mkdirs(fs, targetBaseDirPath, + new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); + if (!result) { + throw new HoodieException( + "Could not create " + targetBaseDirPath + " with the required permissions"); + } } - private String getStoredAsClause() { - if(config.tempTableStorageFormat.equalsIgnoreCase("json")) { - // Special case for json - // default json serde does not support having same key even if its under multiple depths - return "ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' STORED AS TEXTFILE"; - } - return "STORED AS " + config.tempTableStorageFormat; + Path targetPath = new Path(targetBaseDirPath, lastCommitTime); + if (fs.exists(targetPath)) { + boolean result = fs.delete(targetPath, true); + if (!result) { + throw new HoodieException( + "Could not delete existing " + targetPath); + } } + log.info("Creating " + targetPath + " with permission drwxrwxrwx"); + return FileSystem.mkdirs(fs, targetBaseDirPath, + new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); + } - private void initHiveBeelineProperties(Statement stmt) throws SQLException { - log.info("Setting up Hive JDBC Session with properties"); - // set the queue - executeStatement("set mapred.job.queue.name=" + config.yarnQueueName, stmt); - // Set the inputformat to HoodieCombineHiveInputFormat - executeStatement("set hive.input.format=com.uber.hoodie.hadoop.hive.HoodieCombineHiveInputFormat", stmt); - // Allow queries without partition predicate - executeStatement("set hive.strict.checks.large.query=false", stmt); - // Dont gather stats for the table created - executeStatement("set hive.stats.autogather=false", stmt); - // Set the hoodie modie - executeStatement("set hoodie." + config.sourceTable + ".consume.mode=INCREMENTAL", stmt); - // Set the from commit time - executeStatement("set hoodie." + config.sourceTable + ".consume.start.timestamp=" - + config.fromCommitTime, stmt); - // Set number of commits to pull - executeStatement("set hoodie." + config.sourceTable + ".consume.max.commits=" + String - .valueOf(config.maxCommits), stmt); + private String getLastCommitTimePulled(FileSystem fs, String sourceTableLocation) + throws IOException { + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, sourceTableLocation); + List commitsToSync = metadata.getActiveTimeline().getCommitsAndCompactionsTimeline() + .filterCompletedInstants() + .findInstantsAfter(config.fromCommitTime, config.maxCommits).getInstants() + .map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); + if (commitsToSync.isEmpty()) { + log.warn("Nothing to sync. All commits in " + config.sourceTable + " are " + metadata + .getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants() + .getInstants() + .collect(Collectors.toList()) + " and from commit time is " + + config.fromCommitTime); + return null; } + log.info("Syncing commits " + commitsToSync); + return commitsToSync.get(commitsToSync.size() - 1); + } - private boolean deleteHDFSPath(FileSystem fs, String path) throws IOException { - log.info("Deleting path " + path); - return fs.delete(new Path(path), true); + private Connection getConnection() throws SQLException { + if (connection == null) { + DataSource ds = getDatasource(); + log.info("Getting Hive Connection from Datasource " + ds); + this.connection = ds.getConnection(); } + return connection; + } - private void executeStatement(String sql, Statement stmt) throws SQLException { - log.info("Executing: " + sql); - stmt.execute(sql); - } - - private String inferCommitTime(FileSystem fs) throws SQLException, IOException { - log.info("FromCommitTime not specified. Trying to infer it from Hoodie dataset " - + config.targetDb + "." + config.targetTable); - String targetDataLocation = getTableLocation(config.targetDb, config.targetTable); - return scanForCommitTime(fs, targetDataLocation); - } - - private String getTableLocation(String db, String table) throws SQLException { - ResultSet resultSet = null; - Statement stmt = null; - try { - Connection conn = getConnection(); - stmt = conn.createStatement(); - resultSet = stmt.executeQuery("describe formatted `" + db + "." + table + "`"); - while (resultSet.next()) { - if (resultSet.getString(1).trim().equals("Location:")) { - log.info("Inferred table location for " + db + "." + table + " as " + resultSet - .getString(2)); - return resultSet.getString(2); - } - } - } catch (SQLException e) { - throw new HoodieIncrementalPullException( - "Failed to get data location for table " + db + "." + table, e); - } finally { - try { - if (stmt != null) - stmt.close(); - if (resultSet != null) - resultSet.close(); - } catch (SQLException e) { - log.error("Could not close the resultset opened ", e); - } - } - return null; - } - - private String scanForCommitTime(FileSystem fs, String targetDataPath) throws IOException { - if(targetDataPath == null) { - throw new IllegalArgumentException("Please specify either --fromCommitTime or --targetDataPath"); - } - if(!fs.exists(new Path(targetDataPath)) || !fs.exists(new Path(targetDataPath + "/.hoodie"))) { - return "0"; - } - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, targetDataPath); - - Optional - lastCommit = metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants().lastInstant(); - if(lastCommit.isPresent()) { - return lastCommit.get().getTimestamp(); - } - return "0"; - } - - private boolean ensureTempPathExists(FileSystem fs, String lastCommitTime) - throws IOException { - Path targetBaseDirPath = new Path(config.hoodieTmpDir, config.targetTable + "__" + config.sourceTable); - if(!fs.exists(targetBaseDirPath)) { - log.info("Creating " + targetBaseDirPath + " with permission drwxrwxrwx"); - boolean result = FileSystem.mkdirs(fs, targetBaseDirPath, - new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); - if (!result) { - throw new HoodieException( - "Could not create " + targetBaseDirPath + " with the required permissions"); - } - } - - Path targetPath = new Path(targetBaseDirPath, lastCommitTime); - if(fs.exists(targetPath)) { - boolean result = fs.delete(targetPath, true); - if (!result) { - throw new HoodieException( - "Could not delete existing " + targetPath); - } - } - log.info("Creating " + targetPath + " with permission drwxrwxrwx"); - return FileSystem.mkdirs(fs, targetBaseDirPath, - new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); - } - - private String getLastCommitTimePulled(FileSystem fs, String sourceTableLocation) throws IOException { - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, sourceTableLocation); - List commitsToSync = metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants() - .findInstantsAfter(config.fromCommitTime, config.maxCommits).getInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); - if (commitsToSync.isEmpty()) { - log.warn("Nothing to sync. All commits in " + config.sourceTable + " are " + metadata - .getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants().getInstants() - .collect(Collectors.toList()) + " and from commit time is " - + config.fromCommitTime); - return null; - } - log.info("Syncing commits " + commitsToSync); - return commitsToSync.get(commitsToSync.size() - 1); - } - - private Connection getConnection() throws SQLException { - if (connection == null) { - DataSource ds = getDatasource(); - log.info("Getting Hive Connection from Datasource " + ds); - this.connection = ds.getConnection(); - } - return connection; - } - - private DataSource getDatasource() { - BasicDataSource ds = new BasicDataSource(); - ds.setDriverClassName(driverName); - ds.setUrl(config.hiveJDBCUrl); - ds.setUsername(config.hiveUsername); - ds.setPassword(config.hivePassword); - return ds; - } - - public static void main(String[] args) throws IOException { - final Config cfg = new Config(); - JCommander cmd = new JCommander(cfg, args); - if (cfg.help || args.length == 0) { - cmd.usage(); - System.exit(1); - } - new HiveIncrementalPuller(cfg).saveDelta(); + private DataSource getDatasource() { + BasicDataSource ds = new BasicDataSource(); + ds.setDriverClassName(driverName); + ds.setUrl(config.hiveJDBCUrl); + ds.setUsername(config.hiveUsername); + ds.setPassword(config.hivePassword); + return ds; + } + + public static void main(String[] args) throws IOException { + final Config cfg = new Config(); + JCommander cmd = new JCommander(cfg, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); } + new HiveIncrementalPuller(cfg).saveDelta(); + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java index 9bb7869bd..0a74e2036 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java @@ -20,7 +20,6 @@ package com.uber.hoodie.utilities; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; - import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.table.HoodieTableConfig; @@ -30,7 +29,12 @@ import com.uber.hoodie.common.table.TableFileSystemView; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.view.HoodieTableFileSystemView; import com.uber.hoodie.common.util.FSUtils; - +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.stream.Stream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; @@ -39,140 +43,154 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; - import scala.Tuple2; -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.stream.Stream; - /** - * Hoodie snapshot copy job which copies latest files from all partitions to another place, for snapshot backup. + * Hoodie snapshot copy job which copies latest files from all partitions to another place, for + * snapshot backup. */ public class HoodieSnapshotCopier implements Serializable { - private static Logger logger = LogManager.getLogger(HoodieSnapshotCopier.class); - static class Config implements Serializable { - @Parameter(names = {"--base-path", "-bp"}, description = "Hoodie table base path", required = true) - String basePath = null; + private static Logger logger = LogManager.getLogger(HoodieSnapshotCopier.class); - @Parameter(names = {"--output-path", "-op"}, description = "The snapshot output path", required = true) - String outputPath = null; + static class Config implements Serializable { - @Parameter(names = {"--date-partitioned", "-dp"}, description = "Can we assume date partitioning?") - boolean shouldAssumeDatePartitioning = false; + @Parameter(names = {"--base-path", + "-bp"}, description = "Hoodie table base path", required = true) + String basePath = null; + + @Parameter(names = {"--output-path", + "-op"}, description = "The snapshot output path", required = true) + String outputPath = null; + + @Parameter(names = {"--date-partitioned", + "-dp"}, description = "Can we assume date partitioning?") + boolean shouldAssumeDatePartitioning = false; + } + + public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, + final boolean shouldAssumeDatePartitioning) throws IOException { + FileSystem fs = FSUtils.getFs(); + final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs, baseDir); + final TableFileSystemView.ReadOptimizedView fsView = new HoodieTableFileSystemView( + tableMetadata, + tableMetadata.getActiveTimeline().getCommitsAndCompactionsTimeline() + .filterCompletedInstants()); + // Get the latest commit + Optional latestCommit = tableMetadata.getActiveTimeline() + .getCommitsAndCompactionsTimeline().filterCompletedInstants().lastInstant(); + if (!latestCommit.isPresent()) { + logger.warn("No commits present. Nothing to snapshot"); + return; } + final String latestCommitTimestamp = latestCommit.get().getTimestamp(); + logger.info(String + .format("Starting to snapshot latest version files which are also no-late-than %s.", + latestCommitTimestamp)); - public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, final boolean shouldAssumeDatePartitioning) throws IOException { - FileSystem fs = FSUtils.getFs(); - final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs, baseDir); - final TableFileSystemView.ReadOptimizedView fsView = new HoodieTableFileSystemView(tableMetadata, - tableMetadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants()); - // Get the latest commit - Optional latestCommit = tableMetadata.getActiveTimeline() - .getCommitsAndCompactionsTimeline().filterCompletedInstants().lastInstant(); - if(!latestCommit.isPresent()) { - logger.warn("No commits present. Nothing to snapshot"); - return; - } - final String latestCommitTimestamp = latestCommit.get().getTimestamp(); - logger.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp)); + List partitions = FSUtils + .getAllPartitionPaths(fs, baseDir, shouldAssumeDatePartitioning); + if (partitions.size() > 0) { + logger.info(String.format("The job needs to copy %d partitions.", partitions.size())); - List partitions = FSUtils.getAllPartitionPaths(fs, baseDir, shouldAssumeDatePartitioning); - if (partitions.size() > 0) { - logger.info(String.format("The job needs to copy %d partitions.", partitions.size())); + // Make sure the output directory is empty + Path outputPath = new Path(outputDir); + if (fs.exists(outputPath)) { + logger.warn( + String.format("The output path %targetBasePath already exists, deleting", outputPath)); + fs.delete(new Path(outputDir), true); + } - // Make sure the output directory is empty - Path outputPath = new Path(outputDir); - if (fs.exists(outputPath)) { - logger.warn(String.format("The output path %targetBasePath already exists, deleting", outputPath)); - fs.delete(new Path(outputDir), true); + jsc.parallelize(partitions, partitions.size()) + .flatMap(partition -> { + // Only take latest version files <= latestCommit. + FileSystem fs1 = FSUtils.getFs(); + List> filePaths = new ArrayList<>(); + Stream dataFiles = fsView + .getLatestDataFilesBeforeOrOn(partition, latestCommitTimestamp); + dataFiles.forEach( + hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath()))); + + // also need to copy over partition metadata + Path partitionMetaFile = new Path(new Path(baseDir, partition), + HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); + if (fs1.exists(partitionMetaFile)) { + filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString())); } - jsc.parallelize(partitions, partitions.size()) - .flatMap(partition -> { - // Only take latest version files <= latestCommit. - FileSystem fs1 = FSUtils.getFs(); - List> filePaths = new ArrayList<>(); - Stream dataFiles = fsView.getLatestDataFilesBeforeOrOn(partition, latestCommitTimestamp); - dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath()))); + return filePaths.iterator(); + }).foreach(tuple -> { + String partition = tuple._1(); + Path sourceFilePath = new Path(tuple._2()); + Path toPartitionPath = new Path(outputDir, partition); + FileSystem fs1 = FSUtils.getFs(); - // also need to copy over partition metadata - Path partitionMetaFile = new Path(new Path(baseDir, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); - if (fs1.exists(partitionMetaFile)) { - filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString())); - } + if (!fs1.exists(toPartitionPath)) { + fs1.mkdirs(toPartitionPath); + } + FileUtil.copy(fs1, sourceFilePath, fs1, + new Path(toPartitionPath, sourceFilePath.getName()), false, fs1.getConf()); + }); - return filePaths.iterator(); - }).foreach(tuple -> { - String partition = tuple._1(); - Path sourceFilePath = new Path(tuple._2()); - Path toPartitionPath = new Path(outputDir, partition); - FileSystem fs1 = FSUtils.getFs(); - - if (!fs1.exists(toPartitionPath)) { - fs1.mkdirs(toPartitionPath); - } - FileUtil.copy(fs1, sourceFilePath, fs1, - new Path(toPartitionPath, sourceFilePath.getName()), false, fs1.getConf()); - }); - - // Also copy the .commit files - logger.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp)); - FileStatus[] commitFilesToCopy = fs.listStatus( - new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> { - if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) { - return true; - } else { - String commitTime = - FSUtils.getCommitFromCommitFile(commitFilePath.getName()); - return HoodieTimeline.compareTimestamps(commitTime, latestCommitTimestamp, HoodieTimeline.LESSER_OR_EQUAL); - } - }); - for (FileStatus commitStatus : commitFilesToCopy) { - Path targetFilePath = new Path( - outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus - .getPath().getName()); - if (! fs.exists(targetFilePath.getParent())) { - fs.mkdirs(targetFilePath.getParent()); - } - if (fs.exists(targetFilePath)) { - logger.error(String.format("The target output commit file (%targetBasePath) already exists.", targetFilePath)); - } - FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf()); + // Also copy the .commit files + logger.info( + String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp)); + FileStatus[] commitFilesToCopy = fs.listStatus( + new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> { + if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) { + return true; + } else { + String commitTime = + FSUtils.getCommitFromCommitFile(commitFilePath.getName()); + return HoodieTimeline.compareTimestamps(commitTime, latestCommitTimestamp, + HoodieTimeline.LESSER_OR_EQUAL); } - } else { - logger.info("The job has 0 partition to copy."); + }); + for (FileStatus commitStatus : commitFilesToCopy) { + Path targetFilePath = new Path( + outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus + .getPath().getName()); + if (!fs.exists(targetFilePath.getParent())) { + fs.mkdirs(targetFilePath.getParent()); } - - // Create the _SUCCESS tag - Path successTagPath = new Path(outputDir + "/_SUCCESS"); - if (!fs.exists(successTagPath)) { - logger.info("Creating _SUCCESS under targetBasePath: " + outputDir); - fs.createNewFile(successTagPath); + if (fs.exists(targetFilePath)) { + logger.error(String + .format("The target output commit file (%targetBasePath) already exists.", + targetFilePath)); } + FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf()); + } + } else { + logger.info("The job has 0 partition to copy."); } - public static void main(String[] args) throws IOException { - // Take input configs - final Config cfg = new Config(); - new JCommander(cfg, args); - logger.info(String.format("Snapshot hoodie table from %targetBasePath to %targetBasePath", cfg.basePath, cfg.outputPath)); - - // Create a spark job to do the snapshot copy - SparkConf sparkConf = new SparkConf().setAppName("Hoodie-snapshot-copier"); - sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - logger.info("Initializing spark job."); - - // Copy - HoodieSnapshotCopier copier = new HoodieSnapshotCopier(); - copier.snapshot(jsc, cfg.basePath, cfg.outputPath, cfg.shouldAssumeDatePartitioning); - - // Stop the job - jsc.stop(); + // Create the _SUCCESS tag + Path successTagPath = new Path(outputDir + "/_SUCCESS"); + if (!fs.exists(successTagPath)) { + logger.info("Creating _SUCCESS under targetBasePath: " + outputDir); + fs.createNewFile(successTagPath); } + } + + public static void main(String[] args) throws IOException { + // Take input configs + final Config cfg = new Config(); + new JCommander(cfg, args); + logger.info(String + .format("Snapshot hoodie table from %targetBasePath to %targetBasePath", cfg.basePath, + cfg.outputPath)); + + // Create a spark job to do the snapshot copy + SparkConf sparkConf = new SparkConf().setAppName("Hoodie-snapshot-copier"); + sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + JavaSparkContext jsc = new JavaSparkContext(sparkConf); + logger.info("Initializing spark job."); + + // Copy + HoodieSnapshotCopier copier = new HoodieSnapshotCopier(); + copier.snapshot(jsc, cfg.basePath, cfg.outputPath, cfg.shouldAssumeDatePartitioning); + + // Stop the job + jsc.stop(); + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java index 69ad2e7e7..502e36e83 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java @@ -18,16 +18,12 @@ package com.uber.hoodie.utilities; -import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.exception.HoodieIOException; -import com.uber.hoodie.exception.HoodieNotSupportedException; -import com.uber.hoodie.KeyGenerator; +import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException; import com.uber.hoodie.utilities.schema.SchemaProvider; import com.uber.hoodie.utilities.sources.Source; -import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException; import com.uber.hoodie.utilities.sources.SourceDataFormat; - -import org.apache.avro.generic.GenericRecord; +import java.io.IOException; import org.apache.commons.configuration.ConfigurationException; import org.apache.commons.configuration.PropertiesConfiguration; import org.apache.commons.lang3.reflect.ConstructorUtils; @@ -36,50 +32,49 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaSparkContext; -import java.io.IOException; -import java.util.List; - /** * Bunch of helper methods */ public class UtilHelpers { - public static Source createSource(String sourceClass, PropertiesConfiguration cfg, JavaSparkContext jssc, SourceDataFormat dataFormat, SchemaProvider schemaProvider) throws IOException { - try { - return (Source) ConstructorUtils.invokeConstructor(Class.forName(sourceClass), (Object) cfg, (Object) jssc, (Object) dataFormat, (Object) schemaProvider); - } catch (Throwable e) { - throw new IOException("Could not load source class " + sourceClass, e); - } + public static Source createSource(String sourceClass, PropertiesConfiguration cfg, + JavaSparkContext jssc, SourceDataFormat dataFormat, SchemaProvider schemaProvider) + throws IOException { + try { + return (Source) ConstructorUtils + .invokeConstructor(Class.forName(sourceClass), (Object) cfg, (Object) jssc, + (Object) dataFormat, (Object) schemaProvider); + } catch (Throwable e) { + throw new IOException("Could not load source class " + sourceClass, e); } + } - public static SchemaProvider createSchemaProvider(String schemaProviderClass, PropertiesConfiguration cfg) throws IOException { - try { - return (SchemaProvider) ConstructorUtils.invokeConstructor(Class.forName(schemaProviderClass), (Object) cfg); - } catch (Throwable e) { - throw new IOException("Could not load schema provider class " + schemaProviderClass, e); - } + public static SchemaProvider createSchemaProvider(String schemaProviderClass, + PropertiesConfiguration cfg) throws IOException { + try { + return (SchemaProvider) ConstructorUtils + .invokeConstructor(Class.forName(schemaProviderClass), (Object) cfg); + } catch (Throwable e) { + throw new IOException("Could not load schema provider class " + schemaProviderClass, e); } + } - /** - * - * TODO: Support hierarchical config files (see CONFIGURATION-609 for sample) - * - * @param fs - * @param cfgPath - * @return - */ - public static PropertiesConfiguration readConfig(FileSystem fs, Path cfgPath) { - try { - FSDataInputStream in = fs.open(cfgPath); - PropertiesConfiguration config = new PropertiesConfiguration(); - config.load(in); - in.close(); - return config; - } catch (IOException e) { - throw new HoodieIOException("Unable to read config file at :" + cfgPath, e); - } catch (ConfigurationException e) { - throw new HoodieDeltaStreamerException("Invalid configs found in config file at :" + cfgPath, e); - } + /** + * TODO: Support hierarchical config files (see CONFIGURATION-609 for sample) + */ + public static PropertiesConfiguration readConfig(FileSystem fs, Path cfgPath) { + try { + FSDataInputStream in = fs.open(cfgPath); + PropertiesConfiguration config = new PropertiesConfiguration(); + config.load(in); + in.close(); + return config; + } catch (IOException e) { + throw new HoodieIOException("Unable to read config file at :" + cfgPath, e); + } catch (ConfigurationException e) { + throw new HoodieDeltaStreamerException("Invalid configs found in config file at :" + cfgPath, + e); } + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java index 02fc0d7ce..ad8ccd2ed 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java @@ -47,6 +47,13 @@ import com.uber.hoodie.utilities.schema.SchemaProvider; import com.uber.hoodie.utilities.sources.DFSSource; import com.uber.hoodie.utilities.sources.Source; import com.uber.hoodie.utilities.sources.SourceDataFormat; +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Optional; +import java.util.Properties; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.commons.configuration.PropertiesConfiguration; @@ -60,299 +67,326 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import scala.collection.JavaConversions; -import java.io.IOException; -import java.io.Serializable; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Optional; -import java.util.Properties; - /** - * An Utility which can incrementally take the output from {@link HiveIncrementalPuller} and apply it to the target dataset. - * Does not maintain any state, queries at runtime to see how far behind the target dataset is from - * the source dataset. This can be overriden to force sync from a timestamp. + * An Utility which can incrementally take the output from {@link HiveIncrementalPuller} and apply + * it to the target dataset. Does not maintain any state, queries at runtime to see how far behind + * the target dataset is from the source dataset. This can be overriden to force sync from a + * timestamp. */ public class HoodieDeltaStreamer implements Serializable { - private static volatile Logger log = LogManager.getLogger(HoodieDeltaStreamer.class); + private static volatile Logger log = LogManager.getLogger(HoodieDeltaStreamer.class); - private static String CHECKPOINT_KEY = "deltastreamer.checkpoint.key"; + private static String CHECKPOINT_KEY = "deltastreamer.checkpoint.key"; - private final Config cfg; + private final Config cfg; - /** - * Source to pull deltas from - */ - private transient Source source; + /** + * Source to pull deltas from + */ + private transient Source source; - /** - * Schema provider that supplies the command for reading the input and writing out the - * target table. - */ - private transient SchemaProvider schemaProvider; + /** + * Schema provider that supplies the command for reading the input and writing out the target + * table. + */ + private transient SchemaProvider schemaProvider; - /** - * Extract the key for the target dataset - */ - private KeyGenerator keyGenerator; + /** + * Extract the key for the target dataset + */ + private KeyGenerator keyGenerator; - /** - * Filesystem used - */ - private transient FileSystem fs; + /** + * Filesystem used + */ + private transient FileSystem fs; - /** - * Timeline with completed commits - */ - private transient Optional commitTimelineOpt; + /** + * Timeline with completed commits + */ + private transient Optional commitTimelineOpt; - /** - * Spark context - */ - private transient JavaSparkContext jssc; + /** + * Spark context + */ + private transient JavaSparkContext jssc; - public HoodieDeltaStreamer(Config cfg) throws IOException { - this.cfg = cfg; - this.fs = FSUtils.getFs(); + public HoodieDeltaStreamer(Config cfg) throws IOException { + this.cfg = cfg; + this.fs = FSUtils.getFs(); + + if (fs.exists(new Path(cfg.targetBasePath))) { + HoodieTableMetaClient meta = new HoodieTableMetaClient(fs, cfg.targetBasePath); + this.commitTimelineOpt = Optional + .of(meta.getActiveTimeline().getCommitsAndCompactionsTimeline() + .filterCompletedInstants()); + } else { + this.commitTimelineOpt = Optional.empty(); + } + + //TODO(vc) Should these be passed from outside? + initSchemaProvider(); + initKeyGenerator(); + this.jssc = getSparkContext(); + + initSource(); + } + + private void initSource() throws IOException { + // Create the source & schema providers + PropertiesConfiguration sourceCfg = UtilHelpers.readConfig(fs, new Path(cfg.sourceConfigProps)); + log.info("Creating source " + cfg.sourceClassName + " with configs : " + sourceCfg.toString()); + this.source = UtilHelpers + .createSource(cfg.sourceClassName, sourceCfg, jssc, cfg.sourceFormat, schemaProvider); + } + + private void initSchemaProvider() throws IOException { + PropertiesConfiguration schemaCfg = UtilHelpers + .readConfig(fs, new Path(cfg.schemaProviderConfigProps)); + log.info( + "Creating schema provider " + cfg.schemaProviderClassName + " with configs : " + schemaCfg + .toString()); + this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, schemaCfg); + } + + private void initKeyGenerator() throws IOException { + PropertiesConfiguration keygenCfg = UtilHelpers.readConfig(fs, new Path(cfg.keyGeneratorProps)); + log.info("Creating key generator " + cfg.keyGeneratorClass + " with configs : " + keygenCfg + .toString()); + this.keyGenerator = DataSourceUtils.createKeyGenerator(cfg.keyGeneratorClass, keygenCfg); + } - if (fs.exists(new Path(cfg.targetBasePath))) { - HoodieTableMetaClient meta = new HoodieTableMetaClient(fs, cfg.targetBasePath); - this.commitTimelineOpt = Optional.of(meta.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants()); + private JavaSparkContext getSparkContext() { + SparkConf sparkConf = new SparkConf() + .setAppName("hoodie-delta-streamer-" + cfg.targetTableName); + //sparkConf.setMaster(cfg.sparkMaster); + sparkConf.setMaster("local[2]"); + sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + sparkConf.set("spark.driver.maxResultSize", "2g"); + + // Configure hadoop conf + sparkConf.set("spark.hadoop.mapred.output.compress", "true"); + sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); + sparkConf.set("spark.hadoop.mapred.output.compression.codec", + "org.apache.hadoop.io.compress.GzipCodec"); + sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); + + sparkConf = HoodieWriteClient.registerClasses(sparkConf); + // register the schemas, so that shuffle does not serialize the full schemas + List schemas = Arrays + .asList(schemaProvider.getSourceSchema(), schemaProvider.getTargetSchema()); + sparkConf.registerAvroSchemas(JavaConversions.asScalaBuffer(schemas).toList()); + return new JavaSparkContext(sparkConf); + } + + private void sync() throws Exception { + // Retrieve the previous round checkpoints, if any + Optional resumeCheckpointStr = Optional.empty(); + if (commitTimelineOpt.isPresent()) { + Optional lastCommit = commitTimelineOpt.get().lastInstant(); + if (lastCommit.isPresent()) { + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata + .fromBytes(commitTimelineOpt.get().getInstantDetails(lastCommit.get()).get()); + if (commitMetadata.getMetadata(CHECKPOINT_KEY) != null) { + resumeCheckpointStr = Optional.of(commitMetadata.getMetadata(CHECKPOINT_KEY)); } else { - this.commitTimelineOpt = Optional.empty(); + throw new HoodieDeltaStreamerException( + "Unable to find previous checkpoint. Please double check if this table " + + "was indeed built via delta streamer "); } + } + } else { + Properties properties = new Properties(); + properties.put(HoodieWriteConfig.TABLE_NAME, cfg.targetTableName); + HoodieTableMetaClient + .initializePathAsHoodieDataset(FSUtils.getFs(), cfg.targetBasePath, properties); + } + log.info("Checkpoint to resume from : " + resumeCheckpointStr); - //TODO(vc) Should these be passed from outside? - initSchemaProvider(); - initKeyGenerator(); - this.jssc = getSparkContext(); + // Pull the data from the source & prepare the write + Pair>, String> dataAndCheckpoint = source + .fetchNewData(resumeCheckpointStr, cfg.maxInputBytes); - initSource(); + if (!dataAndCheckpoint.getKey().isPresent()) { + log.info("No new data, nothing to commit.. "); + return; } - private void initSource() throws IOException { - // Create the source & schema providers - PropertiesConfiguration sourceCfg = UtilHelpers.readConfig(fs, new Path(cfg.sourceConfigProps)); - log.info("Creating source " + cfg.sourceClassName + " with configs : " + sourceCfg.toString()); - this.source = UtilHelpers.createSource(cfg.sourceClassName, sourceCfg, jssc, cfg.sourceFormat, schemaProvider); + JavaRDD avroRDD = dataAndCheckpoint.getKey().get(); + JavaRDD records = avroRDD + .map(gr -> { + HoodieRecordPayload payload = DataSourceUtils.createPayload( + cfg.payloadClassName, + gr, + (Comparable) gr.get(cfg.sourceOrderingField)); + return new HoodieRecord<>(keyGenerator.getKey(gr), payload); + }); + + // Perform the write + HoodieWriteConfig hoodieCfg = getHoodieClientConfig(cfg.hoodieClientProps); + HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg); + String commitTime = client.startCommit(); + log.info("Starting commit : " + commitTime); + + JavaRDD writeStatusRDD; + if (cfg.operation == Operation.INSERT) { + writeStatusRDD = client.insert(records, commitTime); + } else if (cfg.operation == Operation.UPSERT) { + writeStatusRDD = client.upsert(records, commitTime); + } else { + throw new HoodieDeltaStreamerException("Unknown operation :" + cfg.operation); } - private void initSchemaProvider() throws IOException { - PropertiesConfiguration schemaCfg = UtilHelpers.readConfig(fs, new Path(cfg.schemaProviderConfigProps)); - log.info("Creating schema provider " + cfg.schemaProviderClassName + " with configs : " + schemaCfg.toString()); - this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, schemaCfg); + // Simply commit for now. TODO(vc): Support better error handlers later on + HashMap checkpointCommitMetadata = new HashMap<>(); + checkpointCommitMetadata.put(CHECKPOINT_KEY, dataAndCheckpoint.getValue()); + + boolean success = client + .commit(commitTime, writeStatusRDD, Optional.of(checkpointCommitMetadata)); + if (success) { + log.info("Commit " + commitTime + " successful!"); + // TODO(vc): Kick off hive sync from here. + + } else { + log.info("Commit " + commitTime + " failed!"); } + client.close(); + } - private void initKeyGenerator() throws IOException { - PropertiesConfiguration keygenCfg = UtilHelpers.readConfig(fs, new Path(cfg.keyGeneratorProps)); - log.info("Creating key generator " + cfg.keyGeneratorClass + " with configs : " + keygenCfg.toString()); - this.keyGenerator = DataSourceUtils.createKeyGenerator(cfg.keyGeneratorClass, keygenCfg); + private HoodieWriteConfig getHoodieClientConfig(String hoodieClientCfgPath) throws Exception { + return HoodieWriteConfig.newBuilder() + .combineInput(true, true) + .withPath(cfg.targetBasePath) + .withAutoCommit(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withPayloadClass(OverwriteWithLatestAvroPayload.class.getName()).build()) + .withSchema(schemaProvider.getTargetSchema().toString()) + .forTable(cfg.targetTableName) + .withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .fromInputStream(fs.open(new Path(hoodieClientCfgPath))) + .build(); + } + + private enum Operation { + UPSERT, + INSERT + } + + private class OperationConvertor implements IStringConverter { + + @Override + public Operation convert(String value) throws ParameterException { + return Operation.valueOf(value); } + } + private class SourceFormatConvertor implements IStringConverter { - private JavaSparkContext getSparkContext() { - SparkConf sparkConf = new SparkConf().setAppName("hoodie-delta-streamer-" + cfg.targetTableName); - //sparkConf.setMaster(cfg.sparkMaster); - sparkConf.setMaster("local[2]"); - sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - sparkConf.set("spark.driver.maxResultSize", "2g"); - - // Configure hadoop conf - sparkConf.set("spark.hadoop.mapred.output.compress", "true"); - sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); - sparkConf.set("spark.hadoop.mapred.output.compression.codec", - "org.apache.hadoop.io.compress.GzipCodec"); - sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); - - sparkConf = HoodieWriteClient.registerClasses(sparkConf); - // register the schemas, so that shuffle does not serialize the full schemas - List schemas = Arrays.asList(schemaProvider.getSourceSchema(), schemaProvider.getTargetSchema()); - sparkConf.registerAvroSchemas(JavaConversions.asScalaBuffer(schemas).toList()); - return new JavaSparkContext(sparkConf); + @Override + public SourceDataFormat convert(String value) throws ParameterException { + return SourceDataFormat.valueOf(value); } + } - private void sync() throws Exception { - // Retrieve the previous round checkpoints, if any - Optional resumeCheckpointStr = Optional.empty(); - if (commitTimelineOpt.isPresent()) { - Optional lastCommit = commitTimelineOpt.get().lastInstant(); - if (lastCommit.isPresent()) { - HoodieCommitMetadata commitMetadata = - HoodieCommitMetadata.fromBytes(commitTimelineOpt.get().getInstantDetails(lastCommit.get()).get()); - if (commitMetadata.getMetadata(CHECKPOINT_KEY) != null) { - resumeCheckpointStr = Optional.of(commitMetadata.getMetadata(CHECKPOINT_KEY)); - } else { - throw new HoodieDeltaStreamerException("Unable to find previous checkpoint. Please double check if this table " + - "was indeed built via delta streamer "); - } - } - } else { - Properties properties = new Properties(); - properties.put(HoodieWriteConfig.TABLE_NAME, cfg.targetTableName); - HoodieTableMetaClient.initializePathAsHoodieDataset(FSUtils.getFs(), cfg.targetBasePath, properties); - } - log.info("Checkpoint to resume from : " + resumeCheckpointStr); + public static class Config implements Serializable { + + /** + * TARGET CONFIGS + **/ + @Parameter(names = { + "--target-base-path"}, description = "base path for the target hoodie dataset", required = true) + public String targetBasePath; + + // TODO: How to obtain hive configs to register? + @Parameter(names = { + "--target-table"}, description = "name of the target table in Hive", required = true) + public String targetTableName; + + @Parameter(names = {"--hoodie-client-config"}, description = + "path to properties file on localfs or dfs, with hoodie client config. Sane defaults" + + "are used, but recommend use to provide basic things like metrics endpoints, hive configs etc") + public String hoodieClientProps = null; + + /** + * SOURCE CONFIGS + **/ + @Parameter(names = {"--source-class"}, description = + "subclass of com.uber.hoodie.utilities.sources.Source to use to read data. " + + "built-in options: com.uber.hoodie.utilities.common.{DFSSource (default), KafkaSource, HiveIncrPullSource}") + public String sourceClassName = DFSSource.class.getName(); + + @Parameter(names = {"--source-config"}, description = + "path to properties file on localfs or dfs, with source configs. " + + "For list of acceptable properties, refer the source class", required = true) + public String sourceConfigProps = null; + + @Parameter(names = {"--source-format"}, description = + "Format of data in source, JSON (default), Avro. All source data is " + + "converted to Avro using the provided schema in any case", converter = SourceFormatConvertor.class) + public SourceDataFormat sourceFormat = SourceDataFormat.JSON; + + @Parameter(names = {"--source-ordering-field"}, description = + "Field within source record to decide how to break ties between " + + " records with same key in input data. Default: 'ts' holding unix timestamp of record") + public String sourceOrderingField = "ts"; + + @Parameter(names = {"--key-generator-class"}, description = + "Subclass of com.uber.hoodie.utilities.common.KeyExtractor to generate" + + "a HoodieKey from the given avro record. Built in: SimpleKeyGenerator (Uses provided field names as recordkey & partitionpath. " + + + "Nested fields specified via dot notation, e.g: a.b.c)") + public String keyGeneratorClass = SimpleKeyGenerator.class.getName(); + + @Parameter(names = {"--key-generator-config"}, description = + "Path to properties file on localfs or dfs, with KeyGenerator configs. " + + "For list of acceptable properites, refer the KeyGenerator class", required = true) + public String keyGeneratorProps = null; + + @Parameter(names = {"--payload-class"}, description = + "subclass of HoodieRecordPayload, that works off a GenericRecord. " + + "Default: SourceWrapperPayload. Implement your own, if you want to do something other than overwriting existing value") + public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName(); + + @Parameter(names = {"--schemaprovider-class"}, description = + "subclass of com.uber.hoodie.utilities.schema.SchemaProvider " + + "to attach schemas to input & target table data, built in options: FilebasedSchemaProvider") + public String schemaProviderClassName = FilebasedSchemaProvider.class.getName(); + + @Parameter(names = {"--schemaprovider-config"}, description = + "path to properties file on localfs or dfs, with schema configs. " + + "For list of acceptable properties, refer the schema provider class", required = true) + public String schemaProviderConfigProps = null; - // Pull the data from the source & prepare the write - Pair>, String> dataAndCheckpoint = source.fetchNewData(resumeCheckpointStr, cfg.maxInputBytes); + /** + * Other configs + **/ + @Parameter(names = { + "--max-input-bytes"}, description = "Maximum number of bytes to read from source. Default: 1TB") + public long maxInputBytes = 1L * 1024 * 1024 * 1024 * 1024; - if (!dataAndCheckpoint.getKey().isPresent()) { - log.info("No new data, nothing to commit.. "); - return; - } - - JavaRDD avroRDD = dataAndCheckpoint.getKey().get(); - JavaRDD records = avroRDD - .map(gr -> { - HoodieRecordPayload payload = DataSourceUtils.createPayload( - cfg.payloadClassName, - gr, - (Comparable) gr.get(cfg.sourceOrderingField)); - return new HoodieRecord<>(keyGenerator.getKey(gr), payload); - }); + @Parameter(names = {"--op"}, description = + "Takes one of these values : UPSERT (default), INSERT (use when input " + + "is purely new data/inserts to gain speed)", converter = OperationConvertor.class) + public Operation operation = Operation.UPSERT; - // Perform the write - HoodieWriteConfig hoodieCfg = getHoodieClientConfig(cfg.hoodieClientProps); - HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg); - String commitTime = client.startCommit(); - log.info("Starting commit : " + commitTime); + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + } - JavaRDD writeStatusRDD; - if (cfg.operation == Operation.INSERT) { - writeStatusRDD = client.insert(records, commitTime); - } else if (cfg.operation == Operation.UPSERT) { - writeStatusRDD = client.upsert(records, commitTime); - } else { - throw new HoodieDeltaStreamerException("Unknown operation :" + cfg.operation); - } - - // Simply commit for now. TODO(vc): Support better error handlers later on - HashMap checkpointCommitMetadata = new HashMap<>(); - checkpointCommitMetadata.put(CHECKPOINT_KEY, dataAndCheckpoint.getValue()); - - boolean success = client.commit(commitTime, writeStatusRDD, Optional.of(checkpointCommitMetadata)); - if (success) { - log.info("Commit " + commitTime + " successful!"); - // TODO(vc): Kick off hive sync from here. - - } else { - log.info("Commit " + commitTime + " failed!"); - } - client.close(); - } - - private HoodieWriteConfig getHoodieClientConfig(String hoodieClientCfgPath) throws Exception { - return HoodieWriteConfig.newBuilder() - .combineInput(true, true) - .withPath(cfg.targetBasePath) - .withAutoCommit(false) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withPayloadClass(OverwriteWithLatestAvroPayload.class.getName()).build()) - .withSchema(schemaProvider.getTargetSchema().toString()) - .forTable(cfg.targetTableName) - .withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .fromInputStream(fs.open(new Path(hoodieClientCfgPath))) - .build(); - } - - private enum Operation { - UPSERT, - INSERT - } - - private class OperationConvertor implements IStringConverter { - @Override - public Operation convert(String value) throws ParameterException { - return Operation.valueOf(value); - } - } - - private class SourceFormatConvertor implements IStringConverter { - @Override - public SourceDataFormat convert(String value) throws ParameterException { - return SourceDataFormat.valueOf(value); - } - } - - public static class Config implements Serializable { - - /** TARGET CONFIGS **/ - @Parameter(names = {"--target-base-path"}, description = "base path for the target hoodie dataset", required = true) - public String targetBasePath; - - // TODO: How to obtain hive configs to register? - @Parameter(names = {"--target-table"}, description = "name of the target table in Hive", required = true) - public String targetTableName; - - @Parameter(names = {"--hoodie-client-config"}, description = "path to properties file on localfs or dfs, with hoodie client config. Sane defaults" + - "are used, but recommend use to provide basic things like metrics endpoints, hive configs etc") - public String hoodieClientProps = null; - - /** SOURCE CONFIGS **/ - @Parameter(names = {"--source-class"}, description = "subclass of com.uber.hoodie.utilities.sources.Source to use to read data. " + - "built-in options: com.uber.hoodie.utilities.common.{DFSSource (default), KafkaSource, HiveIncrPullSource}") - public String sourceClassName = DFSSource.class.getName(); - - @Parameter(names = {"--source-config"}, description = "path to properties file on localfs or dfs, with source configs. " + - "For list of acceptable properties, refer the source class", required = true) - public String sourceConfigProps = null; - - @Parameter(names = {"--source-format"}, description = "Format of data in source, JSON (default), Avro. All source data is " + - "converted to Avro using the provided schema in any case", converter = SourceFormatConvertor.class) - public SourceDataFormat sourceFormat = SourceDataFormat.JSON; - - @Parameter(names = {"--source-ordering-field"}, description = "Field within source record to decide how to break ties between " + - " records with same key in input data. Default: 'ts' holding unix timestamp of record") - public String sourceOrderingField = "ts"; - - @Parameter(names = {"--key-generator-class"}, description = "Subclass of com.uber.hoodie.utilities.common.KeyExtractor to generate" + - "a HoodieKey from the given avro record. Built in: SimpleKeyGenerator (Uses provided field names as recordkey & partitionpath. " + - "Nested fields specified via dot notation, e.g: a.b.c)") - public String keyGeneratorClass = SimpleKeyGenerator.class.getName(); - - @Parameter(names = {"--key-generator-config"}, description = "Path to properties file on localfs or dfs, with KeyGenerator configs. " + - "For list of acceptable properites, refer the KeyGenerator class", required = true) - public String keyGeneratorProps = null; - - @Parameter(names = {"--payload-class"}, description = "subclass of HoodieRecordPayload, that works off a GenericRecord. " + - "Default: SourceWrapperPayload. Implement your own, if you want to do something other than overwriting existing value") - public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName(); - - @Parameter(names = {"--schemaprovider-class"}, description = "subclass of com.uber.hoodie.utilities.schema.SchemaProvider " + - "to attach schemas to input & target table data, built in options: FilebasedSchemaProvider") - public String schemaProviderClassName = FilebasedSchemaProvider.class.getName(); - - @Parameter(names = {"--schemaprovider-config"}, description = "path to properties file on localfs or dfs, with schema configs. " + - "For list of acceptable properties, refer the schema provider class", required = true) - public String schemaProviderConfigProps = null; - - - /** Other configs **/ - @Parameter(names = {"--max-input-bytes"}, description = "Maximum number of bytes to read from source. Default: 1TB") - public long maxInputBytes = 1L * 1024 * 1024 * 1024 * 1024; - - @Parameter(names = {"--op"}, description = "Takes one of these values : UPSERT (default), INSERT (use when input " + - "is purely new data/inserts to gain speed)", converter = OperationConvertor.class) - public Operation operation = Operation.UPSERT; - - - @Parameter(names = {"--help", "-h"}, help = true) - public Boolean help = false; - } - - public static void main(String[] args) throws Exception { - final Config cfg = new Config(); - JCommander cmd = new JCommander(cfg, args); - if (cfg.help || args.length == 0) { - cmd.usage(); - System.exit(1); - } - new HoodieDeltaStreamer(cfg).sync(); + public static void main(String[] args) throws Exception { + final Config cfg = new Config(); + JCommander cmd = new JCommander(cfg, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); } + new HoodieDeltaStreamer(cfg).sync(); + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieDeltaStreamerException.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieDeltaStreamerException.java index c99197b8a..40031aa09 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieDeltaStreamerException.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieDeltaStreamerException.java @@ -21,11 +21,12 @@ package com.uber.hoodie.utilities.exception; import com.uber.hoodie.exception.HoodieException; public class HoodieDeltaStreamerException extends HoodieException { - public HoodieDeltaStreamerException(String msg, Throwable e) { - super(msg, e); - } - public HoodieDeltaStreamerException(String msg) { - super(msg); - } + public HoodieDeltaStreamerException(String msg, Throwable e) { + super(msg, e); + } + + public HoodieDeltaStreamerException(String msg) { + super(msg); + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieIncrementalPullException.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieIncrementalPullException.java index a939d8cc6..79092bacf 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieIncrementalPullException.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieIncrementalPullException.java @@ -19,15 +19,15 @@ package com.uber.hoodie.utilities.exception; import com.uber.hoodie.exception.HoodieException; - import java.sql.SQLException; public class HoodieIncrementalPullException extends HoodieException { - public HoodieIncrementalPullException(String msg, SQLException e) { - super(msg, e); - } - public HoodieIncrementalPullException(String msg) { - super(msg); - } + public HoodieIncrementalPullException(String msg, SQLException e) { + super(msg, e); + } + + public HoodieIncrementalPullException(String msg) { + super(msg); + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieIncrementalPullSQLException.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieIncrementalPullSQLException.java index 3089631b7..008d4d0d8 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieIncrementalPullSQLException.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/exception/HoodieIncrementalPullSQLException.java @@ -21,11 +21,12 @@ package com.uber.hoodie.utilities.exception; import java.sql.SQLException; public class HoodieIncrementalPullSQLException extends HoodieIncrementalPullException { - public HoodieIncrementalPullSQLException(String msg, SQLException e) { - super(msg, e); - } - public HoodieIncrementalPullSQLException(String msg) { - super(msg); - } + public HoodieIncrementalPullSQLException(String msg, SQLException e) { + super(msg, e); + } + + public HoodieIncrementalPullSQLException(String msg) { + super(msg); + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/keygen/TimestampBasedKeyGenerator.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/keygen/TimestampBasedKeyGenerator.java index 5c67bbd8b..d9da949b5 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/keygen/TimestampBasedKeyGenerator.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/keygen/TimestampBasedKeyGenerator.java @@ -23,83 +23,86 @@ import com.uber.hoodie.SimpleKeyGenerator; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.exception.HoodieNotSupportedException; import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException; - -import org.apache.avro.generic.GenericRecord; -import org.apache.commons.configuration.PropertiesConfiguration; - import java.io.Serializable; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; import java.util.TimeZone; +import org.apache.avro.generic.GenericRecord; +import org.apache.commons.configuration.PropertiesConfiguration; /** * Key generator, that relies on timestamps for partitioning field. Still picks record key by name. - * */ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator { - enum TimestampType implements Serializable { - UNIX_TIMESTAMP, - DATE_STRING, - MIXED + enum TimestampType implements Serializable { + UNIX_TIMESTAMP, + DATE_STRING, + MIXED + } + + private final TimestampType timestampType; + + private SimpleDateFormat inputDateFormat; + + private final String outputDateFormat; + + + /** + * Supported configs + */ + static class Config { + + // One value from TimestampType above + private static final String TIMESTAMP_TYPE_FIELD_PROP = "hoodie.deltastreamer.keygen.timebased.timestamp.type"; + private static final String TIMESTAMP_INPUT_DATE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.input.dateformat"; + private static final String TIMESTAMP_OUTPUT_DATE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.output.dateformat"; + } + + public TimestampBasedKeyGenerator(PropertiesConfiguration config) { + super(config); + DataSourceUtils.checkRequiredProperties(config, + Arrays.asList(Config.TIMESTAMP_TYPE_FIELD_PROP, Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP)); + this.timestampType = TimestampType.valueOf(config.getString(Config.TIMESTAMP_TYPE_FIELD_PROP)); + this.outputDateFormat = config.getString(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); + + if (timestampType == TimestampType.DATE_STRING || timestampType == TimestampType.MIXED) { + DataSourceUtils + .checkRequiredProperties(config, Arrays.asList(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); + this.inputDateFormat = new SimpleDateFormat( + config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); + this.inputDateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); } + } - private final TimestampType timestampType; + @Override + public HoodieKey getKey(GenericRecord record) { + Object partitionVal = record.get(partitionPathField); + SimpleDateFormat partitionPathFormat = new SimpleDateFormat(outputDateFormat); + partitionPathFormat.setTimeZone(TimeZone.getTimeZone("GMT")); - private SimpleDateFormat inputDateFormat; + try { + long unixTime; + if (partitionVal instanceof Double) { + unixTime = ((Double) partitionVal).longValue(); + } else if (partitionVal instanceof Float) { + unixTime = ((Float) partitionVal).longValue(); + } else if (partitionVal instanceof Long) { + unixTime = (Long) partitionVal; + } else if (partitionVal instanceof String) { + unixTime = inputDateFormat.parse(partitionVal.toString()).getTime() / 1000; + } else { + throw new HoodieNotSupportedException( + "Unexpected type for partition field: " + partitionVal.getClass().getName()); + } - private final String outputDateFormat; - - - /** - * Supported configs - */ - static class Config { - // One value from TimestampType above - private static final String TIMESTAMP_TYPE_FIELD_PROP = "hoodie.deltastreamer.keygen.timebased.timestamp.type"; - private static final String TIMESTAMP_INPUT_DATE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.input.dateformat"; - private static final String TIMESTAMP_OUTPUT_DATE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.output.dateformat"; - } - - public TimestampBasedKeyGenerator(PropertiesConfiguration config) { - super(config); - DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.TIMESTAMP_TYPE_FIELD_PROP, Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP)); - this.timestampType = TimestampType.valueOf(config.getString(Config.TIMESTAMP_TYPE_FIELD_PROP)); - this.outputDateFormat = config.getString(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); - - if (timestampType == TimestampType.DATE_STRING || timestampType == TimestampType.MIXED) { - DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); - this.inputDateFormat = new SimpleDateFormat(config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); - this.inputDateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); - } - } - - @Override - public HoodieKey getKey(GenericRecord record) { - Object partitionVal = record.get(partitionPathField); - SimpleDateFormat partitionPathFormat = new SimpleDateFormat(outputDateFormat); - partitionPathFormat.setTimeZone(TimeZone.getTimeZone("GMT")); - - try { - long unixTime; - if (partitionVal instanceof Double) { - unixTime = ((Double) partitionVal).longValue(); - } else if (partitionVal instanceof Float) { - unixTime = ((Float) partitionVal).longValue(); - } else if (partitionVal instanceof Long) { - unixTime = (Long) partitionVal; - } else if (partitionVal instanceof String) { - unixTime = inputDateFormat.parse(partitionVal.toString()).getTime() / 1000; - } else { - throw new HoodieNotSupportedException("Unexpected type for partition field: "+ partitionVal.getClass().getName()); - } - - return new HoodieKey(record.get(recordKeyField).toString(), - partitionPathFormat.format(new Date(unixTime * 1000))); - } catch (ParseException pe) { - throw new HoodieDeltaStreamerException("Unable to parse input partition field :" + partitionVal, pe); - } + return new HoodieKey(record.get(recordKeyField).toString(), + partitionPathFormat.format(new Date(unixTime * 1000))); + } catch (ParseException pe) { + throw new HoodieDeltaStreamerException( + "Unable to parse input partition field :" + partitionVal, pe); } + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/FilebasedSchemaProvider.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/FilebasedSchemaProvider.java index 6a77632c3..f6ea67f01 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/FilebasedSchemaProvider.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/FilebasedSchemaProvider.java @@ -21,54 +21,56 @@ package com.uber.hoodie.utilities.schema; import com.uber.hoodie.DataSourceUtils; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieIOException; - +import java.io.IOException; +import java.util.Arrays; import org.apache.avro.Schema; import org.apache.commons.configuration.PropertiesConfiguration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import java.io.IOException; -import java.util.Arrays; - /** * A simple schema provider, that reads off files on DFS */ public class FilebasedSchemaProvider extends SchemaProvider { - /** - * Configs supported - */ - static class Config { - private static final String SOURCE_SCHEMA_FILE_PROP = "hoodie.deltastreamer.filebased.schemaprovider.source.schema.file"; - private static final String TARGET_SCHEMA_FILE_PROP = "hoodie.deltastreamer.filebased.schemaprovider.target.schema.file"; + /** + * Configs supported + */ + static class Config { + + private static final String SOURCE_SCHEMA_FILE_PROP = "hoodie.deltastreamer.filebased.schemaprovider.source.schema.file"; + private static final String TARGET_SCHEMA_FILE_PROP = "hoodie.deltastreamer.filebased.schemaprovider.target.schema.file"; + } + + private final FileSystem fs; + + private final Schema sourceSchema; + + private final Schema targetSchema; + + public FilebasedSchemaProvider(PropertiesConfiguration config) { + super(config); + this.fs = FSUtils.getFs(); + + DataSourceUtils.checkRequiredProperties(config, + Arrays.asList(Config.SOURCE_SCHEMA_FILE_PROP, Config.TARGET_SCHEMA_FILE_PROP)); + try { + this.sourceSchema = new Schema.Parser() + .parse(fs.open(new Path(config.getString(Config.SOURCE_SCHEMA_FILE_PROP)))); + this.targetSchema = new Schema.Parser() + .parse(fs.open(new Path(config.getString(Config.TARGET_SCHEMA_FILE_PROP)))); + } catch (IOException ioe) { + throw new HoodieIOException("Error reading schema", ioe); } + } - private final FileSystem fs; + @Override + public Schema getSourceSchema() { + return sourceSchema; + } - private final Schema sourceSchema; - - private final Schema targetSchema; - - public FilebasedSchemaProvider(PropertiesConfiguration config) { - super(config); - this.fs = FSUtils.getFs(); - - DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.SOURCE_SCHEMA_FILE_PROP, Config.TARGET_SCHEMA_FILE_PROP)); - try { - this.sourceSchema = new Schema.Parser().parse(fs.open(new Path(config.getString(Config.SOURCE_SCHEMA_FILE_PROP)))); - this.targetSchema = new Schema.Parser().parse(fs.open(new Path(config.getString(Config.TARGET_SCHEMA_FILE_PROP)))); - } catch (IOException ioe) { - throw new HoodieIOException("Error reading schema", ioe); - } - } - - @Override - public Schema getSourceSchema() { - return sourceSchema; - } - - @Override - public Schema getTargetSchema() { - return targetSchema; - } + @Override + public Schema getTargetSchema() { + return targetSchema; + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaProvider.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaProvider.java index b3f385bf9..3a192581c 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaProvider.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaProvider.java @@ -18,22 +18,22 @@ package com.uber.hoodie.utilities.schema; +import java.io.Serializable; import org.apache.avro.Schema; import org.apache.commons.configuration.PropertiesConfiguration; -import java.io.Serializable; /** * Class to provide schema for reading data and also writing into a Hoodie table */ public abstract class SchemaProvider implements Serializable { - protected PropertiesConfiguration config; + protected PropertiesConfiguration config; - protected SchemaProvider(PropertiesConfiguration config) { - this.config = config; - } + protected SchemaProvider(PropertiesConfiguration config) { + this.config = config; + } - public abstract Schema getSourceSchema(); + public abstract Schema getSourceSchema(); - public abstract Schema getTargetSchema(); + public abstract Schema getTargetSchema(); } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroConvertor.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroConvertor.java index a2c1db220..083ecb7d3 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroConvertor.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroConvertor.java @@ -21,73 +21,71 @@ package com.uber.hoodie.utilities.sources; import com.twitter.bijection.Injection; import com.twitter.bijection.avro.GenericAvroCodecs; import com.uber.hoodie.avro.MercifulJsonConverter; - +import java.io.IOException; +import java.io.Serializable; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; -import java.io.IOException; -import java.io.Serializable; - /** - * Convert a variety of {@link SourceDataFormat} into Avro GenericRecords. Has a bunch of lazy fields - * to circumvent issues around serializing these objects from driver to executors + * Convert a variety of {@link SourceDataFormat} into Avro GenericRecords. Has a bunch of lazy + * fields to circumvent issues around serializing these objects from driver to executors */ public class AvroConvertor implements Serializable { - /** - * To be lazily inited on executors - */ - private transient Schema schema; + /** + * To be lazily inited on executors + */ + private transient Schema schema; - private final String schemaStr; + private final String schemaStr; - /** - * To be lazily inited on executors - */ - private transient MercifulJsonConverter jsonConverter; + /** + * To be lazily inited on executors + */ + private transient MercifulJsonConverter jsonConverter; - /** - * To be lazily inited on executors - */ - private transient Injection recordInjection; + /** + * To be lazily inited on executors + */ + private transient Injection recordInjection; - public AvroConvertor(String schemaStr) { - this.schemaStr = schemaStr; + public AvroConvertor(String schemaStr) { + this.schemaStr = schemaStr; + } + + + private void initSchema() { + if (schema == null) { + Schema.Parser parser = new Schema.Parser(); + schema = parser.parse(schemaStr); } + } - - private void initSchema() { - if (schema == null) { - Schema.Parser parser = new Schema.Parser(); - schema = parser.parse(schemaStr); - } + private void initInjection() { + if (recordInjection == null) { + recordInjection = GenericAvroCodecs.toBinary(schema); } + } - private void initInjection() { - if (recordInjection == null) { - recordInjection = GenericAvroCodecs.toBinary(schema); - } - } - - private void initJsonConvertor() { - if (jsonConverter == null) { - jsonConverter = new MercifulJsonConverter(schema); - } + private void initJsonConvertor() { + if (jsonConverter == null) { + jsonConverter = new MercifulJsonConverter(schema); } + } - public GenericRecord fromJson(String json) throws IOException { - initSchema(); - initJsonConvertor(); - return jsonConverter.convert(json); - } + public GenericRecord fromJson(String json) throws IOException { + initSchema(); + initJsonConvertor(); + return jsonConverter.convert(json); + } - public GenericRecord fromAvroBinary(byte[] avroBinary) throws IOException { - initSchema(); - initInjection(); - return recordInjection.invert(avroBinary).get(); - } + public GenericRecord fromAvroBinary(byte[] avroBinary) throws IOException { + initSchema(); + initInjection(); + return recordInjection.invert(avroBinary).get(); + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java index 49c333782..128a449a4 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java @@ -23,7 +23,12 @@ import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.HoodieNotSupportedException; import com.uber.hoodie.utilities.schema.SchemaProvider; - +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; import org.apache.avro.generic.GenericRecord; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapreduce.AvroKeyInputFormat; @@ -40,113 +45,121 @@ import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; - /** * Source to read data from a given DFS directory structure, incrementally */ public class DFSSource extends Source { - /** - * Configs supported - */ - static class Config { - private final static String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.dfs.root"; - } - - private final static List IGNORE_FILEPREFIX_LIST = Arrays.asList(".", "_"); - - private final transient FileSystem fs; - - public DFSSource(PropertiesConfiguration config, JavaSparkContext sparkContext, SourceDataFormat dataFormat, SchemaProvider schemaProvider) { - super(config, sparkContext, dataFormat, schemaProvider); - this.fs = FSUtils.getFs(); - DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.ROOT_INPUT_PATH_PROP)); + /** + * Configs supported + */ + static class Config { + + private final static String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.dfs.root"; + } + + private final static List IGNORE_FILEPREFIX_LIST = Arrays.asList(".", "_"); + + private final transient FileSystem fs; + + public DFSSource(PropertiesConfiguration config, JavaSparkContext sparkContext, + SourceDataFormat dataFormat, SchemaProvider schemaProvider) { + super(config, sparkContext, dataFormat, schemaProvider); + this.fs = FSUtils.getFs(); + DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.ROOT_INPUT_PATH_PROP)); + } + + + public static JavaRDD fromAvroFiles(final AvroConvertor convertor, String pathStr, + JavaSparkContext sparkContext) { + JavaPairRDD avroRDD = sparkContext.newAPIHadoopFile(pathStr, + AvroKeyInputFormat.class, + AvroKey.class, + NullWritable.class, + sparkContext.hadoopConfiguration()); + return avroRDD.keys().map(r -> ((GenericRecord) r.datum())); + } + + public static JavaRDD fromJsonFiles(final AvroConvertor convertor, String pathStr, + JavaSparkContext sparkContext) { + return sparkContext.textFile(pathStr).map((String j) -> { + return convertor.fromJson(j); + }); + } + + public static JavaRDD fromFiles(SourceDataFormat dataFormat, + final AvroConvertor convertor, String pathStr, JavaSparkContext sparkContext) { + if (dataFormat == SourceDataFormat.AVRO) { + return DFSSource.fromAvroFiles(convertor, pathStr, sparkContext); + } else if (dataFormat == SourceDataFormat.JSON) { + return DFSSource.fromJsonFiles(convertor, pathStr, sparkContext); + } else { + throw new HoodieNotSupportedException("Unsupported data format :" + dataFormat); } + } - public static JavaRDD fromAvroFiles(final AvroConvertor convertor, String pathStr, JavaSparkContext sparkContext) { - JavaPairRDD avroRDD = sparkContext.newAPIHadoopFile(pathStr, - AvroKeyInputFormat.class, - AvroKey.class, - NullWritable.class, - sparkContext.hadoopConfiguration()); - return avroRDD.keys().map(r -> ((GenericRecord) r.datum())); - } + @Override + public Pair>, String> fetchNewData( + Optional lastCheckpointStr, long maxInputBytes) { - public static JavaRDD fromJsonFiles(final AvroConvertor convertor, String pathStr, JavaSparkContext sparkContext) { - return sparkContext.textFile(pathStr).map((String j) -> { - return convertor.fromJson(j); - }); - } - - public static JavaRDD fromFiles(SourceDataFormat dataFormat, final AvroConvertor convertor, String pathStr, JavaSparkContext sparkContext) { - if (dataFormat == SourceDataFormat.AVRO) { - return DFSSource.fromAvroFiles(convertor, pathStr, sparkContext); - } else if (dataFormat == SourceDataFormat.JSON) { - return DFSSource.fromJsonFiles(convertor, pathStr, sparkContext); - } else { - throw new HoodieNotSupportedException("Unsupported data format :" + dataFormat); + try { + // obtain all eligible files under root folder. + List eligibleFiles = new ArrayList<>(); + RemoteIterator fitr = fs + .listFiles(new Path(config.getString(Config.ROOT_INPUT_PATH_PROP)), true); + while (fitr.hasNext()) { + LocatedFileStatus fileStatus = fitr.next(); + if (fileStatus.isDirectory() || + IGNORE_FILEPREFIX_LIST.stream() + .filter(pfx -> fileStatus.getPath().getName().startsWith(pfx)).count() > 0) { + continue; } - } + eligibleFiles.add(fileStatus); + } + // sort them by modification time. + eligibleFiles.sort((FileStatus f1, FileStatus f2) -> Long.valueOf(f1.getModificationTime()) + .compareTo(Long.valueOf(f2.getModificationTime()))); - - @Override - public Pair>, String> fetchNewData(Optional lastCheckpointStr, long maxInputBytes) { - - try { - // obtain all eligible files under root folder. - List eligibleFiles = new ArrayList<>(); - RemoteIterator fitr = fs.listFiles(new Path(config.getString(Config.ROOT_INPUT_PATH_PROP)), true); - while (fitr.hasNext()) { - LocatedFileStatus fileStatus = fitr.next(); - if (fileStatus.isDirectory() || - IGNORE_FILEPREFIX_LIST.stream().filter(pfx -> fileStatus.getPath().getName().startsWith(pfx)).count() > 0) { - continue; - } - eligibleFiles.add(fileStatus); - } - // sort them by modification time. - eligibleFiles.sort((FileStatus f1, FileStatus f2) -> Long.valueOf(f1.getModificationTime()).compareTo(Long.valueOf(f2.getModificationTime()))); - - // Filter based on checkpoint & input size, if needed - long currentBytes = 0; - long maxModificationTime = Long.MIN_VALUE; - List filteredFiles = new ArrayList<>(); - for (FileStatus f : eligibleFiles) { - if (lastCheckpointStr.isPresent() && f.getModificationTime() <= Long.valueOf(lastCheckpointStr.get())) { - // skip processed files - continue; - } - - maxModificationTime = f.getModificationTime(); - currentBytes += f.getLen(); - filteredFiles.add(f); - if (currentBytes >= maxInputBytes) { - // we have enough data, we are done - break; - } - } - - // no data to read - if (filteredFiles.size() == 0) { - return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : String.valueOf(Long.MIN_VALUE)); - } - - // read the files out. - String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(",")); - String schemaStr = schemaProvider.getSourceSchema().toString(); - final AvroConvertor avroConvertor = new AvroConvertor(schemaStr); - - return new ImmutablePair<>(Optional.of(DFSSource.fromFiles(dataFormat, avroConvertor, pathStr, sparkContext)), - String.valueOf(maxModificationTime)); - } catch (IOException ioe) { - throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); + // Filter based on checkpoint & input size, if needed + long currentBytes = 0; + long maxModificationTime = Long.MIN_VALUE; + List filteredFiles = new ArrayList<>(); + for (FileStatus f : eligibleFiles) { + if (lastCheckpointStr.isPresent() && f.getModificationTime() <= Long + .valueOf(lastCheckpointStr.get())) { + // skip processed files + continue; } + + maxModificationTime = f.getModificationTime(); + currentBytes += f.getLen(); + filteredFiles.add(f); + if (currentBytes >= maxInputBytes) { + // we have enough data, we are done + break; + } + } + + // no data to read + if (filteredFiles.size() == 0) { + return new ImmutablePair<>(Optional.empty(), + lastCheckpointStr.isPresent() ? lastCheckpointStr.get() + : String.valueOf(Long.MIN_VALUE)); + } + + // read the files out. + String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()) + .collect(Collectors.joining(",")); + String schemaStr = schemaProvider.getSourceSchema().toString(); + final AvroConvertor avroConvertor = new AvroConvertor(schemaStr); + + return new ImmutablePair<>( + Optional.of(DFSSource.fromFiles(dataFormat, avroConvertor, pathStr, sparkContext)), + String.valueOf(maxModificationTime)); + } catch (IOException ioe) { + throw new HoodieIOException( + "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); } + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java index 4aceb88b3..08c919366 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java @@ -22,7 +22,13 @@ import com.uber.hoodie.DataSourceUtils; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.utilities.schema.SchemaProvider; - +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; import org.apache.avro.generic.GenericRecord; import org.apache.commons.configuration.PropertiesConfiguration; import org.apache.commons.lang3.tuple.ImmutablePair; @@ -35,98 +41,98 @@ import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; - /** * Source to read deltas produced by {@link com.uber.hoodie.utilities.HiveIncrementalPuller}, commit * by commit and apply to the target table * * The general idea here is to have commits sync across the data pipeline. * - * [Source Tables(s)] ====> HiveIncrementalScanner ==> incrPullRootPath ==> targetTable - * {c1,c2,c3,...} {c1,c2,c3,...} {c1,c2,c3,...} + * [Source Tables(s)] ====> HiveIncrementalScanner ==> incrPullRootPath ==> targetTable + * {c1,c2,c3,...} {c1,c2,c3,...} {c1,c2,c3,...} * * This produces beautiful causality, that makes data issues in ETLs very easy to debug - * */ public class HiveIncrPullSource extends Source { - private static volatile Logger log = LogManager.getLogger(HiveIncrPullSource.class); + private static volatile Logger log = LogManager.getLogger(HiveIncrPullSource.class); - private final transient FileSystem fs; + private final transient FileSystem fs; - private final String incrPullRootPath; + private final String incrPullRootPath; - /** - * Configs supported - */ - static class Config { - private final static String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.incrpull.root"; + /** + * Configs supported + */ + static class Config { + + private final static String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.incrpull.root"; + } + + public HiveIncrPullSource(PropertiesConfiguration config, JavaSparkContext sparkContext, + SourceDataFormat dataFormat, SchemaProvider schemaProvider) { + super(config, sparkContext, dataFormat, schemaProvider); + this.fs = FSUtils.getFs(); + DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.ROOT_INPUT_PATH_PROP)); + this.incrPullRootPath = config.getString(Config.ROOT_INPUT_PATH_PROP); + } + + /** + * Finds the first commit from source, greater than the target's last commit, and reads it out. + */ + private Optional findCommitToPull(Optional latestTargetCommit) + throws IOException { + + log.info("Looking for commits "); + + FileStatus[] commitTimePaths = fs.listStatus(new Path(incrPullRootPath)); + List commitTimes = new ArrayList<>(commitTimePaths.length); + for (FileStatus commitTimePath : commitTimePaths) { + String[] splits = commitTimePath.getPath().toString().split("/"); + commitTimes.add(splits[splits.length - 1]); + } + Collections.sort(commitTimes); + log.info("Retrieved commit times " + commitTimes); + + if (!latestTargetCommit.isPresent()) { + // start from the beginning + return Optional.of(commitTimes.get(0)); } - public HiveIncrPullSource(PropertiesConfiguration config, JavaSparkContext sparkContext, SourceDataFormat dataFormat, SchemaProvider schemaProvider) { - super(config, sparkContext, dataFormat, schemaProvider); - this.fs = FSUtils.getFs(); - DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.ROOT_INPUT_PATH_PROP)); - this.incrPullRootPath = config.getString(Config.ROOT_INPUT_PATH_PROP); + for (String commitTime : commitTimes) { + //TODO(vc): Add an option to delete consumed commits + if (commitTime.compareTo(latestTargetCommit.get()) > 0) { + return Optional.of(commitTime); + } } + return Optional.empty(); + } - /** - * Finds the first commit from source, greater than the target's last commit, and reads it out. - */ - private Optional findCommitToPull(Optional latestTargetCommit) throws IOException { + @Override + public Pair>, String> fetchNewData( + Optional lastCheckpointStr, long maxInputBytes) { + try { + // find the source commit to pull + Optional commitToPull = findCommitToPull(lastCheckpointStr); - log.info("Looking for commits "); + if (!commitToPull.isPresent()) { + return new ImmutablePair<>(Optional.empty(), + lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); + } - FileStatus[] commitTimePaths = fs.listStatus(new Path(incrPullRootPath)); - List commitTimes = new ArrayList<>(commitTimePaths.length); - for (FileStatus commitTimePath : commitTimePaths) { - String[] splits = commitTimePath.getPath().toString().split("/"); - commitTimes.add(splits[splits.length - 1]); - } - Collections.sort(commitTimes); - log.info("Retrieved commit times " + commitTimes); - - if (!latestTargetCommit.isPresent()) { - // start from the beginning - return Optional.of(commitTimes.get(0)); - } - - for (String commitTime : commitTimes) { - //TODO(vc): Add an option to delete consumed commits - if (commitTime.compareTo(latestTargetCommit.get()) > 0) { - return Optional.of(commitTime); - } - } - return Optional.empty(); - } - - @Override - public Pair>, String> fetchNewData(Optional lastCheckpointStr, long maxInputBytes) { - try { - // find the source commit to pull - Optional commitToPull = findCommitToPull(lastCheckpointStr); - - if (!commitToPull.isPresent()) { - return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); - } - - // read the files out. - List commitDeltaFiles = Arrays.asList(fs.listStatus(new Path(incrPullRootPath, commitToPull.get()))); - String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(",")); - String schemaStr = schemaProvider.getSourceSchema().toString(); - final AvroConvertor avroConvertor = new AvroConvertor(schemaStr); - return new ImmutablePair<>(Optional.of(DFSSource.fromFiles(dataFormat, avroConvertor, pathStr, sparkContext)), - String.valueOf(commitToPull.get())); - } catch (IOException ioe) { - throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); - } + // read the files out. + List commitDeltaFiles = Arrays + .asList(fs.listStatus(new Path(incrPullRootPath, commitToPull.get()))); + String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()) + .collect(Collectors.joining(",")); + String schemaStr = schemaProvider.getSourceSchema().toString(); + final AvroConvertor avroConvertor = new AvroConvertor(schemaStr); + return new ImmutablePair<>( + Optional.of(DFSSource.fromFiles(dataFormat, avroConvertor, pathStr, sparkContext)), + String.valueOf(commitToPull.get())); + } catch (IOException ioe) { + throw new HoodieIOException( + "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); } + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java index f1738a385..2f2941e5d 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java @@ -22,20 +22,6 @@ import com.uber.hoodie.DataSourceUtils; import com.uber.hoodie.exception.HoodieNotSupportedException; import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException; import com.uber.hoodie.utilities.schema.SchemaProvider; - -import org.apache.avro.generic.GenericRecord; -import org.apache.commons.configuration.PropertiesConfiguration; -import org.apache.commons.lang3.tuple.ImmutablePair; -import org.apache.commons.lang3.tuple.Pair; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.streaming.kafka.KafkaCluster; -import org.apache.spark.streaming.kafka.KafkaUtils; -import org.apache.spark.streaming.kafka.OffsetRange; -import kafka.common.TopicAndPartition; - import java.nio.charset.Charset; import java.util.Arrays; import java.util.Comparator; @@ -48,8 +34,19 @@ import java.util.Spliterators; import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.stream.StreamSupport; - +import kafka.common.TopicAndPartition; import kafka.serializer.DefaultDecoder; +import org.apache.avro.generic.GenericRecord; +import org.apache.commons.configuration.PropertiesConfiguration; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.streaming.kafka.KafkaCluster; +import org.apache.spark.streaming.kafka.KafkaUtils; +import org.apache.spark.streaming.kafka.OffsetRange; import scala.Predef; import scala.Tuple2; import scala.collection.JavaConverters; @@ -65,183 +62,192 @@ import scala.util.Either; */ public class KafkaSource extends Source { - private static volatile Logger log = LogManager.getLogger(KafkaSource.class); + private static volatile Logger log = LogManager.getLogger(KafkaSource.class); - static class CheckpointUtils { + static class CheckpointUtils { - /** - * Reconstruct checkpoint from string. - * - * @param checkpointStr - * @return - */ - public static HashMap strToOffsets(String checkpointStr) { - HashMap offsetMap = new HashMap<>(); - String[] splits = checkpointStr.split(","); - String topic = splits[0]; - for (int i = 1; i < splits.length; i++) { - String[] subSplits = splits[i].split(":"); - offsetMap.put(new TopicAndPartition(topic, Integer.parseInt(subSplits[0])), - new KafkaCluster.LeaderOffset("", -1, Long.parseLong(subSplits[1]))); - } - return offsetMap; - } - - /** - * String representation of checkpoint - * - * Format: - * topic1,0:offset0,1:offset1,2:offset2, ..... - * - * @param offsetMap - * @return - */ - public static String offsetsToStr(HashMap offsetMap) { - StringBuilder sb = new StringBuilder(); - // atleast 1 partition will be present. - sb.append(offsetMap.entrySet().stream().findFirst().get().getKey().topic() + ","); - sb.append(offsetMap.entrySet().stream() - .map(e -> String.format("%s:%d",e.getKey().partition(), e.getValue().offset())) - .collect(Collectors.joining(","))); - return sb.toString(); - } - - public static OffsetRange[] computeOffsetRanges(HashMap fromOffsetMap, - HashMap toOffsetMap) { - Comparator byPartition = (OffsetRange o1, OffsetRange o2) -> { - return Integer.valueOf(o1.partition()).compareTo(Integer.valueOf(o2.partition())); - }; - List offsetRanges = toOffsetMap.entrySet().stream().map(e -> { - TopicAndPartition tp = e.getKey(); - long fromOffset = -1; - if (fromOffsetMap.containsKey(tp)){ - fromOffset = fromOffsetMap.get(tp).offset(); - } - return OffsetRange.create(tp, fromOffset, e.getValue().offset()); - }).sorted(byPartition).collect(Collectors.toList()); - - OffsetRange[] ranges = new OffsetRange[offsetRanges.size()]; - return offsetRanges.toArray(ranges); - } - - public static long totalNewMessages(OffsetRange[] ranges) { - long totalMsgs = 0; - for (OffsetRange range: ranges) { - totalMsgs += Math.max(range.untilOffset()-range.fromOffset(), 0); - } - return totalMsgs; - } + /** + * Reconstruct checkpoint from string. + */ + public static HashMap strToOffsets( + String checkpointStr) { + HashMap offsetMap = new HashMap<>(); + String[] splits = checkpointStr.split(","); + String topic = splits[0]; + for (int i = 1; i < splits.length; i++) { + String[] subSplits = splits[i].split(":"); + offsetMap.put(new TopicAndPartition(topic, Integer.parseInt(subSplits[0])), + new KafkaCluster.LeaderOffset("", -1, Long.parseLong(subSplits[1]))); + } + return offsetMap; } /** - * Helpers to deal with tricky scala <=> java conversions. (oh my!) + * String representation of checkpoint + * + * Format: topic1,0:offset0,1:offset1,2:offset2, ..... */ - static class ScalaHelpers { - public static Map toScalaMap(HashMap m) { - return JavaConverters.mapAsScalaMapConverter(m).asScala().toMap( - Predef.>conforms() - ); - } - - public static Set toScalaSet(HashSet s) { - return JavaConverters.asScalaSetConverter(s).asScala().toSet(); - } - - public static java.util.Map toJavaMap(Map m) { - return JavaConverters.mapAsJavaMapConverter(m).asJava(); - } + public static String offsetsToStr( + HashMap offsetMap) { + StringBuilder sb = new StringBuilder(); + // atleast 1 partition will be present. + sb.append(offsetMap.entrySet().stream().findFirst().get().getKey().topic() + ","); + sb.append(offsetMap.entrySet().stream() + .map(e -> String.format("%s:%d", e.getKey().partition(), e.getValue().offset())) + .collect(Collectors.joining(","))); + return sb.toString(); } + public static OffsetRange[] computeOffsetRanges( + HashMap fromOffsetMap, + HashMap toOffsetMap) { + Comparator byPartition = (OffsetRange o1, OffsetRange o2) -> { + return Integer.valueOf(o1.partition()).compareTo(Integer.valueOf(o2.partition())); + }; + List offsetRanges = toOffsetMap.entrySet().stream().map(e -> { + TopicAndPartition tp = e.getKey(); + long fromOffset = -1; + if (fromOffsetMap.containsKey(tp)) { + fromOffset = fromOffsetMap.get(tp).offset(); + } + return OffsetRange.create(tp, fromOffset, e.getValue().offset()); + }).sorted(byPartition).collect(Collectors.toList()); - /** - * Configs to be passed for this source. All standard Kafka consumer configs are also - * respected - */ - static class Config { - private final static String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic"; - private final static String DEFAULT_AUTO_RESET_OFFSET = "largest"; + OffsetRange[] ranges = new OffsetRange[offsetRanges.size()]; + return offsetRanges.toArray(ranges); } + public static long totalNewMessages(OffsetRange[] ranges) { + long totalMsgs = 0; + for (OffsetRange range : ranges) { + totalMsgs += Math.max(range.untilOffset() - range.fromOffset(), 0); + } + return totalMsgs; + } + } - private HashMap kafkaParams; + /** + * Helpers to deal with tricky scala <=> java conversions. (oh my!) + */ + static class ScalaHelpers { - private final String topicName; - - public KafkaSource(PropertiesConfiguration config, JavaSparkContext sparkContext, SourceDataFormat dataFormat, SchemaProvider schemaProvider) { - super(config, sparkContext, dataFormat, schemaProvider); - - kafkaParams = new HashMap<>(); - Stream keys = StreamSupport.stream(Spliterators.spliteratorUnknownSize(config.getKeys(), Spliterator.NONNULL), false); - keys.forEach(k -> kafkaParams.put(k, config.getString(k))); - - DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.KAFKA_TOPIC_NAME)); - topicName = config.getString(Config.KAFKA_TOPIC_NAME); + public static Map toScalaMap(HashMap m) { + return JavaConverters.mapAsScalaMapConverter(m).asScala().toMap( + Predef.>conforms() + ); } - @Override - public Pair>, String> fetchNewData(Optional lastCheckpointStr, long maxInputBytes) { - - // Obtain current metadata for the topic - KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams)); - Either, Set> either = cluster.getPartitions(ScalaHelpers.toScalaSet(new HashSet<>(Arrays.asList(topicName)))); - if (either.isLeft()) { - // log errors. and bail out. - throw new HoodieDeltaStreamerException("Error obtaining partition metadata", either.left().get().head()); - } - Set topicPartitions = either.right().get(); - - // Determine the offset ranges to read from - HashMap fromOffsets; - if (lastCheckpointStr.isPresent()) { - fromOffsets = CheckpointUtils.strToOffsets(lastCheckpointStr.get()); - } else { - String autoResetValue = config.getString("auto.offset.reset", Config.DEFAULT_AUTO_RESET_OFFSET); - if (autoResetValue.equals("smallest")) { - fromOffsets = new HashMap(ScalaHelpers.toJavaMap(cluster.getEarliestLeaderOffsets(topicPartitions).right().get())); - } else if (autoResetValue.equals("largest")) { - fromOffsets = new HashMap(ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get())); - } else { - throw new HoodieNotSupportedException("Auto reset value must be one of 'smallest' or 'largest' "); - } - } - - // Always read until the latest offset - HashMap toOffsets = new HashMap(ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get())); - - - // Come up with final set of OffsetRanges to read (account for new partitions) - // TODO(vc): Respect maxInputBytes, by estimating number of messages to read each batch from partition size - OffsetRange[] offsetRanges = CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets); - long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); - if (totalNewMsgs <= 0) { - return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : CheckpointUtils.offsetsToStr(toOffsets)); - } else { - log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + topicName); - } - - - // Perform the actual read from Kafka - JavaRDD kafkaRDD = KafkaUtils.createRDD( - sparkContext, - byte[].class, - byte[].class, - DefaultDecoder.class, - DefaultDecoder.class, - kafkaParams, - offsetRanges).values(); - - // Produce a RDD[GenericRecord] - final AvroConvertor avroConvertor = new AvroConvertor(schemaProvider.getSourceSchema().toString()); - JavaRDD newDataRDD; - if (dataFormat == SourceDataFormat.AVRO) { - newDataRDD = kafkaRDD.map(bytes -> avroConvertor.fromAvroBinary(bytes)); - } else if (dataFormat == SourceDataFormat.JSON) { - newDataRDD = kafkaRDD.map(bytes -> avroConvertor.fromJson(new String(bytes, Charset.forName("utf-8")))); - } else { - throw new HoodieNotSupportedException("Unsupport data format :" + dataFormat); - } - - return new ImmutablePair<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(toOffsets)); + public static Set toScalaSet(HashSet s) { + return JavaConverters.asScalaSetConverter(s).asScala().toSet(); } + + public static java.util.Map toJavaMap(Map m) { + return JavaConverters.mapAsJavaMapConverter(m).asJava(); + } + } + + + /** + * Configs to be passed for this source. All standard Kafka consumer configs are also respected + */ + static class Config { + + private final static String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic"; + private final static String DEFAULT_AUTO_RESET_OFFSET = "largest"; + } + + + private HashMap kafkaParams; + + private final String topicName; + + public KafkaSource(PropertiesConfiguration config, JavaSparkContext sparkContext, + SourceDataFormat dataFormat, SchemaProvider schemaProvider) { + super(config, sparkContext, dataFormat, schemaProvider); + + kafkaParams = new HashMap<>(); + Stream keys = StreamSupport + .stream(Spliterators.spliteratorUnknownSize(config.getKeys(), Spliterator.NONNULL), false); + keys.forEach(k -> kafkaParams.put(k, config.getString(k))); + + DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.KAFKA_TOPIC_NAME)); + topicName = config.getString(Config.KAFKA_TOPIC_NAME); + } + + @Override + public Pair>, String> fetchNewData( + Optional lastCheckpointStr, long maxInputBytes) { + + // Obtain current metadata for the topic + KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams)); + Either, Set> either = cluster + .getPartitions(ScalaHelpers.toScalaSet(new HashSet<>(Arrays.asList(topicName)))); + if (either.isLeft()) { + // log errors. and bail out. + throw new HoodieDeltaStreamerException("Error obtaining partition metadata", + either.left().get().head()); + } + Set topicPartitions = either.right().get(); + + // Determine the offset ranges to read from + HashMap fromOffsets; + if (lastCheckpointStr.isPresent()) { + fromOffsets = CheckpointUtils.strToOffsets(lastCheckpointStr.get()); + } else { + String autoResetValue = config + .getString("auto.offset.reset", Config.DEFAULT_AUTO_RESET_OFFSET); + if (autoResetValue.equals("smallest")) { + fromOffsets = new HashMap(ScalaHelpers + .toJavaMap(cluster.getEarliestLeaderOffsets(topicPartitions).right().get())); + } else if (autoResetValue.equals("largest")) { + fromOffsets = new HashMap( + ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get())); + } else { + throw new HoodieNotSupportedException( + "Auto reset value must be one of 'smallest' or 'largest' "); + } + } + + // Always read until the latest offset + HashMap toOffsets = new HashMap( + ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get())); + + // Come up with final set of OffsetRanges to read (account for new partitions) + // TODO(vc): Respect maxInputBytes, by estimating number of messages to read each batch from partition size + OffsetRange[] offsetRanges = CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets); + long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); + if (totalNewMsgs <= 0) { + return new ImmutablePair<>(Optional.empty(), + lastCheckpointStr.isPresent() ? lastCheckpointStr.get() + : CheckpointUtils.offsetsToStr(toOffsets)); + } else { + log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + topicName); + } + + // Perform the actual read from Kafka + JavaRDD kafkaRDD = KafkaUtils.createRDD( + sparkContext, + byte[].class, + byte[].class, + DefaultDecoder.class, + DefaultDecoder.class, + kafkaParams, + offsetRanges).values(); + + // Produce a RDD[GenericRecord] + final AvroConvertor avroConvertor = new AvroConvertor( + schemaProvider.getSourceSchema().toString()); + JavaRDD newDataRDD; + if (dataFormat == SourceDataFormat.AVRO) { + newDataRDD = kafkaRDD.map(bytes -> avroConvertor.fromAvroBinary(bytes)); + } else if (dataFormat == SourceDataFormat.JSON) { + newDataRDD = kafkaRDD + .map(bytes -> avroConvertor.fromJson(new String(bytes, Charset.forName("utf-8")))); + } else { + throw new HoodieNotSupportedException("Unsupport data format :" + dataFormat); + } + + return new ImmutablePair<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(toOffsets)); + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java index b44ca614f..d8ff58e89 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java @@ -19,50 +19,46 @@ package com.uber.hoodie.utilities.sources; import com.uber.hoodie.utilities.schema.SchemaProvider; - +import java.io.Serializable; +import java.util.Optional; import org.apache.avro.generic.GenericRecord; import org.apache.commons.configuration.PropertiesConfiguration; import org.apache.commons.lang3.tuple.Pair; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import java.io.Serializable; -import java.util.Optional; - /** * Represents a source from which we can tail data. Assumes a constructor that takes properties. */ public abstract class Source implements Serializable { - protected transient PropertiesConfiguration config; + protected transient PropertiesConfiguration config; - protected transient JavaSparkContext sparkContext; + protected transient JavaSparkContext sparkContext; - protected transient SourceDataFormat dataFormat; + protected transient SourceDataFormat dataFormat; - protected transient SchemaProvider schemaProvider; + protected transient SchemaProvider schemaProvider; - protected Source(PropertiesConfiguration config, JavaSparkContext sparkContext, SourceDataFormat dataFormat, SchemaProvider schemaProvider) { - this.config = config; - this.sparkContext = sparkContext; - this.dataFormat = dataFormat; - this.schemaProvider = schemaProvider; - } + protected Source(PropertiesConfiguration config, JavaSparkContext sparkContext, + SourceDataFormat dataFormat, SchemaProvider schemaProvider) { + this.config = config; + this.sparkContext = sparkContext; + this.dataFormat = dataFormat; + this.schemaProvider = schemaProvider; + } - /** - * Fetches new data upto maxInputBytes, from the provided checkpoint and returns an RDD of the data, - * as well as the checkpoint to be written as a result of that. - * - * @param lastCheckpointStr - * @param maxInputBytes - * @return - */ - public abstract Pair>, String> fetchNewData(Optional lastCheckpointStr, - long maxInputBytes); + /** + * Fetches new data upto maxInputBytes, from the provided checkpoint and returns an RDD of the + * data, as well as the checkpoint to be written as a result of that. + */ + public abstract Pair>, String> fetchNewData( + Optional lastCheckpointStr, + long maxInputBytes); - public PropertiesConfiguration getConfig() { - return config; - } + public PropertiesConfiguration getConfig() { + return config; + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/SourceDataFormat.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/SourceDataFormat.java index 229d7ff39..12596a7cd 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/SourceDataFormat.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/SourceDataFormat.java @@ -22,8 +22,8 @@ package com.uber.hoodie.utilities.sources; * Format of the data within source. */ public enum SourceDataFormat { - AVRO, // No conversion needed explicitly to avro - JSON, // we will try to convert to avro - ROW, // Will be added later, so we can plug/play with spark sources. - CUSTOM // the source is responsible for conversion to avro. + AVRO, // No conversion needed explicitly to avro + JSON, // we will try to convert to avro + ROW, // Will be added later, so we can plug/play with spark sources. + CUSTOM // the source is responsible for conversion to avro. } diff --git a/hoodie-utilities/src/main/resources/delta-streamer-config/hoodie-client.properties b/hoodie-utilities/src/main/resources/delta-streamer-config/hoodie-client.properties index 0a578f439..81f928b01 100644 --- a/hoodie-utilities/src/main/resources/delta-streamer-config/hoodie-client.properties +++ b/hoodie-utilities/src/main/resources/delta-streamer-config/hoodie-client.properties @@ -15,5 +15,4 @@ # # # - hoodie.upsert.shuffle.parallelism=2 diff --git a/hoodie-utilities/src/main/resources/delta-streamer-config/key-generator.properties b/hoodie-utilities/src/main/resources/delta-streamer-config/key-generator.properties index e98189d99..c75201780 100644 --- a/hoodie-utilities/src/main/resources/delta-streamer-config/key-generator.properties +++ b/hoodie-utilities/src/main/resources/delta-streamer-config/key-generator.properties @@ -15,6 +15,5 @@ # # # - hoodie.datasource.write.recordkey.field=_row_key hoodie.datasource.write.partitionpath.field=driver diff --git a/hoodie-utilities/src/main/resources/delta-streamer-config/schema-provider.properties b/hoodie-utilities/src/main/resources/delta-streamer-config/schema-provider.properties index 187cd1193..1842069de 100644 --- a/hoodie-utilities/src/main/resources/delta-streamer-config/schema-provider.properties +++ b/hoodie-utilities/src/main/resources/delta-streamer-config/schema-provider.properties @@ -14,7 +14,5 @@ # limitations under the License. # # - - hoodie.deltastreamer.filebased.schemaprovider.source.schema.file=file:///Users/vinoth/bin/hoodie/hoodie-utilities/src/main/resources/delta-streamer-config/source.avsc hoodie.deltastreamer.filebased.schemaprovider.target.schema.file=file:///Users/vinoth/bin/hoodie/hoodie-utilities/src/main/resources/delta-streamer-config/target.avsc diff --git a/hoodie-utilities/src/main/resources/delta-streamer-config/source.properties b/hoodie-utilities/src/main/resources/delta-streamer-config/source.properties index 85489c5ec..6e698db1d 100644 --- a/hoodie-utilities/src/main/resources/delta-streamer-config/source.properties +++ b/hoodie-utilities/src/main/resources/delta-streamer-config/source.properties @@ -15,10 +15,8 @@ # # # - # DFS Source hoodie.deltastreamer.source.dfs.root=file:///tmp/hoodie-dfs-input - # Kafka Source hoodie.deltastreamer.source.kafka.topic=uber_trips metadata.broker.list=localhost:9092 diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHDFSParquetImporter.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHDFSParquetImporter.java index 38feb184c..af0a52330 100644 --- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHDFSParquetImporter.java +++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHDFSParquetImporter.java @@ -55,237 +55,240 @@ import org.junit.BeforeClass; import org.junit.Test; public class TestHDFSParquetImporter implements Serializable { - private static String dfsBasePath; - private static HdfsTestService hdfsTestService; - private static MiniDFSCluster dfsCluster; - private static DistributedFileSystem dfs; + + private static String dfsBasePath; + private static HdfsTestService hdfsTestService; + private static MiniDFSCluster dfsCluster; + private static DistributedFileSystem dfs; - @BeforeClass - public static void initClass() throws Exception { - hdfsTestService = new HdfsTestService(); - dfsCluster = hdfsTestService.start(true); + @BeforeClass + public static void initClass() throws Exception { + hdfsTestService = new HdfsTestService(); + dfsCluster = hdfsTestService.start(true); - // Create a temp folder as the base path - dfs = dfsCluster.getFileSystem(); - dfsBasePath = dfs.getWorkingDirectory().toString(); - dfs.mkdirs(new Path(dfsBasePath)); - FSUtils.setFs(dfs); + // Create a temp folder as the base path + dfs = dfsCluster.getFileSystem(); + dfsBasePath = dfs.getWorkingDirectory().toString(); + dfs.mkdirs(new Path(dfsBasePath)); + FSUtils.setFs(dfs); + } + + @AfterClass + public static void cleanupClass() throws Exception { + if (hdfsTestService != null) { + hdfsTestService.stop(); } + FSUtils.setFs(null); + } - @AfterClass - public static void cleanupClass() throws Exception { - if (hdfsTestService != null) { - hdfsTestService.stop(); + /** + * Test successful data import with retries. + */ + @Test + public void testDatasetImportWithRetries() throws Exception { + JavaSparkContext jsc = null; + try { + jsc = getJavaSparkContext(); + + // Test root folder. + String basePath = (new Path(dfsBasePath, + Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); + + // Hoodie root folder + Path hoodieFolder = new Path(basePath, "testTarget"); + + // Create schema file. + String schemaFile = new Path(basePath, "file.schema").toString(); + + //Create generic records. + Path srcFolder = new Path(basePath, "testSrc"); + createRecords(srcFolder); + + HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), + hoodieFolder.toString(), + "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", + 1, schemaFile); + AtomicInteger retry = new AtomicInteger(3); + AtomicInteger fileCreated = new AtomicInteger(0); + HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg) { + @Override + protected int dataImport(JavaSparkContext jsc) throws IOException { + int ret = super.dataImport(jsc); + if (retry.decrementAndGet() == 0) { + fileCreated.incrementAndGet(); + createSchemaFile(schemaFile); + } + + return ret; } - FSUtils.setFs(null); - } + }; + // Schema file is not created so this operation should fail. + assertEquals(0, dataImporter.dataImport(jsc, retry.get())); + assertEquals(retry.get(), -1); + assertEquals(fileCreated.get(), 1); - /** - * Test successful data import with retries. - */ - @Test - public void testDatasetImportWithRetries() throws Exception { - JavaSparkContext jsc = null; - try { - jsc = getJavaSparkContext(); + // Check if + // 1. .commit file is present + // 2. number of records in each partition == 24 + // 3. total number of partitions == 4; + boolean isCommitFilePresent = false; + Map recordCounts = new HashMap(); + RemoteIterator hoodieFiles = dfs.listFiles(hoodieFolder, true); + while (hoodieFiles.hasNext()) { + LocatedFileStatus f = hoodieFiles.next(); + isCommitFilePresent = + isCommitFilePresent || f.getPath().toString().endsWith(HoodieTimeline.COMMIT_EXTENSION); - // Test root folder. - String basePath = (new Path(dfsBasePath, - Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); - - // Hoodie root folder - Path hoodieFolder = new Path(basePath, "testTarget"); - - // Create schema file. - String schemaFile = new Path(basePath, "file.schema").toString(); - - - //Create generic records. - Path srcFolder = new Path(basePath, "testSrc"); - createRecords(srcFolder); - - HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), - "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", - 1, schemaFile); - AtomicInteger retry = new AtomicInteger(3); - AtomicInteger fileCreated = new AtomicInteger(0); - HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg) { - @Override - protected int dataImport(JavaSparkContext jsc) throws IOException { - int ret = super.dataImport(jsc); - if (retry.decrementAndGet() == 0) { - fileCreated.incrementAndGet(); - createSchemaFile(schemaFile); - } - - return ret; - } - }; - // Schema file is not created so this operation should fail. - assertEquals(0, dataImporter.dataImport(jsc, retry.get())); - assertEquals(retry.get(), -1); - assertEquals(fileCreated.get(), 1); - - // Check if - // 1. .commit file is present - // 2. number of records in each partition == 24 - // 3. total number of partitions == 4; - boolean isCommitFilePresent = false; - Map recordCounts = new HashMap(); - RemoteIterator hoodieFiles = dfs.listFiles(hoodieFolder, true); - while (hoodieFiles.hasNext()) { - LocatedFileStatus f = hoodieFiles.next(); - isCommitFilePresent = isCommitFilePresent || f.getPath().toString().endsWith(HoodieTimeline.COMMIT_EXTENSION); - - if (f.getPath().toString().endsWith("parquet")) { - SQLContext sc = new SQLContext(jsc); - String partitionPath = f.getPath().getParent().toString(); - long count = sc.read().parquet(f.getPath().toString()).count(); - if (!recordCounts.containsKey(partitionPath)) recordCounts.put(partitionPath, 0L); - recordCounts.put(partitionPath, recordCounts.get(partitionPath) + count); - } - } - assertTrue("commit file is missing", isCommitFilePresent); - assertEquals("partition is missing", 4, recordCounts.size()); - for (Entry e : recordCounts.entrySet()) { - assertEquals( "missing records", 24, e.getValue().longValue()); - } - } finally { - if (jsc != null) { - jsc.stop(); - } + if (f.getPath().toString().endsWith("parquet")) { + SQLContext sc = new SQLContext(jsc); + String partitionPath = f.getPath().getParent().toString(); + long count = sc.read().parquet(f.getPath().toString()).count(); + if (!recordCounts.containsKey(partitionPath)) { + recordCounts.put(partitionPath, 0L); + } + recordCounts.put(partitionPath, recordCounts.get(partitionPath) + count); } + } + assertTrue("commit file is missing", isCommitFilePresent); + assertEquals("partition is missing", 4, recordCounts.size()); + for (Entry e : recordCounts.entrySet()) { + assertEquals("missing records", 24, e.getValue().longValue()); + } + } finally { + if (jsc != null) { + jsc.stop(); + } } + } - private void createRecords(Path srcFolder) throws ParseException, IOException { - Path srcFile = new Path(srcFolder.toString(), "file1.parquet"); - long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000; - List records = new ArrayList(); - for (long recordNum = 0; recordNum < 96; recordNum++) { - records.add(HoodieTestDataGenerator - .generateGenericRecord(Long.toString(recordNum), "rider-" + recordNum, - "driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum))); - } - ParquetWriter writer = AvroParquetWriter - .builder(srcFile) - .withSchema(HoodieTestDataGenerator.avroSchema) - .withConf(new Configuration()) - .build(); - for (GenericRecord record : records) { - writer.write(record); - } - writer.close(); + private void createRecords(Path srcFolder) throws ParseException, IOException { + Path srcFile = new Path(srcFolder.toString(), "file1.parquet"); + long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000; + List records = new ArrayList(); + for (long recordNum = 0; recordNum < 96; recordNum++) { + records.add(HoodieTestDataGenerator + .generateGenericRecord(Long.toString(recordNum), "rider-" + recordNum, + "driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum))); } - - private void createSchemaFile(String schemaFile) throws IOException { - FSDataOutputStream schemaFileOS = dfs.create(new Path(schemaFile)); - schemaFileOS.write(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA.getBytes()); - schemaFileOS.close(); + ParquetWriter writer = AvroParquetWriter + .builder(srcFile) + .withSchema(HoodieTestDataGenerator.avroSchema) + .withConf(new Configuration()) + .build(); + for (GenericRecord record : records) { + writer.write(record); } + writer.close(); + } - /** - * Tests for scheme file. - * 1. File is missing. - * 2. File has invalid data. - */ - @Test - public void testSchemaFile() throws Exception { - JavaSparkContext jsc = null; - try { - jsc = getJavaSparkContext(); + private void createSchemaFile(String schemaFile) throws IOException { + FSDataOutputStream schemaFileOS = dfs.create(new Path(schemaFile)); + schemaFileOS.write(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA.getBytes()); + schemaFileOS.close(); + } - // Test root folder. - String basePath = (new Path(dfsBasePath, - Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); - // Hoodie root folder - Path hoodieFolder = new Path(basePath, "testTarget"); - Path srcFolder = new Path(basePath.toString(), "srcTest"); - Path schemaFile = new Path(basePath.toString(), "missingFile.schema"); - HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), - "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", - 1, schemaFile.toString()); - HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg); - // Should fail - return : -1. - assertEquals(-1, dataImporter.dataImport(jsc, 0)); + /** + * Tests for scheme file. 1. File is missing. 2. File has invalid data. + */ + @Test + public void testSchemaFile() throws Exception { + JavaSparkContext jsc = null; + try { + jsc = getJavaSparkContext(); - dfs.create(schemaFile).write("Random invalid schema data".getBytes()); - // Should fail - return : -1. - assertEquals(-1, dataImporter.dataImport(jsc, 0)); + // Test root folder. + String basePath = (new Path(dfsBasePath, + Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); + // Hoodie root folder + Path hoodieFolder = new Path(basePath, "testTarget"); + Path srcFolder = new Path(basePath.toString(), "srcTest"); + Path schemaFile = new Path(basePath.toString(), "missingFile.schema"); + HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), + hoodieFolder.toString(), + "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", + 1, schemaFile.toString()); + HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg); + // Should fail - return : -1. + assertEquals(-1, dataImporter.dataImport(jsc, 0)); - } finally { - if (jsc != null) { - jsc.stop(); - } - } + dfs.create(schemaFile).write("Random invalid schema data".getBytes()); + // Should fail - return : -1. + assertEquals(-1, dataImporter.dataImport(jsc, 0)); + + } finally { + if (jsc != null) { + jsc.stop(); + } } + } - /** - * Test for missing rowKey and partitionKey. - */ - @Test - public void testRowAndPartitionKey() throws Exception { - JavaSparkContext jsc = null; - try { - jsc = getJavaSparkContext(); + /** + * Test for missing rowKey and partitionKey. + */ + @Test + public void testRowAndPartitionKey() throws Exception { + JavaSparkContext jsc = null; + try { + jsc = getJavaSparkContext(); - // Test root folder. - String basePath = (new Path(dfsBasePath, - Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); - // Hoodie root folder - Path hoodieFolder = new Path(basePath, "testTarget"); + // Test root folder. + String basePath = (new Path(dfsBasePath, + Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); + // Hoodie root folder + Path hoodieFolder = new Path(basePath, "testTarget"); - //Create generic records. - Path srcFolder = new Path(basePath, "testSrc"); - createRecords(srcFolder); + //Create generic records. + Path srcFolder = new Path(basePath, "testSrc"); + createRecords(srcFolder); - // Create schema file. - Path schemaFile = new Path(basePath.toString(), "missingFile.schema"); - createSchemaFile(schemaFile.toString()); + // Create schema file. + Path schemaFile = new Path(basePath.toString(), "missingFile.schema"); + createSchemaFile(schemaFile.toString()); - HDFSParquetImporter dataImporter; - HDFSParquetImporter.Config cfg; + HDFSParquetImporter dataImporter; + HDFSParquetImporter.Config cfg; - // Check for invalid row key. - cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), - "testTable", "COPY_ON_WRITE", "invalidRowKey", "timestamp", - 1, schemaFile.toString()); - dataImporter = new HDFSParquetImporter(cfg); - assertEquals(-1, dataImporter.dataImport(jsc, 0)); + // Check for invalid row key. + cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), + "testTable", "COPY_ON_WRITE", "invalidRowKey", "timestamp", + 1, schemaFile.toString()); + dataImporter = new HDFSParquetImporter(cfg); + assertEquals(-1, dataImporter.dataImport(jsc, 0)); - // Check for invalid partition key. - cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), - "testTable", "COPY_ON_WRITE", "_row_key", "invalidTimeStamp", - 1, schemaFile.toString()); - dataImporter = new HDFSParquetImporter(cfg); - assertEquals(-1, dataImporter.dataImport(jsc, 0)); + // Check for invalid partition key. + cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), + "testTable", "COPY_ON_WRITE", "_row_key", "invalidTimeStamp", + 1, schemaFile.toString()); + dataImporter = new HDFSParquetImporter(cfg); + assertEquals(-1, dataImporter.dataImport(jsc, 0)); - } finally { - if (jsc != null) { - jsc.stop(); - } - } + } finally { + if (jsc != null) { + jsc.stop(); + } } + } - private HDFSParquetImporter.Config getHDFSParquetImporterConfig(String srcPath, String targetPath, - String tableName, String tableType, String rowKey, String partitionKey, int parallelism, - String schemaFile) { - HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config(); - cfg.srcPath = srcPath; - cfg.targetPath = targetPath; - cfg.tableName = tableName; - cfg.tableType = tableType; - cfg.rowKey = rowKey; - cfg.partitionKey = partitionKey; - cfg.parallelism = parallelism; - cfg.schemaFile = schemaFile; - return cfg; - } + private HDFSParquetImporter.Config getHDFSParquetImporterConfig(String srcPath, String targetPath, + String tableName, String tableType, String rowKey, String partitionKey, int parallelism, + String schemaFile) { + HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config(); + cfg.srcPath = srcPath; + cfg.targetPath = targetPath; + cfg.tableName = tableName; + cfg.tableType = tableType; + cfg.rowKey = rowKey; + cfg.partitionKey = partitionKey; + cfg.parallelism = parallelism; + cfg.schemaFile = schemaFile; + return cfg; + } - private JavaSparkContext getJavaSparkContext() { - // Initialize a local spark env - SparkConf sparkConf = new SparkConf().setAppName("TestConversionCommand").setMaster("local[1]"); - sparkConf = HoodieWriteClient.registerClasses(sparkConf); - return new JavaSparkContext(HoodieReadClient.addHoodieSupport(sparkConf)); - } + private JavaSparkContext getJavaSparkContext() { + // Initialize a local spark env + SparkConf sparkConf = new SparkConf().setAppName("TestConversionCommand").setMaster("local[1]"); + sparkConf = HoodieWriteClient.registerClasses(sparkConf); + return new JavaSparkContext(HoodieReadClient.addHoodieSupport(sparkConf)); + } } diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java index 33459b9fc..6f9acc489 100644 --- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java +++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java @@ -16,9 +16,15 @@ package com.uber.hoodie.utilities; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.util.FSUtils; +import java.io.File; +import java.io.IOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.SparkConf; @@ -28,124 +34,129 @@ import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.File; -import java.io.IOException; - -import static org.junit.Assert.*; - public class TestHoodieSnapshotCopier { - private String rootPath = null; - private String basePath = null; - private String outputPath = null; - private FileSystem fs = null; - private JavaSparkContext jsc = null; - @Before - public void init() throws IOException { - // Prepare directories - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - rootPath = folder.getRoot().getAbsolutePath(); - basePath = rootPath + "/" + HoodieTestUtils.RAW_TRIPS_TEST_NAME; - HoodieTestUtils.init(basePath); - outputPath = rootPath + "/output"; - fs = FSUtils.getFs(); - // Start a local Spark job - SparkConf conf = new SparkConf().setAppName("snapshot-test-job").setMaster("local[2]"); - jsc = new JavaSparkContext(conf); + private String rootPath = null; + private String basePath = null; + private String outputPath = null; + private FileSystem fs = null; + private JavaSparkContext jsc = null; + + @Before + public void init() throws IOException { + // Prepare directories + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + rootPath = folder.getRoot().getAbsolutePath(); + basePath = rootPath + "/" + HoodieTestUtils.RAW_TRIPS_TEST_NAME; + HoodieTestUtils.init(basePath); + outputPath = rootPath + "/output"; + fs = FSUtils.getFs(); + // Start a local Spark job + SparkConf conf = new SparkConf().setAppName("snapshot-test-job").setMaster("local[2]"); + jsc = new JavaSparkContext(conf); + } + + @Test + public void testEmptySnapshotCopy() throws IOException { + // There is no real data (only .hoodie directory) + assertEquals(fs.listStatus(new Path(basePath)).length, 1); + assertFalse(fs.exists(new Path(outputPath))); + + // Do the snapshot + HoodieSnapshotCopier copier = new HoodieSnapshotCopier(); + copier.snapshot(jsc, basePath, outputPath, true); + + // Nothing changed; we just bail out + assertEquals(fs.listStatus(new Path(basePath)).length, 1); + assertFalse(fs.exists(new Path(outputPath + "/_SUCCESS"))); + } + + //TODO - uncomment this after fixing test failures + //@Test + public void testSnapshotCopy() throws Exception { + // Generate some commits and corresponding parquets + String commitTime1 = "20160501010101"; + String commitTime2 = "20160502020601"; + String commitTime3 = "20160506030611"; + new File(basePath + "/.hoodie").mkdirs(); + new File(basePath + "/.hoodie/hoodie.properties").createNewFile(); + // Only first two have commit files + new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); + new File(basePath + "/.hoodie/" + commitTime3 + ".inflight").createNewFile(); + + // Some parquet files + new File(basePath + "/2016/05/01/").mkdirs(); + new File(basePath + "/2016/05/02/").mkdirs(); + new File(basePath + "/2016/05/06/").mkdirs(); + HoodieTestDataGenerator.writePartitionMetadata(fs, + new String[]{"2016/05/01", "2016/05/02", "2016/05/06"}, + basePath); + // Make commit1 + File file11 = new File( + basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime1, 1, "id11")); + file11.createNewFile(); + File file12 = new File( + basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime1, 1, "id12")); + file12.createNewFile(); + File file13 = new File( + basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime1, 1, "id13")); + file13.createNewFile(); + + // Make commit2 + File file21 = new File( + basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime2, 1, "id21")); + file21.createNewFile(); + File file22 = new File( + basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime2, 1, "id22")); + file22.createNewFile(); + File file23 = new File( + basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime2, 1, "id23")); + file23.createNewFile(); + + // Make commit3 + File file31 = new File( + basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime3, 1, "id31")); + file31.createNewFile(); + File file32 = new File( + basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime3, 1, "id32")); + file32.createNewFile(); + File file33 = new File( + basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime3, 1, "id33")); + file33.createNewFile(); + + // Do a snapshot copy + HoodieSnapshotCopier copier = new HoodieSnapshotCopier(); + copier.snapshot(jsc, basePath, outputPath, false); + + // Check results + assertTrue(fs.exists(new Path(outputPath + "/2016/05/01/" + file11.getName()))); + assertTrue(fs.exists(new Path(outputPath + "/2016/05/02/" + file12.getName()))); + assertTrue(fs.exists(new Path(outputPath + "/2016/05/06/" + file13.getName()))); + assertTrue(fs.exists(new Path(outputPath + "/2016/05/01/" + file21.getName()))); + assertTrue(fs.exists(new Path(outputPath + "/2016/05/02/" + file22.getName()))); + assertTrue(fs.exists(new Path(outputPath + "/2016/05/06/" + file23.getName()))); + assertFalse(fs.exists(new Path(outputPath + "/2016/05/01/" + file31.getName()))); + assertFalse(fs.exists(new Path(outputPath + "/2016/05/02/" + file32.getName()))); + assertFalse(fs.exists(new Path(outputPath + "/2016/05/06/" + file33.getName()))); + + assertTrue(fs.exists(new Path(outputPath + "/.hoodie/" + commitTime1 + ".commit"))); + assertTrue(fs.exists(new Path(outputPath + "/.hoodie/" + commitTime2 + ".commit"))); + assertFalse(fs.exists(new Path(outputPath + "/.hoodie/" + commitTime3 + ".commit"))); + assertFalse(fs.exists(new Path(outputPath + "/.hoodie/" + commitTime3 + ".inflight"))); + assertTrue(fs.exists(new Path(outputPath + "/.hoodie/hoodie.properties"))); + + assertTrue(fs.exists(new Path(outputPath + "/_SUCCESS"))); + } + + @After + public void cleanup() { + if (rootPath != null) { + new File(rootPath).delete(); } - - @Test - public void testEmptySnapshotCopy() throws IOException { - // There is no real data (only .hoodie directory) - assertEquals(fs.listStatus(new Path(basePath)).length, 1); - assertFalse(fs.exists(new Path(outputPath))); - - // Do the snapshot - HoodieSnapshotCopier copier = new HoodieSnapshotCopier(); - copier.snapshot(jsc, basePath, outputPath, true); - - // Nothing changed; we just bail out - assertEquals(fs.listStatus(new Path(basePath)).length, 1); - assertFalse(fs.exists(new Path(outputPath + "/_SUCCESS"))); - } - - //TODO - uncomment this after fixing test failures - //@Test - public void testSnapshotCopy() throws Exception { - // Generate some commits and corresponding parquets - String commitTime1 = "20160501010101"; - String commitTime2 = "20160502020601"; - String commitTime3 = "20160506030611"; - new File(basePath + "/.hoodie").mkdirs(); - new File(basePath + "/.hoodie/hoodie.properties").createNewFile(); - // Only first two have commit files - new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); - new File(basePath + "/.hoodie/" + commitTime3 + ".inflight").createNewFile(); - - // Some parquet files - new File(basePath + "/2016/05/01/").mkdirs(); - new File(basePath + "/2016/05/02/").mkdirs(); - new File(basePath + "/2016/05/06/").mkdirs(); - HoodieTestDataGenerator.writePartitionMetadata(fs, - new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, - basePath); - // Make commit1 - File file11 = new File(basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime1, 1, "id11")); - file11.createNewFile(); - File file12 = new File(basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime1, 1, "id12")); - file12.createNewFile(); - File file13 = new File(basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime1, 1, "id13")); - file13.createNewFile(); - - // Make commit2 - File file21 = new File(basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime2, 1, "id21")); - file21.createNewFile(); - File file22 = new File(basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime2, 1, "id22")); - file22.createNewFile(); - File file23 = new File(basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime2, 1, "id23")); - file23.createNewFile(); - - // Make commit3 - File file31 = new File(basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime3, 1, "id31")); - file31.createNewFile(); - File file32 = new File(basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime3, 1, "id32")); - file32.createNewFile(); - File file33 = new File(basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime3, 1, "id33")); - file33.createNewFile(); - - // Do a snapshot copy - HoodieSnapshotCopier copier = new HoodieSnapshotCopier(); - copier.snapshot(jsc, basePath, outputPath, false); - - // Check results - assertTrue(fs.exists(new Path(outputPath + "/2016/05/01/" + file11.getName()))); - assertTrue(fs.exists(new Path(outputPath + "/2016/05/02/" + file12.getName()))); - assertTrue(fs.exists(new Path(outputPath + "/2016/05/06/" + file13.getName()))); - assertTrue(fs.exists(new Path(outputPath + "/2016/05/01/" + file21.getName()))); - assertTrue(fs.exists(new Path(outputPath + "/2016/05/02/" + file22.getName()))); - assertTrue(fs.exists(new Path(outputPath + "/2016/05/06/" + file23.getName()))); - assertFalse(fs.exists(new Path(outputPath + "/2016/05/01/" + file31.getName()))); - assertFalse(fs.exists(new Path(outputPath + "/2016/05/02/" + file32.getName()))); - assertFalse(fs.exists(new Path(outputPath + "/2016/05/06/" + file33.getName()))); - - assertTrue(fs.exists(new Path(outputPath + "/.hoodie/" + commitTime1 + ".commit"))); - assertTrue(fs.exists(new Path(outputPath + "/.hoodie/" + commitTime2 + ".commit"))); - assertFalse(fs.exists(new Path(outputPath + "/.hoodie/" + commitTime3 + ".commit"))); - assertFalse(fs.exists(new Path(outputPath + "/.hoodie/" + commitTime3 + ".inflight"))); - assertTrue(fs.exists(new Path(outputPath + "/.hoodie/hoodie.properties"))); - - assertTrue(fs.exists(new Path(outputPath + "/_SUCCESS"))); - } - - @After - public void cleanup() { - if (rootPath != null) { - new File(rootPath).delete(); - } - if (jsc != null) { - jsc.stop(); - } + if (jsc != null) { + jsc.stop(); } + } } diff --git a/hoodie-utilities/src/test/resources/log4j-surefire.properties b/hoodie-utilities/src/test/resources/log4j-surefire.properties index eab225528..3613e7d12 100644 --- a/hoodie-utilities/src/test/resources/log4j-surefire.properties +++ b/hoodie-utilities/src/test/resources/log4j-surefire.properties @@ -16,7 +16,6 @@ log4j.rootLogger=WARN, A1 log4j.category.com.uber=INFO log4j.category.org.apache.parquet.hadoop=WARN - # A1 is set to be a ConsoleAppender. log4j.appender.A1=org.apache.log4j.ConsoleAppender # A1 uses PatternLayout. diff --git a/pom.xml b/pom.xml index 1c61798d4..b19b801d2 100644 --- a/pom.xml +++ b/pom.xml @@ -15,666 +15,673 @@ ~ limitations under the License. --> - - 4.0.0 + + 4.0.0 - com.uber.hoodie - hoodie - pom - 0.4.1-SNAPSHOT - Hoodie is a Apache Spark library that provides the ability to efficiently do incremental processing on datasets in HDFS - https://github.com/uber/hoodie - Hoodie + com.uber.hoodie + hoodie + pom + 0.4.1-SNAPSHOT + Hoodie is a Apache Spark library that provides the ability to efficiently do + incremental processing on datasets in HDFS + + https://github.com/uber/hoodie + Hoodie - - hoodie-common - hoodie-client - hoodie-cli - hoodie-hadoop-mr - hoodie-hive - hoodie-utilities - hoodie-spark - + + hoodie-common + hoodie-client + hoodie-cli + hoodie-hadoop-mr + hoodie-hive + hoodie-utilities + hoodie-spark + - - - Apache License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - A business-friendly OSS license - - + + + Apache License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + - - Uber Technologies Inc. - http://www.uber.com/ - + + Uber Technologies Inc. + http://www.uber.com/ + - - - vinoth - Vinoth Chandar - Uber - - - prasanna - Prasanna Rajaperumal - Uber - - + + + vinoth + Vinoth Chandar + Uber + + + prasanna + Prasanna Rajaperumal + Uber + + - - - Wei Yan - Uber - - - Siddhartha Gunda - Uber - - - Omkar Joshi - Uber - - - Zeeshan Qureshi - Shopify - - - Kathy Ge - Shopify - - - Kaushik Devarajaiah - Uber - - - Nishith Agarwal - Uber - - + + + Wei Yan + Uber + + + Siddhartha Gunda + Uber + + + Omkar Joshi + Uber + + + Zeeshan Qureshi + Shopify + + + Kathy Ge + Shopify + + + Kaushik Devarajaiah + Uber + + + Nishith Agarwal + Uber + + - 2015-2016 + 2015-2016 + + + com.google.code.gson + gson + 2.3.1 + test + + + junit + junit + ${junit.version} + test + + + + + 2.10 + 2.6 + 2.19.1 + 1.8.1 + 4.11 + 1.9.5 + 1.2.17 + 5.7.2 + 2.6.0 + 1.1.0 + 3.1.1 + 2.1.0 + 2.11.8 + 2.11 + + + + scm:git:git@github.com:uber/hoodie.git + scm:git:git@github.com:uber/hoodie.git + git@github.com:uber/hoodie.git + HEAD + + + + + User List + hoodie-user@googlegroups.com + https://groups.google.com/d/forum/hoodie-user/ + + + Developer List + hoodie-dev@googlegroups.com + https://groups.google.com/d/forum/hoodie-dev/ + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.8 + 1.8 + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + true + false + release + deploy + + + + org.apache.maven.plugins + maven-surefire-plugin + ${maven-surefire-plugin.version} + + + ${surefireArgLine} + + + file:${project.build.testOutputDirectory}/log4j-surefire.properties + + + + + **/IT*.java + + + + + + + + + maven-dependency-plugin + ${maven-dependency-plugin.version} + + + maven-jar-plugin + ${maven-jar-plugin.version} + + + org.jacoco + jacoco-maven-plugin + 0.7.8 + + + + pre-unit-test + + prepare-agent + + + + ${project.build.directory}/coverage-reports/jacoco-ut.exec + + surefireArgLine + + + + + post-unit-test + test + + report + + + + ${project.build.directory}/coverage-reports/jacoco-ut.exec + + ${project.reporting.outputDirectory}/jacoco-ut + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + org.apache.rat + apache-rat-plugin + 0.11 + + + **/.* + **/*.txt + **/*.sh + **/*.log + **/dependency-reduced-pom.xml + **/test/resources/*.avro + **/test/resources/*.data + **/test/resources/*.schema + **/test/resources/*.csv + **/main/avro/*.avsc + **/target/* + + + + + package + + check + + + + + + org.apache.avro + avro-maven-plugin + 1.7.6 + + + generate-sources + + schema + + + ${project.basedir}/src/main/avro/ + ${project.build.directory}/generated-sources/src/main/java/ + + String + + + + + + + + + - - com.google.code.gson - gson - 2.3.1 - test - - - junit - junit - ${junit.version} - test - + + + com.beust + jcommander + 1.48 + + + + log4j + log4j + ${log4j.version} + + + + org.apache.hadoop + hadoop-client + ${hadoop.version}-cdh${cdh.version} + provided + + + + org.apache.parquet + parquet-avro + ${parquet.version} + + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + + + + org.apache.avro + avro-mapred + 1.7.7 + + + + + com.google.guava + guava + 15.0 + + + + + org.apache.hadoop + hadoop-common + ${hadoop.version}-cdh${cdh.version} + provided + + + org.apache.hadoop + hadoop-hdfs + ${hadoop.version}-cdh${cdh.version} + provided + + + org.apache.hadoop + hadoop-auth + ${hadoop.version}-cdh${cdh.version} + provided + + + org.apache.hive + hive-common + ${hive.version}-cdh${cdh.version} + provided + + + org.apache.hadoop + hadoop-mapreduce-client-core + ${hadoop.version}-cdh${cdh.version} + provided + + + org.apache.hadoop + hadoop-mapreduce-client-common + 2.6.0-cdh5.7.2 + provided + + + org.apache.hive + hive-exec + 1.1.0-cdh5.7.2 + provided + + + commons-logging + commons-logging + 1.2 + + + + + + + com.twitter + parquet-hadoop-bundle + 1.5.0-cdh5.7.2 + + + com.twitter + parquet-hive-bundle + 1.5.0 + + + com.twitter + parquet-avro + 1.5.0-cdh5.7.2 + + + + org.apache.parquet + parquet-hive-bundle + 1.8.1 + + + + org.apache.spark + spark-core_2.11 + ${spark.version} + provided + + + org.apache.spark + spark-sql_2.11 + ${spark.version} + provided + + + + org.apache.hbase + hbase-client + 1.0.0 + + + + org.apache.avro + avro + 1.7.6-cdh5.7.2 + + + org.slf4j + slf4j-api + + + + + + + io.dropwizard.metrics + metrics-graphite + ${metrics.version} + + + io.dropwizard.metrics + metrics-core + ${metrics.version} + + + + xerces + xercesImpl + 2.9.1 + + + xalan + xalan + 2.7.1 + + + + commons-dbcp + commons-dbcp + 1.4 + + + org.apache.httpcomponents + httpcore + 4.3.2 + + + org.slf4j + slf4j-api + 1.7.5 + + + org.slf4j + slf4j-log4j12 + 1.7.5 + + + + org.apache.commons + commons-configuration2 + 2.1.1 + + + + com.fasterxml.jackson.core + jackson-annotations + 2.6.0 + + + org.codehaus.jackson + jackson-mapper-asl + 1.9.13 + + + + org.apache.hive + hive-jdbc + ${hive.version}-cdh${cdh.version} + + + + org.apache.hive + hive-service + ${hive.version}-cdh${cdh.version} + + + org.apache.hive + hive-metastore + ${hive.version}-cdh${cdh.version} + + + org.apache.commons + commons-lang3 + 3.4 + + + + junit + junit + 4.12 + + + org.apache.hadoop + hadoop-hdfs + tests + ${hadoop.version}-cdh${cdh.version} + + + org.apache.hadoop + hadoop-common + tests + ${hadoop.version}-cdh${cdh.version} + + + org.mockito + mockito-all + test + 1.10.19 + + + + com.esotericsoftware + kryo + 4.0.0 + test + - - 2.10 - 2.6 - 2.19.1 - 1.8.1 - 4.11 - 1.9.5 - 1.2.17 - 5.7.2 - 2.6.0 - 1.1.0 - 3.1.1 - 2.1.0 - 2.11.8 - 2.11 - + - - scm:git:git@github.com:uber/hoodie.git - scm:git:git@github.com:uber/hoodie.git - git@github.com:uber/hoodie.git - HEAD - + + + cloudera-repo-releases + https://repository.cloudera.com/artifactory/public/ + + - - - User List - hoodie-user@googlegroups.com - https://groups.google.com/d/forum/hoodie-user/ - - - Developer List - hoodie-dev@googlegroups.com - https://groups.google.com/d/forum/hoodie-dev/ - - + + + ossrh + https://oss.sonatype.org/content/repositories/snapshots + + + ossrh + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + - + + + release + + + deployArtifacts + true + + + - - org.apache.maven.plugins - maven-compiler-plugin - - 1.8 - 1.8 - - - - org.apache.maven.plugins - maven-release-plugin - 2.5.3 - - true - false - release - deploy - - - - org.apache.maven.plugins - maven-surefire-plugin - ${maven-surefire-plugin.version} - - - ${surefireArgLine} - - file:${project.build.testOutputDirectory}/log4j-surefire.properties - - - - **/IT*.java - - - + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + attach-sources + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + attach-javadocs + + jar + + + + + -Xdoclint:none + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.5 + + + sign-artifacts + verify + + sign + + + + + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6.2 + true + + ossrh + https://oss.sonatype.org/ + true + + + + + - - - - maven-dependency-plugin - ${maven-dependency-plugin.version} - - - maven-jar-plugin - ${maven-jar-plugin.version} - - - org.jacoco - jacoco-maven-plugin - 0.7.8 - - - - pre-unit-test - - prepare-agent - - - - ${project.build.directory}/coverage-reports/jacoco-ut.exec - - surefireArgLine - - - - - post-unit-test - test - - report - - - - ${project.build.directory}/coverage-reports/jacoco-ut.exec - - ${project.reporting.outputDirectory}/jacoco-ut - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - org.apache.rat - apache-rat-plugin - 0.11 - - - **/.* - **/*.txt - **/*.sh - **/*.log - **/dependency-reduced-pom.xml - **/test/resources/*.avro - **/test/resources/*.data - **/test/resources/*.schema - **/test/resources/*.csv - **/main/avro/*.avsc - **/target/* - - - - - package - - check - - - - - - org.apache.avro - avro-maven-plugin - 1.7.6 - - - generate-sources - - schema - - - ${project.basedir}/src/main/avro/ - ${project.build.directory}/generated-sources/src/main/java/ - String - - - - - - - - - - - - - com.beust - jcommander - 1.48 - - - - log4j - log4j - ${log4j.version} - - - - org.apache.hadoop - hadoop-client - ${hadoop.version}-cdh${cdh.version} - provided - - - - org.apache.parquet - parquet-avro - ${parquet.version} - - - - org.apache.parquet - parquet-hadoop - ${parquet.version} - - - - org.apache.avro - avro-mapred - 1.7.7 - - - - - com.google.guava - guava - 15.0 - - - - - org.apache.hadoop - hadoop-common - ${hadoop.version}-cdh${cdh.version} - provided - - - org.apache.hadoop - hadoop-hdfs - ${hadoop.version}-cdh${cdh.version} - provided - - - org.apache.hadoop - hadoop-auth - ${hadoop.version}-cdh${cdh.version} - provided - - - org.apache.hive - hive-common - ${hive.version}-cdh${cdh.version} - provided - - - org.apache.hadoop - hadoop-mapreduce-client-core - ${hadoop.version}-cdh${cdh.version} - provided - - - org.apache.hadoop - hadoop-mapreduce-client-common - 2.6.0-cdh5.7.2 - provided - - - org.apache.hive - hive-exec - 1.1.0-cdh5.7.2 - provided - - - commons-logging - commons-logging - 1.2 - - - - - - - com.twitter - parquet-hadoop-bundle - 1.5.0-cdh5.7.2 - - - com.twitter - parquet-hive-bundle - 1.5.0 - - - com.twitter - parquet-avro - 1.5.0-cdh5.7.2 - - - - org.apache.parquet - parquet-hive-bundle - 1.8.1 - - - - org.apache.spark - spark-core_2.11 - ${spark.version} - provided - - - org.apache.spark - spark-sql_2.11 - ${spark.version} - provided - - - - org.apache.hbase - hbase-client - 1.0.0 - - - - org.apache.avro - avro - 1.7.6-cdh5.7.2 - - - org.slf4j - slf4j-api - - - - - - - io.dropwizard.metrics - metrics-graphite - ${metrics.version} - - - io.dropwizard.metrics - metrics-core - ${metrics.version} - - - - xerces - xercesImpl - 2.9.1 - - - xalan - xalan - 2.7.1 - - - - commons-dbcp - commons-dbcp - 1.4 - - - org.apache.httpcomponents - httpcore - 4.3.2 - - - org.slf4j - slf4j-api - 1.7.5 - - - org.slf4j - slf4j-log4j12 - 1.7.5 - - - - org.apache.commons - commons-configuration2 - 2.1.1 - - - - com.fasterxml.jackson.core - jackson-annotations - 2.6.0 - - - org.codehaus.jackson - jackson-mapper-asl - 1.9.13 - - - - org.apache.hive - hive-jdbc - ${hive.version}-cdh${cdh.version} - - - - org.apache.hive - hive-service - ${hive.version}-cdh${cdh.version} - - - org.apache.hive - hive-metastore - ${hive.version}-cdh${cdh.version} - - - org.apache.commons - commons-lang3 - 3.4 - - - - junit - junit - 4.12 - - - org.apache.hadoop - hadoop-hdfs - tests - ${hadoop.version}-cdh${cdh.version} - - - org.apache.hadoop - hadoop-common - tests - ${hadoop.version}-cdh${cdh.version} - - - org.mockito - mockito-all - test - 1.10.19 - - - - com.esotericsoftware - kryo - 4.0.0 - test - - - - - - - - cloudera-repo-releases - https://repository.cloudera.com/artifactory/public/ - - - - - - ossrh - https://oss.sonatype.org/content/repositories/snapshots - - - ossrh - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - - - - - release - - - deployArtifacts - true - - - - - - org.apache.maven.plugins - maven-source-plugin - 2.2.1 - - - attach-sources - - jar-no-fork - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.9.1 - - - attach-javadocs - - jar - - - - - -Xdoclint:none - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.5 - - - sign-artifacts - verify - - sign - - - - - - org.sonatype.plugins - nexus-staging-maven-plugin - 1.6.2 - true - - ossrh - https://oss.sonatype.org/ - true - - - - - - - - - GitHub - https://github.com/uber/hoodie/issues - + + GitHub + https://github.com/uber/hoodie/issues +